--- trunk2/lib/WebPAC.pm	2004/09/10 22:24:42	421
+++ trunk2/lib/WebPAC.pm	2004/09/15 21:21:36	453
@@ -9,6 +9,7 @@
 use XML::Simple;
 use Template;
 use Log::Log4perl qw(get_logger :levels);
+use Time::HiRes qw(time);
 
 use Data::Dumper;
 
@@ -33,34 +34,19 @@
 
  my $webpac = new WebPAC(
  	config_file => 'name.conf',
-	[code_page => 'ISO-8859-2',]
-	[low_mem => 1,]
+	code_page => 'ISO-8859-2',
+	low_mem => 1,
  );
 
 Default C<code_page> is C<ISO-8859-2>.
 
+Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
+
 This method will also read configuration files
 C<global.conf> (used by indexer and Web font-end)
 and configuration file specified by C<config_file>
 which describes databases to be indexed.
 
-C<low_mem> options is double-edged sword. If enabled, WebPAC
-will run on memory constraint machines (which doesn't have enough
-physical RAM to create memory structure for whole ISIS database).
-
-If your machine has 512Mb or more and database is around 10000 records,
-memory shouldn't be an issue. If you don't have enough physical RAM, you
-might consider using virtual memory (if your operating system is handling it
-well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
-parsed structure of ISIS database.
-
-However, when WebPAC is running on desktop machines (or laptops :-), it's
-highly undesireable for system to start swapping. Using C<low_mem> option can
-reduce WecPAC memory usage to 16Mb for same database with lookup fields and
-sorted indexes which stay in RAM. Performance will suffer, but memory usage
-will really be minimal. It might be also more confortable to run WebPAC reniced
-on those machines.
-
 =cut
 
 # mapping between data type and tag which specify
@@ -77,6 +63,8 @@
         my $self = {@_};
         bless($self, $class);
 
+	$self->{'start_t'} = time();
+
 	my $log_file = $self->{'log'} || "log.conf";
 	Log::Log4perl->init($log_file);
 
@@ -126,7 +114,7 @@
 
 	# running with low_mem flag? well, use DBM::Deep then.
 	if ($self->{'low_mem'}) {
-		$log->info("running with low_mem which impacts performance (<64 Mb memory usage)");
+		$log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
 
 		my $db_file = "data.db";
 
@@ -135,7 +123,7 @@
 			$log->debug("removed '$db_file' from last run");
 		}
 
-		use DBM::Deep;
+		require DBM::Deep;
 
 		my $db = new DBM::Deep $db_file;
 
@@ -144,7 +132,7 @@
 		if ($db->error()) {
 			$log->logdie("can't open '$db_file' under low_mem: ",$db->error());
 		} else {
-			$log->debug("using file $db_file for DBM::Deep");
+			$log->debug("using file '$db_file' for DBM::Deep");
 		}
 
 		$self->{'db'} = $db;
@@ -160,12 +148,16 @@
  $webpac->open_isis(
  	filename => '/data/ISIS/ISIS',
 	code_page => '852',
-	limit_mfn => '500',
+	limit_mfn => 500,
+	start_mfn => 6000,
 	lookup => [ ... ],
  );
 
 By default, ISIS code page is assumed to be C<852>.
 
+If optional parametar C<start_mfn> is set, this will be first MFN to read
+from database (so you can skip beginning of your database if you need to).
+
 If optional parametar C<limit_mfn> is set, it will read just 500 records
 from database in example above.
 
@@ -193,6 +185,8 @@
 	$log->logcroak("need filename") if (! $arg->{'filename'});
 	my $code_page = $arg->{'code_page'} || '852';
 
+	$log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
+
 	# store data in object
 	$self->{'isis_filename'} = $arg->{'filename'};
 	$self->{'isis_code_page'} = $code_page;
@@ -210,13 +204,21 @@
 	my $isis_db = OpenIsis::open($arg->{'filename'});
 
 	my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
+	my $startmfn = 1;
+
+	if (my $s = $self->{'start_mfn'}) {
+		$log->info("skipping to MFN $s");
+		$startmfn = $s;
+	} else {
+		$self->{'start_mfn'} = $startmfn;
+	}
 
-	$maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
+	$maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
 
-	$log->info("processing $maxmfn records...");
+	$log->info("processing ",($maxmfn-$startmfn)." records...");
 
 	# read database
-	for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
+	for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
 
 
 		$log->debug("mfn: $mfn\n");
@@ -264,7 +266,7 @@
 
 	}
 
-	$self->{'current_mfn'} = 1;
+	$self->{'current_mfn'} = -1;
 	$self->{'last_pcnt'} = 0;
 
 	$log->debug("max mfn: $maxmfn");
@@ -287,7 +289,15 @@
 
 	my $log = $self->_get_logger();
 
-	my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
+	$log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
+
+	if ($self->{'current_mfn'} == -1) {
+		$self->{'current_mfn'} = $self->{'start_mfn'};
+	} else {
+		$self->{'current_mfn'}++;
+	}
+
+	my $mfn = $self->{'current_mfn'};
 
 	if ($mfn > $self->{'max_mfn'}) {
 		$self->{'current_mfn'} = $self->{'max_mfn'};
@@ -304,6 +314,19 @@
 	}
 }
 
+=head2 mfn
+
+Returns current record number (MFN).
+
+ print $webpac->mfn;
+
+=cut
+
+sub mfn {
+	my $self = shift;
+	return $self->{'current_mfn'};
+}
+
 =head2 progress_bar
 
 Draw progress bar on STDERR.
@@ -329,16 +352,52 @@
 
 	$self->{'last_pcnt'} ||= 1;
 
-	$self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});
-
 	my $p = int($curr * 100 / $max);
+
+	# reset on re-run
+	if ($p < $self->{'last_pcnt'}) {
+		$self->{'last_pcnt'} = $p;
+		$self->{'last_t'} = time();
+		$self->{'last_curr'} = undef;
+	}
+
 	if ($p != $self->{'last_pcnt'}) {
-		printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );
+
+		my $last_curr = $self->{'last_curr'} || $curr;
+		my $t = time();
+		my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
+		my $eta = ($max-$curr) / ($rate || 1);
+		printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
 		$self->{'last_pcnt'} = $p;
+		$self->{'last_t'} = time();
+		$self->{'last_curr'} = $curr;
 	}
 	print STDERR "\n" if ($p == 100);
 }
 
+=head2 fmt_time
+
+Format time (in seconds) for display.
+
+ print $webpac->fmt_time(time());
+
+This method is called by L<progress_bar> to display remaining time.
+
+=cut
+
+sub fmt_time {
+	my $self = shift;
+
+	my $t = shift || 0;
+	my $out = "";
+
+	my ($ss,$mm,$hh) = gmtime($t);
+	$out .= "${hh}h" if ($hh);
+	$out .= sprintf("%02d:%02d", $mm,$ss);
+	$out .= "  " if ($hh == 0);
+	return $out;
+}
+
 =head2 open_import_xml
 
 Read file from C<import_xml/> directory and parse it.
@@ -718,6 +777,31 @@
 	return @arr;
 }
 
+=head2 sort_arr
+
+Sort array ignoring case and html in data
+
+ my @sorted = $webpac->sort_arr(@unsorted);
+
+=cut
+
+sub sort_arr {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	# FIXME add Schwartzian Transformation?
+
+	my @sorted = sort {
+		$a =~ s#<[^>]+/*>##;
+		$b =~ s#<[^>]+/*>##;
+		lc($b) cmp lc($a)
+	} @_;
+	$log->debug("sorted values: ",sub { join(", ",@sorted) });
+
+	return @sorted;
+}
+
 
 =head2 data_structure
 
@@ -774,6 +858,11 @@
 			}
 			next if (! @v);
 
+			if ($tag->{'sort'}) {
+				@v = $self->sort_arr(@v);
+				$log->warn("sort within tag is usually not what you want!");
+			}
+
 			# use format?
 			if ($tag->{'format_name'}) {
 				@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
@@ -788,12 +877,35 @@
 				next; # don't return headline in data_structure!
 			}
 
-			# does tag have type?
-			if ($tag->{'type'}) {
-				push @{$row->{$tag->{'type'}}}, @v;
-			} else {
-				push @{$row->{'display'}}, @v;
-				push @{$row->{'swish'}}, @v;
+			# delimiter will join repeatable fields
+			if ($tag->{'delimiter'}) {
+				@v = ( join($tag->{'delimiter'}, @v) );
+			}
+
+			# default types 
+			my @types = qw(display swish);
+			# override by type attribute
+			@types = ( $tag->{'type'} ) if ($tag->{'type'});
+
+			foreach my $type (@types) {
+				# append to previous line?
+				$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
+				if ($tag->{'append'}) {
+
+					# I will delimit appended part with
+					# delimiter (or ,)
+					my $d = $tag->{'delimiter'};
+					# default delimiter
+					$d ||= ", ";
+
+					my $last = pop @{$row->{$type}};
+					$d = "" if (! $last);
+					$last .= $d . join($d, @v);
+					push @{$row->{$type}}, $last;
+
+				} else {
+					push @{$row->{$type}}, @v;
+				}
 			}
 
 
@@ -806,6 +918,11 @@
 			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
 			$row->{'name'} = $name ? $self->_x($name) : $field;
 
+			# post-sort all values in field
+			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+				$log->warn("sort at field tag not implemented");
+			}
+
 			push @ds, $row;
 
 			$log->debug("row $field: ",sub { Dumper($row) });
@@ -1021,6 +1138,37 @@
 also use method names, and not only classes (which are just few)
 to filter logging.
 
+
+=head1 MEMORY USAGE
+
+C<low_mem> options is double-edged sword. If enabled, WebPAC
+will run on memory constraint machines (which doesn't have enough
+physical RAM to create memory structure for whole source database).
+
+If your machine has 512Mb or more of RAM and database is around 10000 records,
+memory shouldn't be an issue. If you don't have enough physical RAM, you
+might consider using virtual memory (if your operating system is handling it
+well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
+parsed structure of ISIS database (this is what C<low_mem> option does).
+
+Hitting swap at end of reading source database is probably o.k. However,
+hitting swap before 90% will dramatically decrease performance and you will
+be better off with C<low_mem> and using rest of availble memory for
+operating system disk cache (Linux is particuallary good about this).
+However, every access to database record will require disk access, so
+generation phase will be slower 10-100 times.
+
+Parsed structures are essential - you just have option to trade RAM memory
+(which is fast) for disk space (which is slow). Be sure to have planty of
+disk space if you are using C<low_mem> and thus L<DBD::Deep>.
+
+However, when WebPAC is running on desktop machines (or laptops :-), it's
+highly undesireable for system to start swapping. Using C<low_mem> option can
+reduce WecPAC memory usage to around 64Mb for same database with lookup
+fields and sorted indexes which stay in RAM. Performance will suffer, but
+memory usage will really be minimal. It might be also more confortable to
+run WebPAC reniced on those machines.
+
 =cut
 
 1;