--- trunk2/lib/WebPAC.pm 2004/07/20 17:15:48 389 +++ trunk2/lib/WebPAC.pm 2004/09/11 08:36:38 422 @@ -9,6 +9,7 @@ use XML::Simple; use Template; use Log::Log4perl qw(get_logger :levels); +use Time::HiRes qw(time); use Data::Dumper; @@ -29,16 +30,19 @@ =head2 new -This will create new instance of WebPAC using configuration specified by C. +Create new instance of WebPAC using configuration specified by C. my $webpac = new WebPAC( config_file => 'name.conf', [code_page => 'ISO-8859-2',] + [low_mem => 1,] ); Default C is C. -It will also read configuration files +Default is not to use C options (see L below). + +This method will also read configuration files C (used by indexer and Web font-end) and configuration file specified by C which describes databases to be indexed. @@ -59,6 +63,8 @@ my $self = {@_}; bless($self, $class); + $self->{'start_t'} = time(); + my $log_file = $self->{'log'} || "log.conf"; Log::Log4perl->init($log_file); @@ -106,6 +112,32 @@ EVAL_PERL => 1, ); + # running with low_mem flag? well, use DBM::Deep then. + if ($self->{'low_mem'}) { + $log->info("running with low_mem which impacts performance (<32 Mb memory usage)"); + + my $db_file = "data.db"; + + if (-e $db_file) { + unlink $db_file or $log->logdie("can't remove '$db_file' from last run"); + $log->debug("removed '$db_file' from last run"); + } + + use DBM::Deep; + + my $db = new DBM::Deep $db_file; + + $log->logdie("DBM::Deep error: $!") unless ($db); + + if ($db->error()) { + $log->logdie("can't open '$db_file' under low_mem: ",$db->error()); + } else { + $log->debug("using file '$db_file' for DBM::Deep"); + } + + $self->{'db'} = $db; + } + return $self; } @@ -149,6 +181,10 @@ $log->logcroak("need filename") if (! $arg->{'filename'}); my $code_page = $arg->{'code_page'} || '852'; + # store data in object + $self->{'isis_filename'} = $arg->{'filename'}; + $self->{'isis_code_page'} = $code_page; + use OpenIsis; #$self->{'isis_code_page'} = $code_page; @@ -157,6 +193,7 @@ my $cp = Text::Iconv->new($code_page,$self->{'code_page'}); $log->info("reading ISIS database '",$arg->{'filename'},"'"); + $log->debug("isis code page: $code_page"); my $isis_db = OpenIsis::open($arg->{'filename'}); @@ -169,6 +206,11 @@ # read database for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) { + + $log->debug("mfn: $mfn\n"); + + my $rec; + # read record my $row = OpenIsis::read( $isis_db, $mfn ); foreach my $k (keys %{$row}) { @@ -186,16 +228,24 @@ $val = $l; } - push @{$self->{'data'}->{$mfn}->{$k}}, $val; + push @{$rec->{$k}}, $val; } } else { - push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn; + push @{$rec->{'000'}}, $mfn; } } + $log->confess("record $mfn empty?") unless ($rec); + + # store + if ($self->{'low_mem'}) { + $self->{'db'}->put($mfn, $rec); + } else { + $self->{'data'}->{$mfn} = $rec; + } + # create lookup - my $rec = $self->{'data'}->{$mfn}; $self->create_lookup($rec, @{$arg->{'lookup'}}); $self->progress_bar($mfn,$maxmfn); @@ -205,6 +255,8 @@ $self->{'current_mfn'} = 1; $self->{'last_pcnt'} = 0; + $log->debug("max mfn: $maxmfn"); + # store max mfn and return it. return $self->{'max_mfn'} = $maxmfn; } @@ -233,7 +285,11 @@ $self->progress_bar($mfn,$self->{'max_mfn'}); - return $self->{'data'}->{$mfn}; + if ($self->{'low_mem'}) { + return $self->{'db'}->get($mfn); + } else { + return $self->{'data'}->{$mfn}; + } } =head2 progress_bar @@ -261,13 +317,50 @@ $self->{'last_pcnt'} ||= 1; - $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'}); - my $p = int($curr * 100 / $max); + + # reset on re-run + if ($p < $self->{'last_pcnt'}) { + $self->{'last_pcnt'} = $p; + $self->{'last_t'} = time(); + $self->{'last_curr'} = 1; + } + if ($p != $self->{'last_pcnt'}) { - printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p ); + + my $last_curr = $self->{'last_curr'} || $curr; + my $t = time(); + my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1)); + my $eta = ($max-$curr) / ($rate || 1); + printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta)); $self->{'last_pcnt'} = $p; + $self->{'last_t'} = time(); + $self->{'last_curr'} = $curr; } + print STDERR "\n" if ($p == 100); +} + +=head2 fmt_time + +Format time (in seconds) for display. + + print $webpac->fmt_time(time()); + +This method is called by L to display remaining time. + +=cut + +sub fmt_time { + my $self = shift; + + my $t = shift || 0; + my $out = ""; + + my ($ss,$mm,$hh) = gmtime($t); + $out .= "${hh}h" if ($hh); + $out .= sprintf("%02d:%02d", $mm,$ss); + $out .= " " if ($hh == 0); + return $out; } =head2 open_import_xml @@ -329,21 +422,23 @@ $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); foreach my $i (@_) { - if ($i->{'eval'}) { - my $eval = $self->fill_in($rec,$i->{'eval'}); - my $key = $self->fill_in($rec,$i->{'key'}); - my @val = $self->fill_in($rec,$i->{'val'}); - if ($key && @val && eval $eval) { + $log->logconfess("need key") unless defined($i->{'key'}); + $log->logconfess("need val") unless defined($i->{'val'}); + + if (defined($i->{'eval'})) { + # eval first, so we can skip fill_in for key and val + my $eval = $self->fill_in($rec,$i->{'eval'}) || next; + if ($self->_eval($eval)) { + my $key = $self->fill_in($rec,$i->{'key'}) || next; + my @val = $self->fill_in($rec,$i->{'val'}) || next; $log->debug("stored $key = ",sub { join(" | ",@val) }); push @{$self->{'lookup'}->{$key}}, @val; } } else { - my $key = $self->fill_in($rec,$i->{'key'}); - my @val = $self->fill_in($rec,$i->{'val'}); - if ($key && @val) { - $log->debug("stored $key = ",sub { join(" | ",@val) }); - push @{$self->{'lookup'}->{$key}}, @val; - } + my $key = $self->fill_in($rec,$i->{'key'}) || next; + my @val = $self->fill_in($rec,$i->{'val'}) || next; + $log->debug("stored $key = ",sub { join(" | ",@val) }); + push @{$self->{'lookup'}->{$key}}, @val; } } } @@ -656,7 +751,8 @@ my @ds = $webpac->data_structure($rec); This method will also set C<$webpac->{'currnet_filename'}> if there is - tag in C. + tag in C and C<$webpac->{'headline'}> if there is + tag. =cut @@ -669,6 +765,7 @@ $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); undef $self->{'currnet_filename'}; + undef $self->{'headline'}; my @sorted_tags; if ($self->{tags_by_order}) { @@ -706,6 +803,15 @@ @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; } + if ($field eq 'filename') { + $self->{'current_filename'} = join('',@v); + $log->debug("filename: ",$self->{'current_filename'}); + } elsif ($field eq 'headline') { + $self->{'headline'} .= join('',@v); + $log->debug("headline: ",$self->{'headline'}); + next; # don't return headline in data_structure! + } + # does tag have type? if ($tag->{'type'}) { push @{$row->{$tag->{'type'}}}, @v; @@ -714,10 +820,6 @@ push @{$row->{'swish'}}, @v; } - if ($field eq 'filename') { - $self->{'current_filename'} = join('',@v); - $log->debug("filename: ",$self->{'current_filename'}); - } } @@ -768,6 +870,38 @@ return $out; } +=head2 output_file + +Create output from in-memory data structure using Template Toolkit template +to a file. + + $webpac->output_file( + file => 'out.txt', + template => 'text.tt', + data => @ds + ); + +=cut + +sub output_file { + my $self = shift; + + my $args = {@_}; + + my $log = $self->_get_logger(); + + my $file = $args->{'file'} || $log->logconfess("need file name"); + + $log->debug("creating file ",$file); + + open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!"); + print $fh $self->output( + template => $args->{'template'}, + data => $args->{'data'}, + ) || $log->logdie("print: $!"); + close($fh) || $log->logdie("close: $!"); +} + =head2 apply_format Apply format specified in tag with C and @@ -911,6 +1045,37 @@ also use method names, and not only classes (which are just few) to filter logging. + +=head1 MEMORY USAGE + +C options is double-edged sword. If enabled, WebPAC +will run on memory constraint machines (which doesn't have enough +physical RAM to create memory structure for whole source database). + +If your machine has 512Mb or more of RAM and database is around 10000 records, +memory shouldn't be an issue. If you don't have enough physical RAM, you +might consider using virtual memory (if your operating system is handling it +well, like on FreeBSD or Linux) instead of dropping to L to handle +parsed structure of ISIS database (this is what C option does). + +Hitting swap at end of reading source database is probably o.k. However, +hitting swap before 90% will dramatically decrease performance and you will +be better off with C and using rest of availble memory for +operating system disk cache (Linux is particuallary good about this). +However, every access to database record will require disk access, so +generation phase will be slower 10-100 times. + +Parsed structures are essential - you just have option to trade RAM memory +(which is fast) for disk space (which is slow). Be sure to have planty of +disk space if you are using C and thus L. + +However, when WebPAC is running on desktop machines (or laptops :-), it's +highly undesireable for system to start swapping. Using C option can +reduce WecPAC memory usage to around 64Mb for same database with lookup +fields and sorted indexes which stay in RAM. Performance will suffer, but +memory usage will really be minimal. It might be also more confortable to +run WebPAC reniced on those machines. + =cut 1;