--- trunk2/lib/WebPAC.pm 2004/09/09 18:08:38 418 +++ trunk2/lib/WebPAC.pm 2004/10/30 23:58:36 563 @@ -9,6 +9,7 @@ use XML::Simple; use Template; use Log::Log4perl qw(get_logger :levels); +use Time::HiRes qw(time); use Data::Dumper; @@ -29,16 +30,26 @@ =head2 new -This will create new instance of WebPAC using configuration specified by C. +Create new instance of WebPAC using configuration specified by C. my $webpac = new WebPAC( config_file => 'name.conf', - [code_page => 'ISO-8859-2',] + code_page => 'ISO-8859-2', + low_mem => 1, + filter => { + 'lower' => sub { lc($_[0]) }, + }, ); Default C is C. -It will also read configuration files +Default is not to use C options (see L below). + +There is optinal parametar C which specify different filters which +can be applied using C notation. +Same filters can be used in Template Toolkit files. + +This method will also read configuration files C (used by indexer and Web font-end) and configuration file specified by C which describes databases to be indexed. @@ -59,6 +70,8 @@ my $self = {@_}; bless($self, $class); + $self->{'start_t'} = time(); + my $log_file = $self->{'log'} || "log.conf"; Log::Log4perl->init($log_file); @@ -100,12 +113,38 @@ # create Template toolkit instance $self->{'tt'} = Template->new( INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'), -# FILTERS => { -# 'foo' => \&foo_filter, -# }, + FILTERS => $self->{'filter'}, EVAL_PERL => 1, ); + # running with low_mem flag? well, use DBM::Deep then. + if ($self->{'low_mem'}) { + $log->info("running with low_mem which impacts performance (<32 Mb memory usage)"); + + my $db_file = "data.db"; + + if (-e $db_file) { + unlink $db_file or $log->logdie("can't remove '$db_file' from last run"); + $log->debug("removed '$db_file' from last run"); + } + + require DBM::Deep; + + my $db = new DBM::Deep $db_file; + + $log->logdie("DBM::Deep error: $!") unless ($db); + + if ($db->error()) { + $log->logdie("can't open '$db_file' under low_mem: ",$db->error()); + } else { + $log->debug("using file '$db_file' for DBM::Deep"); + } + + $self->{'db'} = $db; + } + + $log->debug("filters defined: ",Dumper($self->{'filter'})); + return $self; } @@ -116,12 +155,16 @@ $webpac->open_isis( filename => '/data/ISIS/ISIS', code_page => '852', - limit_mfn => '500', + limit_mfn => 500, + start_mfn => 6000, lookup => [ ... ], ); By default, ISIS code page is assumed to be C<852>. +If optional parametar C is set, this will be first MFN to read +from database (so you can skip beginning of your database if you need to). + If optional parametar C is set, it will read just 500 records from database in example above. @@ -149,6 +192,12 @@ $log->logcroak("need filename") if (! $arg->{'filename'}); my $code_page = $arg->{'code_page'} || '852'; + $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*')); + + # store data in object + $self->{'isis_filename'} = $arg->{'filename'}; + $self->{'isis_code_page'} = $code_page; + use OpenIsis; #$self->{'isis_code_page'} = $code_page; @@ -157,21 +206,32 @@ my $cp = Text::Iconv->new($code_page,$self->{'code_page'}); $log->info("reading ISIS database '",$arg->{'filename'},"'"); + $log->debug("isis code page: $code_page"); my $isis_db = OpenIsis::open($arg->{'filename'}); my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1; + my $startmfn = 1; - $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn}); + if (my $s = $self->{'start_mfn'}) { + $log->info("skipping to MFN $s"); + $startmfn = $s; + } else { + $self->{'start_mfn'} = $startmfn; + } + + $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn}); - $log->info("processing $maxmfn records..."); + $log->info("processing ",($maxmfn-$startmfn)." records..."); # read database - for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) { + for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) { $log->debug("mfn: $mfn\n"); + my $rec; + # read record my $row = OpenIsis::read( $isis_db, $mfn ); foreach my $k (keys %{$row}) { @@ -189,23 +249,31 @@ $val = $l; } - push @{$self->{'data'}->{$mfn}->{$k}}, $val; + push @{$rec->{$k}}, $val; } } else { - push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn; + push @{$rec->{'000'}}, $mfn; } } + $log->confess("record $mfn empty?") unless ($rec); + + # store + if ($self->{'low_mem'}) { + $self->{'db'}->put($mfn, $rec); + } else { + $self->{'data'}->{$mfn} = $rec; + } + # create lookup - my $rec = $self->{'data'}->{$mfn} || $log->confess("record $mfn empty?"); $self->create_lookup($rec, @{$arg->{'lookup'}}); $self->progress_bar($mfn,$maxmfn); } - $self->{'current_mfn'} = 1; + $self->{'current_mfn'} = -1; $self->{'last_pcnt'} = 0; $log->debug("max mfn: $maxmfn"); @@ -228,7 +296,15 @@ my $log = $self->_get_logger(); - my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!"); + $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'}); + + if ($self->{'current_mfn'} == -1) { + $self->{'current_mfn'} = $self->{'start_mfn'}; + } else { + $self->{'current_mfn'}++; + } + + my $mfn = $self->{'current_mfn'}; if ($mfn > $self->{'max_mfn'}) { $self->{'current_mfn'} = $self->{'max_mfn'}; @@ -238,7 +314,24 @@ $self->progress_bar($mfn,$self->{'max_mfn'}); - return $self->{'data'}->{$mfn}; + if ($self->{'low_mem'}) { + return $self->{'db'}->get($mfn); + } else { + return $self->{'data'}->{$mfn}; + } +} + +=head2 mfn + +Returns current record number (MFN). + + print $webpac->mfn; + +=cut + +sub mfn { + my $self = shift; + return $self->{'current_mfn'}; } =head2 progress_bar @@ -266,16 +359,54 @@ $self->{'last_pcnt'} ||= 1; - $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'}); + my $p = int($curr * 100 / $max) || 1; + + # reset on re-run + if ($p < $self->{'last_pcnt'}) { + $self->{'last_pcnt'} = $p; + $self->{'last_t'} = time(); + $self->{'last_curr'} = undef; + } + + $self->{'last_t'} ||= time(); - my $p = int($curr * 100 / $max); if ($p != $self->{'last_pcnt'}) { - printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p ); + + my $last_curr = $self->{'last_curr'} || $curr; + my $t = time(); + my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1)); + my $eta = ($max-$curr) / ($rate || 1); + printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta)); $self->{'last_pcnt'} = $p; + $self->{'last_t'} = time(); + $self->{'last_curr'} = $curr; } print STDERR "\n" if ($p == 100); } +=head2 fmt_time + +Format time (in seconds) for display. + + print $webpac->fmt_time(time()); + +This method is called by L to display remaining time. + +=cut + +sub fmt_time { + my $self = shift; + + my $t = shift || 0; + my $out = ""; + + my ($ss,$mm,$hh) = gmtime($t); + $out .= "${hh}h" if ($hh); + $out .= sprintf("%02d:%02d", $mm,$ss); + $out .= " " if ($hh == 0); + return $out; +} + =head2 open_import_xml Read file from C directory and parse it. @@ -438,6 +569,8 @@ # iteration (for repeatable fields) my $i = shift || 0; + $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999)); + # FIXME remove for speedup? $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); @@ -451,8 +584,15 @@ # remove eval{...} from beginning $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); + my $filter_name; + # remove filter{...} from beginning + $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); + # do actual replacement of placeholders + # repeatable fields $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; + # non-repeatable fields + $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges; if ($found) { $log->debug("format: $format"); @@ -460,6 +600,12 @@ my $eval = $self->fill_in($rec,$eval_code,$i); return if (! $self->_eval($eval)); } + if ($filter_name && $self->{'filter'}->{$filter_name}) { + $log->debug("filter '$filter_name' for $format"); + $format = $self->{'filter'}->{$filter_name}->($format); + return unless(defined($format)); + $log->debug("filter result: $format"); + } # do we have lookups? if ($format =~ /$LOOKUP_REGEX/o) { $log->debug("format '$format' has lookup"); @@ -552,16 +698,24 @@ # remove eval{...} from beginning $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); + my $filter_name; + # remove filter{...} from beginning + $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s); + my $prefix; my $all_found=0; - while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) { + while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) { my $del = $1 || ''; $prefix ||= $del if ($all_found == 0); + # repeatable index + my $r = $i; + $r = 0 if (lc("$2") eq 's'); + my $found = 0; - my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found); + my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found); if ($found) { push @out, $del; @@ -585,10 +739,17 @@ } if ($eval_code) { - my $eval = $self->fill_in($rec,$eval_code,$i); - $log->debug("about to eval{",$eval,"} format: $out"); + my $eval = $self->fill_in($rec,$eval_code,$i) || return; + $log->debug("about to eval{$eval} format: $out"); return if (! $self->_eval($eval)); } + + if ($filter_name && $self->{'filter'}->{$filter_name}) { + $log->debug("about to filter{$filter_name} format: $out"); + $out = $self->{'filter'}->{$filter_name}->($out); + return unless(defined($out)); + $log->debug("filter result: $out"); + } return $out; } @@ -655,6 +816,31 @@ return @arr; } +=head2 sort_arr + +Sort array ignoring case and html in data + + my @sorted = $webpac->sort_arr(@unsorted); + +=cut + +sub sort_arr { + my $self = shift; + + my $log = $self->_get_logger(); + + # FIXME add Schwartzian Transformation? + + my @sorted = sort { + $a =~ s#<[^>]+/*>##; + $b =~ s#<[^>]+/*>##; + lc($b) cmp lc($a) + } @_; + $log->debug("sorted values: ",sub { join(", ",@sorted) }); + + return @sorted; +} + =head2 data_structure @@ -711,6 +897,10 @@ } next if (! @v); + if ($tag->{'sort'}) { + @v = $self->sort_arr(@v); + } + # use format? if ($tag->{'format_name'}) { @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; @@ -725,12 +915,35 @@ next; # don't return headline in data_structure! } - # does tag have type? - if ($tag->{'type'}) { - push @{$row->{$tag->{'type'}}}, @v; - } else { - push @{$row->{'display'}}, @v; - push @{$row->{'swish'}}, @v; + # delimiter will join repeatable fields + if ($tag->{'delimiter'}) { + @v = ( join($tag->{'delimiter'}, @v) ); + } + + # default types + my @types = qw(display swish); + # override by type attribute + @types = ( $tag->{'type'} ) if ($tag->{'type'}); + + foreach my $type (@types) { + # append to previous line? + $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append'); + if ($tag->{'append'}) { + + # I will delimit appended part with + # delimiter (or ,) + my $d = $tag->{'delimiter'}; + # default delimiter + $d ||= " "; + + my $last = pop @{$row->{$type}}; + $d = "" if (! $last); + $last .= $d . join($d, @v); + push @{$row->{$type}}, $last; + + } else { + push @{$row->{$type}}, @v; + } } @@ -743,6 +956,11 @@ my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; $row->{'name'} = $name ? $self->_x($name) : $field; + # post-sort all values in field + if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) { + $log->warn("sort at field tag not implemented"); + } + push @ds, $row; $log->debug("row $field: ",sub { Dumper($row) }); @@ -803,11 +1021,11 @@ my $log = $self->_get_logger(); - $log->logconfess("need file name") if (! $args->{'file'}); + my $file = $args->{'file'} || $log->logconfess("need file name"); - $log->debug("creating file ",$args->{'file'}); + $log->debug("creating file ",$file); - open(my $fh, ">", $args->{'file'}) || $log->logdie("can't open output file '$self->{'file'}': $!"); + open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!"); print $fh $self->output( template => $args->{'template'}, data => $args->{'data'}, @@ -888,7 +1106,7 @@ $log->debug("eval: ",$code," [",$ret,"]"); - return $ret || 0; + return $ret || undef; } =head2 _sort_by_order @@ -958,6 +1176,37 @@ also use method names, and not only classes (which are just few) to filter logging. + +=head1 MEMORY USAGE + +C options is double-edged sword. If enabled, WebPAC +will run on memory constraint machines (which doesn't have enough +physical RAM to create memory structure for whole source database). + +If your machine has 512Mb or more of RAM and database is around 10000 records, +memory shouldn't be an issue. If you don't have enough physical RAM, you +might consider using virtual memory (if your operating system is handling it +well, like on FreeBSD or Linux) instead of dropping to L to handle +parsed structure of ISIS database (this is what C option does). + +Hitting swap at end of reading source database is probably o.k. However, +hitting swap before 90% will dramatically decrease performance and you will +be better off with C and using rest of availble memory for +operating system disk cache (Linux is particuallary good about this). +However, every access to database record will require disk access, so +generation phase will be slower 10-100 times. + +Parsed structures are essential - you just have option to trade RAM memory +(which is fast) for disk space (which is slow). Be sure to have planty of +disk space if you are using C and thus L. + +However, when WebPAC is running on desktop machines (or laptops :-), it's +highly undesireable for system to start swapping. Using C option can +reduce WecPAC memory usage to around 64Mb for same database with lookup +fields and sorted indexes which stay in RAM. Performance will suffer, but +memory usage will really be minimal. It might be also more confortable to +run WebPAC reniced on those machines. + =cut 1;