/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 418 by dpavlin, Thu Sep 9 18:08:38 2004 UTC revision 560 by dpavlin, Sat Oct 30 23:04:37 2004 UTC
# Line 9  use Config::IniFiles; Line 9  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10  use Template;  use Template;
11  use Log::Log4perl qw(get_logger :levels);  use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
# Line 29  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38            low_mem => 1,
39            filter => {
40                    'lower' => sub { lc($_[0]) },
41            },
42   );   );
43    
44  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
45    
46  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48    There is optinal parametar C<filter> which specify different filters which
49    can be applied using C<filter{name}> notation.
50    Same filters can be used in Template Toolkit files.
51    
52    This method will also read configuration files
53  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
54  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
55  which describes databases to be indexed.  which describes databases to be indexed.
# Line 59  sub new { Line 70  sub new {
70          my $self = {@_};          my $self = {@_};
71          bless($self, $class);          bless($self, $class);
72    
73            $self->{'start_t'} = time();
74    
75          my $log_file = $self->{'log'} || "log.conf";          my $log_file = $self->{'log'} || "log.conf";
76          Log::Log4perl->init($log_file);          Log::Log4perl->init($log_file);
77    
# Line 100  sub new { Line 113  sub new {
113          # create Template toolkit instance          # create Template toolkit instance
114          $self->{'tt'} = Template->new(          $self->{'tt'} = Template->new(
115                  INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),                  INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116  #               FILTERS => {                  FILTERS => $self->{'filter'},
 #                       'foo' => \&foo_filter,  
 #               },  
117                  EVAL_PERL => 1,                  EVAL_PERL => 1,
118          );          );
119    
120            # running with low_mem flag? well, use DBM::Deep then.
121            if ($self->{'low_mem'}) {
122                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123    
124                    my $db_file = "data.db";
125    
126                    if (-e $db_file) {
127                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128                            $log->debug("removed '$db_file' from last run");
129                    }
130    
131                    require DBM::Deep;
132    
133                    my $db = new DBM::Deep $db_file;
134    
135                    $log->logdie("DBM::Deep error: $!") unless ($db);
136    
137                    if ($db->error()) {
138                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139                    } else {
140                            $log->debug("using file '$db_file' for DBM::Deep");
141                    }
142    
143                    $self->{'db'} = $db;
144            }
145    
146            $log->debug("filters defined: ",Dumper($self->{'filter'}));
147    
148          return $self;          return $self;
149  }  }
150    
# Line 116  Open CDS/ISIS database using OpenIsis mo Line 155  Open CDS/ISIS database using OpenIsis mo
155   $webpac->open_isis(   $webpac->open_isis(
156          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
157          code_page => '852',          code_page => '852',
158          limit_mfn => '500',          limit_mfn => 500,
159            start_mfn => 6000,
160          lookup => [ ... ],          lookup => [ ... ],
161   );   );
162    
163  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
164    
165    If optional parametar C<start_mfn> is set, this will be first MFN to read
166    from database (so you can skip beginning of your database if you need to).
167    
168  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
169  from database in example above.  from database in example above.
170    
# Line 149  sub open_isis { Line 192  sub open_isis {
192          $log->logcroak("need filename") if (! $arg->{'filename'});          $log->logcroak("need filename") if (! $arg->{'filename'});
193          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
194    
195            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
196    
197            # store data in object
198            $self->{'isis_filename'} = $arg->{'filename'};
199            $self->{'isis_code_page'} = $code_page;
200    
201          use OpenIsis;          use OpenIsis;
202    
203          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 157  sub open_isis { Line 206  sub open_isis {
206          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
207    
208          $log->info("reading ISIS database '",$arg->{'filename'},"'");          $log->info("reading ISIS database '",$arg->{'filename'},"'");
209            $log->debug("isis code page: $code_page");
210    
211          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
212    
213          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
214            my $startmfn = 1;
215    
216          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          if (my $s = $self->{'start_mfn'}) {
217                    $log->info("skipping to MFN $s");
218                    $startmfn = $s;
219            } else {
220                    $self->{'start_mfn'} = $startmfn;
221            }
222    
223            $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
224    
225          $log->info("processing $maxmfn records...");          $log->info("processing ",($maxmfn-$startmfn)." records...");
226    
227          # read database          # read database
228          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
229    
230    
231                  $log->debug("mfn: $mfn\n");                  $log->debug("mfn: $mfn\n");
232    
233                    my $rec;
234    
235                  # read record                  # read record
236                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
237                  foreach my $k (keys %{$row}) {                  foreach my $k (keys %{$row}) {
# Line 189  sub open_isis { Line 249  sub open_isis {
249                                                  $val = $l;                                                  $val = $l;
250                                          }                                          }
251    
252                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
253                                  }                                  }
254                          } else {                          } else {
255                                  push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn;                                  push @{$rec->{'000'}}, $mfn;
256                          }                          }
257    
258                  }                  }
259    
260                    $log->confess("record $mfn empty?") unless ($rec);
261    
262                    # store
263                    if ($self->{'low_mem'}) {
264                            $self->{'db'}->put($mfn, $rec);
265                    } else {
266                            $self->{'data'}->{$mfn} = $rec;
267                    }
268    
269                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn} || $log->confess("record $mfn empty?");  
270                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
271    
272                  $self->progress_bar($mfn,$maxmfn);                  $self->progress_bar($mfn,$maxmfn);
273    
274          }          }
275    
276          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = -1;
277          $self->{'last_pcnt'} = 0;          $self->{'last_pcnt'} = 0;
278    
279          $log->debug("max mfn: $maxmfn");          $log->debug("max mfn: $maxmfn");
# Line 228  sub fetch_rec { Line 296  sub fetch_rec {
296    
297          my $log = $self->_get_logger();          my $log = $self->_get_logger();
298    
299          my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");          $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
300    
301            if ($self->{'current_mfn'} == -1) {
302                    $self->{'current_mfn'} = $self->{'start_mfn'};
303            } else {
304                    $self->{'current_mfn'}++;
305            }
306    
307            my $mfn = $self->{'current_mfn'};
308    
309          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{'max_mfn'}) {
310                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{'current_mfn'} = $self->{'max_mfn'};
# Line 238  sub fetch_rec { Line 314  sub fetch_rec {
314    
315          $self->progress_bar($mfn,$self->{'max_mfn'});          $self->progress_bar($mfn,$self->{'max_mfn'});
316    
317          return $self->{'data'}->{$mfn};          if ($self->{'low_mem'}) {
318                    return $self->{'db'}->get($mfn);
319            } else {
320                    return $self->{'data'}->{$mfn};
321            }
322    }
323    
324    =head2 mfn
325    
326    Returns current record number (MFN).
327    
328     print $webpac->mfn;
329    
330    =cut
331    
332    sub mfn {
333            my $self = shift;
334            return $self->{'current_mfn'};
335  }  }
336    
337  =head2 progress_bar  =head2 progress_bar
# Line 266  sub progress_bar { Line 359  sub progress_bar {
359    
360          $self->{'last_pcnt'} ||= 1;          $self->{'last_pcnt'} ||= 1;
361    
362          $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});          my $p = int($curr * 100 / $max) || 1;
363    
364            # reset on re-run
365            if ($p < $self->{'last_pcnt'}) {
366                    $self->{'last_pcnt'} = $p;
367                    $self->{'last_t'} = time();
368                    $self->{'last_curr'} = undef;
369            }
370    
371            $self->{'last_t'} ||= time();
372    
         my $p = int($curr * 100 / $max);  
373          if ($p != $self->{'last_pcnt'}) {          if ($p != $self->{'last_pcnt'}) {
374                  printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );  
375                    my $last_curr = $self->{'last_curr'} || $curr;
376                    my $t = time();
377                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
378                    my $eta = ($max-$curr) / ($rate || 1);
379                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
380                  $self->{'last_pcnt'} = $p;                  $self->{'last_pcnt'} = $p;
381                    $self->{'last_t'} = time();
382                    $self->{'last_curr'} = $curr;
383          }          }
384          print STDERR "\n" if ($p == 100);          print STDERR "\n" if ($p == 100);
385  }  }
386    
387    =head2 fmt_time
388    
389    Format time (in seconds) for display.
390    
391     print $webpac->fmt_time(time());
392    
393    This method is called by L<progress_bar> to display remaining time.
394    
395    =cut
396    
397    sub fmt_time {
398            my $self = shift;
399    
400            my $t = shift || 0;
401            my $out = "";
402    
403            my ($ss,$mm,$hh) = gmtime($t);
404            $out .= "${hh}h" if ($hh);
405            $out .= sprintf("%02d:%02d", $mm,$ss);
406            $out .= "  " if ($hh == 0);
407            return $out;
408    }
409    
410  =head2 open_import_xml  =head2 open_import_xml
411    
412  Read file from C<import_xml/> directory and parse it.  Read file from C<import_xml/> directory and parse it.
# Line 438  sub fill_in { Line 569  sub fill_in {
569          # iteration (for repeatable fields)          # iteration (for repeatable fields)
570          my $i = shift || 0;          my $i = shift || 0;
571    
572            $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
573    
574          # FIXME remove for speedup?          # FIXME remove for speedup?
575          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
576    
# Line 451  sub fill_in { Line 584  sub fill_in {
584          # remove eval{...} from beginning          # remove eval{...} from beginning
585          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
586    
587            my $filter_name;
588            # remove filter{...} from beginning
589            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
590    
591          # do actual replacement of placeholders          # do actual replacement of placeholders
592            # repeatable fields
593          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
594            # non-repeatable fields
595            $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
596    
597          if ($found) {          if ($found) {
598                  $log->debug("format: $format");                  $log->debug("format: $format");
# Line 460  sub fill_in { Line 600  sub fill_in {
600                          my $eval = $self->fill_in($rec,$eval_code,$i);                          my $eval = $self->fill_in($rec,$eval_code,$i);
601                          return if (! $self->_eval($eval));                          return if (! $self->_eval($eval));
602                  }                  }
603                    if ($filter_name && $self->{'filter'}->{$filter_name}) {
604                            $log->debug("filter '$filter_name' for $format");
605                            $format = $self->{'filter'}->{$filter_name}->($format);
606                            return unless(defined($format));
607                            $log->debug("filter result: $format");
608                    }
609                  # do we have lookups?                  # do we have lookups?
610                  if ($format =~ /$LOOKUP_REGEX/o) {                  if ($format =~ /$LOOKUP_REGEX/o) {
611                          $log->debug("format '$format' has lookup");                          $log->debug("format '$format' has lookup");
# Line 552  sub parse { Line 698  sub parse {
698          # remove eval{...} from beginning          # remove eval{...} from beginning
699          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
700    
701            my $filter_name;
702            # remove filter{...} from beginning
703            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
704    
705          my $prefix;          my $prefix;
706          my $all_found=0;          my $all_found=0;
707    
708          while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {          while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
709    
710                  my $del = $1 || '';                  my $del = $1 || '';
711                  $prefix ||= $del if ($all_found == 0);                  $prefix ||= $del if ($all_found == 0);
712    
713                    # repeatable index
714                    my $r = $i;
715                    $r = 0 if (lc("$2") eq 's');
716    
717                  my $found = 0;                  my $found = 0;
718                  my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);                  my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
719    
720                  if ($found) {                  if ($found) {
721                          push @out, $del;                          push @out, $del;
# Line 585  sub parse { Line 739  sub parse {
739          }          }
740    
741          if ($eval_code) {          if ($eval_code) {
742                  my $eval = $self->fill_in($rec,$eval_code,$i);                  my $eval = $self->fill_in($rec,$eval_code,$i) || return;
743                  $log->debug("about to eval{",$eval,"} format: $out");                  $log->debug("about to eval{$eval} format: $out");
744                  return if (! $self->_eval($eval));                  return if (! $self->_eval($eval));
745          }          }
746            
747            if ($filter_name && $self->{'filter'}->{$filter_name}) {
748                    $log->debug("about to filter{$filter_name} format: $out");
749                    $out = $self->{'filter'}->{$filter_name}->($out);
750                    return unless(defined($out));
751                    $log->debug("filter result: $out");
752            }
753    
754          return $out;          return $out;
755  }  }
# Line 655  sub fill_in_to_arr { Line 816  sub fill_in_to_arr {
816          return @arr;          return @arr;
817  }  }
818    
819    =head2 sort_arr
820    
821    Sort array ignoring case and html in data
822    
823     my @sorted = $webpac->sort_arr(@unsorted);
824    
825    =cut
826    
827    sub sort_arr {
828            my $self = shift;
829    
830            my $log = $self->_get_logger();
831    
832            # FIXME add Schwartzian Transformation?
833    
834            my @sorted = sort {
835                    $a =~ s#<[^>]+/*>##;
836                    $b =~ s#<[^>]+/*>##;
837                    lc($b) cmp lc($a)
838            } @_;
839            $log->debug("sorted values: ",sub { join(", ",@sorted) });
840    
841            return @sorted;
842    }
843    
844    
845  =head2 data_structure  =head2 data_structure
846    
# Line 711  sub data_structure { Line 897  sub data_structure {
897                          }                          }
898                          next if (! @v);                          next if (! @v);
899    
900                            if ($tag->{'sort'}) {
901                                    @v = $self->sort_arr(@v);
902                                    $log->warn("sort within tag is usually not what you want!");
903                            }
904    
905                          # use format?                          # use format?
906                          if ($tag->{'format_name'}) {                          if ($tag->{'format_name'}) {
907                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
# Line 725  sub data_structure { Line 916  sub data_structure {
916                                  next; # don't return headline in data_structure!                                  next; # don't return headline in data_structure!
917                          }                          }
918    
919                          # does tag have type?                          # delimiter will join repeatable fields
920                          if ($tag->{'type'}) {                          if ($tag->{'delimiter'}) {
921                                  push @{$row->{$tag->{'type'}}}, @v;                                  @v = ( join($tag->{'delimiter'}, @v) );
922                          } else {                          }
923                                  push @{$row->{'display'}}, @v;  
924                                  push @{$row->{'swish'}}, @v;                          # default types
925                            my @types = qw(display swish);
926                            # override by type attribute
927                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
928    
929                            foreach my $type (@types) {
930                                    # append to previous line?
931                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
932                                    if ($tag->{'append'}) {
933    
934                                            # I will delimit appended part with
935                                            # delimiter (or ,)
936                                            my $d = $tag->{'delimiter'};
937                                            # default delimiter
938                                            $d ||= " ";
939    
940                                            my $last = pop @{$row->{$type}};
941                                            $d = "" if (! $last);
942                                            $last .= $d . join($d, @v);
943                                            push @{$row->{$type}}, $last;
944    
945                                    } else {
946                                            push @{$row->{$type}}, @v;
947                                    }
948                          }                          }
949    
950    
# Line 743  sub data_structure { Line 957  sub data_structure {
957                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
958                          $row->{'name'} = $name ? $self->_x($name) : $field;                          $row->{'name'} = $name ? $self->_x($name) : $field;
959    
960                            # post-sort all values in field
961                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
962                                    $log->warn("sort at field tag not implemented");
963                            }
964    
965                          push @ds, $row;                          push @ds, $row;
966    
967                          $log->debug("row $field: ",sub { Dumper($row) });                          $log->debug("row $field: ",sub { Dumper($row) });
# Line 803  sub output_file { Line 1022  sub output_file {
1022    
1023          my $log = $self->_get_logger();          my $log = $self->_get_logger();
1024    
1025          $log->logconfess("need file name") if (! $args->{'file'});          my $file = $args->{'file'} || $log->logconfess("need file name");
1026    
1027          $log->debug("creating file ",$args->{'file'});          $log->debug("creating file ",$file);
1028    
1029          open(my $fh, ">", $args->{'file'}) || $log->logdie("can't open output file '$self->{'file'}': $!");          open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1030          print $fh $self->output(          print $fh $self->output(
1031                  template => $args->{'template'},                  template => $args->{'template'},
1032                  data => $args->{'data'},                  data => $args->{'data'},
# Line 888  sub _eval { Line 1107  sub _eval {
1107    
1108          $log->debug("eval: ",$code," [",$ret,"]");          $log->debug("eval: ",$code," [",$ret,"]");
1109    
1110          return $ret || 0;          return $ret || undef;
1111  }  }
1112    
1113  =head2 _sort_by_order  =head2 _sort_by_order
# Line 958  B<This is different from normal Log4perl Line 1177  B<This is different from normal Log4perl
1177  also use method names, and not only classes (which are just few)  also use method names, and not only classes (which are just few)
1178  to filter logging.  to filter logging.
1179    
1180    
1181    =head1 MEMORY USAGE
1182    
1183    C<low_mem> options is double-edged sword. If enabled, WebPAC
1184    will run on memory constraint machines (which doesn't have enough
1185    physical RAM to create memory structure for whole source database).
1186    
1187    If your machine has 512Mb or more of RAM and database is around 10000 records,
1188    memory shouldn't be an issue. If you don't have enough physical RAM, you
1189    might consider using virtual memory (if your operating system is handling it
1190    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1191    parsed structure of ISIS database (this is what C<low_mem> option does).
1192    
1193    Hitting swap at end of reading source database is probably o.k. However,
1194    hitting swap before 90% will dramatically decrease performance and you will
1195    be better off with C<low_mem> and using rest of availble memory for
1196    operating system disk cache (Linux is particuallary good about this).
1197    However, every access to database record will require disk access, so
1198    generation phase will be slower 10-100 times.
1199    
1200    Parsed structures are essential - you just have option to trade RAM memory
1201    (which is fast) for disk space (which is slow). Be sure to have planty of
1202    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1203    
1204    However, when WebPAC is running on desktop machines (or laptops :-), it's
1205    highly undesireable for system to start swapping. Using C<low_mem> option can
1206    reduce WecPAC memory usage to around 64Mb for same database with lookup
1207    fields and sorted indexes which stay in RAM. Performance will suffer, but
1208    memory usage will really be minimal. It might be also more confortable to
1209    run WebPAC reniced on those machines.
1210    
1211  =cut  =cut
1212    
1213  1;  1;

Legend:
Removed from v.418  
changed lines
  Added in v.560

  ViewVC Help
Powered by ViewVC 1.1.26