/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 418 by dpavlin, Thu Sep 9 18:08:38 2004 UTC revision 563 by dpavlin, Sat Oct 30 23:58:36 2004 UTC
# Line 9  use Config::IniFiles; Line 9  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10  use Template;  use Template;
11  use Log::Log4perl qw(get_logger :levels);  use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
# Line 29  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38            low_mem => 1,
39            filter => {
40                    'lower' => sub { lc($_[0]) },
41            },
42   );   );
43    
44  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
45    
46  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48    There is optinal parametar C<filter> which specify different filters which
49    can be applied using C<filter{name}> notation.
50    Same filters can be used in Template Toolkit files.
51    
52    This method will also read configuration files
53  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
54  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
55  which describes databases to be indexed.  which describes databases to be indexed.
# Line 59  sub new { Line 70  sub new {
70          my $self = {@_};          my $self = {@_};
71          bless($self, $class);          bless($self, $class);
72    
73            $self->{'start_t'} = time();
74    
75          my $log_file = $self->{'log'} || "log.conf";          my $log_file = $self->{'log'} || "log.conf";
76          Log::Log4perl->init($log_file);          Log::Log4perl->init($log_file);
77    
# Line 100  sub new { Line 113  sub new {
113          # create Template toolkit instance          # create Template toolkit instance
114          $self->{'tt'} = Template->new(          $self->{'tt'} = Template->new(
115                  INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),                  INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116  #               FILTERS => {                  FILTERS => $self->{'filter'},
 #                       'foo' => \&foo_filter,  
 #               },  
117                  EVAL_PERL => 1,                  EVAL_PERL => 1,
118          );          );
119    
120            # running with low_mem flag? well, use DBM::Deep then.
121            if ($self->{'low_mem'}) {
122                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123    
124                    my $db_file = "data.db";
125    
126                    if (-e $db_file) {
127                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128                            $log->debug("removed '$db_file' from last run");
129                    }
130    
131                    require DBM::Deep;
132    
133                    my $db = new DBM::Deep $db_file;
134    
135                    $log->logdie("DBM::Deep error: $!") unless ($db);
136    
137                    if ($db->error()) {
138                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139                    } else {
140                            $log->debug("using file '$db_file' for DBM::Deep");
141                    }
142    
143                    $self->{'db'} = $db;
144            }
145    
146            $log->debug("filters defined: ",Dumper($self->{'filter'}));
147    
148          return $self;          return $self;
149  }  }
150    
# Line 116  Open CDS/ISIS database using OpenIsis mo Line 155  Open CDS/ISIS database using OpenIsis mo
155   $webpac->open_isis(   $webpac->open_isis(
156          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
157          code_page => '852',          code_page => '852',
158          limit_mfn => '500',          limit_mfn => 500,
159            start_mfn => 6000,
160          lookup => [ ... ],          lookup => [ ... ],
161   );   );
162    
163  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
164    
165    If optional parametar C<start_mfn> is set, this will be first MFN to read
166    from database (so you can skip beginning of your database if you need to).
167    
168  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
169  from database in example above.  from database in example above.
170    
# Line 149  sub open_isis { Line 192  sub open_isis {
192          $log->logcroak("need filename") if (! $arg->{'filename'});          $log->logcroak("need filename") if (! $arg->{'filename'});
193          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
194    
195            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
196    
197            # store data in object
198            $self->{'isis_filename'} = $arg->{'filename'};
199            $self->{'isis_code_page'} = $code_page;
200    
201          use OpenIsis;          use OpenIsis;
202    
203          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 157  sub open_isis { Line 206  sub open_isis {
206          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
207    
208          $log->info("reading ISIS database '",$arg->{'filename'},"'");          $log->info("reading ISIS database '",$arg->{'filename'},"'");
209            $log->debug("isis code page: $code_page");
210    
211          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
212    
213          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
214            my $startmfn = 1;
215    
216          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          if (my $s = $self->{'start_mfn'}) {
217                    $log->info("skipping to MFN $s");
218                    $startmfn = $s;
219            } else {
220                    $self->{'start_mfn'} = $startmfn;
221            }
222    
223            $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
224    
225          $log->info("processing $maxmfn records...");          $log->info("processing ",($maxmfn-$startmfn)." records...");
226    
227          # read database          # read database
228          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
229    
230    
231                  $log->debug("mfn: $mfn\n");                  $log->debug("mfn: $mfn\n");
232    
233                    my $rec;
234    
235                  # read record                  # read record
236                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
237                  foreach my $k (keys %{$row}) {                  foreach my $k (keys %{$row}) {
# Line 189  sub open_isis { Line 249  sub open_isis {
249                                                  $val = $l;                                                  $val = $l;
250                                          }                                          }
251    
252                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
253                                  }                                  }
254                          } else {                          } else {
255                                  push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn;                                  push @{$rec->{'000'}}, $mfn;
256                          }                          }
257    
258                  }                  }
259    
260                    $log->confess("record $mfn empty?") unless ($rec);
261    
262                    # store
263                    if ($self->{'low_mem'}) {
264                            $self->{'db'}->put($mfn, $rec);
265                    } else {
266                            $self->{'data'}->{$mfn} = $rec;
267                    }
268    
269                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn} || $log->confess("record $mfn empty?");  
270                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
271    
272                  $self->progress_bar($mfn,$maxmfn);                  $self->progress_bar($mfn,$maxmfn);
273    
274          }          }
275    
276          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = -1;
277          $self->{'last_pcnt'} = 0;          $self->{'last_pcnt'} = 0;
278    
279          $log->debug("max mfn: $maxmfn");          $log->debug("max mfn: $maxmfn");
# Line 228  sub fetch_rec { Line 296  sub fetch_rec {
296    
297          my $log = $self->_get_logger();          my $log = $self->_get_logger();
298    
299          my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");          $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
300    
301            if ($self->{'current_mfn'} == -1) {
302                    $self->{'current_mfn'} = $self->{'start_mfn'};
303            } else {
304                    $self->{'current_mfn'}++;
305            }
306    
307            my $mfn = $self->{'current_mfn'};
308    
309          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{'max_mfn'}) {
310                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{'current_mfn'} = $self->{'max_mfn'};
# Line 238  sub fetch_rec { Line 314  sub fetch_rec {
314    
315          $self->progress_bar($mfn,$self->{'max_mfn'});          $self->progress_bar($mfn,$self->{'max_mfn'});
316    
317          return $self->{'data'}->{$mfn};          if ($self->{'low_mem'}) {
318                    return $self->{'db'}->get($mfn);
319            } else {
320                    return $self->{'data'}->{$mfn};
321            }
322    }
323    
324    =head2 mfn
325    
326    Returns current record number (MFN).
327    
328     print $webpac->mfn;
329    
330    =cut
331    
332    sub mfn {
333            my $self = shift;
334            return $self->{'current_mfn'};
335  }  }
336    
337  =head2 progress_bar  =head2 progress_bar
# Line 266  sub progress_bar { Line 359  sub progress_bar {
359    
360          $self->{'last_pcnt'} ||= 1;          $self->{'last_pcnt'} ||= 1;
361    
362          $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});          my $p = int($curr * 100 / $max) || 1;
363    
364            # reset on re-run
365            if ($p < $self->{'last_pcnt'}) {
366                    $self->{'last_pcnt'} = $p;
367                    $self->{'last_t'} = time();
368                    $self->{'last_curr'} = undef;
369            }
370    
371            $self->{'last_t'} ||= time();
372    
         my $p = int($curr * 100 / $max);  
373          if ($p != $self->{'last_pcnt'}) {          if ($p != $self->{'last_pcnt'}) {
374                  printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );  
375                    my $last_curr = $self->{'last_curr'} || $curr;
376                    my $t = time();
377                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
378                    my $eta = ($max-$curr) / ($rate || 1);
379                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
380                  $self->{'last_pcnt'} = $p;                  $self->{'last_pcnt'} = $p;
381                    $self->{'last_t'} = time();
382                    $self->{'last_curr'} = $curr;
383          }          }
384          print STDERR "\n" if ($p == 100);          print STDERR "\n" if ($p == 100);
385  }  }
386    
387    =head2 fmt_time
388    
389    Format time (in seconds) for display.
390    
391     print $webpac->fmt_time(time());
392    
393    This method is called by L<progress_bar> to display remaining time.
394    
395    =cut
396    
397    sub fmt_time {
398            my $self = shift;
399    
400            my $t = shift || 0;
401            my $out = "";
402    
403            my ($ss,$mm,$hh) = gmtime($t);
404            $out .= "${hh}h" if ($hh);
405            $out .= sprintf("%02d:%02d", $mm,$ss);
406            $out .= "  " if ($hh == 0);
407            return $out;
408    }
409    
410  =head2 open_import_xml  =head2 open_import_xml
411    
412  Read file from C<import_xml/> directory and parse it.  Read file from C<import_xml/> directory and parse it.
# Line 438  sub fill_in { Line 569  sub fill_in {
569          # iteration (for repeatable fields)          # iteration (for repeatable fields)
570          my $i = shift || 0;          my $i = shift || 0;
571    
572            $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
573    
574          # FIXME remove for speedup?          # FIXME remove for speedup?
575          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
576    
# Line 451  sub fill_in { Line 584  sub fill_in {
584          # remove eval{...} from beginning          # remove eval{...} from beginning
585          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
586    
587            my $filter_name;
588            # remove filter{...} from beginning
589            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
590    
591          # do actual replacement of placeholders          # do actual replacement of placeholders
592            # repeatable fields
593          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
594            # non-repeatable fields
595            $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
596    
597          if ($found) {          if ($found) {
598                  $log->debug("format: $format");                  $log->debug("format: $format");
# Line 460  sub fill_in { Line 600  sub fill_in {
600                          my $eval = $self->fill_in($rec,$eval_code,$i);                          my $eval = $self->fill_in($rec,$eval_code,$i);
601                          return if (! $self->_eval($eval));                          return if (! $self->_eval($eval));
602                  }                  }
603                    if ($filter_name && $self->{'filter'}->{$filter_name}) {
604                            $log->debug("filter '$filter_name' for $format");
605                            $format = $self->{'filter'}->{$filter_name}->($format);
606                            return unless(defined($format));
607                            $log->debug("filter result: $format");
608                    }
609                  # do we have lookups?                  # do we have lookups?
610                  if ($format =~ /$LOOKUP_REGEX/o) {                  if ($format =~ /$LOOKUP_REGEX/o) {
611                          $log->debug("format '$format' has lookup");                          $log->debug("format '$format' has lookup");
# Line 552  sub parse { Line 698  sub parse {
698          # remove eval{...} from beginning          # remove eval{...} from beginning
699          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
700    
701            my $filter_name;
702            # remove filter{...} from beginning
703            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
704    
705          my $prefix;          my $prefix;
706          my $all_found=0;          my $all_found=0;
707    
708          while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {          while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
709    
710                  my $del = $1 || '';                  my $del = $1 || '';
711                  $prefix ||= $del if ($all_found == 0);                  $prefix ||= $del if ($all_found == 0);
712    
713                    # repeatable index
714                    my $r = $i;
715                    $r = 0 if (lc("$2") eq 's');
716    
717                  my $found = 0;                  my $found = 0;
718                  my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);                  my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
719    
720                  if ($found) {                  if ($found) {
721                          push @out, $del;                          push @out, $del;
# Line 585  sub parse { Line 739  sub parse {
739          }          }
740    
741          if ($eval_code) {          if ($eval_code) {
742                  my $eval = $self->fill_in($rec,$eval_code,$i);                  my $eval = $self->fill_in($rec,$eval_code,$i) || return;
743                  $log->debug("about to eval{",$eval,"} format: $out");                  $log->debug("about to eval{$eval} format: $out");
744                  return if (! $self->_eval($eval));                  return if (! $self->_eval($eval));
745          }          }
746            
747            if ($filter_name && $self->{'filter'}->{$filter_name}) {
748                    $log->debug("about to filter{$filter_name} format: $out");
749                    $out = $self->{'filter'}->{$filter_name}->($out);
750                    return unless(defined($out));
751                    $log->debug("filter result: $out");
752            }
753    
754          return $out;          return $out;
755  }  }
# Line 655  sub fill_in_to_arr { Line 816  sub fill_in_to_arr {
816          return @arr;          return @arr;
817  }  }
818    
819    =head2 sort_arr
820    
821    Sort array ignoring case and html in data
822    
823     my @sorted = $webpac->sort_arr(@unsorted);
824    
825    =cut
826    
827    sub sort_arr {
828            my $self = shift;
829    
830            my $log = $self->_get_logger();
831    
832            # FIXME add Schwartzian Transformation?
833    
834            my @sorted = sort {
835                    $a =~ s#<[^>]+/*>##;
836                    $b =~ s#<[^>]+/*>##;
837                    lc($b) cmp lc($a)
838            } @_;
839            $log->debug("sorted values: ",sub { join(", ",@sorted) });
840    
841            return @sorted;
842    }
843    
844    
845  =head2 data_structure  =head2 data_structure
846    
# Line 711  sub data_structure { Line 897  sub data_structure {
897                          }                          }
898                          next if (! @v);                          next if (! @v);
899    
900                            if ($tag->{'sort'}) {
901                                    @v = $self->sort_arr(@v);
902                            }
903    
904                          # use format?                          # use format?
905                          if ($tag->{'format_name'}) {                          if ($tag->{'format_name'}) {
906                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
# Line 725  sub data_structure { Line 915  sub data_structure {
915                                  next; # don't return headline in data_structure!                                  next; # don't return headline in data_structure!
916                          }                          }
917    
918                          # does tag have type?                          # delimiter will join repeatable fields
919                          if ($tag->{'type'}) {                          if ($tag->{'delimiter'}) {
920                                  push @{$row->{$tag->{'type'}}}, @v;                                  @v = ( join($tag->{'delimiter'}, @v) );
921                          } else {                          }
922                                  push @{$row->{'display'}}, @v;  
923                                  push @{$row->{'swish'}}, @v;                          # default types
924                            my @types = qw(display swish);
925                            # override by type attribute
926                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
927    
928                            foreach my $type (@types) {
929                                    # append to previous line?
930                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
931                                    if ($tag->{'append'}) {
932    
933                                            # I will delimit appended part with
934                                            # delimiter (or ,)
935                                            my $d = $tag->{'delimiter'};
936                                            # default delimiter
937                                            $d ||= " ";
938    
939                                            my $last = pop @{$row->{$type}};
940                                            $d = "" if (! $last);
941                                            $last .= $d . join($d, @v);
942                                            push @{$row->{$type}}, $last;
943    
944                                    } else {
945                                            push @{$row->{$type}}, @v;
946                                    }
947                          }                          }
948    
949    
# Line 743  sub data_structure { Line 956  sub data_structure {
956                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
957                          $row->{'name'} = $name ? $self->_x($name) : $field;                          $row->{'name'} = $name ? $self->_x($name) : $field;
958    
959                            # post-sort all values in field
960                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
961                                    $log->warn("sort at field tag not implemented");
962                            }
963    
964                          push @ds, $row;                          push @ds, $row;
965    
966                          $log->debug("row $field: ",sub { Dumper($row) });                          $log->debug("row $field: ",sub { Dumper($row) });
# Line 803  sub output_file { Line 1021  sub output_file {
1021    
1022          my $log = $self->_get_logger();          my $log = $self->_get_logger();
1023    
1024          $log->logconfess("need file name") if (! $args->{'file'});          my $file = $args->{'file'} || $log->logconfess("need file name");
1025    
1026          $log->debug("creating file ",$args->{'file'});          $log->debug("creating file ",$file);
1027    
1028          open(my $fh, ">", $args->{'file'}) || $log->logdie("can't open output file '$self->{'file'}': $!");          open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1029          print $fh $self->output(          print $fh $self->output(
1030                  template => $args->{'template'},                  template => $args->{'template'},
1031                  data => $args->{'data'},                  data => $args->{'data'},
# Line 888  sub _eval { Line 1106  sub _eval {
1106    
1107          $log->debug("eval: ",$code," [",$ret,"]");          $log->debug("eval: ",$code," [",$ret,"]");
1108    
1109          return $ret || 0;          return $ret || undef;
1110  }  }
1111    
1112  =head2 _sort_by_order  =head2 _sort_by_order
# Line 958  B<This is different from normal Log4perl Line 1176  B<This is different from normal Log4perl
1176  also use method names, and not only classes (which are just few)  also use method names, and not only classes (which are just few)
1177  to filter logging.  to filter logging.
1178    
1179    
1180    =head1 MEMORY USAGE
1181    
1182    C<low_mem> options is double-edged sword. If enabled, WebPAC
1183    will run on memory constraint machines (which doesn't have enough
1184    physical RAM to create memory structure for whole source database).
1185    
1186    If your machine has 512Mb or more of RAM and database is around 10000 records,
1187    memory shouldn't be an issue. If you don't have enough physical RAM, you
1188    might consider using virtual memory (if your operating system is handling it
1189    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1190    parsed structure of ISIS database (this is what C<low_mem> option does).
1191    
1192    Hitting swap at end of reading source database is probably o.k. However,
1193    hitting swap before 90% will dramatically decrease performance and you will
1194    be better off with C<low_mem> and using rest of availble memory for
1195    operating system disk cache (Linux is particuallary good about this).
1196    However, every access to database record will require disk access, so
1197    generation phase will be slower 10-100 times.
1198    
1199    Parsed structures are essential - you just have option to trade RAM memory
1200    (which is fast) for disk space (which is slow). Be sure to have planty of
1201    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1202    
1203    However, when WebPAC is running on desktop machines (or laptops :-), it's
1204    highly undesireable for system to start swapping. Using C<low_mem> option can
1205    reduce WecPAC memory usage to around 64Mb for same database with lookup
1206    fields and sorted indexes which stay in RAM. Performance will suffer, but
1207    memory usage will really be minimal. It might be also more confortable to
1208    run WebPAC reniced on those machines.
1209    
1210  =cut  =cut
1211    
1212  1;  1;

Legend:
Removed from v.418  
changed lines
  Added in v.563

  ViewVC Help
Powered by ViewVC 1.1.26