/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 418 by dpavlin, Thu Sep 9 18:08:38 2004 UTC revision 500 by dpavlin, Sun Oct 10 11:04:52 2004 UTC
# Line 9  use Config::IniFiles; Line 9  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10  use Template;  use Template;
11  use Log::Log4perl qw(get_logger :levels);  use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
# Line 29  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38            low_mem => 1,
39            filter => {
40                    'lower' => sub { lc($_[0]) },
41            },
42   );   );
43    
44  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
45    
46  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48    There is optinal parametar C<filter> which specify different filters which
49    can be applied using C<filter{name}> notation.
50    
51    This method will also read configuration files
52  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
53  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
54  which describes databases to be indexed.  which describes databases to be indexed.
# Line 59  sub new { Line 69  sub new {
69          my $self = {@_};          my $self = {@_};
70          bless($self, $class);          bless($self, $class);
71    
72            $self->{'start_t'} = time();
73    
74          my $log_file = $self->{'log'} || "log.conf";          my $log_file = $self->{'log'} || "log.conf";
75          Log::Log4perl->init($log_file);          Log::Log4perl->init($log_file);
76    
# Line 106  sub new { Line 118  sub new {
118                  EVAL_PERL => 1,                  EVAL_PERL => 1,
119          );          );
120    
121            # running with low_mem flag? well, use DBM::Deep then.
122            if ($self->{'low_mem'}) {
123                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
124    
125                    my $db_file = "data.db";
126    
127                    if (-e $db_file) {
128                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
129                            $log->debug("removed '$db_file' from last run");
130                    }
131    
132                    require DBM::Deep;
133    
134                    my $db = new DBM::Deep $db_file;
135    
136                    $log->logdie("DBM::Deep error: $!") unless ($db);
137    
138                    if ($db->error()) {
139                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
140                    } else {
141                            $log->debug("using file '$db_file' for DBM::Deep");
142                    }
143    
144                    $self->{'db'} = $db;
145            }
146    
147            $log->debug("filters defined: ",Dumper($self->{'filter'}));
148    
149          return $self;          return $self;
150  }  }
151    
# Line 116  Open CDS/ISIS database using OpenIsis mo Line 156  Open CDS/ISIS database using OpenIsis mo
156   $webpac->open_isis(   $webpac->open_isis(
157          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
158          code_page => '852',          code_page => '852',
159          limit_mfn => '500',          limit_mfn => 500,
160            start_mfn => 6000,
161          lookup => [ ... ],          lookup => [ ... ],
162   );   );
163    
164  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
165    
166    If optional parametar C<start_mfn> is set, this will be first MFN to read
167    from database (so you can skip beginning of your database if you need to).
168    
169  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
170  from database in example above.  from database in example above.
171    
# Line 149  sub open_isis { Line 193  sub open_isis {
193          $log->logcroak("need filename") if (! $arg->{'filename'});          $log->logcroak("need filename") if (! $arg->{'filename'});
194          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
195    
196            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197    
198            # store data in object
199            $self->{'isis_filename'} = $arg->{'filename'};
200            $self->{'isis_code_page'} = $code_page;
201    
202          use OpenIsis;          use OpenIsis;
203    
204          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 157  sub open_isis { Line 207  sub open_isis {
207          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
208    
209          $log->info("reading ISIS database '",$arg->{'filename'},"'");          $log->info("reading ISIS database '",$arg->{'filename'},"'");
210            $log->debug("isis code page: $code_page");
211    
212          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
213    
214          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
215            my $startmfn = 1;
216    
217          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          if (my $s = $self->{'start_mfn'}) {
218                    $log->info("skipping to MFN $s");
219                    $startmfn = $s;
220            } else {
221                    $self->{'start_mfn'} = $startmfn;
222            }
223    
224            $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
225    
226          $log->info("processing $maxmfn records...");          $log->info("processing ",($maxmfn-$startmfn)." records...");
227    
228          # read database          # read database
229          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
230    
231    
232                  $log->debug("mfn: $mfn\n");                  $log->debug("mfn: $mfn\n");
233    
234                    my $rec;
235    
236                  # read record                  # read record
237                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
238                  foreach my $k (keys %{$row}) {                  foreach my $k (keys %{$row}) {
# Line 189  sub open_isis { Line 250  sub open_isis {
250                                                  $val = $l;                                                  $val = $l;
251                                          }                                          }
252    
253                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
254                                  }                                  }
255                          } else {                          } else {
256                                  push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn;                                  push @{$rec->{'000'}}, $mfn;
257                          }                          }
258    
259                  }                  }
260    
261                    $log->confess("record $mfn empty?") unless ($rec);
262    
263                    # store
264                    if ($self->{'low_mem'}) {
265                            $self->{'db'}->put($mfn, $rec);
266                    } else {
267                            $self->{'data'}->{$mfn} = $rec;
268                    }
269    
270                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn} || $log->confess("record $mfn empty?");  
271                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
272    
273                  $self->progress_bar($mfn,$maxmfn);                  $self->progress_bar($mfn,$maxmfn);
274    
275          }          }
276    
277          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = -1;
278          $self->{'last_pcnt'} = 0;          $self->{'last_pcnt'} = 0;
279    
280          $log->debug("max mfn: $maxmfn");          $log->debug("max mfn: $maxmfn");
# Line 228  sub fetch_rec { Line 297  sub fetch_rec {
297    
298          my $log = $self->_get_logger();          my $log = $self->_get_logger();
299    
300          my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");          $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
301    
302            if ($self->{'current_mfn'} == -1) {
303                    $self->{'current_mfn'} = $self->{'start_mfn'};
304            } else {
305                    $self->{'current_mfn'}++;
306            }
307    
308            my $mfn = $self->{'current_mfn'};
309    
310          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{'max_mfn'}) {
311                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{'current_mfn'} = $self->{'max_mfn'};
# Line 238  sub fetch_rec { Line 315  sub fetch_rec {
315    
316          $self->progress_bar($mfn,$self->{'max_mfn'});          $self->progress_bar($mfn,$self->{'max_mfn'});
317    
318          return $self->{'data'}->{$mfn};          if ($self->{'low_mem'}) {
319                    return $self->{'db'}->get($mfn);
320            } else {
321                    return $self->{'data'}->{$mfn};
322            }
323    }
324    
325    =head2 mfn
326    
327    Returns current record number (MFN).
328    
329     print $webpac->mfn;
330    
331    =cut
332    
333    sub mfn {
334            my $self = shift;
335            return $self->{'current_mfn'};
336  }  }
337    
338  =head2 progress_bar  =head2 progress_bar
# Line 266  sub progress_bar { Line 360  sub progress_bar {
360    
361          $self->{'last_pcnt'} ||= 1;          $self->{'last_pcnt'} ||= 1;
362    
363          $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});          my $p = int($curr * 100 / $max) || 1;
364    
365            # reset on re-run
366            if ($p < $self->{'last_pcnt'}) {
367                    $self->{'last_pcnt'} = $p;
368                    $self->{'last_t'} = time();
369                    $self->{'last_curr'} = undef;
370            }
371    
372            $self->{'last_t'} ||= time();
373    
         my $p = int($curr * 100 / $max);  
374          if ($p != $self->{'last_pcnt'}) {          if ($p != $self->{'last_pcnt'}) {
375                  printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );  
376                    my $last_curr = $self->{'last_curr'} || $curr;
377                    my $t = time();
378                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
379                    my $eta = ($max-$curr) / ($rate || 1);
380                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
381                  $self->{'last_pcnt'} = $p;                  $self->{'last_pcnt'} = $p;
382                    $self->{'last_t'} = time();
383                    $self->{'last_curr'} = $curr;
384          }          }
385          print STDERR "\n" if ($p == 100);          print STDERR "\n" if ($p == 100);
386  }  }
387    
388    =head2 fmt_time
389    
390    Format time (in seconds) for display.
391    
392     print $webpac->fmt_time(time());
393    
394    This method is called by L<progress_bar> to display remaining time.
395    
396    =cut
397    
398    sub fmt_time {
399            my $self = shift;
400    
401            my $t = shift || 0;
402            my $out = "";
403    
404            my ($ss,$mm,$hh) = gmtime($t);
405            $out .= "${hh}h" if ($hh);
406            $out .= sprintf("%02d:%02d", $mm,$ss);
407            $out .= "  " if ($hh == 0);
408            return $out;
409    }
410    
411  =head2 open_import_xml  =head2 open_import_xml
412    
413  Read file from C<import_xml/> directory and parse it.  Read file from C<import_xml/> directory and parse it.
# Line 451  sub fill_in { Line 583  sub fill_in {
583          # remove eval{...} from beginning          # remove eval{...} from beginning
584          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
585    
586            my $filter_name;
587            # remove filter{...} from beginning
588            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
589    
590          # do actual replacement of placeholders          # do actual replacement of placeholders
591          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
592    
# Line 460  sub fill_in { Line 596  sub fill_in {
596                          my $eval = $self->fill_in($rec,$eval_code,$i);                          my $eval = $self->fill_in($rec,$eval_code,$i);
597                          return if (! $self->_eval($eval));                          return if (! $self->_eval($eval));
598                  }                  }
599                    if ($filter_name && $self->{'filter'}->{$filter_name}) {
600                            $log->debug("filter '$filter_name' for $format");
601                            $format = $self->{'filter'}->{$filter_name}->($format);
602                            return unless(defined($format));
603                            $log->debug("filter result: $format");
604                    }
605                  # do we have lookups?                  # do we have lookups?
606                  if ($format =~ /$LOOKUP_REGEX/o) {                  if ($format =~ /$LOOKUP_REGEX/o) {
607                          $log->debug("format '$format' has lookup");                          $log->debug("format '$format' has lookup");
# Line 552  sub parse { Line 694  sub parse {
694          # remove eval{...} from beginning          # remove eval{...} from beginning
695          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
696    
697            my $filter_name;
698            # remove filter{...} from beginning
699            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
700    
701          my $prefix;          my $prefix;
702          my $all_found=0;          my $all_found=0;
703    
# Line 586  sub parse { Line 732  sub parse {
732    
733          if ($eval_code) {          if ($eval_code) {
734                  my $eval = $self->fill_in($rec,$eval_code,$i);                  my $eval = $self->fill_in($rec,$eval_code,$i);
735                  $log->debug("about to eval{",$eval,"} format: $out");                  $log->debug("about to eval{$eval} format: $out");
736                  return if (! $self->_eval($eval));                  return if (! $self->_eval($eval));
737          }          }
738            
739            if ($filter_name && $self->{'filter'}->{$filter_name}) {
740                    $log->debug("about to filter{$filter_name} format: $out");
741                    $out = $self->{'filter'}->{$filter_name}->($out);
742                    return unless(defined($out));
743                    $log->debug("filter result: $out");
744            }
745    
746          return $out;          return $out;
747  }  }
# Line 655  sub fill_in_to_arr { Line 808  sub fill_in_to_arr {
808          return @arr;          return @arr;
809  }  }
810    
811    =head2 sort_arr
812    
813    Sort array ignoring case and html in data
814    
815     my @sorted = $webpac->sort_arr(@unsorted);
816    
817    =cut
818    
819    sub sort_arr {
820            my $self = shift;
821    
822            my $log = $self->_get_logger();
823    
824            # FIXME add Schwartzian Transformation?
825    
826            my @sorted = sort {
827                    $a =~ s#<[^>]+/*>##;
828                    $b =~ s#<[^>]+/*>##;
829                    lc($b) cmp lc($a)
830            } @_;
831            $log->debug("sorted values: ",sub { join(", ",@sorted) });
832    
833            return @sorted;
834    }
835    
836    
837  =head2 data_structure  =head2 data_structure
838    
# Line 711  sub data_structure { Line 889  sub data_structure {
889                          }                          }
890                          next if (! @v);                          next if (! @v);
891    
892                            if ($tag->{'sort'}) {
893                                    @v = $self->sort_arr(@v);
894                                    $log->warn("sort within tag is usually not what you want!");
895                            }
896    
897                          # use format?                          # use format?
898                          if ($tag->{'format_name'}) {                          if ($tag->{'format_name'}) {
899                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
# Line 725  sub data_structure { Line 908  sub data_structure {
908                                  next; # don't return headline in data_structure!                                  next; # don't return headline in data_structure!
909                          }                          }
910    
911                          # does tag have type?                          # delimiter will join repeatable fields
912                          if ($tag->{'type'}) {                          if ($tag->{'delimiter'}) {
913                                  push @{$row->{$tag->{'type'}}}, @v;                                  @v = ( join($tag->{'delimiter'}, @v) );
914                          } else {                          }
915                                  push @{$row->{'display'}}, @v;  
916                                  push @{$row->{'swish'}}, @v;                          # default types
917                            my @types = qw(display swish);
918                            # override by type attribute
919                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
920    
921                            foreach my $type (@types) {
922                                    # append to previous line?
923                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
924                                    if ($tag->{'append'}) {
925    
926                                            # I will delimit appended part with
927                                            # delimiter (or ,)
928                                            my $d = $tag->{'delimiter'};
929                                            # default delimiter
930                                            $d ||= ", ";
931    
932                                            my $last = pop @{$row->{$type}};
933                                            $d = "" if (! $last);
934                                            $last .= $d . join($d, @v);
935                                            push @{$row->{$type}}, $last;
936    
937                                    } else {
938                                            push @{$row->{$type}}, @v;
939                                    }
940                          }                          }
941    
942    
# Line 743  sub data_structure { Line 949  sub data_structure {
949                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
950                          $row->{'name'} = $name ? $self->_x($name) : $field;                          $row->{'name'} = $name ? $self->_x($name) : $field;
951    
952                            # post-sort all values in field
953                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
954                                    $log->warn("sort at field tag not implemented");
955                            }
956    
957                          push @ds, $row;                          push @ds, $row;
958    
959                          $log->debug("row $field: ",sub { Dumper($row) });                          $log->debug("row $field: ",sub { Dumper($row) });
# Line 803  sub output_file { Line 1014  sub output_file {
1014    
1015          my $log = $self->_get_logger();          my $log = $self->_get_logger();
1016    
1017          $log->logconfess("need file name") if (! $args->{'file'});          my $file = $args->{'file'} || $log->logconfess("need file name");
1018    
1019          $log->debug("creating file ",$args->{'file'});          $log->debug("creating file ",$file);
1020    
1021          open(my $fh, ">", $args->{'file'}) || $log->logdie("can't open output file '$self->{'file'}': $!");          open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1022          print $fh $self->output(          print $fh $self->output(
1023                  template => $args->{'template'},                  template => $args->{'template'},
1024                  data => $args->{'data'},                  data => $args->{'data'},
# Line 888  sub _eval { Line 1099  sub _eval {
1099    
1100          $log->debug("eval: ",$code," [",$ret,"]");          $log->debug("eval: ",$code," [",$ret,"]");
1101    
1102          return $ret || 0;          return $ret || undef;
1103  }  }
1104    
1105  =head2 _sort_by_order  =head2 _sort_by_order
# Line 958  B<This is different from normal Log4perl Line 1169  B<This is different from normal Log4perl
1169  also use method names, and not only classes (which are just few)  also use method names, and not only classes (which are just few)
1170  to filter logging.  to filter logging.
1171    
1172    
1173    =head1 MEMORY USAGE
1174    
1175    C<low_mem> options is double-edged sword. If enabled, WebPAC
1176    will run on memory constraint machines (which doesn't have enough
1177    physical RAM to create memory structure for whole source database).
1178    
1179    If your machine has 512Mb or more of RAM and database is around 10000 records,
1180    memory shouldn't be an issue. If you don't have enough physical RAM, you
1181    might consider using virtual memory (if your operating system is handling it
1182    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1183    parsed structure of ISIS database (this is what C<low_mem> option does).
1184    
1185    Hitting swap at end of reading source database is probably o.k. However,
1186    hitting swap before 90% will dramatically decrease performance and you will
1187    be better off with C<low_mem> and using rest of availble memory for
1188    operating system disk cache (Linux is particuallary good about this).
1189    However, every access to database record will require disk access, so
1190    generation phase will be slower 10-100 times.
1191    
1192    Parsed structures are essential - you just have option to trade RAM memory
1193    (which is fast) for disk space (which is slow). Be sure to have planty of
1194    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1195    
1196    However, when WebPAC is running on desktop machines (or laptops :-), it's
1197    highly undesireable for system to start swapping. Using C<low_mem> option can
1198    reduce WecPAC memory usage to around 64Mb for same database with lookup
1199    fields and sorted indexes which stay in RAM. Performance will suffer, but
1200    memory usage will really be minimal. It might be also more confortable to
1201    run WebPAC reniced on those machines.
1202    
1203  =cut  =cut
1204    
1205  1;  1;

Legend:
Removed from v.418  
changed lines
  Added in v.500

  ViewVC Help
Powered by ViewVC 1.1.26