/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 389 by dpavlin, Tue Jul 20 17:15:48 2004 UTC revision 459 by dpavlin, Tue Sep 21 19:08:11 2004 UTC
# Line 9  use Config::IniFiles; Line 9  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10  use Template;  use Template;
11  use Log::Log4perl qw(get_logger :levels);  use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
# Line 29  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38            low_mem => 1,
39   );   );
40    
41  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
42    
43  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45    This method will also read configuration files
46  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
47  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
48  which describes databases to be indexed.  which describes databases to be indexed.
# Line 59  sub new { Line 63  sub new {
63          my $self = {@_};          my $self = {@_};
64          bless($self, $class);          bless($self, $class);
65    
66            $self->{'start_t'} = time();
67    
68          my $log_file = $self->{'log'} || "log.conf";          my $log_file = $self->{'log'} || "log.conf";
69          Log::Log4perl->init($log_file);          Log::Log4perl->init($log_file);
70    
# Line 106  sub new { Line 112  sub new {
112                  EVAL_PERL => 1,                  EVAL_PERL => 1,
113          );          );
114    
115            # running with low_mem flag? well, use DBM::Deep then.
116            if ($self->{'low_mem'}) {
117                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119                    my $db_file = "data.db";
120    
121                    if (-e $db_file) {
122                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123                            $log->debug("removed '$db_file' from last run");
124                    }
125    
126                    require DBM::Deep;
127    
128                    my $db = new DBM::Deep $db_file;
129    
130                    $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132                    if ($db->error()) {
133                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134                    } else {
135                            $log->debug("using file '$db_file' for DBM::Deep");
136                    }
137    
138                    $self->{'db'} = $db;
139            }
140    
141          return $self;          return $self;
142  }  }
143    
# Line 116  Open CDS/ISIS database using OpenIsis mo Line 148  Open CDS/ISIS database using OpenIsis mo
148   $webpac->open_isis(   $webpac->open_isis(
149          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
150          code_page => '852',          code_page => '852',
151          limit_mfn => '500',          limit_mfn => 500,
152            start_mfn => 6000,
153          lookup => [ ... ],          lookup => [ ... ],
154   );   );
155    
156  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
157    
158    If optional parametar C<start_mfn> is set, this will be first MFN to read
159    from database (so you can skip beginning of your database if you need to).
160    
161  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
162  from database in example above.  from database in example above.
163    
# Line 149  sub open_isis { Line 185  sub open_isis {
185          $log->logcroak("need filename") if (! $arg->{'filename'});          $log->logcroak("need filename") if (! $arg->{'filename'});
186          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
187    
188            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190            # store data in object
191            $self->{'isis_filename'} = $arg->{'filename'};
192            $self->{'isis_code_page'} = $code_page;
193    
194          use OpenIsis;          use OpenIsis;
195    
196          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 157  sub open_isis { Line 199  sub open_isis {
199          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200    
201          $log->info("reading ISIS database '",$arg->{'filename'},"'");          $log->info("reading ISIS database '",$arg->{'filename'},"'");
202            $log->debug("isis code page: $code_page");
203    
204          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207            my $startmfn = 1;
208    
209            if (my $s = $self->{'start_mfn'}) {
210                    $log->info("skipping to MFN $s");
211                    $startmfn = $s;
212            } else {
213                    $self->{'start_mfn'} = $startmfn;
214            }
215    
216          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
217    
218          $log->info("processing $maxmfn records...");          $log->info("processing ",($maxmfn-$startmfn)." records...");
219    
220          # read database          # read database
221          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
222    
223    
224                    $log->debug("mfn: $mfn\n");
225    
226                    my $rec;
227    
228                  # read record                  # read record
229                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
# Line 186  sub open_isis { Line 242  sub open_isis {
242                                                  $val = $l;                                                  $val = $l;
243                                          }                                          }
244    
245                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
246                                  }                                  }
247                          } else {                          } else {
248                                  push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn;                                  push @{$rec->{'000'}}, $mfn;
249                          }                          }
250    
251                  }                  }
252    
253                    $log->confess("record $mfn empty?") unless ($rec);
254    
255                    # store
256                    if ($self->{'low_mem'}) {
257                            $self->{'db'}->put($mfn, $rec);
258                    } else {
259                            $self->{'data'}->{$mfn} = $rec;
260                    }
261    
262                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn};  
263                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
264    
265                  $self->progress_bar($mfn,$maxmfn);                  $self->progress_bar($mfn,$maxmfn);
266    
267          }          }
268    
269          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = -1;
270          $self->{'last_pcnt'} = 0;          $self->{'last_pcnt'} = 0;
271    
272            $log->debug("max mfn: $maxmfn");
273    
274          # store max mfn and return it.          # store max mfn and return it.
275          return $self->{'max_mfn'} = $maxmfn;          return $self->{'max_mfn'} = $maxmfn;
276  }  }
# Line 223  sub fetch_rec { Line 289  sub fetch_rec {
289    
290          my $log = $self->_get_logger();          my $log = $self->_get_logger();
291    
292          my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");          $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
293    
294            if ($self->{'current_mfn'} == -1) {
295                    $self->{'current_mfn'} = $self->{'start_mfn'};
296            } else {
297                    $self->{'current_mfn'}++;
298            }
299    
300            my $mfn = $self->{'current_mfn'};
301    
302          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{'max_mfn'}) {
303                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{'current_mfn'} = $self->{'max_mfn'};
# Line 233  sub fetch_rec { Line 307  sub fetch_rec {
307    
308          $self->progress_bar($mfn,$self->{'max_mfn'});          $self->progress_bar($mfn,$self->{'max_mfn'});
309    
310          return $self->{'data'}->{$mfn};          if ($self->{'low_mem'}) {
311                    return $self->{'db'}->get($mfn);
312            } else {
313                    return $self->{'data'}->{$mfn};
314            }
315    }
316    
317    =head2 mfn
318    
319    Returns current record number (MFN).
320    
321     print $webpac->mfn;
322    
323    =cut
324    
325    sub mfn {
326            my $self = shift;
327            return $self->{'current_mfn'};
328  }  }
329    
330  =head2 progress_bar  =head2 progress_bar
# Line 261  sub progress_bar { Line 352  sub progress_bar {
352    
353          $self->{'last_pcnt'} ||= 1;          $self->{'last_pcnt'} ||= 1;
354    
355          $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});          my $p = int($curr * 100 / $max) || 1;
356    
357            # reset on re-run
358            if ($p < $self->{'last_pcnt'}) {
359                    $self->{'last_pcnt'} = $p;
360                    $self->{'last_t'} = time();
361                    $self->{'last_curr'} = undef;
362            }
363    
364            $self->{'last_t'} ||= time();
365    
         my $p = int($curr * 100 / $max);  
366          if ($p != $self->{'last_pcnt'}) {          if ($p != $self->{'last_pcnt'}) {
367                  printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );  
368                    my $last_curr = $self->{'last_curr'} || $curr;
369                    my $t = time();
370                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
371                    my $eta = ($max-$curr) / ($rate || 1);
372                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
373                  $self->{'last_pcnt'} = $p;                  $self->{'last_pcnt'} = $p;
374                    $self->{'last_t'} = time();
375                    $self->{'last_curr'} = $curr;
376          }          }
377            print STDERR "\n" if ($p == 100);
378    }
379    
380    =head2 fmt_time
381    
382    Format time (in seconds) for display.
383    
384     print $webpac->fmt_time(time());
385    
386    This method is called by L<progress_bar> to display remaining time.
387    
388    =cut
389    
390    sub fmt_time {
391            my $self = shift;
392    
393            my $t = shift || 0;
394            my $out = "";
395    
396            my ($ss,$mm,$hh) = gmtime($t);
397            $out .= "${hh}h" if ($hh);
398            $out .= sprintf("%02d:%02d", $mm,$ss);
399            $out .= "  " if ($hh == 0);
400            return $out;
401  }  }
402    
403  =head2 open_import_xml  =head2 open_import_xml
# Line 329  sub create_lookup { Line 459  sub create_lookup {
459          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
460    
461          foreach my $i (@_) {          foreach my $i (@_) {
462                  if ($i->{'eval'}) {                  $log->logconfess("need key") unless defined($i->{'key'});
463                          my $eval = $self->fill_in($rec,$i->{'eval'});                  $log->logconfess("need val") unless defined($i->{'val'});
464                          my $key = $self->fill_in($rec,$i->{'key'});  
465                          my @val = $self->fill_in($rec,$i->{'val'});                  if (defined($i->{'eval'})) {
466                          if ($key && @val && eval $eval) {                          # eval first, so we can skip fill_in for key and val
467                            my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
468                            if ($self->_eval($eval)) {
469                                    my $key = $self->fill_in($rec,$i->{'key'}) || next;
470                                    my @val = $self->fill_in($rec,$i->{'val'}) || next;
471                                  $log->debug("stored $key = ",sub { join(" | ",@val) });                                  $log->debug("stored $key = ",sub { join(" | ",@val) });
472                                  push @{$self->{'lookup'}->{$key}}, @val;                                  push @{$self->{'lookup'}->{$key}}, @val;
473                          }                          }
474                  } else {                  } else {
475                          my $key = $self->fill_in($rec,$i->{'key'});                          my $key = $self->fill_in($rec,$i->{'key'}) || next;
476                          my @val = $self->fill_in($rec,$i->{'val'});                          my @val = $self->fill_in($rec,$i->{'val'}) || next;
477                          if ($key && @val) {                          $log->debug("stored $key = ",sub { join(" | ",@val) });
478                                  $log->debug("stored $key = ",sub { join(" | ",@val) });                          push @{$self->{'lookup'}->{$key}}, @val;
                                 push @{$self->{'lookup'}->{$key}}, @val;  
                         }  
479                  }                  }
480          }          }
481  }  }
# Line 647  sub fill_in_to_arr { Line 779  sub fill_in_to_arr {
779          return @arr;          return @arr;
780  }  }
781    
782    =head2 sort_arr
783    
784    Sort array ignoring case and html in data
785    
786     my @sorted = $webpac->sort_arr(@unsorted);
787    
788    =cut
789    
790    sub sort_arr {
791            my $self = shift;
792    
793            my $log = $self->_get_logger();
794    
795            # FIXME add Schwartzian Transformation?
796    
797            my @sorted = sort {
798                    $a =~ s#<[^>]+/*>##;
799                    $b =~ s#<[^>]+/*>##;
800                    lc($b) cmp lc($a)
801            } @_;
802            $log->debug("sorted values: ",sub { join(", ",@sorted) });
803    
804            return @sorted;
805    }
806    
807    
808  =head2 data_structure  =head2 data_structure
809    
# Line 656  It is used later to produce output. Line 813  It is used later to produce output.
813   my @ds = $webpac->data_structure($rec);   my @ds = $webpac->data_structure($rec);
814    
815  This method will also set C<$webpac->{'currnet_filename'}> if there is  This method will also set C<$webpac->{'currnet_filename'}> if there is
816  <filename> tag in C<import_xml>.  <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
817    <headline> tag.
818    
819  =cut  =cut
820    
# Line 669  sub data_structure { Line 827  sub data_structure {
827          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
828    
829          undef $self->{'currnet_filename'};          undef $self->{'currnet_filename'};
830            undef $self->{'headline'};
831    
832          my @sorted_tags;          my @sorted_tags;
833          if ($self->{tags_by_order}) {          if ($self->{tags_by_order}) {
# Line 701  sub data_structure { Line 860  sub data_structure {
860                          }                          }
861                          next if (! @v);                          next if (! @v);
862    
863                            if ($tag->{'sort'}) {
864                                    @v = $self->sort_arr(@v);
865                                    $log->warn("sort within tag is usually not what you want!");
866                            }
867    
868                          # use format?                          # use format?
869                          if ($tag->{'format_name'}) {                          if ($tag->{'format_name'}) {
870                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
871                          }                          }
872    
                         # does tag have type?  
                         if ($tag->{'type'}) {  
                                 push @{$row->{$tag->{'type'}}}, @v;  
                         } else {  
                                 push @{$row->{'display'}}, @v;  
                                 push @{$row->{'swish'}}, @v;  
                         }  
   
873                          if ($field eq 'filename') {                          if ($field eq 'filename') {
874                                  $self->{'current_filename'} = join('',@v);                                  $self->{'current_filename'} = join('',@v);
875                                  $log->debug("filename: ",$self->{'current_filename'});                                  $log->debug("filename: ",$self->{'current_filename'});
876                            } elsif ($field eq 'headline') {
877                                    $self->{'headline'} .= join('',@v);
878                                    $log->debug("headline: ",$self->{'headline'});
879                                    next; # don't return headline in data_structure!
880                            }
881    
882                            # delimiter will join repeatable fields
883                            if ($tag->{'delimiter'}) {
884                                    @v = ( join($tag->{'delimiter'}, @v) );
885                          }                          }
886    
887                            # default types
888                            my @types = qw(display swish);
889                            # override by type attribute
890                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
891    
892                            foreach my $type (@types) {
893                                    # append to previous line?
894                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
895                                    if ($tag->{'append'}) {
896    
897                                            # I will delimit appended part with
898                                            # delimiter (or ,)
899                                            my $d = $tag->{'delimiter'};
900                                            # default delimiter
901                                            $d ||= ", ";
902    
903                                            my $last = pop @{$row->{$type}};
904                                            $d = "" if (! $last);
905                                            $last .= $d . join($d, @v);
906                                            push @{$row->{$type}}, $last;
907    
908                                    } else {
909                                            push @{$row->{$type}}, @v;
910                                    }
911                            }
912    
913    
914                  }                  }
915    
916                  if ($row) {                  if ($row) {
# Line 728  sub data_structure { Line 920  sub data_structure {
920                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
921                          $row->{'name'} = $name ? $self->_x($name) : $field;                          $row->{'name'} = $name ? $self->_x($name) : $field;
922    
923                            # post-sort all values in field
924                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
925                                    $log->warn("sort at field tag not implemented");
926                            }
927    
928                          push @ds, $row;                          push @ds, $row;
929    
930                          $log->debug("row $field: ",sub { Dumper($row) });                          $log->debug("row $field: ",sub { Dumper($row) });
# Line 768  sub output { Line 965  sub output {
965          return $out;          return $out;
966  }  }
967    
968    =head2 output_file
969    
970    Create output from in-memory data structure using Template Toolkit template
971    to a file.
972    
973     $webpac->output_file(
974            file => 'out.txt',
975            template => 'text.tt',
976            data => @ds
977     );
978    
979    =cut
980    
981    sub output_file {
982            my $self = shift;
983    
984            my $args = {@_};
985    
986            my $log = $self->_get_logger();
987    
988            my $file = $args->{'file'} || $log->logconfess("need file name");
989    
990            $log->debug("creating file ",$file);
991    
992            open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
993            print $fh $self->output(
994                    template => $args->{'template'},
995                    data => $args->{'data'},
996            ) || $log->logdie("print: $!");
997            close($fh) || $log->logdie("close: $!");
998    }
999    
1000  =head2 apply_format  =head2 apply_format
1001    
1002  Apply format specified in tag with C<format_name="name"> and  Apply format specified in tag with C<format_name="name"> and
# Line 911  B<This is different from normal Log4perl Line 1140  B<This is different from normal Log4perl
1140  also use method names, and not only classes (which are just few)  also use method names, and not only classes (which are just few)
1141  to filter logging.  to filter logging.
1142    
1143    
1144    =head1 MEMORY USAGE
1145    
1146    C<low_mem> options is double-edged sword. If enabled, WebPAC
1147    will run on memory constraint machines (which doesn't have enough
1148    physical RAM to create memory structure for whole source database).
1149    
1150    If your machine has 512Mb or more of RAM and database is around 10000 records,
1151    memory shouldn't be an issue. If you don't have enough physical RAM, you
1152    might consider using virtual memory (if your operating system is handling it
1153    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1154    parsed structure of ISIS database (this is what C<low_mem> option does).
1155    
1156    Hitting swap at end of reading source database is probably o.k. However,
1157    hitting swap before 90% will dramatically decrease performance and you will
1158    be better off with C<low_mem> and using rest of availble memory for
1159    operating system disk cache (Linux is particuallary good about this).
1160    However, every access to database record will require disk access, so
1161    generation phase will be slower 10-100 times.
1162    
1163    Parsed structures are essential - you just have option to trade RAM memory
1164    (which is fast) for disk space (which is slow). Be sure to have planty of
1165    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1166    
1167    However, when WebPAC is running on desktop machines (or laptops :-), it's
1168    highly undesireable for system to start swapping. Using C<low_mem> option can
1169    reduce WecPAC memory usage to around 64Mb for same database with lookup
1170    fields and sorted indexes which stay in RAM. Performance will suffer, but
1171    memory usage will really be minimal. It might be also more confortable to
1172    run WebPAC reniced on those machines.
1173    
1174  =cut  =cut
1175    
1176  1;  1;

Legend:
Removed from v.389  
changed lines
  Added in v.459

  ViewVC Help
Powered by ViewVC 1.1.26