/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 389 by dpavlin, Tue Jul 20 17:15:48 2004 UTC revision 453 by dpavlin, Wed Sep 15 21:21:36 2004 UTC
# Line 9  use Config::IniFiles; Line 9  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10  use Template;  use Template;
11  use Log::Log4perl qw(get_logger :levels);  use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
# Line 29  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38            low_mem => 1,
39   );   );
40    
41  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
42    
43  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45    This method will also read configuration files
46  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
47  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
48  which describes databases to be indexed.  which describes databases to be indexed.
# Line 59  sub new { Line 63  sub new {
63          my $self = {@_};          my $self = {@_};
64          bless($self, $class);          bless($self, $class);
65    
66            $self->{'start_t'} = time();
67    
68          my $log_file = $self->{'log'} || "log.conf";          my $log_file = $self->{'log'} || "log.conf";
69          Log::Log4perl->init($log_file);          Log::Log4perl->init($log_file);
70    
# Line 106  sub new { Line 112  sub new {
112                  EVAL_PERL => 1,                  EVAL_PERL => 1,
113          );          );
114    
115            # running with low_mem flag? well, use DBM::Deep then.
116            if ($self->{'low_mem'}) {
117                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119                    my $db_file = "data.db";
120    
121                    if (-e $db_file) {
122                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123                            $log->debug("removed '$db_file' from last run");
124                    }
125    
126                    require DBM::Deep;
127    
128                    my $db = new DBM::Deep $db_file;
129    
130                    $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132                    if ($db->error()) {
133                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134                    } else {
135                            $log->debug("using file '$db_file' for DBM::Deep");
136                    }
137    
138                    $self->{'db'} = $db;
139            }
140    
141          return $self;          return $self;
142  }  }
143    
# Line 116  Open CDS/ISIS database using OpenIsis mo Line 148  Open CDS/ISIS database using OpenIsis mo
148   $webpac->open_isis(   $webpac->open_isis(
149          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
150          code_page => '852',          code_page => '852',
151          limit_mfn => '500',          limit_mfn => 500,
152            start_mfn => 6000,
153          lookup => [ ... ],          lookup => [ ... ],
154   );   );
155    
156  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
157    
158    If optional parametar C<start_mfn> is set, this will be first MFN to read
159    from database (so you can skip beginning of your database if you need to).
160    
161  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
162  from database in example above.  from database in example above.
163    
# Line 149  sub open_isis { Line 185  sub open_isis {
185          $log->logcroak("need filename") if (! $arg->{'filename'});          $log->logcroak("need filename") if (! $arg->{'filename'});
186          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
187    
188            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190            # store data in object
191            $self->{'isis_filename'} = $arg->{'filename'};
192            $self->{'isis_code_page'} = $code_page;
193    
194          use OpenIsis;          use OpenIsis;
195    
196          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 157  sub open_isis { Line 199  sub open_isis {
199          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200    
201          $log->info("reading ISIS database '",$arg->{'filename'},"'");          $log->info("reading ISIS database '",$arg->{'filename'},"'");
202            $log->debug("isis code page: $code_page");
203    
204          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207            my $startmfn = 1;
208    
209            if (my $s = $self->{'start_mfn'}) {
210                    $log->info("skipping to MFN $s");
211                    $startmfn = $s;
212            } else {
213                    $self->{'start_mfn'} = $startmfn;
214            }
215    
216          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
217    
218          $log->info("processing $maxmfn records...");          $log->info("processing ",($maxmfn-$startmfn)." records...");
219    
220          # read database          # read database
221          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
222    
223    
224                    $log->debug("mfn: $mfn\n");
225    
226                    my $rec;
227    
228                  # read record                  # read record
229                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
# Line 186  sub open_isis { Line 242  sub open_isis {
242                                                  $val = $l;                                                  $val = $l;
243                                          }                                          }
244    
245                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
246                                  }                                  }
247                          } else {                          } else {
248                                  push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn;                                  push @{$rec->{'000'}}, $mfn;
249                          }                          }
250    
251                  }                  }
252    
253                    $log->confess("record $mfn empty?") unless ($rec);
254    
255                    # store
256                    if ($self->{'low_mem'}) {
257                            $self->{'db'}->put($mfn, $rec);
258                    } else {
259                            $self->{'data'}->{$mfn} = $rec;
260                    }
261    
262                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn};  
263                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
264    
265                  $self->progress_bar($mfn,$maxmfn);                  $self->progress_bar($mfn,$maxmfn);
266    
267          }          }
268    
269          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = -1;
270          $self->{'last_pcnt'} = 0;          $self->{'last_pcnt'} = 0;
271    
272            $log->debug("max mfn: $maxmfn");
273    
274          # store max mfn and return it.          # store max mfn and return it.
275          return $self->{'max_mfn'} = $maxmfn;          return $self->{'max_mfn'} = $maxmfn;
276  }  }
# Line 223  sub fetch_rec { Line 289  sub fetch_rec {
289    
290          my $log = $self->_get_logger();          my $log = $self->_get_logger();
291    
292          my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");          $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
293    
294            if ($self->{'current_mfn'} == -1) {
295                    $self->{'current_mfn'} = $self->{'start_mfn'};
296            } else {
297                    $self->{'current_mfn'}++;
298            }
299    
300            my $mfn = $self->{'current_mfn'};
301    
302          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{'max_mfn'}) {
303                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{'current_mfn'} = $self->{'max_mfn'};
# Line 233  sub fetch_rec { Line 307  sub fetch_rec {
307    
308          $self->progress_bar($mfn,$self->{'max_mfn'});          $self->progress_bar($mfn,$self->{'max_mfn'});
309    
310          return $self->{'data'}->{$mfn};          if ($self->{'low_mem'}) {
311                    return $self->{'db'}->get($mfn);
312            } else {
313                    return $self->{'data'}->{$mfn};
314            }
315    }
316    
317    =head2 mfn
318    
319    Returns current record number (MFN).
320    
321     print $webpac->mfn;
322    
323    =cut
324    
325    sub mfn {
326            my $self = shift;
327            return $self->{'current_mfn'};
328  }  }
329    
330  =head2 progress_bar  =head2 progress_bar
# Line 261  sub progress_bar { Line 352  sub progress_bar {
352    
353          $self->{'last_pcnt'} ||= 1;          $self->{'last_pcnt'} ||= 1;
354    
         $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});  
   
355          my $p = int($curr * 100 / $max);          my $p = int($curr * 100 / $max);
356    
357            # reset on re-run
358            if ($p < $self->{'last_pcnt'}) {
359                    $self->{'last_pcnt'} = $p;
360                    $self->{'last_t'} = time();
361                    $self->{'last_curr'} = undef;
362            }
363    
364          if ($p != $self->{'last_pcnt'}) {          if ($p != $self->{'last_pcnt'}) {
365                  printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );  
366                    my $last_curr = $self->{'last_curr'} || $curr;
367                    my $t = time();
368                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
369                    my $eta = ($max-$curr) / ($rate || 1);
370                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
371                  $self->{'last_pcnt'} = $p;                  $self->{'last_pcnt'} = $p;
372                    $self->{'last_t'} = time();
373                    $self->{'last_curr'} = $curr;
374          }          }
375            print STDERR "\n" if ($p == 100);
376    }
377    
378    =head2 fmt_time
379    
380    Format time (in seconds) for display.
381    
382     print $webpac->fmt_time(time());
383    
384    This method is called by L<progress_bar> to display remaining time.
385    
386    =cut
387    
388    sub fmt_time {
389            my $self = shift;
390    
391            my $t = shift || 0;
392            my $out = "";
393    
394            my ($ss,$mm,$hh) = gmtime($t);
395            $out .= "${hh}h" if ($hh);
396            $out .= sprintf("%02d:%02d", $mm,$ss);
397            $out .= "  " if ($hh == 0);
398            return $out;
399  }  }
400    
401  =head2 open_import_xml  =head2 open_import_xml
# Line 329  sub create_lookup { Line 457  sub create_lookup {
457          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
458    
459          foreach my $i (@_) {          foreach my $i (@_) {
460                  if ($i->{'eval'}) {                  $log->logconfess("need key") unless defined($i->{'key'});
461                          my $eval = $self->fill_in($rec,$i->{'eval'});                  $log->logconfess("need val") unless defined($i->{'val'});
462                          my $key = $self->fill_in($rec,$i->{'key'});  
463                          my @val = $self->fill_in($rec,$i->{'val'});                  if (defined($i->{'eval'})) {
464                          if ($key && @val && eval $eval) {                          # eval first, so we can skip fill_in for key and val
465                            my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
466                            if ($self->_eval($eval)) {
467                                    my $key = $self->fill_in($rec,$i->{'key'}) || next;
468                                    my @val = $self->fill_in($rec,$i->{'val'}) || next;
469                                  $log->debug("stored $key = ",sub { join(" | ",@val) });                                  $log->debug("stored $key = ",sub { join(" | ",@val) });
470                                  push @{$self->{'lookup'}->{$key}}, @val;                                  push @{$self->{'lookup'}->{$key}}, @val;
471                          }                          }
472                  } else {                  } else {
473                          my $key = $self->fill_in($rec,$i->{'key'});                          my $key = $self->fill_in($rec,$i->{'key'}) || next;
474                          my @val = $self->fill_in($rec,$i->{'val'});                          my @val = $self->fill_in($rec,$i->{'val'}) || next;
475                          if ($key && @val) {                          $log->debug("stored $key = ",sub { join(" | ",@val) });
476                                  $log->debug("stored $key = ",sub { join(" | ",@val) });                          push @{$self->{'lookup'}->{$key}}, @val;
                                 push @{$self->{'lookup'}->{$key}}, @val;  
                         }  
477                  }                  }
478          }          }
479  }  }
# Line 647  sub fill_in_to_arr { Line 777  sub fill_in_to_arr {
777          return @arr;          return @arr;
778  }  }
779    
780    =head2 sort_arr
781    
782    Sort array ignoring case and html in data
783    
784     my @sorted = $webpac->sort_arr(@unsorted);
785    
786    =cut
787    
788    sub sort_arr {
789            my $self = shift;
790    
791            my $log = $self->_get_logger();
792    
793            # FIXME add Schwartzian Transformation?
794    
795            my @sorted = sort {
796                    $a =~ s#<[^>]+/*>##;
797                    $b =~ s#<[^>]+/*>##;
798                    lc($b) cmp lc($a)
799            } @_;
800            $log->debug("sorted values: ",sub { join(", ",@sorted) });
801    
802            return @sorted;
803    }
804    
805    
806  =head2 data_structure  =head2 data_structure
807    
# Line 656  It is used later to produce output. Line 811  It is used later to produce output.
811   my @ds = $webpac->data_structure($rec);   my @ds = $webpac->data_structure($rec);
812    
813  This method will also set C<$webpac->{'currnet_filename'}> if there is  This method will also set C<$webpac->{'currnet_filename'}> if there is
814  <filename> tag in C<import_xml>.  <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
815    <headline> tag.
816    
817  =cut  =cut
818    
# Line 669  sub data_structure { Line 825  sub data_structure {
825          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
826    
827          undef $self->{'currnet_filename'};          undef $self->{'currnet_filename'};
828            undef $self->{'headline'};
829    
830          my @sorted_tags;          my @sorted_tags;
831          if ($self->{tags_by_order}) {          if ($self->{tags_by_order}) {
# Line 701  sub data_structure { Line 858  sub data_structure {
858                          }                          }
859                          next if (! @v);                          next if (! @v);
860    
861                            if ($tag->{'sort'}) {
862                                    @v = $self->sort_arr(@v);
863                                    $log->warn("sort within tag is usually not what you want!");
864                            }
865    
866                          # use format?                          # use format?
867                          if ($tag->{'format_name'}) {                          if ($tag->{'format_name'}) {
868                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
869                          }                          }
870    
                         # does tag have type?  
                         if ($tag->{'type'}) {  
                                 push @{$row->{$tag->{'type'}}}, @v;  
                         } else {  
                                 push @{$row->{'display'}}, @v;  
                                 push @{$row->{'swish'}}, @v;  
                         }  
   
871                          if ($field eq 'filename') {                          if ($field eq 'filename') {
872                                  $self->{'current_filename'} = join('',@v);                                  $self->{'current_filename'} = join('',@v);
873                                  $log->debug("filename: ",$self->{'current_filename'});                                  $log->debug("filename: ",$self->{'current_filename'});
874                            } elsif ($field eq 'headline') {
875                                    $self->{'headline'} .= join('',@v);
876                                    $log->debug("headline: ",$self->{'headline'});
877                                    next; # don't return headline in data_structure!
878                            }
879    
880                            # delimiter will join repeatable fields
881                            if ($tag->{'delimiter'}) {
882                                    @v = ( join($tag->{'delimiter'}, @v) );
883                          }                          }
884    
885                            # default types
886                            my @types = qw(display swish);
887                            # override by type attribute
888                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
889    
890                            foreach my $type (@types) {
891                                    # append to previous line?
892                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
893                                    if ($tag->{'append'}) {
894    
895                                            # I will delimit appended part with
896                                            # delimiter (or ,)
897                                            my $d = $tag->{'delimiter'};
898                                            # default delimiter
899                                            $d ||= ", ";
900    
901                                            my $last = pop @{$row->{$type}};
902                                            $d = "" if (! $last);
903                                            $last .= $d . join($d, @v);
904                                            push @{$row->{$type}}, $last;
905    
906                                    } else {
907                                            push @{$row->{$type}}, @v;
908                                    }
909                            }
910    
911    
912                  }                  }
913    
914                  if ($row) {                  if ($row) {
# Line 728  sub data_structure { Line 918  sub data_structure {
918                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
919                          $row->{'name'} = $name ? $self->_x($name) : $field;                          $row->{'name'} = $name ? $self->_x($name) : $field;
920    
921                            # post-sort all values in field
922                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
923                                    $log->warn("sort at field tag not implemented");
924                            }
925    
926                          push @ds, $row;                          push @ds, $row;
927    
928                          $log->debug("row $field: ",sub { Dumper($row) });                          $log->debug("row $field: ",sub { Dumper($row) });
# Line 768  sub output { Line 963  sub output {
963          return $out;          return $out;
964  }  }
965    
966    =head2 output_file
967    
968    Create output from in-memory data structure using Template Toolkit template
969    to a file.
970    
971     $webpac->output_file(
972            file => 'out.txt',
973            template => 'text.tt',
974            data => @ds
975     );
976    
977    =cut
978    
979    sub output_file {
980            my $self = shift;
981    
982            my $args = {@_};
983    
984            my $log = $self->_get_logger();
985    
986            my $file = $args->{'file'} || $log->logconfess("need file name");
987    
988            $log->debug("creating file ",$file);
989    
990            open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
991            print $fh $self->output(
992                    template => $args->{'template'},
993                    data => $args->{'data'},
994            ) || $log->logdie("print: $!");
995            close($fh) || $log->logdie("close: $!");
996    }
997    
998  =head2 apply_format  =head2 apply_format
999    
1000  Apply format specified in tag with C<format_name="name"> and  Apply format specified in tag with C<format_name="name"> and
# Line 911  B<This is different from normal Log4perl Line 1138  B<This is different from normal Log4perl
1138  also use method names, and not only classes (which are just few)  also use method names, and not only classes (which are just few)
1139  to filter logging.  to filter logging.
1140    
1141    
1142    =head1 MEMORY USAGE
1143    
1144    C<low_mem> options is double-edged sword. If enabled, WebPAC
1145    will run on memory constraint machines (which doesn't have enough
1146    physical RAM to create memory structure for whole source database).
1147    
1148    If your machine has 512Mb or more of RAM and database is around 10000 records,
1149    memory shouldn't be an issue. If you don't have enough physical RAM, you
1150    might consider using virtual memory (if your operating system is handling it
1151    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1152    parsed structure of ISIS database (this is what C<low_mem> option does).
1153    
1154    Hitting swap at end of reading source database is probably o.k. However,
1155    hitting swap before 90% will dramatically decrease performance and you will
1156    be better off with C<low_mem> and using rest of availble memory for
1157    operating system disk cache (Linux is particuallary good about this).
1158    However, every access to database record will require disk access, so
1159    generation phase will be slower 10-100 times.
1160    
1161    Parsed structures are essential - you just have option to trade RAM memory
1162    (which is fast) for disk space (which is slow). Be sure to have planty of
1163    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1164    
1165    However, when WebPAC is running on desktop machines (or laptops :-), it's
1166    highly undesireable for system to start swapping. Using C<low_mem> option can
1167    reduce WecPAC memory usage to around 64Mb for same database with lookup
1168    fields and sorted indexes which stay in RAM. Performance will suffer, but
1169    memory usage will really be minimal. It might be also more confortable to
1170    run WebPAC reniced on those machines.
1171    
1172  =cut  =cut
1173    
1174  1;  1;

Legend:
Removed from v.389  
changed lines
  Added in v.453

  ViewVC Help
Powered by ViewVC 1.1.26