/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 374 by dpavlin, Sun Jun 20 16:57:52 2004 UTC revision 422 by dpavlin, Sat Sep 11 08:36:38 2004 UTC
# Line 9  use Config::IniFiles; Line 9  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10  use Template;  use Template;
11  use Log::Log4perl qw(get_logger :levels);  use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
# Line 29  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          [code_page => 'ISO-8859-2',]
38            [low_mem => 1,]
39   );   );
40    
41  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
42    
43  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45    This method will also read configuration files
46  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
47  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
48  which describes databases to be indexed.  which describes databases to be indexed.
# Line 59  sub new { Line 63  sub new {
63          my $self = {@_};          my $self = {@_};
64          bless($self, $class);          bless($self, $class);
65    
66            $self->{'start_t'} = time();
67    
68          my $log_file = $self->{'log'} || "log.conf";          my $log_file = $self->{'log'} || "log.conf";
69          Log::Log4perl->init($log_file);          Log::Log4perl->init($log_file);
70    
# Line 106  sub new { Line 112  sub new {
112                  EVAL_PERL => 1,                  EVAL_PERL => 1,
113          );          );
114    
115            # running with low_mem flag? well, use DBM::Deep then.
116            if ($self->{'low_mem'}) {
117                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119                    my $db_file = "data.db";
120    
121                    if (-e $db_file) {
122                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123                            $log->debug("removed '$db_file' from last run");
124                    }
125    
126                    use DBM::Deep;
127    
128                    my $db = new DBM::Deep $db_file;
129    
130                    $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132                    if ($db->error()) {
133                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134                    } else {
135                            $log->debug("using file '$db_file' for DBM::Deep");
136                    }
137    
138                    $self->{'db'} = $db;
139            }
140    
141          return $self;          return $self;
142  }  }
143    
# Line 149  sub open_isis { Line 181  sub open_isis {
181          $log->logcroak("need filename") if (! $arg->{'filename'});          $log->logcroak("need filename") if (! $arg->{'filename'});
182          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
183    
184            # store data in object
185            $self->{'isis_filename'} = $arg->{'filename'};
186            $self->{'isis_code_page'} = $code_page;
187    
188          use OpenIsis;          use OpenIsis;
189    
190          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 157  sub open_isis { Line 193  sub open_isis {
193          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
194    
195          $log->info("reading ISIS database '",$arg->{'filename'},"'");          $log->info("reading ISIS database '",$arg->{'filename'},"'");
196            $log->debug("isis code page: $code_page");
197    
198          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
199    
# Line 169  sub open_isis { Line 206  sub open_isis {
206          # read database          # read database
207          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
208    
209    
210                    $log->debug("mfn: $mfn\n");
211    
212                    my $rec;
213    
214                  # read record                  # read record
215                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
216                  foreach my $k (keys %{$row}) {                  foreach my $k (keys %{$row}) {
# Line 186  sub open_isis { Line 228  sub open_isis {
228                                                  $val = $l;                                                  $val = $l;
229                                          }                                          }
230    
231                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
232                                  }                                  }
233                          } else {                          } else {
234                                  push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn;                                  push @{$rec->{'000'}}, $mfn;
235                          }                          }
236    
237                  }                  }
238    
239                    $log->confess("record $mfn empty?") unless ($rec);
240    
241                    # store
242                    if ($self->{'low_mem'}) {
243                            $self->{'db'}->put($mfn, $rec);
244                    } else {
245                            $self->{'data'}->{$mfn} = $rec;
246                    }
247    
248                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn};  
249                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
250    
251                    $self->progress_bar($mfn,$maxmfn);
252    
253          }          }
254    
255          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = 1;
256            $self->{'last_pcnt'} = 0;
257    
258            $log->debug("max mfn: $maxmfn");
259    
260          # store max mfn and return it.          # store max mfn and return it.
261          return $self->{'max_mfn'} = $maxmfn;          return $self->{'max_mfn'} = $maxmfn;
# Line 228  sub fetch_rec { Line 283  sub fetch_rec {
283                  return;                  return;
284          }          }
285    
286          return $self->{'data'}->{$mfn};          $self->progress_bar($mfn,$self->{'max_mfn'});
287    
288            if ($self->{'low_mem'}) {
289                    return $self->{'db'}->get($mfn);
290            } else {
291                    return $self->{'data'}->{$mfn};
292            }
293    }
294    
295    =head2 progress_bar
296    
297    Draw progress bar on STDERR.
298    
299     $webpac->progress_bar($current, $max);
300    
301    =cut
302    
303    sub progress_bar {
304            my $self = shift;
305    
306            my ($curr,$max) = @_;
307    
308            my $log = $self->_get_logger();
309    
310            $log->logconfess("no current value!") if (! $curr);
311            $log->logconfess("no maximum value!") if (! $max);
312    
313            if ($curr > $max) {
314                    $max = $curr;
315                    $log->debug("overflow to $curr");
316            }
317    
318            $self->{'last_pcnt'} ||= 1;
319    
320            my $p = int($curr * 100 / $max);
321    
322            # reset on re-run
323            if ($p < $self->{'last_pcnt'}) {
324                    $self->{'last_pcnt'} = $p;
325                    $self->{'last_t'} = time();
326                    $self->{'last_curr'} = 1;
327            }
328    
329            if ($p != $self->{'last_pcnt'}) {
330    
331                    my $last_curr = $self->{'last_curr'} || $curr;
332                    my $t = time();
333                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
334                    my $eta = ($max-$curr) / ($rate || 1);
335                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
336                    $self->{'last_pcnt'} = $p;
337                    $self->{'last_t'} = time();
338                    $self->{'last_curr'} = $curr;
339            }
340            print STDERR "\n" if ($p == 100);
341    }
342    
343    =head2 fmt_time
344    
345    Format time (in seconds) for display.
346    
347     print $webpac->fmt_time(time());
348    
349    This method is called by L<progress_bar> to display remaining time.
350    
351    =cut
352    
353    sub fmt_time {
354            my $self = shift;
355    
356            my $t = shift || 0;
357            my $out = "";
358    
359            my ($ss,$mm,$hh) = gmtime($t);
360            $out .= "${hh}h" if ($hh);
361            $out .= sprintf("%02d:%02d", $mm,$ss);
362            $out .= "  " if ($hh == 0);
363            return $out;
364  }  }
365    
366  =head2 open_import_xml  =head2 open_import_xml
# Line 254  sub open_import_xml { Line 386  sub open_import_xml {
386    
387          $self->{'tag'} = $type2tag{$type_base};          $self->{'tag'} = $type2tag{$type_base};
388    
389          $log->debug("using type '",$self->{'type'},"' tag <",$self->{'tag'},">") if ($self->{'debug'});          $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
390    
391          my $f = "./import_xml/".$self->{'type'}.".xml";          my $f = "./import_xml/".$self->{'type'}.".xml";
392          $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");          $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
393    
394          $log->debug("reading '$f'") if ($self->{'debug'});          $log->info("reading '$f'");
395    
396            $self->{'import_xml_file'} = $f;
397    
398          $self->{'import_xml'} = XMLin($f,          $self->{'import_xml'} = XMLin($f,
399                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],
400          );          );
401    
402            $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
403    
404  }  }
405    
406  =head2 create_lookup  =head2 create_lookup
# Line 286  sub create_lookup { Line 422  sub create_lookup {
422          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
423    
424          foreach my $i (@_) {          foreach my $i (@_) {
425                  if ($i->{'eval'}) {                  $log->logconfess("need key") unless defined($i->{'key'});
426                          my $eval = $self->fill_in($rec,$i->{'eval'});                  $log->logconfess("need val") unless defined($i->{'val'});
427                          my $key = $self->fill_in($rec,$i->{'key'});  
428                          my @val = $self->fill_in($rec,$i->{'val'});                  if (defined($i->{'eval'})) {
429                          if ($key && @val && eval $eval) {                          # eval first, so we can skip fill_in for key and val
430                            my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
431                            if ($self->_eval($eval)) {
432                                    my $key = $self->fill_in($rec,$i->{'key'}) || next;
433                                    my @val = $self->fill_in($rec,$i->{'val'}) || next;
434                                  $log->debug("stored $key = ",sub { join(" | ",@val) });                                  $log->debug("stored $key = ",sub { join(" | ",@val) });
435                                  push @{$self->{'lookup'}->{$key}}, @val;                                  push @{$self->{'lookup'}->{$key}}, @val;
436                          }                          }
437                  } else {                  } else {
438                          my $key = $self->fill_in($rec,$i->{'key'});                          my $key = $self->fill_in($rec,$i->{'key'}) || next;
439                          my @val = $self->fill_in($rec,$i->{'val'});                          my @val = $self->fill_in($rec,$i->{'val'}) || next;
440                          if ($key && @val) {                          $log->debug("stored $key = ",sub { join(" | ",@val) });
441                                  $log->debug("stored $key = ",sub { join(" | ",@val) });                          push @{$self->{'lookup'}->{$key}}, @val;
                                 push @{$self->{'lookup'}->{$key}}, @val;  
                         }  
442                  }                  }
443          }          }
444  }  }
# Line 331  sub get_data { Line 469  sub get_data {
469    
470          if ($$rec->{$f}) {          if ($$rec->{$f}) {
471                  return '' if (! $$rec->{$f}->[$i]);                  return '' if (! $$rec->{$f}->[$i]);
472                    no strict 'refs';
473                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {
474                          $$found++ if (defined($$found));                          $$found++ if (defined($$found));
475                          return $$rec->{$f}->[$i]->{$sf};                          return $$rec->{$f}->[$i]->{$sf};
# Line 371  Following example will read second value Line 510  Following example will read second value
510  This function B<does not> perform parsing of format to inteligenty skip  This function B<does not> perform parsing of format to inteligenty skip
511  delimiters before fields which aren't used.  delimiters before fields which aren't used.
512    
513    This method will automatically decode UTF-8 string to local code page
514    if needed.
515    
516  =cut  =cut
517    
518  sub fill_in {  sub fill_in {
# Line 386  sub fill_in { Line 528  sub fill_in {
528          # FIXME remove for speedup?          # FIXME remove for speedup?
529          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
530    
531            if (utf8::is_utf8($format)) {
532                    $format = $self->_x($format);
533            }
534    
535          my $found = 0;          my $found = 0;
536    
537          my $eval_code;          my $eval_code;
# Line 483  sub parse { Line 629  sub parse {
629    
630          $i = 0 if (! $i);          $i = 0 if (! $i);
631    
632          my $format = $self->{'utf2cp'}->convert($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});          my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
633    
634          my @out;          my @out;
635    
# Line 605  It is used later to produce output. Line 751  It is used later to produce output.
751   my @ds = $webpac->data_structure($rec);   my @ds = $webpac->data_structure($rec);
752    
753  This method will also set C<$webpac->{'currnet_filename'}> if there is  This method will also set C<$webpac->{'currnet_filename'}> if there is
754  <filename> tag in C<import_xml>.  <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
755    <headline> tag.
756    
757  =cut  =cut
758    
# Line 618  sub data_structure { Line 765  sub data_structure {
765          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
766    
767          undef $self->{'currnet_filename'};          undef $self->{'currnet_filename'};
768            undef $self->{'headline'};
769    
770          my @sorted_tags;          my @sorted_tags;
771          if ($self->{tags_by_order}) {          if ($self->{tags_by_order}) {
# Line 650  sub data_structure { Line 798  sub data_structure {
798                          }                          }
799                          next if (! @v);                          next if (! @v);
800    
801                            # use format?
802                            if ($tag->{'format_name'}) {
803                                    @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
804                            }
805    
806                            if ($field eq 'filename') {
807                                    $self->{'current_filename'} = join('',@v);
808                                    $log->debug("filename: ",$self->{'current_filename'});
809                            } elsif ($field eq 'headline') {
810                                    $self->{'headline'} .= join('',@v);
811                                    $log->debug("headline: ",$self->{'headline'});
812                                    next; # don't return headline in data_structure!
813                            }
814    
815                          # does tag have type?                          # does tag have type?
816                          if ($tag->{'type'}) {                          if ($tag->{'type'}) {
817                                  push @{$row->{$tag->{'type'}}}, @v;                                  push @{$row->{$tag->{'type'}}}, @v;
# Line 658  sub data_structure { Line 820  sub data_structure {
820                                  push @{$row->{'swish'}}, @v;                                  push @{$row->{'swish'}}, @v;
821                          }                          }
822    
                         if ($field eq 'filename') {  
                                 $self->{'current_filename'} = join('',@v);  
                                 $log->debug("filename: ",$self->{'current_filename'});  
                         }  
823    
824                  }                  }
825    
826                  if ($row) {                  if ($row) {
827                          $row->{'tag'} = $field;                          $row->{'tag'} = $field;
828    
829                            # TODO: name_sigular, name_plural
830                            my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
831                            $row->{'name'} = $name ? $self->_x($name) : $field;
832    
833                          push @ds, $row;                          push @ds, $row;
834    
835                          $log->debug("row $field: ",sub { Dumper($row) });                          $log->debug("row $field: ",sub { Dumper($row) });
# Line 707  sub output { Line 870  sub output {
870          return $out;          return $out;
871  }  }
872    
873    =head2 output_file
874    
875    Create output from in-memory data structure using Template Toolkit template
876    to a file.
877    
878     $webpac->output_file(
879            file => 'out.txt',
880            template => 'text.tt',
881            data => @ds
882     );
883    
884    =cut
885    
886    sub output_file {
887            my $self = shift;
888    
889            my $args = {@_};
890    
891            my $log = $self->_get_logger();
892    
893            my $file = $args->{'file'} || $log->logconfess("need file name");
894    
895            $log->debug("creating file ",$file);
896    
897            open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
898            print $fh $self->output(
899                    template => $args->{'template'},
900                    data => $args->{'data'},
901            ) || $log->logdie("print: $!");
902            close($fh) || $log->logdie("close: $!");
903    }
904    
905    =head2 apply_format
906    
907    Apply format specified in tag with C<format_name="name"> and
908    C<format_delimiter=";;">.
909    
910     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
911    
912    Formats can contain C<lookup{...}> if you need them.
913    
914    =cut
915    
916    sub apply_format {
917            my $self = shift;
918    
919            my ($name,$delimiter,$data) = @_;
920    
921            my $log = $self->_get_logger();
922    
923            if (! $self->{'import_xml'}->{'format'}->{$name}) {
924                    $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
925                    return $data;
926            }
927    
928            $log->warn("no delimiter for format $name") if (! $delimiter);
929    
930            my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
931    
932            my @data = split(/\Q$delimiter\E/, $data);
933    
934            my $out = sprintf($format, @data);
935            $log->debug("using format $name [$format] on $data to produce: $out");
936    
937            if ($out =~ m/$LOOKUP_REGEX/o) {
938                    return $self->lookup($out);
939            } else {
940                    return $out;
941            }
942    
943    }
944    
945    
946  #  #
947  #  #
948  #  #
# Line 759  sub _sort_by_order { Line 995  sub _sort_by_order {
995          return $va <=> $vb;          return $va <=> $vb;
996  }  }
997    
998    =head2 _get_logger
999    
1000    Get C<Log::Log4perl> object with a twist: domains are defined for each
1001    method
1002    
1003     my $log = $webpac->_get_logger();
1004    
1005    =cut
1006    
1007  sub _get_logger {  sub _get_logger {
1008          my $self = shift;          my $self = shift;
1009    
# Line 766  sub _get_logger { Line 1011  sub _get_logger {
1011          return get_logger($name);          return get_logger($name);
1012  }  }
1013    
1014    =head2 _x
1015    
1016    Convert string from UTF-8 to code page defined in C<import_xml>.
1017    
1018     my $text = $webpac->_x('utf8 text');
1019    
1020    =cut
1021    
1022    sub _x {
1023            my $self = shift;
1024            my $utf8 = shift || return;
1025    
1026            return $self->{'utf2cp'}->convert($utf8) ||
1027                    $self->_get_logger()->logwarn("can't convert '$utf8'");
1028    }
1029    
1030  #  #
1031  #  #
1032  #  #
# Line 784  B<This is different from normal Log4perl Line 1045  B<This is different from normal Log4perl
1045  also use method names, and not only classes (which are just few)  also use method names, and not only classes (which are just few)
1046  to filter logging.  to filter logging.
1047    
1048    
1049    =head1 MEMORY USAGE
1050    
1051    C<low_mem> options is double-edged sword. If enabled, WebPAC
1052    will run on memory constraint machines (which doesn't have enough
1053    physical RAM to create memory structure for whole source database).
1054    
1055    If your machine has 512Mb or more of RAM and database is around 10000 records,
1056    memory shouldn't be an issue. If you don't have enough physical RAM, you
1057    might consider using virtual memory (if your operating system is handling it
1058    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1059    parsed structure of ISIS database (this is what C<low_mem> option does).
1060    
1061    Hitting swap at end of reading source database is probably o.k. However,
1062    hitting swap before 90% will dramatically decrease performance and you will
1063    be better off with C<low_mem> and using rest of availble memory for
1064    operating system disk cache (Linux is particuallary good about this).
1065    However, every access to database record will require disk access, so
1066    generation phase will be slower 10-100 times.
1067    
1068    Parsed structures are essential - you just have option to trade RAM memory
1069    (which is fast) for disk space (which is slow). Be sure to have planty of
1070    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1071    
1072    However, when WebPAC is running on desktop machines (or laptops :-), it's
1073    highly undesireable for system to start swapping. Using C<low_mem> option can
1074    reduce WecPAC memory usage to around 64Mb for same database with lookup
1075    fields and sorted indexes which stay in RAM. Performance will suffer, but
1076    memory usage will really be minimal. It might be also more confortable to
1077    run WebPAC reniced on those machines.
1078    
1079  =cut  =cut
1080    
1081  1;  1;

Legend:
Removed from v.374  
changed lines
  Added in v.422

  ViewVC Help
Powered by ViewVC 1.1.26