/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 374 by dpavlin, Sun Jun 20 16:57:52 2004 UTC revision 436 by dpavlin, Mon Sep 13 14:55:13 2004 UTC
# Line 9  use Config::IniFiles; Line 9  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10  use Template;  use Template;
11  use Log::Log4perl qw(get_logger :levels);  use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
# Line 29  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          [code_page => 'ISO-8859-2',]
38            [low_mem => 1,]
39   );   );
40    
41  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
42    
43  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45    This method will also read configuration files
46  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
47  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
48  which describes databases to be indexed.  which describes databases to be indexed.
# Line 59  sub new { Line 63  sub new {
63          my $self = {@_};          my $self = {@_};
64          bless($self, $class);          bless($self, $class);
65    
66            $self->{'start_t'} = time();
67    
68          my $log_file = $self->{'log'} || "log.conf";          my $log_file = $self->{'log'} || "log.conf";
69          Log::Log4perl->init($log_file);          Log::Log4perl->init($log_file);
70    
# Line 106  sub new { Line 112  sub new {
112                  EVAL_PERL => 1,                  EVAL_PERL => 1,
113          );          );
114    
115            # running with low_mem flag? well, use DBM::Deep then.
116            if ($self->{'low_mem'}) {
117                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119                    my $db_file = "data.db";
120    
121                    if (-e $db_file) {
122                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123                            $log->debug("removed '$db_file' from last run");
124                    }
125    
126                    require DBM::Deep;
127    
128                    my $db = new DBM::Deep $db_file;
129    
130                    $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132                    if ($db->error()) {
133                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134                    } else {
135                            $log->debug("using file '$db_file' for DBM::Deep");
136                    }
137    
138                    $self->{'db'} = $db;
139            }
140    
141          return $self;          return $self;
142  }  }
143    
# Line 116  Open CDS/ISIS database using OpenIsis mo Line 148  Open CDS/ISIS database using OpenIsis mo
148   $webpac->open_isis(   $webpac->open_isis(
149          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
150          code_page => '852',          code_page => '852',
151          limit_mfn => '500',          limit_mfn => 500,
152            start_mfn => 6000,
153          lookup => [ ... ],          lookup => [ ... ],
154   );   );
155    
156  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
157    
158    If optional parametar C<start_mfn> is set, this will be first MFN to read
159    from database (so you can skip beginning of your database if you need to).
160    
161  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
162  from database in example above.  from database in example above.
163    
# Line 149  sub open_isis { Line 185  sub open_isis {
185          $log->logcroak("need filename") if (! $arg->{'filename'});          $log->logcroak("need filename") if (! $arg->{'filename'});
186          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
187    
188            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190            # store data in object
191            $self->{'isis_filename'} = $arg->{'filename'};
192            $self->{'isis_code_page'} = $code_page;
193    
194          use OpenIsis;          use OpenIsis;
195    
196          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 157  sub open_isis { Line 199  sub open_isis {
199          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200    
201          $log->info("reading ISIS database '",$arg->{'filename'},"'");          $log->info("reading ISIS database '",$arg->{'filename'},"'");
202            $log->debug("isis code page: $code_page");
203    
204          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207            my $startmfn = 1;
208    
209            if (my $s = $self->{'start_mfn'}) {
210                    $log->info("skipping to MFN $s");
211                    $startmfn = $s;
212            }
213    
214          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
215    
216          $log->info("processing $maxmfn records...");          $log->info("processing ",($maxmfn-$startmfn)." records...");
217    
218          # read database          # read database
219          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
220    
221    
222                    $log->debug("mfn: $mfn\n");
223    
224                    my $rec;
225    
226                  # read record                  # read record
227                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
# Line 186  sub open_isis { Line 240  sub open_isis {
240                                                  $val = $l;                                                  $val = $l;
241                                          }                                          }
242    
243                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
244                                  }                                  }
245                          } else {                          } else {
246                                  push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn;                                  push @{$rec->{'000'}}, $mfn;
247                          }                          }
248    
249                  }                  }
250    
251                    $log->confess("record $mfn empty?") unless ($rec);
252    
253                    # store
254                    if ($self->{'low_mem'}) {
255                            $self->{'db'}->put($mfn, $rec);
256                    } else {
257                            $self->{'data'}->{$mfn} = $rec;
258                    }
259    
260                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn};  
261                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
262    
263                    $self->progress_bar($mfn,$maxmfn);
264    
265          }          }
266    
267          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = $startmfn;
268            $self->{'last_pcnt'} = 0;
269    
270            $log->debug("max mfn: $maxmfn");
271    
272          # store max mfn and return it.          # store max mfn and return it.
273          return $self->{'max_mfn'} = $maxmfn;          return $self->{'max_mfn'} = $maxmfn;
# Line 228  sub fetch_rec { Line 295  sub fetch_rec {
295                  return;                  return;
296          }          }
297    
298          return $self->{'data'}->{$mfn};          $self->progress_bar($mfn,$self->{'max_mfn'});
299    
300            if ($self->{'low_mem'}) {
301                    return $self->{'db'}->get($mfn);
302            } else {
303                    return $self->{'data'}->{$mfn};
304            }
305    }
306    
307    =head2 progress_bar
308    
309    Draw progress bar on STDERR.
310    
311     $webpac->progress_bar($current, $max);
312    
313    =cut
314    
315    sub progress_bar {
316            my $self = shift;
317    
318            my ($curr,$max) = @_;
319    
320            my $log = $self->_get_logger();
321    
322            $log->logconfess("no current value!") if (! $curr);
323            $log->logconfess("no maximum value!") if (! $max);
324    
325            if ($curr > $max) {
326                    $max = $curr;
327                    $log->debug("overflow to $curr");
328            }
329    
330            $self->{'last_pcnt'} ||= 1;
331    
332            my $p = int($curr * 100 / $max);
333    
334            # reset on re-run
335            if ($p < $self->{'last_pcnt'}) {
336                    $self->{'last_pcnt'} = $p;
337                    $self->{'last_t'} = time();
338                    $self->{'last_curr'} = 1;
339            }
340    
341            if ($p != $self->{'last_pcnt'}) {
342    
343                    my $last_curr = $self->{'last_curr'} || $curr;
344                    my $t = time();
345                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
346                    my $eta = ($max-$curr) / ($rate || 1);
347                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
348                    $self->{'last_pcnt'} = $p;
349                    $self->{'last_t'} = time();
350                    $self->{'last_curr'} = $curr;
351            }
352            print STDERR "\n" if ($p == 100);
353    }
354    
355    =head2 fmt_time
356    
357    Format time (in seconds) for display.
358    
359     print $webpac->fmt_time(time());
360    
361    This method is called by L<progress_bar> to display remaining time.
362    
363    =cut
364    
365    sub fmt_time {
366            my $self = shift;
367    
368            my $t = shift || 0;
369            my $out = "";
370    
371            my ($ss,$mm,$hh) = gmtime($t);
372            $out .= "${hh}h" if ($hh);
373            $out .= sprintf("%02d:%02d", $mm,$ss);
374            $out .= "  " if ($hh == 0);
375            return $out;
376  }  }
377    
378  =head2 open_import_xml  =head2 open_import_xml
# Line 254  sub open_import_xml { Line 398  sub open_import_xml {
398    
399          $self->{'tag'} = $type2tag{$type_base};          $self->{'tag'} = $type2tag{$type_base};
400    
401          $log->debug("using type '",$self->{'type'},"' tag <",$self->{'tag'},">") if ($self->{'debug'});          $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
402    
403          my $f = "./import_xml/".$self->{'type'}.".xml";          my $f = "./import_xml/".$self->{'type'}.".xml";
404          $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");          $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
405    
406          $log->debug("reading '$f'") if ($self->{'debug'});          $log->info("reading '$f'");
407    
408            $self->{'import_xml_file'} = $f;
409    
410          $self->{'import_xml'} = XMLin($f,          $self->{'import_xml'} = XMLin($f,
411                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],
412          );          );
413    
414            $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
415    
416  }  }
417    
418  =head2 create_lookup  =head2 create_lookup
# Line 286  sub create_lookup { Line 434  sub create_lookup {
434          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
435    
436          foreach my $i (@_) {          foreach my $i (@_) {
437                  if ($i->{'eval'}) {                  $log->logconfess("need key") unless defined($i->{'key'});
438                          my $eval = $self->fill_in($rec,$i->{'eval'});                  $log->logconfess("need val") unless defined($i->{'val'});
439                          my $key = $self->fill_in($rec,$i->{'key'});  
440                          my @val = $self->fill_in($rec,$i->{'val'});                  if (defined($i->{'eval'})) {
441                          if ($key && @val && eval $eval) {                          # eval first, so we can skip fill_in for key and val
442                            my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
443                            if ($self->_eval($eval)) {
444                                    my $key = $self->fill_in($rec,$i->{'key'}) || next;
445                                    my @val = $self->fill_in($rec,$i->{'val'}) || next;
446                                  $log->debug("stored $key = ",sub { join(" | ",@val) });                                  $log->debug("stored $key = ",sub { join(" | ",@val) });
447                                  push @{$self->{'lookup'}->{$key}}, @val;                                  push @{$self->{'lookup'}->{$key}}, @val;
448                          }                          }
449                  } else {                  } else {
450                          my $key = $self->fill_in($rec,$i->{'key'});                          my $key = $self->fill_in($rec,$i->{'key'}) || next;
451                          my @val = $self->fill_in($rec,$i->{'val'});                          my @val = $self->fill_in($rec,$i->{'val'}) || next;
452                          if ($key && @val) {                          $log->debug("stored $key = ",sub { join(" | ",@val) });
453                                  $log->debug("stored $key = ",sub { join(" | ",@val) });                          push @{$self->{'lookup'}->{$key}}, @val;
                                 push @{$self->{'lookup'}->{$key}}, @val;  
                         }  
454                  }                  }
455          }          }
456  }  }
# Line 331  sub get_data { Line 481  sub get_data {
481    
482          if ($$rec->{$f}) {          if ($$rec->{$f}) {
483                  return '' if (! $$rec->{$f}->[$i]);                  return '' if (! $$rec->{$f}->[$i]);
484                    no strict 'refs';
485                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {
486                          $$found++ if (defined($$found));                          $$found++ if (defined($$found));
487                          return $$rec->{$f}->[$i]->{$sf};                          return $$rec->{$f}->[$i]->{$sf};
# Line 371  Following example will read second value Line 522  Following example will read second value
522  This function B<does not> perform parsing of format to inteligenty skip  This function B<does not> perform parsing of format to inteligenty skip
523  delimiters before fields which aren't used.  delimiters before fields which aren't used.
524    
525    This method will automatically decode UTF-8 string to local code page
526    if needed.
527    
528  =cut  =cut
529    
530  sub fill_in {  sub fill_in {
# Line 386  sub fill_in { Line 540  sub fill_in {
540          # FIXME remove for speedup?          # FIXME remove for speedup?
541          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
542    
543            if (utf8::is_utf8($format)) {
544                    $format = $self->_x($format);
545            }
546    
547          my $found = 0;          my $found = 0;
548    
549          my $eval_code;          my $eval_code;
# Line 483  sub parse { Line 641  sub parse {
641    
642          $i = 0 if (! $i);          $i = 0 if (! $i);
643    
644          my $format = $self->{'utf2cp'}->convert($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});          my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
645    
646          my @out;          my @out;
647    
# Line 605  It is used later to produce output. Line 763  It is used later to produce output.
763   my @ds = $webpac->data_structure($rec);   my @ds = $webpac->data_structure($rec);
764    
765  This method will also set C<$webpac->{'currnet_filename'}> if there is  This method will also set C<$webpac->{'currnet_filename'}> if there is
766  <filename> tag in C<import_xml>.  <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
767    <headline> tag.
768    
769  =cut  =cut
770    
# Line 618  sub data_structure { Line 777  sub data_structure {
777          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
778    
779          undef $self->{'currnet_filename'};          undef $self->{'currnet_filename'};
780            undef $self->{'headline'};
781    
782          my @sorted_tags;          my @sorted_tags;
783          if ($self->{tags_by_order}) {          if ($self->{tags_by_order}) {
# Line 650  sub data_structure { Line 810  sub data_structure {
810                          }                          }
811                          next if (! @v);                          next if (! @v);
812    
813                            # use format?
814                            if ($tag->{'format_name'}) {
815                                    @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
816                            }
817    
818                            if ($field eq 'filename') {
819                                    $self->{'current_filename'} = join('',@v);
820                                    $log->debug("filename: ",$self->{'current_filename'});
821                            } elsif ($field eq 'headline') {
822                                    $self->{'headline'} .= join('',@v);
823                                    $log->debug("headline: ",$self->{'headline'});
824                                    next; # don't return headline in data_structure!
825                            }
826    
827                          # does tag have type?                          # does tag have type?
828                          if ($tag->{'type'}) {                          if ($tag->{'type'}) {
829                                  push @{$row->{$tag->{'type'}}}, @v;                                  push @{$row->{$tag->{'type'}}}, @v;
# Line 658  sub data_structure { Line 832  sub data_structure {
832                                  push @{$row->{'swish'}}, @v;                                  push @{$row->{'swish'}}, @v;
833                          }                          }
834    
                         if ($field eq 'filename') {  
                                 $self->{'current_filename'} = join('',@v);  
                                 $log->debug("filename: ",$self->{'current_filename'});  
                         }  
835    
836                  }                  }
837    
838                  if ($row) {                  if ($row) {
839                          $row->{'tag'} = $field;                          $row->{'tag'} = $field;
840    
841                            # TODO: name_sigular, name_plural
842                            my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
843                            $row->{'name'} = $name ? $self->_x($name) : $field;
844    
845                          push @ds, $row;                          push @ds, $row;
846    
847                          $log->debug("row $field: ",sub { Dumper($row) });                          $log->debug("row $field: ",sub { Dumper($row) });
# Line 707  sub output { Line 882  sub output {
882          return $out;          return $out;
883  }  }
884    
885    =head2 output_file
886    
887    Create output from in-memory data structure using Template Toolkit template
888    to a file.
889    
890     $webpac->output_file(
891            file => 'out.txt',
892            template => 'text.tt',
893            data => @ds
894     );
895    
896    =cut
897    
898    sub output_file {
899            my $self = shift;
900    
901            my $args = {@_};
902    
903            my $log = $self->_get_logger();
904    
905            my $file = $args->{'file'} || $log->logconfess("need file name");
906    
907            $log->debug("creating file ",$file);
908    
909            open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
910            print $fh $self->output(
911                    template => $args->{'template'},
912                    data => $args->{'data'},
913            ) || $log->logdie("print: $!");
914            close($fh) || $log->logdie("close: $!");
915    }
916    
917    =head2 apply_format
918    
919    Apply format specified in tag with C<format_name="name"> and
920    C<format_delimiter=";;">.
921    
922     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
923    
924    Formats can contain C<lookup{...}> if you need them.
925    
926    =cut
927    
928    sub apply_format {
929            my $self = shift;
930    
931            my ($name,$delimiter,$data) = @_;
932    
933            my $log = $self->_get_logger();
934    
935            if (! $self->{'import_xml'}->{'format'}->{$name}) {
936                    $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
937                    return $data;
938            }
939    
940            $log->warn("no delimiter for format $name") if (! $delimiter);
941    
942            my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
943    
944            my @data = split(/\Q$delimiter\E/, $data);
945    
946            my $out = sprintf($format, @data);
947            $log->debug("using format $name [$format] on $data to produce: $out");
948    
949            if ($out =~ m/$LOOKUP_REGEX/o) {
950                    return $self->lookup($out);
951            } else {
952                    return $out;
953            }
954    
955    }
956    
957    
958  #  #
959  #  #
960  #  #
# Line 759  sub _sort_by_order { Line 1007  sub _sort_by_order {
1007          return $va <=> $vb;          return $va <=> $vb;
1008  }  }
1009    
1010    =head2 _get_logger
1011    
1012    Get C<Log::Log4perl> object with a twist: domains are defined for each
1013    method
1014    
1015     my $log = $webpac->_get_logger();
1016    
1017    =cut
1018    
1019  sub _get_logger {  sub _get_logger {
1020          my $self = shift;          my $self = shift;
1021    
# Line 766  sub _get_logger { Line 1023  sub _get_logger {
1023          return get_logger($name);          return get_logger($name);
1024  }  }
1025    
1026    =head2 _x
1027    
1028    Convert string from UTF-8 to code page defined in C<import_xml>.
1029    
1030     my $text = $webpac->_x('utf8 text');
1031    
1032    =cut
1033    
1034    sub _x {
1035            my $self = shift;
1036            my $utf8 = shift || return;
1037    
1038            return $self->{'utf2cp'}->convert($utf8) ||
1039                    $self->_get_logger()->logwarn("can't convert '$utf8'");
1040    }
1041    
1042  #  #
1043  #  #
1044  #  #
# Line 784  B<This is different from normal Log4perl Line 1057  B<This is different from normal Log4perl
1057  also use method names, and not only classes (which are just few)  also use method names, and not only classes (which are just few)
1058  to filter logging.  to filter logging.
1059    
1060    
1061    =head1 MEMORY USAGE
1062    
1063    C<low_mem> options is double-edged sword. If enabled, WebPAC
1064    will run on memory constraint machines (which doesn't have enough
1065    physical RAM to create memory structure for whole source database).
1066    
1067    If your machine has 512Mb or more of RAM and database is around 10000 records,
1068    memory shouldn't be an issue. If you don't have enough physical RAM, you
1069    might consider using virtual memory (if your operating system is handling it
1070    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1071    parsed structure of ISIS database (this is what C<low_mem> option does).
1072    
1073    Hitting swap at end of reading source database is probably o.k. However,
1074    hitting swap before 90% will dramatically decrease performance and you will
1075    be better off with C<low_mem> and using rest of availble memory for
1076    operating system disk cache (Linux is particuallary good about this).
1077    However, every access to database record will require disk access, so
1078    generation phase will be slower 10-100 times.
1079    
1080    Parsed structures are essential - you just have option to trade RAM memory
1081    (which is fast) for disk space (which is slow). Be sure to have planty of
1082    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1083    
1084    However, when WebPAC is running on desktop machines (or laptops :-), it's
1085    highly undesireable for system to start swapping. Using C<low_mem> option can
1086    reduce WecPAC memory usage to around 64Mb for same database with lookup
1087    fields and sorted indexes which stay in RAM. Performance will suffer, but
1088    memory usage will really be minimal. It might be also more confortable to
1089    run WebPAC reniced on those machines.
1090    
1091  =cut  =cut
1092    
1093  1;  1;

Legend:
Removed from v.374  
changed lines
  Added in v.436

  ViewVC Help
Powered by ViewVC 1.1.26