/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 411 by dpavlin, Sun Sep 5 22:22:37 2004 UTC revision 441 by dpavlin, Tue Sep 14 17:07:59 2004 UTC
# Line 9  use Config::IniFiles; Line 9  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10  use Template;  use Template;
11  use Log::Log4perl qw(get_logger :levels);  use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
# Line 29  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38            low_mem => 1,
39   );   );
40    
41  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
42    
43  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45    This method will also read configuration files
46  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
47  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
48  which describes databases to be indexed.  which describes databases to be indexed.
# Line 59  sub new { Line 63  sub new {
63          my $self = {@_};          my $self = {@_};
64          bless($self, $class);          bless($self, $class);
65    
66            $self->{'start_t'} = time();
67    
68          my $log_file = $self->{'log'} || "log.conf";          my $log_file = $self->{'log'} || "log.conf";
69          Log::Log4perl->init($log_file);          Log::Log4perl->init($log_file);
70    
# Line 106  sub new { Line 112  sub new {
112                  EVAL_PERL => 1,                  EVAL_PERL => 1,
113          );          );
114    
115            # running with low_mem flag? well, use DBM::Deep then.
116            if ($self->{'low_mem'}) {
117                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119                    my $db_file = "data.db";
120    
121                    if (-e $db_file) {
122                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123                            $log->debug("removed '$db_file' from last run");
124                    }
125    
126                    require DBM::Deep;
127    
128                    my $db = new DBM::Deep $db_file;
129    
130                    $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132                    if ($db->error()) {
133                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134                    } else {
135                            $log->debug("using file '$db_file' for DBM::Deep");
136                    }
137    
138                    $self->{'db'} = $db;
139            }
140    
141          return $self;          return $self;
142  }  }
143    
# Line 116  Open CDS/ISIS database using OpenIsis mo Line 148  Open CDS/ISIS database using OpenIsis mo
148   $webpac->open_isis(   $webpac->open_isis(
149          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
150          code_page => '852',          code_page => '852',
151          limit_mfn => '500',          limit_mfn => 500,
152            start_mfn => 6000,
153          lookup => [ ... ],          lookup => [ ... ],
154   );   );
155    
156  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
157    
158    If optional parametar C<start_mfn> is set, this will be first MFN to read
159    from database (so you can skip beginning of your database if you need to).
160    
161  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
162  from database in example above.  from database in example above.
163    
# Line 149  sub open_isis { Line 185  sub open_isis {
185          $log->logcroak("need filename") if (! $arg->{'filename'});          $log->logcroak("need filename") if (! $arg->{'filename'});
186          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
187    
188            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190            # store data in object
191            $self->{'isis_filename'} = $arg->{'filename'};
192            $self->{'isis_code_page'} = $code_page;
193    
194          use OpenIsis;          use OpenIsis;
195    
196          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 157  sub open_isis { Line 199  sub open_isis {
199          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200    
201          $log->info("reading ISIS database '",$arg->{'filename'},"'");          $log->info("reading ISIS database '",$arg->{'filename'},"'");
202            $log->debug("isis code page: $code_page");
203    
204          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207            my $startmfn = 1;
208    
209          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          if (my $s = $self->{'start_mfn'}) {
210                    $log->info("skipping to MFN $s");
211                    $startmfn = $s;
212            }
213    
214            $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
215    
216          $log->info("processing $maxmfn records...");          $log->info("processing ",($maxmfn-$startmfn)." records...");
217    
218          # read database          # read database
219          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
220    
221    
222                    $log->debug("mfn: $mfn\n");
223    
224                    my $rec;
225    
226                  # read record                  # read record
227                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
# Line 186  sub open_isis { Line 240  sub open_isis {
240                                                  $val = $l;                                                  $val = $l;
241                                          }                                          }
242    
243                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
244                                  }                                  }
245                          } else {                          } else {
246                                  push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn;                                  push @{$rec->{'000'}}, $mfn;
247                          }                          }
248    
249                  }                  }
250    
251                    $log->confess("record $mfn empty?") unless ($rec);
252    
253                    # store
254                    if ($self->{'low_mem'}) {
255                            $self->{'db'}->put($mfn, $rec);
256                    } else {
257                            $self->{'data'}->{$mfn} = $rec;
258                    }
259    
260                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn};  
261                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
262    
263                  $self->progress_bar($mfn,$maxmfn);                  $self->progress_bar($mfn,$maxmfn);
264    
265          }          }
266    
267          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = $startmfn;
268          $self->{'last_pcnt'} = 0;          $self->{'last_pcnt'} = 0;
269    
270            $log->debug("max mfn: $maxmfn");
271    
272          # store max mfn and return it.          # store max mfn and return it.
273          return $self->{'max_mfn'} = $maxmfn;          return $self->{'max_mfn'} = $maxmfn;
274  }  }
# Line 233  sub fetch_rec { Line 297  sub fetch_rec {
297    
298          $self->progress_bar($mfn,$self->{'max_mfn'});          $self->progress_bar($mfn,$self->{'max_mfn'});
299    
300          return $self->{'data'}->{$mfn};          if ($self->{'low_mem'}) {
301                    return $self->{'db'}->get($mfn);
302            } else {
303                    return $self->{'data'}->{$mfn};
304            }
305  }  }
306    
307  =head2 progress_bar  =head2 progress_bar
# Line 261  sub progress_bar { Line 329  sub progress_bar {
329    
330          $self->{'last_pcnt'} ||= 1;          $self->{'last_pcnt'} ||= 1;
331    
         $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});  
   
332          my $p = int($curr * 100 / $max);          my $p = int($curr * 100 / $max);
333    
334            # reset on re-run
335            if ($p < $self->{'last_pcnt'}) {
336                    $self->{'last_pcnt'} = $p;
337                    $self->{'last_t'} = time();
338                    $self->{'last_curr'} = 1;
339            }
340    
341          if ($p != $self->{'last_pcnt'}) {          if ($p != $self->{'last_pcnt'}) {
342                  printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );  
343                    my $last_curr = $self->{'last_curr'} || $curr;
344                    my $t = time();
345                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
346                    my $eta = ($max-$curr) / ($rate || 1);
347                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
348                  $self->{'last_pcnt'} = $p;                  $self->{'last_pcnt'} = $p;
349                    $self->{'last_t'} = time();
350                    $self->{'last_curr'} = $curr;
351          }          }
352            print STDERR "\n" if ($p == 100);
353    }
354    
355    =head2 fmt_time
356    
357    Format time (in seconds) for display.
358    
359     print $webpac->fmt_time(time());
360    
361    This method is called by L<progress_bar> to display remaining time.
362    
363    =cut
364    
365    sub fmt_time {
366            my $self = shift;
367    
368            my $t = shift || 0;
369            my $out = "";
370    
371            my ($ss,$mm,$hh) = gmtime($t);
372            $out .= "${hh}h" if ($hh);
373            $out .= sprintf("%02d:%02d", $mm,$ss);
374            $out .= "  " if ($hh == 0);
375            return $out;
376  }  }
377    
378  =head2 open_import_xml  =head2 open_import_xml
# Line 329  sub create_lookup { Line 434  sub create_lookup {
434          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
435    
436          foreach my $i (@_) {          foreach my $i (@_) {
437                  if ($i->{'eval'}) {                  $log->logconfess("need key") unless defined($i->{'key'});
438                          my $eval = $self->fill_in($rec,$i->{'eval'});                  $log->logconfess("need val") unless defined($i->{'val'});
439                          my $key = $self->fill_in($rec,$i->{'key'});  
440                          my @val = $self->fill_in($rec,$i->{'val'});                  if (defined($i->{'eval'})) {
441                          if ($key && @val && eval $eval) {                          # eval first, so we can skip fill_in for key and val
442                            my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
443                            if ($self->_eval($eval)) {
444                                    my $key = $self->fill_in($rec,$i->{'key'}) || next;
445                                    my @val = $self->fill_in($rec,$i->{'val'}) || next;
446                                  $log->debug("stored $key = ",sub { join(" | ",@val) });                                  $log->debug("stored $key = ",sub { join(" | ",@val) });
447                                  push @{$self->{'lookup'}->{$key}}, @val;                                  push @{$self->{'lookup'}->{$key}}, @val;
448                          }                          }
449                  } else {                  } else {
450                          my $key = $self->fill_in($rec,$i->{'key'});                          my $key = $self->fill_in($rec,$i->{'key'}) || next;
451                          my @val = $self->fill_in($rec,$i->{'val'});                          my @val = $self->fill_in($rec,$i->{'val'}) || next;
452                          if ($key && @val) {                          $log->debug("stored $key = ",sub { join(" | ",@val) });
453                                  $log->debug("stored $key = ",sub { join(" | ",@val) });                          push @{$self->{'lookup'}->{$key}}, @val;
                                 push @{$self->{'lookup'}->{$key}}, @val;  
                         }  
454                  }                  }
455          }          }
456  }  }
# Line 647  sub fill_in_to_arr { Line 754  sub fill_in_to_arr {
754          return @arr;          return @arr;
755  }  }
756    
757    =head2 sort_arr
758    
759    Sort array ignoring case and html in data
760    
761     my @sorted = $webpac->sort_arr(@unsorted);
762    
763    =cut
764    
765    sub sort_arr {
766            my $self = shift;
767    
768            my $log = $self->_get_logger();
769    
770            # FIXME add Schwartzian Transformation?
771    
772            my @sorted = sort {
773                    $a =~ s#<[^>]+/*>##;
774                    $b =~ s#<[^>]+/*>##;
775                    lc($b) cmp lc($a)
776            } @_;
777            $log->debug("sorted values: ",sub { join(", ",@sorted) });
778    
779            return @sorted;
780    }
781    
782    
783  =head2 data_structure  =head2 data_structure
784    
# Line 703  sub data_structure { Line 835  sub data_structure {
835                          }                          }
836                          next if (! @v);                          next if (! @v);
837    
838                            if ($tag->{'sort'}) {
839                                    @v = $self->sort_arr(@v);
840                                    $log->warn("sort within tag is usually not what you want!");
841                            }
842    
843                          # use format?                          # use format?
844                          if ($tag->{'format_name'}) {                          if ($tag->{'format_name'}) {
845                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
# Line 717  sub data_structure { Line 854  sub data_structure {
854                                  next; # don't return headline in data_structure!                                  next; # don't return headline in data_structure!
855                          }                          }
856    
857                          # does tag have type?                          # delimiter will join repeatable fields
858                          if ($tag->{'type'}) {                          if ($tag->{'delimiter'}) {
859                                  push @{$row->{$tag->{'type'}}}, @v;                                  @v = ( join($tag->{'delimiter'}, @v) );
860                          } else {                          }
861                                  push @{$row->{'display'}}, @v;  
862                                  push @{$row->{'swish'}}, @v;                          # default types
863                            my @types = qw(display swish);
864                            # override by type attribute
865                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
866    
867                            foreach my $type (@types) {
868                                    # append to previous line?
869                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
870                                    if ($tag->{'append'}) {
871    
872                                            # I will delimit appended part with
873                                            # delimiter (or ,)
874                                            my $d = $tag->{'delimiter'};
875                                            # default delimiter
876                                            $d ||= ", ";
877    
878                                            my $last = pop @{$row->{$type}};
879                                            $d = "" if (! $last);
880                                            $last .= $d . join($d, @v);
881                                            push @{$row->{$type}}, $last;
882    
883                                    } else {
884                                            push @{$row->{$type}}, @v;
885                                    }
886                          }                          }
887    
888    
# Line 735  sub data_structure { Line 895  sub data_structure {
895                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
896                          $row->{'name'} = $name ? $self->_x($name) : $field;                          $row->{'name'} = $name ? $self->_x($name) : $field;
897    
898                            # post-sort all values in field
899                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
900                                    $log->warn("sort at field tag not implemented");
901                            }
902    
903                          push @ds, $row;                          push @ds, $row;
904    
905                          $log->debug("row $field: ",sub { Dumper($row) });                          $log->debug("row $field: ",sub { Dumper($row) });
# Line 795  sub output_file { Line 960  sub output_file {
960    
961          my $log = $self->_get_logger();          my $log = $self->_get_logger();
962    
963          $log->logconfess("need file name") if (! $args->{'file'});          my $file = $args->{'file'} || $log->logconfess("need file name");
964    
965          $log->debug("creating file ",$args->{'file'});          $log->debug("creating file ",$file);
966    
967          open(my $fh, ">", $args->{'file'}) || $log->logdie("can't open output file '$self->{'file'}': $!");          open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
968          print $fh $self->output(          print $fh $self->output(
969                  template => $args->{'template'},                  template => $args->{'template'},
970                  data => $args->{'data'},                  data => $args->{'data'},
# Line 950  B<This is different from normal Log4perl Line 1115  B<This is different from normal Log4perl
1115  also use method names, and not only classes (which are just few)  also use method names, and not only classes (which are just few)
1116  to filter logging.  to filter logging.
1117    
1118    
1119    =head1 MEMORY USAGE
1120    
1121    C<low_mem> options is double-edged sword. If enabled, WebPAC
1122    will run on memory constraint machines (which doesn't have enough
1123    physical RAM to create memory structure for whole source database).
1124    
1125    If your machine has 512Mb or more of RAM and database is around 10000 records,
1126    memory shouldn't be an issue. If you don't have enough physical RAM, you
1127    might consider using virtual memory (if your operating system is handling it
1128    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1129    parsed structure of ISIS database (this is what C<low_mem> option does).
1130    
1131    Hitting swap at end of reading source database is probably o.k. However,
1132    hitting swap before 90% will dramatically decrease performance and you will
1133    be better off with C<low_mem> and using rest of availble memory for
1134    operating system disk cache (Linux is particuallary good about this).
1135    However, every access to database record will require disk access, so
1136    generation phase will be slower 10-100 times.
1137    
1138    Parsed structures are essential - you just have option to trade RAM memory
1139    (which is fast) for disk space (which is slow). Be sure to have planty of
1140    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1141    
1142    However, when WebPAC is running on desktop machines (or laptops :-), it's
1143    highly undesireable for system to start swapping. Using C<low_mem> option can
1144    reduce WecPAC memory usage to around 64Mb for same database with lookup
1145    fields and sorted indexes which stay in RAM. Performance will suffer, but
1146    memory usage will really be minimal. It might be also more confortable to
1147    run WebPAC reniced on those machines.
1148    
1149  =cut  =cut
1150    
1151  1;  1;

Legend:
Removed from v.411  
changed lines
  Added in v.441

  ViewVC Help
Powered by ViewVC 1.1.26