/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 421 by dpavlin, Fri Sep 10 22:24:42 2004 UTC revision 555 by dpavlin, Fri Oct 29 22:09:04 2004 UTC
# Line 9  use Config::IniFiles; Line 9  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10  use Template;  use Template;
11  use Log::Log4perl qw(get_logger :levels);  use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
# Line 33  Create new instance of WebPAC using conf Line 34  Create new instance of WebPAC using conf
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38          [low_mem => 1,]          low_mem => 1,
39            filter => {
40                    'lower' => sub { lc($_[0]) },
41            },
42   );   );
43    
44  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
45    
46    Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48    There is optinal parametar C<filter> which specify different filters which
49    can be applied using C<filter{name}> notation.
50    
51  This method will also read configuration files  This method will also read configuration files
52  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
53  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
54  which describes databases to be indexed.  which describes databases to be indexed.
55    
 C<low_mem> options is double-edged sword. If enabled, WebPAC  
 will run on memory constraint machines (which doesn't have enough  
 physical RAM to create memory structure for whole ISIS database).  
   
 If your machine has 512Mb or more and database is around 10000 records,  
 memory shouldn't be an issue. If you don't have enough physical RAM, you  
 might consider using virtual memory (if your operating system is handling it  
 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle  
 parsed structure of ISIS database.  
   
 However, when WebPAC is running on desktop machines (or laptops :-), it's  
 highly undesireable for system to start swapping. Using C<low_mem> option can  
 reduce WecPAC memory usage to 16Mb for same database with lookup fields and  
 sorted indexes which stay in RAM. Performance will suffer, but memory usage  
 will really be minimal. It might be also more confortable to run WebPAC reniced  
 on those machines.  
   
56  =cut  =cut
57    
58  # mapping between data type and tag which specify  # mapping between data type and tag which specify
# Line 77  sub new { Line 69  sub new {
69          my $self = {@_};          my $self = {@_};
70          bless($self, $class);          bless($self, $class);
71    
72            $self->{'start_t'} = time();
73    
74          my $log_file = $self->{'log'} || "log.conf";          my $log_file = $self->{'log'} || "log.conf";
75          Log::Log4perl->init($log_file);          Log::Log4perl->init($log_file);
76    
# Line 126  sub new { Line 120  sub new {
120    
121          # running with low_mem flag? well, use DBM::Deep then.          # running with low_mem flag? well, use DBM::Deep then.
122          if ($self->{'low_mem'}) {          if ($self->{'low_mem'}) {
123                  $log->info("running with low_mem which impacts performance (<64 Mb memory usage)");                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
124    
125                  my $db_file = "data.db";                  my $db_file = "data.db";
126    
# Line 135  sub new { Line 129  sub new {
129                          $log->debug("removed '$db_file' from last run");                          $log->debug("removed '$db_file' from last run");
130                  }                  }
131    
132                  use DBM::Deep;                  require DBM::Deep;
133    
134                  my $db = new DBM::Deep $db_file;                  my $db = new DBM::Deep $db_file;
135    
# Line 144  sub new { Line 138  sub new {
138                  if ($db->error()) {                  if ($db->error()) {
139                          $log->logdie("can't open '$db_file' under low_mem: ",$db->error());                          $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
140                  } else {                  } else {
141                          $log->debug("using file $db_file for DBM::Deep");                          $log->debug("using file '$db_file' for DBM::Deep");
142                  }                  }
143    
144                  $self->{'db'} = $db;                  $self->{'db'} = $db;
145          }          }
146    
147            $log->debug("filters defined: ",Dumper($self->{'filter'}));
148    
149          return $self;          return $self;
150  }  }
151    
# Line 160  Open CDS/ISIS database using OpenIsis mo Line 156  Open CDS/ISIS database using OpenIsis mo
156   $webpac->open_isis(   $webpac->open_isis(
157          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
158          code_page => '852',          code_page => '852',
159          limit_mfn => '500',          limit_mfn => 500,
160            start_mfn => 6000,
161          lookup => [ ... ],          lookup => [ ... ],
162   );   );
163    
164  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
165    
166    If optional parametar C<start_mfn> is set, this will be first MFN to read
167    from database (so you can skip beginning of your database if you need to).
168    
169  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
170  from database in example above.  from database in example above.
171    
# Line 193  sub open_isis { Line 193  sub open_isis {
193          $log->logcroak("need filename") if (! $arg->{'filename'});          $log->logcroak("need filename") if (! $arg->{'filename'});
194          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
195    
196            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197    
198          # store data in object          # store data in object
199          $self->{'isis_filename'} = $arg->{'filename'};          $self->{'isis_filename'} = $arg->{'filename'};
200          $self->{'isis_code_page'} = $code_page;          $self->{'isis_code_page'} = $code_page;
# Line 210  sub open_isis { Line 212  sub open_isis {
212          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
213    
214          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
215            my $startmfn = 1;
216    
217          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          if (my $s = $self->{'start_mfn'}) {
218                    $log->info("skipping to MFN $s");
219                    $startmfn = $s;
220            } else {
221                    $self->{'start_mfn'} = $startmfn;
222            }
223    
224          $log->info("processing $maxmfn records...");          $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
225    
226            $log->info("processing ",($maxmfn-$startmfn)." records...");
227    
228          # read database          # read database
229          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
230    
231    
232                  $log->debug("mfn: $mfn\n");                  $log->debug("mfn: $mfn\n");
# Line 264  sub open_isis { Line 274  sub open_isis {
274    
275          }          }
276    
277          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = -1;
278          $self->{'last_pcnt'} = 0;          $self->{'last_pcnt'} = 0;
279    
280          $log->debug("max mfn: $maxmfn");          $log->debug("max mfn: $maxmfn");
# Line 287  sub fetch_rec { Line 297  sub fetch_rec {
297    
298          my $log = $self->_get_logger();          my $log = $self->_get_logger();
299    
300          my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");          $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
301    
302            if ($self->{'current_mfn'} == -1) {
303                    $self->{'current_mfn'} = $self->{'start_mfn'};
304            } else {
305                    $self->{'current_mfn'}++;
306            }
307    
308            my $mfn = $self->{'current_mfn'};
309    
310          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{'max_mfn'}) {
311                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{'current_mfn'} = $self->{'max_mfn'};
# Line 304  sub fetch_rec { Line 322  sub fetch_rec {
322          }          }
323  }  }
324    
325    =head2 mfn
326    
327    Returns current record number (MFN).
328    
329     print $webpac->mfn;
330    
331    =cut
332    
333    sub mfn {
334            my $self = shift;
335            return $self->{'current_mfn'};
336    }
337    
338  =head2 progress_bar  =head2 progress_bar
339    
340  Draw progress bar on STDERR.  Draw progress bar on STDERR.
# Line 329  sub progress_bar { Line 360  sub progress_bar {
360    
361          $self->{'last_pcnt'} ||= 1;          $self->{'last_pcnt'} ||= 1;
362    
363          $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});          my $p = int($curr * 100 / $max) || 1;
364    
365            # reset on re-run
366            if ($p < $self->{'last_pcnt'}) {
367                    $self->{'last_pcnt'} = $p;
368                    $self->{'last_t'} = time();
369                    $self->{'last_curr'} = undef;
370            }
371    
372            $self->{'last_t'} ||= time();
373    
         my $p = int($curr * 100 / $max);  
374          if ($p != $self->{'last_pcnt'}) {          if ($p != $self->{'last_pcnt'}) {
375                  printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );  
376                    my $last_curr = $self->{'last_curr'} || $curr;
377                    my $t = time();
378                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
379                    my $eta = ($max-$curr) / ($rate || 1);
380                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
381                  $self->{'last_pcnt'} = $p;                  $self->{'last_pcnt'} = $p;
382                    $self->{'last_t'} = time();
383                    $self->{'last_curr'} = $curr;
384          }          }
385          print STDERR "\n" if ($p == 100);          print STDERR "\n" if ($p == 100);
386  }  }
387    
388    =head2 fmt_time
389    
390    Format time (in seconds) for display.
391    
392     print $webpac->fmt_time(time());
393    
394    This method is called by L<progress_bar> to display remaining time.
395    
396    =cut
397    
398    sub fmt_time {
399            my $self = shift;
400    
401            my $t = shift || 0;
402            my $out = "";
403    
404            my ($ss,$mm,$hh) = gmtime($t);
405            $out .= "${hh}h" if ($hh);
406            $out .= sprintf("%02d:%02d", $mm,$ss);
407            $out .= "  " if ($hh == 0);
408            return $out;
409    }
410    
411  =head2 open_import_xml  =head2 open_import_xml
412    
413  Read file from C<import_xml/> directory and parse it.  Read file from C<import_xml/> directory and parse it.
# Line 501  sub fill_in { Line 570  sub fill_in {
570          # iteration (for repeatable fields)          # iteration (for repeatable fields)
571          my $i = shift || 0;          my $i = shift || 0;
572    
573            $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
574    
575          # FIXME remove for speedup?          # FIXME remove for speedup?
576          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
577    
# Line 514  sub fill_in { Line 585  sub fill_in {
585          # remove eval{...} from beginning          # remove eval{...} from beginning
586          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
587    
588            my $filter_name;
589            # remove filter{...} from beginning
590            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
591    
592          # do actual replacement of placeholders          # do actual replacement of placeholders
593            # repeatable fields
594          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
595            # non-repeatable fields
596            $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
597    
598          if ($found) {          if ($found) {
599                  $log->debug("format: $format");                  $log->debug("format: $format");
# Line 523  sub fill_in { Line 601  sub fill_in {
601                          my $eval = $self->fill_in($rec,$eval_code,$i);                          my $eval = $self->fill_in($rec,$eval_code,$i);
602                          return if (! $self->_eval($eval));                          return if (! $self->_eval($eval));
603                  }                  }
604                    if ($filter_name && $self->{'filter'}->{$filter_name}) {
605                            $log->debug("filter '$filter_name' for $format");
606                            $format = $self->{'filter'}->{$filter_name}->($format);
607                            return unless(defined($format));
608                            $log->debug("filter result: $format");
609                    }
610                  # do we have lookups?                  # do we have lookups?
611                  if ($format =~ /$LOOKUP_REGEX/o) {                  if ($format =~ /$LOOKUP_REGEX/o) {
612                          $log->debug("format '$format' has lookup");                          $log->debug("format '$format' has lookup");
# Line 615  sub parse { Line 699  sub parse {
699          # remove eval{...} from beginning          # remove eval{...} from beginning
700          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
701    
702            my $filter_name;
703            # remove filter{...} from beginning
704            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
705    
706          my $prefix;          my $prefix;
707          my $all_found=0;          my $all_found=0;
708    
709          while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {          while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
710    
711                  my $del = $1 || '';                  my $del = $1 || '';
712                  $prefix ||= $del if ($all_found == 0);                  $prefix ||= $del if ($all_found == 0);
713    
714                    # repeatable index
715                    my $r = $i;
716                    $r = 0 if (lc("$2") eq 's');
717    
718                  my $found = 0;                  my $found = 0;
719                  my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);                  my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
720    
721                  if ($found) {                  if ($found) {
722                          push @out, $del;                          push @out, $del;
# Line 648  sub parse { Line 740  sub parse {
740          }          }
741    
742          if ($eval_code) {          if ($eval_code) {
743                  my $eval = $self->fill_in($rec,$eval_code,$i);                  my $eval = $self->fill_in($rec,$eval_code,$i) || return;
744                  $log->debug("about to eval{",$eval,"} format: $out");                  $log->debug("about to eval{$eval} format: $out");
745                  return if (! $self->_eval($eval));                  return if (! $self->_eval($eval));
746          }          }
747            
748            if ($filter_name && $self->{'filter'}->{$filter_name}) {
749                    $log->debug("about to filter{$filter_name} format: $out");
750                    $out = $self->{'filter'}->{$filter_name}->($out);
751                    return unless(defined($out));
752                    $log->debug("filter result: $out");
753            }
754    
755          return $out;          return $out;
756  }  }
# Line 718  sub fill_in_to_arr { Line 817  sub fill_in_to_arr {
817          return @arr;          return @arr;
818  }  }
819    
820    =head2 sort_arr
821    
822    Sort array ignoring case and html in data
823    
824     my @sorted = $webpac->sort_arr(@unsorted);
825    
826    =cut
827    
828    sub sort_arr {
829            my $self = shift;
830    
831            my $log = $self->_get_logger();
832    
833            # FIXME add Schwartzian Transformation?
834    
835            my @sorted = sort {
836                    $a =~ s#<[^>]+/*>##;
837                    $b =~ s#<[^>]+/*>##;
838                    lc($b) cmp lc($a)
839            } @_;
840            $log->debug("sorted values: ",sub { join(", ",@sorted) });
841    
842            return @sorted;
843    }
844    
845    
846  =head2 data_structure  =head2 data_structure
847    
# Line 774  sub data_structure { Line 898  sub data_structure {
898                          }                          }
899                          next if (! @v);                          next if (! @v);
900    
901                            if ($tag->{'sort'}) {
902                                    @v = $self->sort_arr(@v);
903                                    $log->warn("sort within tag is usually not what you want!");
904                            }
905    
906                          # use format?                          # use format?
907                          if ($tag->{'format_name'}) {                          if ($tag->{'format_name'}) {
908                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
# Line 788  sub data_structure { Line 917  sub data_structure {
917                                  next; # don't return headline in data_structure!                                  next; # don't return headline in data_structure!
918                          }                          }
919    
920                          # does tag have type?                          # delimiter will join repeatable fields
921                          if ($tag->{'type'}) {                          if ($tag->{'delimiter'}) {
922                                  push @{$row->{$tag->{'type'}}}, @v;                                  @v = ( join($tag->{'delimiter'}, @v) );
923                          } else {                          }
924                                  push @{$row->{'display'}}, @v;  
925                                  push @{$row->{'swish'}}, @v;                          # default types
926                            my @types = qw(display swish);
927                            # override by type attribute
928                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
929    
930                            foreach my $type (@types) {
931                                    # append to previous line?
932                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
933                                    if ($tag->{'append'}) {
934    
935                                            # I will delimit appended part with
936                                            # delimiter (or ,)
937                                            my $d = $tag->{'delimiter'};
938                                            # default delimiter
939                                            $d ||= " ";
940    
941                                            my $last = pop @{$row->{$type}};
942                                            $d = "" if (! $last);
943                                            $last .= $d . join($d, @v);
944                                            push @{$row->{$type}}, $last;
945    
946                                    } else {
947                                            push @{$row->{$type}}, @v;
948                                    }
949                          }                          }
950    
951    
# Line 806  sub data_structure { Line 958  sub data_structure {
958                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
959                          $row->{'name'} = $name ? $self->_x($name) : $field;                          $row->{'name'} = $name ? $self->_x($name) : $field;
960    
961                            # post-sort all values in field
962                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
963                                    $log->warn("sort at field tag not implemented");
964                            }
965    
966                          push @ds, $row;                          push @ds, $row;
967    
968                          $log->debug("row $field: ",sub { Dumper($row) });                          $log->debug("row $field: ",sub { Dumper($row) });
# Line 951  sub _eval { Line 1108  sub _eval {
1108    
1109          $log->debug("eval: ",$code," [",$ret,"]");          $log->debug("eval: ",$code," [",$ret,"]");
1110    
1111          return $ret || 0;          return $ret || undef;
1112  }  }
1113    
1114  =head2 _sort_by_order  =head2 _sort_by_order
# Line 1021  B<This is different from normal Log4perl Line 1178  B<This is different from normal Log4perl
1178  also use method names, and not only classes (which are just few)  also use method names, and not only classes (which are just few)
1179  to filter logging.  to filter logging.
1180    
1181    
1182    =head1 MEMORY USAGE
1183    
1184    C<low_mem> options is double-edged sword. If enabled, WebPAC
1185    will run on memory constraint machines (which doesn't have enough
1186    physical RAM to create memory structure for whole source database).
1187    
1188    If your machine has 512Mb or more of RAM and database is around 10000 records,
1189    memory shouldn't be an issue. If you don't have enough physical RAM, you
1190    might consider using virtual memory (if your operating system is handling it
1191    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1192    parsed structure of ISIS database (this is what C<low_mem> option does).
1193    
1194    Hitting swap at end of reading source database is probably o.k. However,
1195    hitting swap before 90% will dramatically decrease performance and you will
1196    be better off with C<low_mem> and using rest of availble memory for
1197    operating system disk cache (Linux is particuallary good about this).
1198    However, every access to database record will require disk access, so
1199    generation phase will be slower 10-100 times.
1200    
1201    Parsed structures are essential - you just have option to trade RAM memory
1202    (which is fast) for disk space (which is slow). Be sure to have planty of
1203    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1204    
1205    However, when WebPAC is running on desktop machines (or laptops :-), it's
1206    highly undesireable for system to start swapping. Using C<low_mem> option can
1207    reduce WecPAC memory usage to around 64Mb for same database with lookup
1208    fields and sorted indexes which stay in RAM. Performance will suffer, but
1209    memory usage will really be minimal. It might be also more confortable to
1210    run WebPAC reniced on those machines.
1211    
1212  =cut  =cut
1213    
1214  1;  1;

Legend:
Removed from v.421  
changed lines
  Added in v.555

  ViewVC Help
Powered by ViewVC 1.1.26