/[webpac]/trunk2/lib/WebPAC.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk2/lib/WebPAC.pm

Parent Directory | Revision Log | View Patch Patch

-revision 372 by dpavlin,
Sat Jun 19 18:16:20 2004 UTC
+revision 421 by dpavlin,
Fri Sep 10 22:24:42 2004 UTC
 Line 12 
 use Log::Log4perl qw(get_logger :levels)
  use Data::Dumper;
+ #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
+ #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
+ my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
+ my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
  =head1 NAME
  WebPAC - base class for WebPAC
-Line 24 
 This module implements methods used by W
+Line 29 
 This module implements methods used by W
  =head2 new
- This will create new instance of WebPAC using configuration specified by C<config_file>.
+ Create new instance of WebPAC using configuration specified by C<config_file>.
   my $webpac = new WebPAC(
          config_file => 'name.conf',
          [code_page => 'ISO-8859-2',]
+         [low_mem => 1,]
   );
  Default C<code_page> is C<ISO-8859-2>.
- It will also read configuration files
+ This method will also read configuration files
  C<global.conf> (used by indexer and Web font-end)
  and configuration file specified by C<config_file>
  which describes databases to be indexed.
+ C<low_mem> options is double-edged sword. If enabled, WebPAC
+ will run on memory constraint machines (which doesn't have enough
+ physical RAM to create memory structure for whole ISIS database).
+ If your machine has 512Mb or more and database is around 10000 records,
+ memory shouldn't be an issue. If you don't have enough physical RAM, you
+ might consider using virtual memory (if your operating system is handling it
+ well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
+ parsed structure of ISIS database.
+ However, when WebPAC is running on desktop machines (or laptops :-), it's
+ highly undesireable for system to start swapping. Using C<low_mem> option can
+ reduce WecPAC memory usage to 16Mb for same database with lookup fields and
+ sorted indexes which stay in RAM. Performance will suffer, but memory usage
+ will really be minimal. It might be also more confortable to run WebPAC reniced
+ on those machines.
  =cut
  # mapping between data type and tag which specify
-Line 101 
 sub new {
+Line 124 
 sub new {
                  EVAL_PERL => 1,
          );
+         # running with low_mem flag? well, use DBM::Deep then.
+         if ($self->{'low_mem'}) {
+                 $log->info("running with low_mem which impacts performance (<64 Mb memory usage)");
+                 my $db_file = "data.db";
+                 if (-e $db_file) {
+                         unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
+                         $log->debug("removed '$db_file' from last run");
+                 }
+                 use DBM::Deep;
+                 my $db = new DBM::Deep $db_file;
+                 $log->logdie("DBM::Deep error: $!") unless ($db);
+                 if ($db->error()) {
+                         $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
+                 } else {
+                         $log->debug("using file $db_file for DBM::Deep");
+                 }
+                 $self->{'db'} = $db;
+         }
          return $self;
  }
-Line 144 
 sub open_isis {
+Line 193 
 sub open_isis {
          $log->logcroak("need filename") if (! $arg->{'filename'});
          my $code_page = $arg->{'code_page'} || '852';
+         # store data in object
+         $self->{'isis_filename'} = $arg->{'filename'};
+         $self->{'isis_code_page'} = $code_page;
          use OpenIsis;
          #$self->{'isis_code_page'} = $code_page;
-Line 152 
 sub open_isis {
+Line 205 
 sub open_isis {
          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
          $log->info("reading ISIS database '",$arg->{'filename'},"'");
+         $log->debug("isis code page: $code_page");
          my $isis_db = OpenIsis::open($arg->{'filename'});
-Line 164 
 sub open_isis {
+Line 218 
 sub open_isis {
          # read database
          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
+                 $log->debug("mfn: $mfn\n");
+                 my $rec;
                  # read record
                  my $row = OpenIsis::read( $isis_db, $mfn );
                  foreach my $k (keys %{$row}) {
-Line 181 
 sub open_isis {
+Line 240 
 sub open_isis {
                                                  $val = $l;
                                          }
-                                         push @{$self->{'data'}->{$mfn}->{$k}}, $val;
+                                         push @{$rec->{$k}}, $val;
                                  }
+                         } else {
+                                 push @{$rec->{'000'}}, $mfn;
                          }
                  }
+                 $log->confess("record $mfn empty?") unless ($rec);
+                 # store
+                 if ($self->{'low_mem'}) {
+                         $self->{'db'}->put($mfn, $rec);
+                 } else {
+                         $self->{'data'}->{$mfn} = $rec;
+                 }
                  # create lookup
-                 my $rec = $self->{'data'}->{$mfn};
                  $self->create_lookup($rec, @{$arg->{'lookup'}});
+                 $self->progress_bar($mfn,$maxmfn);
          }
          $self->{'current_mfn'} = 1;
+         $self->{'last_pcnt'} = 0;
+         $log->debug("max mfn: $maxmfn");
          # store max mfn and return it.
          return $self->{'max_mfn'} = $maxmfn;
-Line 217 
 sub fetch_rec {
+Line 291 
 sub fetch_rec {
          if ($mfn > $self->{'max_mfn'}) {
                  $self->{'current_mfn'} = $self->{'max_mfn'};
+                 $log->debug("at EOF");
                  return;
          }
-         return $self->{'data'}->{$mfn};
+         $self->progress_bar($mfn,$self->{'max_mfn'});
+         if ($self->{'low_mem'}) {
+                 return $self->{'db'}->get($mfn);
+         } else {
+                 return $self->{'data'}->{$mfn};
+         }
+ }
+ =head2 progress_bar
+ Draw progress bar on STDERR.
+  $webpac->progress_bar($current, $max);
+ =cut
+ sub progress_bar {
+         my $self = shift;
+         my ($curr,$max) = @_;
+         my $log = $self->_get_logger();
+         $log->logconfess("no current value!") if (! $curr);
+         $log->logconfess("no maximum value!") if (! $max);
+         if ($curr > $max) {
+                 $max = $curr;
+                 $log->debug("overflow to $curr");
+         }
+         $self->{'last_pcnt'} ||= 1;
+         $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});
+         my $p = int($curr * 100 / $max);
+         if ($p != $self->{'last_pcnt'}) {
+                 printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );
+                 $self->{'last_pcnt'} = $p;
+         }
+         print STDERR "\n" if ($p == 100);
  }
  =head2 open_import_xml
-Line 246 
 sub open_import_xml {
+Line 362 
 sub open_import_xml {
          $self->{'tag'} = $type2tag{$type_base};
-         $log->debug("using type '",$self->{'type'},"' tag <",$self->{'tag'},">") if ($self->{'debug'});
+         $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
          my $f = "./import_xml/".$self->{'type'}.".xml";
          $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
-         $log->debug("reading '$f'") if ($self->{'debug'});
+         $log->info("reading '$f'");
+         $self->{'import_xml_file'} = $f;
          $self->{'import_xml'} = XMLin($f,
                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],
-                 ForceContent => 1
          );
+         $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
  }
  =head2 create_lookup
-Line 279 
 sub create_lookup {
+Line 398 
 sub create_lookup {
          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
          foreach my $i (@_) {
-                 if ($i->{'eval'}) {
+                 $log->logconfess("need key") unless defined($i->{'key'});
-                         my $eval = $self->fill_in($rec,$i->{'eval'});
+                 $log->logconfess("need val") unless defined($i->{'val'});
-                         my $key = $self->fill_in($rec,$i->{'key'});
-                         my @val = $self->fill_in($rec,$i->{'val'});
+                 if (defined($i->{'eval'})) {
-                         if ($key && @val && eval $eval) {
+                         # eval first, so we can skip fill_in for key and val
+                         my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
+                         if ($self->_eval($eval)) {
+                                 my $key = $self->fill_in($rec,$i->{'key'}) || next;
+                                 my @val = $self->fill_in($rec,$i->{'val'}) || next;
+                                 $log->debug("stored $key = ",sub { join(" | ",@val) });
                                  push @{$self->{'lookup'}->{$key}}, @val;
                          }
                  } else {
-                         my $key = $self->fill_in($rec,$i->{'key'});
+                         my $key = $self->fill_in($rec,$i->{'key'}) || next;
-                         my @val = $self->fill_in($rec,$i->{'val'});
+                         my @val = $self->fill_in($rec,$i->{'val'}) || next;
-                         if ($key && @val) {
+                         $log->debug("stored $key = ",sub { join(" | ",@val) });
-                                 push @{$self->{'lookup'}->{$key}}, @val;
+                         push @{$self->{'lookup'}->{$key}}, @val;
-                         }
                  }
          }
  }
-Line 322 
 sub get_data {
+Line 445 
 sub get_data {
          if ($$rec->{$f}) {
                  return '' if (! $$rec->{$f}->[$i]);
+                 no strict 'refs';
                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                          $$found++ if (defined($$found));
                          return $$rec->{$f}->[$i]->{$sf};
-Line 362 
 Following example will read second value
+Line 486 
 Following example will read second value
  This function B<does not> perform parsing of format to inteligenty skip
  delimiters before fields which aren't used.
+ This method will automatically decode UTF-8 string to local code page
+ if needed.
  =cut
  sub fill_in {
-Line 377 
 sub fill_in {
+Line 504 
 sub fill_in {
          # FIXME remove for speedup?
          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         if (utf8::is_utf8($format)) {
+                 $format = $self->_x($format);
+         }
          my $found = 0;
          my $eval_code;
-Line 384 
 sub fill_in {
+Line 515 
 sub fill_in {
          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
          # do actual replacement of placeholders
-         $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
+         $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
          if ($found) {
+                 $log->debug("format: $format");
                  if ($eval_code) {
                          my $eval = $self->fill_in($rec,$eval_code,$i);
                          return if (! $self->_eval($eval));
                  }
                  # do we have lookups?
-                 $log->debug("test format '$format' for lookups");
+                 if ($format =~ /$LOOKUP_REGEX/o) {
-                 if ($format =~ /\[[^\[\]]+\]/o) {
+                         $log->debug("format '$format' has lookup");
                          return $self->lookup($format);
                  } else {
                          return $format;
-Line 420 
 sub lookup {
+Line 552 
 sub lookup {
          my $tmp = shift || $log->logconfess("need format");
-         if ($tmp =~ /\[[^\[\]]+\]/o) {
+         if ($tmp =~ /$LOOKUP_REGEX/o) {
                  my @in = ( $tmp );
                  $log->debug("lookup for: ",$tmp);
                  my @out;
                  while (my $f = shift @in) {
-                         if ($f =~ /\[([^\[\]]+)\]/) {
+                         if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
                                  my $k = $1;
                                  if ($self->{'lookup'}->{$k}) {
                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {
                                                  my $tmp2 = $f;
-                                                 $tmp2 =~ s/\[$k\]/$nv/g;
+                                                 $tmp2 =~ s/lookup{$k}/$nv/g;
                                                  push @in, $tmp2;
                                          }
                                  } else {
-Line 442 
 sub lookup {
+Line 574 
 sub lookup {
                                  push @out, $f;
                          }
                  }
+                 $log->logconfess("return is array and it's not expected!") unless wantarray;
                  return @out;
          } else {
                  return $tmp;
-Line 472 
 sub parse {
+Line 605 
 sub parse {
          $i = 0 if (! $i);
-         my $format = $self->{'utf2cp'}->convert($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
+         my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
          my @out;
+         $log->debug("format: $format");
          my $eval_code;
          # remove eval{...} from beginning
          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
-Line 483 
 sub parse {
+Line 618 
 sub parse {
          my $prefix;
          my $all_found=0;
-         while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
+         while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
                  my $del = $1 || '';
                  $prefix ||= $del if ($all_found == 0);
-Line 500 
 sub parse {
+Line 635 
 sub parse {
          return if (! $all_found);
-         my $out = join('',@out) . $format;
+         my $out = join('',@out);
-         # add prefix if not there
+         if ($out) {
-         $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+                 # add rest of format (suffix)
+                 $out .= $format;
+                 # add prefix if not there
+                 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+                 $log->debug("result: $out");
+         }
          if ($eval_code) {
                  my $eval = $self->fill_in($rec,$eval_code,$i);
-                 $log->debug("about to eval ",$eval," [$out]");
+                 $log->debug("about to eval{",$eval,"} format: $out");
                  return if (! $self->_eval($eval));
          }
-Line 539 
 sub parse_to_arr {
+Line 681 
 sub parse_to_arr {
                  push @arr, $v;
          }
+         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
          return @arr;
  }
+ =head2 fill_in_to_arr
+ Similar to C<fill_in>, but returns array of all repeatable fields. Usable
+ for fields which have lookups, so they shouldn't be parsed but rather
+ C<fill_id>ed.
+  my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
+ =cut
+ sub fill_in_to_arr {
+         my $self = shift;
+         my ($rec, $format_utf8) = @_;
+         my $log = $self->_get_logger();
+         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         return if (! $format_utf8);
+         my $i = 0;
+         my @arr;
+         while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
+                 push @arr, @v;
+         }
+         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+         return @arr;
+ }
  =head2 data_structure
  Create in-memory data structure which represents layout from C<import_xml>.
-Line 549 
 It is used later to produce output.
+Line 726 
 It is used later to produce output.
   my @ds = $webpac->data_structure($rec);
+ This method will also set C<$webpac->{'currnet_filename'}> if there is
+ <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
+ <headline> tag.
  =cut
  sub data_structure {
-Line 559 
 sub data_structure {
+Line 740 
 sub data_structure {
          my $rec = shift;
          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         undef $self->{'currnet_filename'};
+         undef $self->{'headline'};
          my @sorted_tags;
          if ($self->{tags_by_order}) {
                  @sorted_tags = @{$self->{tags_by_order}};
-Line 569 
 sub data_structure {
+Line 753 
 sub data_structure {
          my @ds;
+         $log->debug("tags: ",sub { join(", ",@sorted_tags) });
          foreach my $field (@sorted_tags) {
                  my $row;
-Line 576 
 sub data_structure {
+Line 762 
 sub data_structure {
  #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
                  foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
-                         my @v = $self->parse_to_arr($rec,$tag->{'content'});
+                         my $format = $tag->{'value'} || $tag->{'content'};
+                         $log->debug("format: $format");
+                         my @v;
+                         if ($format =~ /$LOOKUP_REGEX/o) {
+                                 @v = $self->fill_in_to_arr($rec,$format);
+                         } else {
+                                 @v = $self->parse_to_arr($rec,$format);
+                         }
                          next if (! @v);
+                         # use format?
+                         if ($tag->{'format_name'}) {
+                                 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
+                         }
+                         if ($field eq 'filename') {
+                                 $self->{'current_filename'} = join('',@v);
+                                 $log->debug("filename: ",$self->{'current_filename'});
+                         } elsif ($field eq 'headline') {
+                                 $self->{'headline'} .= join('',@v);
+                                 $log->debug("headline: ",$self->{'headline'});
+                                 next; # don't return headline in data_structure!
+                         }
                          # does tag have type?
                          if ($tag->{'type'}) {
                                  push @{$row->{$tag->{'type'}}}, @v;
-Line 587 
 sub data_structure {
+Line 795 
 sub data_structure {
                                  push @{$row->{'display'}}, @v;
                                  push @{$row->{'swish'}}, @v;
                          }
                  }
                  if ($row) {
                          $row->{'tag'} = $field;
+                         # TODO: name_sigular, name_plural
+                         my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
+                         $row->{'name'} = $name ? $self->_x($name) : $field;
                          push @ds, $row;
+                         $log->debug("row $field: ",sub { Dumper($row) });
                  }
          }
-Line 629 
 sub output {
+Line 846 
 sub output {
          return $out;
  }
+ =head2 output_file
+ Create output from in-memory data structure using Template Toolkit template
+ to a file.
+  $webpac->output_file(
+         file => 'out.txt',
+         template => 'text.tt',
+         data => @ds
+  );
+ =cut
+ sub output_file {
+         my $self = shift;
+         my $args = {@_};
+         my $log = $self->_get_logger();
+         my $file = $args->{'file'} || $log->logconfess("need file name");
+         $log->debug("creating file ",$file);
+         open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
+         print $fh $self->output(
+                 template => $args->{'template'},
+                 data => $args->{'data'},
+         ) || $log->logdie("print: $!");
+         close($fh) || $log->logdie("close: $!");
+ }
+ =head2 apply_format
+ Apply format specified in tag with C<format_name="name"> and
+ C<format_delimiter=";;">.
+  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+ Formats can contain C<lookup{...}> if you need them.
+ =cut
+ sub apply_format {
+         my $self = shift;
+         my ($name,$delimiter,$data) = @_;
+         my $log = $self->_get_logger();
+         if (! $self->{'import_xml'}->{'format'}->{$name}) {
+                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+                 return $data;
+         }
+         $log->warn("no delimiter for format $name") if (! $delimiter);
+         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+         my @data = split(/\Q$delimiter\E/, $data);
+         my $out = sprintf($format, @data);
+         $log->debug("using format $name [$format] on $data to produce: $out");
+         if ($out =~ m/$LOOKUP_REGEX/o) {
+                 return $self->lookup($out);
+         } else {
+                 return $out;
+         }
+ }
  #
  #
  #
-Line 681 
 sub _sort_by_order {
+Line 971 
 sub _sort_by_order {
          return $va <=> $vb;
  }
+ =head2 _get_logger
+ Get C<Log::Log4perl> object with a twist: domains are defined for each
+ method
+  my $log = $webpac->_get_logger();
+ =cut
  sub _get_logger {
          my $self = shift;
-         my @c = caller(1);
+         my $name = (caller(1))[3] || caller;
-         return get_logger($c[3]);
+         return get_logger($name);
+ }
+ =head2 _x
+ Convert string from UTF-8 to code page defined in C<import_xml>.
+  my $text = $webpac->_x('utf8 text');
+ =cut
+ sub _x {
+         my $self = shift;
+         my $utf8 = shift || return;
+         return $self->{'utf2cp'}->convert($utf8) ||
+                 $self->_get_logger()->logwarn("can't convert '$utf8'");
  }
  #

 Legend:



Removed from v.372
 


changed lines


 
Added in v.421
 Legend:



Removed from v.372
 


changed lines


 
Added in v.421
-Removed from v.372
+Added in v.421

	ViewVC Help
Powered by ViewVC 1.1.26