/[webpac]/trunk2/lib/WebPAC.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk2/lib/WebPAC.pm

Parent Directory | Revision Log | View Patch Patch

-revision 359 by dpavlin,
Wed Jun 16 15:41:16 2004 UTC
+revision 555 by dpavlin,
Fri Oct 29 22:09:04 2004 UTC
 Line 1
  package WebPAC;
+ use warnings;
+ use strict;
  use Carp;
  use Text::Iconv;
  use Config::IniFiles;
+ use XML::Simple;
+ use Template;
+ use Log::Log4perl qw(get_logger :levels);
+ use Time::HiRes qw(time);
  use Data::Dumper;
+ #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
+ #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
+ my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
+ my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
  =head1 NAME
  WebPAC - base class for WebPAC
-Line 18 
 This module implements methods used by W
+Line 30 
 This module implements methods used by W
  =head2 new
- This will create new instance of WebPAC using configuration specified by C<config_file>.
+ Create new instance of WebPAC using configuration specified by C<config_file>.
   my $webpac = new WebPAC(
          config_file => 'name.conf',
-         [code_page => 'ISO-8859-2',]
+         code_page => 'ISO-8859-2',
+         low_mem => 1,
+         filter => {
+                 'lower' => sub { lc($_[0]) },
+         },
   );
  Default C<code_page> is C<ISO-8859-2>.
- It will also read configuration files
+ Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
+ There is optinal parametar C<filter> which specify different filters which
+ can be applied using C<filter{name}> notation.
+ This method will also read configuration files
  C<global.conf> (used by indexer and Web font-end)
  and configuration file specified by C<config_file>
  which describes databases to be indexed.
  =cut
+ # mapping between data type and tag which specify
+ # format in XML file
+ my %type2tag = (
+         'isis' => 'isis',
+ #       'excel' => 'column',
+ #       'marc' => 'marc',
+ #       'feed' => 'feed'
+ );
  sub new {
          my $class = shift;
          my $self = {@_};
          bless($self, $class);
+         $self->{'start_t'} = time();
+         my $log_file = $self->{'log'} || "log.conf";
+         Log::Log4perl->init($log_file);
+         my $log = $self->_get_logger();
          # fill in default values
          # output codepage
          $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
-Line 46 
 sub new {
+Line 83 
 sub new {
          #
          # read global.conf
          #
+         $log->debug("read 'global.conf'");
-         $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";
+         my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
          # read global config parametars
          foreach my $var (qw(
-Line 57 
 sub new {
+Line 95 
 sub new {
                          dbi_passwd
                          show_progress
                          my_unac_filter
+                         output_template
                  )) {
-                 $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);
+                 $self->{'global_config'}->{$var} = $config->val('global', $var);
          }
          #
          # read indexer config file
          #
-         $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";
+         $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
-         # read global config parametars
+         # create UTF-8 convertor for import_xml files
-         foreach my $var (qw(
+         $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
-                         dbi_dbd
-                         dbi_dsn
+         # create Template toolkit instance
-                         dbi_user
+         $self->{'tt'} = Template->new(
-                         dbi_passwd
+                 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
-                         show_progress
+ #               FILTERS => {
-                         my_unac_filter
+ #                       'foo' => \&foo_filter,
-                 )) {
+ #               },
-                 $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);
+                 EVAL_PERL => 1,
+         );
+         # running with low_mem flag? well, use DBM::Deep then.
+         if ($self->{'low_mem'}) {
+                 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
+                 my $db_file = "data.db";
+                 if (-e $db_file) {
+                         unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
+                         $log->debug("removed '$db_file' from last run");
+                 }
+                 require DBM::Deep;
+                 my $db = new DBM::Deep $db_file;
+                 $log->logdie("DBM::Deep error: $!") unless ($db);
+                 if ($db->error()) {
+                         $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
+                 } else {
+                         $log->debug("using file '$db_file' for DBM::Deep");
+                 }
+                 $self->{'db'} = $db;
          }
+         $log->debug("filters defined: ",Dumper($self->{'filter'}));
          return $self;
  }
-Line 89 
 Open CDS/ISIS database using OpenIsis mo
+Line 156 
 Open CDS/ISIS database using OpenIsis mo
   $webpac->open_isis(
          filename => '/data/ISIS/ISIS',
          code_page => '852',
-         limit_mfn => '500',
+         limit_mfn => 500,
+         start_mfn => 6000,
          lookup => [ ... ],
   );
  By default, ISIS code page is assumed to be C<852>.
+ If optional parametar C<start_mfn> is set, this will be first MFN to read
+ from database (so you can skip beginning of your database if you need to).
  If optional parametar C<limit_mfn> is set, it will read just 500 records
  from database in example above.
- Returns number of last record read into memory (size of database, really).
  C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
  C<val>. Optional parametar C<eval> is perl code to evaluate before storing
  value in index.
-Line 111 
 value in index.
+Line 180 
 value in index.
      'val' => 'v900' },
   ]
+ Returns number of last record read into memory (size of database, really).
  =cut
  sub open_isis {
          my $self = shift;
          my $arg = {@_};
-         croak "need filename" if (! $arg->{'filename'});
+         my $log = $self->_get_logger();
+         $log->logcroak("need filename") if (! $arg->{'filename'});
          my $code_page = $arg->{'code_page'} || '852';
+         $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
+         # store data in object
+         $self->{'isis_filename'} = $arg->{'filename'};
+         $self->{'isis_code_page'} = $code_page;
          use OpenIsis;
          #$self->{'isis_code_page'} = $code_page;
-Line 127 
 sub open_isis {
+Line 206 
 sub open_isis {
          # create Text::Iconv object
          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
+         $log->info("reading ISIS database '",$arg->{'filename'},"'");
+         $log->debug("isis code page: $code_page");
          my $isis_db = OpenIsis::open($arg->{'filename'});
          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
+         my $startmfn = 1;
+         if (my $s = $self->{'start_mfn'}) {
+                 $log->info("skipping to MFN $s");
+                 $startmfn = $s;
+         } else {
+                 $self->{'start_mfn'} = $startmfn;
+         }
+         $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
-         $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
+         $log->info("processing ",($maxmfn-$startmfn)." records...");
          # read database
-         for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
+         for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
+                 $log->debug("mfn: $mfn\n");
+                 my $rec;
                  # read record
                  my $row = OpenIsis::read( $isis_db, $mfn );
-Line 153 
 sub open_isis {
+Line 250 
 sub open_isis {
                                                  $val = $l;
                                          }
-                                         push @{$self->{'data'}->{$mfn}->{$k}}, $val;
+                                         push @{$rec->{$k}}, $val;
                                  }
+                         } else {
+                                 push @{$rec->{'000'}}, $mfn;
                          }
                  }
+                 $log->confess("record $mfn empty?") unless ($rec);
+                 # store
+                 if ($self->{'low_mem'}) {
+                         $self->{'db'}->put($mfn, $rec);
+                 } else {
+                         $self->{'data'}->{$mfn} = $rec;
+                 }
                  # create lookup
-                 my $rec = $self->{'data'}->{$mfn};
                  $self->create_lookup($rec, @{$arg->{'lookup'}});
+                 $self->progress_bar($mfn,$maxmfn);
          }
+         $self->{'current_mfn'} = -1;
+         $self->{'last_pcnt'} = 0;
+         $log->debug("max mfn: $maxmfn");
          # store max mfn and return it.
          return $self->{'max_mfn'} = $maxmfn;
  }
+ =head2 fetch_rec
+ Fetch next record from database. It will also display progress bar (once
+ it's implemented, that is).
+  my $rec = $webpac->fetch_rec;
+ =cut
+ sub fetch_rec {
+         my $self = shift;
+         my $log = $self->_get_logger();
+         $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
+         if ($self->{'current_mfn'} == -1) {
+                 $self->{'current_mfn'} = $self->{'start_mfn'};
+         } else {
+                 $self->{'current_mfn'}++;
+         }
+         my $mfn = $self->{'current_mfn'};
+         if ($mfn > $self->{'max_mfn'}) {
+                 $self->{'current_mfn'} = $self->{'max_mfn'};
+                 $log->debug("at EOF");
+                 return;
+         }
+         $self->progress_bar($mfn,$self->{'max_mfn'});
+         if ($self->{'low_mem'}) {
+                 return $self->{'db'}->get($mfn);
+         } else {
+                 return $self->{'data'}->{$mfn};
+         }
+ }
+ =head2 mfn
+ Returns current record number (MFN).
+  print $webpac->mfn;
+ =cut
+ sub mfn {
+         my $self = shift;
+         return $self->{'current_mfn'};
+ }
+ =head2 progress_bar
+ Draw progress bar on STDERR.
+  $webpac->progress_bar($current, $max);
+ =cut
+ sub progress_bar {
+         my $self = shift;
+         my ($curr,$max) = @_;
+         my $log = $self->_get_logger();
+         $log->logconfess("no current value!") if (! $curr);
+         $log->logconfess("no maximum value!") if (! $max);
+         if ($curr > $max) {
+                 $max = $curr;
+                 $log->debug("overflow to $curr");
+         }
+         $self->{'last_pcnt'} ||= 1;
+         my $p = int($curr * 100 / $max) || 1;
+         # reset on re-run
+         if ($p < $self->{'last_pcnt'}) {
+                 $self->{'last_pcnt'} = $p;
+                 $self->{'last_t'} = time();
+                 $self->{'last_curr'} = undef;
+         }
+         $self->{'last_t'} ||= time();
+         if ($p != $self->{'last_pcnt'}) {
+                 my $last_curr = $self->{'last_curr'} || $curr;
+                 my $t = time();
+                 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
+                 my $eta = ($max-$curr) / ($rate || 1);
+                 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
+                 $self->{'last_pcnt'} = $p;
+                 $self->{'last_t'} = time();
+                 $self->{'last_curr'} = $curr;
+         }
+         print STDERR "\n" if ($p == 100);
+ }
+ =head2 fmt_time
+ Format time (in seconds) for display.
+  print $webpac->fmt_time(time());
+ This method is called by L<progress_bar> to display remaining time.
+ =cut
+ sub fmt_time {
+         my $self = shift;
+         my $t = shift || 0;
+         my $out = "";
+         my ($ss,$mm,$hh) = gmtime($t);
+         $out .= "${hh}h" if ($hh);
+         $out .= sprintf("%02d:%02d", $mm,$ss);
+         $out .= "  " if ($hh == 0);
+         return $out;
+ }
+ =head2 open_import_xml
+ Read file from C<import_xml/> directory and parse it.
+  $webpac->open_import_xml(type => 'isis');
+ =cut
+ sub open_import_xml {
+         my $self = shift;
+         my $log = $self->_get_logger();
+         my $arg = {@_};
+         $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
+         $self->{'type'} = $arg->{'type'};
+         my $type_base = $arg->{'type'};
+         $type_base =~ s/_.*$//g;
+         $self->{'tag'} = $type2tag{$type_base};
+         $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
+         my $f = "./import_xml/".$self->{'type'}.".xml";
+         $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
+         $log->info("reading '$f'");
+         $self->{'import_xml_file'} = $f;
+         $self->{'import_xml'} = XMLin($f,
+                 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
+         );
+         $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
+ }
  =head2 create_lookup
  Create lookup from record using lookup definition.
+  $self->create_lookup($rec, @lookups);
+ Called internally by C<open_*> methods.
  =cut
  sub create_lookup {
          my $self = shift;
-         my $rec = shift || confess "need record to create lookup";
+         my $log = $self->_get_logger();
-         confess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         my $rec = shift || $log->logconfess("need record to create lookup");
+         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
          foreach my $i (@_) {
-                 if ($i->{'eval'}) {
+                 $log->logconfess("need key") unless defined($i->{'key'});
-                         my $eval = $self->fill_in($rec,$i->{'eval'});
+                 $log->logconfess("need val") unless defined($i->{'val'});
-                         my $key = $self->fill_in($rec,$i->{'key'});
-                         my @val = $self->fill_in($rec,$i->{'val'});
+                 if (defined($i->{'eval'})) {
-                         if ($key && @val && eval $eval) {
+                         # eval first, so we can skip fill_in for key and val
+                         my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
+                         if ($self->_eval($eval)) {
+                                 my $key = $self->fill_in($rec,$i->{'key'}) || next;
+                                 my @val = $self->fill_in($rec,$i->{'val'}) || next;
+                                 $log->debug("stored $key = ",sub { join(" | ",@val) });
                                  push @{$self->{'lookup'}->{$key}}, @val;
                          }
                  } else {
-                         my $key = $self->fill_in($rec,$i->{'key'});
+                         my $key = $self->fill_in($rec,$i->{'key'}) || next;
-                         my @val = $self->fill_in($rec,$i->{'val'});
+                         my @val = $self->fill_in($rec,$i->{'val'}) || next;
-                         if ($key && @val) {
+                         $log->debug("stored $key = ",sub { join(" | ",@val) });
-                                 push @{$self->{'lookup'}->{$key}}, @val;
+                         push @{$self->{'lookup'}->{$key}}, @val;
-                         }
                  }
          }
  }
-Line 203 
 sub create_lookup {
+Line 492 
 sub create_lookup {
  Returns value from record.
-  $self->get_data(\$rec,$f,$sf,$i,\$found);
+  my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
  Arguments are:
  record reference C<$rec>,
-Line 211 
 field C<$f>,
+Line 500 
 field C<$f>,
  optional subfiled C<$sf>,
  index for repeatable values C<$i>.
- Optinal variable C<$found> will be incremeted if thre
+ Optinal variable C<$found> will be incremeted if there
  is field.
  Returns value or empty string.
-Line 222 
 sub get_data {
+Line 511 
 sub get_data {
          my $self = shift;
          my ($rec,$f,$sf,$i,$found) = @_;
          if ($$rec->{$f}) {
+                 return '' if (! $$rec->{$f}->[$i]);
+                 no strict 'refs';
                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                          $$found++ if (defined($$found));
                          return $$rec->{$f}->[$i]->{$sf};
                  } elsif ($$rec->{$f}->[$i]) {
                          $$found++ if (defined($$found));
-                         return $$rec->{$f}->[$i];
+                         # it still might have subfield, just
+                         # not specified, so we'll dump all
+                         if ($$rec->{$f}->[$i] =~ /HASH/o) {
+                                 my $out;
+                                 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
+                                         $out .= $$rec->{$f}->[$i]->{$k}." ";
+                                 }
+                                 return $out;
+                         } else {
+                                 return $$rec->{$f}->[$i];
+                         }
                  }
          } else {
                  return '';
-Line 241 
 Workhourse of all: takes record from in-
+Line 543 
 Workhourse of all: takes record from in-
  strings with placeholders and returns string or array of with substituted
  values from record.
-  $webpac->fill_in($rec,'v250^a');
+  my $text = $webpac->fill_in($rec,'v250^a');
  Optional argument is ordinal number for repeatable fields. By default,
  it's assume to be first repeatable field (fields are perl array, so first
  element is 0).
  Following example will read second value from repeatable field.
-  $webpac->fill_in($rec,'Title: v250^a',1);
+  my $text = $webpac->fill_in($rec,'Title: v250^a',1);
  This function B<does not> perform parsing of format to inteligenty skip
  delimiters before fields which aren't used.
+ This method will automatically decode UTF-8 string to local code page
+ if needed.
  =cut
  sub fill_in {
          my $self = shift;
-         my $rec = shift || confess "need data record";
+         my $log = $self->_get_logger();
-         my $format = shift || confess "need format to parse";
+         my $rec = shift || $log->logconfess("need data record");
+         my $format = shift || $log->logconfess("need format to parse");
          # iteration (for repeatable fields)
          my $i = shift || 0;
+         $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
          # FIXME remove for speedup?
-         confess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         if (utf8::is_utf8($format)) {
+                 $format = $self->_x($format);
+         }
          my $found = 0;
-Line 272 
 sub fill_in {
+Line 585 
 sub fill_in {
          # remove eval{...} from beginning
          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+         my $filter_name;
+         # remove filter{...} from beginning
+         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
          # do actual replacement of placeholders
-         $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
+         # repeatable fields
+         $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
+         # non-repeatable fields
+         $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
          if ($found) {
+                 $log->debug("format: $format");
                  if ($eval_code) {
                          my $eval = $self->fill_in($rec,$eval_code,$i);
-                         return if (! eval $eval);
+                         return if (! $self->_eval($eval));
+                 }
+                 if ($filter_name && $self->{'filter'}->{$filter_name}) {
+                         $log->debug("filter '$filter_name' for $format");
+                         $format = $self->{'filter'}->{$filter_name}->($format);
+                         return unless(defined($format));
+                         $log->debug("filter result: $format");
                  }
                  # do we have lookups?
-                 if ($format =~ /\[[^\[\]]+\]/o) {
+                 if ($format =~ /$LOOKUP_REGEX/o) {
+                         $log->debug("format '$format' has lookup");
                          return $self->lookup($format);
                  } else {
                          return $format;
-Line 295 
 sub fill_in {
+Line 623 
 sub fill_in {
  Perform lookups on format supplied to it.
-  my $txt = $self->lookup('[v900]');
+  my $text = $self->lookup('[v900]');
  Lookups can be nested (like C<[d:[a:[v900]]]>).
-Line 304 
 Lookups can be nested (like C<[d:[a:[v90
+Line 632 
 Lookups can be nested (like C<[d:[a:[v90
  sub lookup {
          my $self = shift;
-         my $tmp = shift || confess "need format";
+         my $log = $self->_get_logger();
+         my $tmp = shift || $log->logconfess("need format");
-         if ($tmp =~ /\[[^\[\]]+\]/o) {
+         if ($tmp =~ /$LOOKUP_REGEX/o) {
                  my @in = ( $tmp );
- #print "##lookup $tmp\n";
+                 $log->debug("lookup for: ",$tmp);
                  my @out;
                  while (my $f = shift @in) {
-                         if ($f =~ /\[([^\[\]]+)\]/) {
+                         if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
                                  my $k = $1;
                                  if ($self->{'lookup'}->{$k}) {
- #print "## lookup key = $k\n";
                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {
                                                  my $tmp2 = $f;
-                                                 $tmp2 =~ s/\[$k\]/$nv/g;
+                                                 $tmp2 =~ s/lookup{$k}/$nv/g;
                                                  push @in, $tmp2;
- #print "## lookup in => $tmp2\n";
                                          }
                                  } else {
                                          undef $f;
                                  }
                          } elsif ($f) {
                                  push @out, $f;
- #print "## lookup out => $f\n";
                          }
                  }
+                 $log->logconfess("return is array and it's not expected!") unless wantarray;
                  return @out;
          } else {
                  return $tmp;
-Line 341 
 Perform smart parsing of string, skippin
+Line 671 
 Perform smart parsing of string, skippin
  defined. It can also eval code in format starting with C<eval{...}> and
  return output or nothing depending on eval code.
-  $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
+  my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
  =cut
  sub parse {
          my $self = shift;
-         my ($rec, $format, $i) = @_;
+         my ($rec, $format_utf8, $i) = @_;
+         return if (! $format_utf8);
-         confess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         my $log = $self->_get_logger();
+         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
          $i = 0 if (! $i);
+         my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
          my @out;
+         $log->debug("format: $format");
          my $eval_code;
          # remove eval{...} from beginning
          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+         my $filter_name;
+         # remove filter{...} from beginning
+         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
          my $prefix;
          my $all_found=0;
- #print "## $format\n";
+         while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
-         while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
- #print "## [ $1 | $2 | $3 ] $format\n";
                  my $del = $1 || '';
                  $prefix ||= $del if ($all_found == 0);
+                 # repeatable index
+                 my $r = $i;
+                 $r = 0 if (lc("$2") eq 's');
                  my $found = 0;
-                 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
+                 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
                  if ($found) {
                          push @out, $del;
-Line 382 
 sub parse {
+Line 727 
 sub parse {
          return if (! $all_found);
-         my $out = join('',@out) . $format;
+         my $out = join('',@out);
+         if ($out) {
+                 # add rest of format (suffix)
+                 $out .= $format;
+                 # add prefix if not there
+                 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+                 $log->debug("result: $out");
+         }
-         # add prefix if not there
-         $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
          if ($eval_code) {
-                 my $eval = $self->fill_in($rec,$eval_code,$i);
+                 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
-                 return if (! eval $eval);
+                 $log->debug("about to eval{$eval} format: $out");
+                 return if (! $self->_eval($eval));
+         }
+         if ($filter_name && $self->{'filter'}->{$filter_name}) {
+                 $log->debug("about to filter{$filter_name} format: $out");
+                 $out = $self->{'filter'}->{$filter_name}->($out);
+                 return unless(defined($out));
+                 $log->debug("filter result: $out");
          }
          return $out;
  }
+ =head2 parse_to_arr
+ Similar to C<parse>, but returns array of all repeatable fields
+  my @arr = $webpac->parse_to_arr($rec,'v250^a');
+ =cut
+ sub parse_to_arr {
+         my $self = shift;
+         my ($rec, $format_utf8) = @_;
+         my $log = $self->_get_logger();
+         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         return if (! $format_utf8);
+         my $i = 0;
+         my @arr;
+         while (my $v = $self->parse($rec,$format_utf8,$i++)) {
+                 push @arr, $v;
+         }
+         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+         return @arr;
+ }
+ =head2 fill_in_to_arr
+ Similar to C<fill_in>, but returns array of all repeatable fields. Usable
+ for fields which have lookups, so they shouldn't be parsed but rather
+ C<fill_id>ed.
+  my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
+ =cut
+ sub fill_in_to_arr {
+         my $self = shift;
+         my ($rec, $format_utf8) = @_;
+         my $log = $self->_get_logger();
+         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         return if (! $format_utf8);
+         my $i = 0;
+         my @arr;
+         while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
+                 push @arr, @v;
+         }
+         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+         return @arr;
+ }
+ =head2 sort_arr
+ Sort array ignoring case and html in data
+  my @sorted = $webpac->sort_arr(@unsorted);
+ =cut
+ sub sort_arr {
+         my $self = shift;
+         my $log = $self->_get_logger();
+         # FIXME add Schwartzian Transformation?
+         my @sorted = sort {
+                 $a =~ s#<[^>]+/*>##;
+                 $b =~ s#<[^>]+/*>##;
+                 lc($b) cmp lc($a)
+         } @_;
+         $log->debug("sorted values: ",sub { join(", ",@sorted) });
+         return @sorted;
+ }
+ =head2 data_structure
+ Create in-memory data structure which represents layout from C<import_xml>.
+ It is used later to produce output.
+  my @ds = $webpac->data_structure($rec);
+ This method will also set C<$webpac->{'currnet_filename'}> if there is
+ <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
+ <headline> tag.
+ =cut
+ sub data_structure {
+         my $self = shift;
+         my $log = $self->_get_logger();
+         my $rec = shift;
+         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         undef $self->{'currnet_filename'};
+         undef $self->{'headline'};
+         my @sorted_tags;
+         if ($self->{tags_by_order}) {
+                 @sorted_tags = @{$self->{tags_by_order}};
+         } else {
+                 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
+                 $self->{tags_by_order} = \@sorted_tags;
+         }
+         my @ds;
+         $log->debug("tags: ",sub { join(", ",@sorted_tags) });
+         foreach my $field (@sorted_tags) {
+                 my $row;
+ #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+                 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+                         my $format = $tag->{'value'} || $tag->{'content'};
+                         $log->debug("format: $format");
+                         my @v;
+                         if ($format =~ /$LOOKUP_REGEX/o) {
+                                 @v = $self->fill_in_to_arr($rec,$format);
+                         } else {
+                                 @v = $self->parse_to_arr($rec,$format);
+                         }
+                         next if (! @v);
+                         if ($tag->{'sort'}) {
+                                 @v = $self->sort_arr(@v);
+                                 $log->warn("sort within tag is usually not what you want!");
+                         }
+                         # use format?
+                         if ($tag->{'format_name'}) {
+                                 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
+                         }
+                         if ($field eq 'filename') {
+                                 $self->{'current_filename'} = join('',@v);
+                                 $log->debug("filename: ",$self->{'current_filename'});
+                         } elsif ($field eq 'headline') {
+                                 $self->{'headline'} .= join('',@v);
+                                 $log->debug("headline: ",$self->{'headline'});
+                                 next; # don't return headline in data_structure!
+                         }
+                         # delimiter will join repeatable fields
+                         if ($tag->{'delimiter'}) {
+                                 @v = ( join($tag->{'delimiter'}, @v) );
+                         }
+                         # default types
+                         my @types = qw(display swish);
+                         # override by type attribute
+                         @types = ( $tag->{'type'} ) if ($tag->{'type'});
+                         foreach my $type (@types) {
+                                 # append to previous line?
+                                 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
+                                 if ($tag->{'append'}) {
+                                         # I will delimit appended part with
+                                         # delimiter (or ,)
+                                         my $d = $tag->{'delimiter'};
+                                         # default delimiter
+                                         $d ||= " ";
+                                         my $last = pop @{$row->{$type}};
+                                         $d = "" if (! $last);
+                                         $last .= $d . join($d, @v);
+                                         push @{$row->{$type}}, $last;
+                                 } else {
+                                         push @{$row->{$type}}, @v;
+                                 }
+                         }
+                 }
+                 if ($row) {
+                         $row->{'tag'} = $field;
+                         # TODO: name_sigular, name_plural
+                         my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
+                         $row->{'name'} = $name ? $self->_x($name) : $field;
+                         # post-sort all values in field
+                         if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+                                 $log->warn("sort at field tag not implemented");
+                         }
+                         push @ds, $row;
+                         $log->debug("row $field: ",sub { Dumper($row) });
+                 }
+         }
+         return @ds;
+ }
+ =head2 output
+ Create output from in-memory data structure using Template Toolkit template.
+ my $text = $webpac->output( template => 'text.tt', data => @ds );
+ =cut
+ sub output {
+         my $self = shift;
+         my $args = {@_};
+         my $log = $self->_get_logger();
+         $log->logconfess("need template name") if (! $args->{'template'});
+         $log->logconfess("need data array") if (! $args->{'data'});
+         my $out;
+         $self->{'tt'}->process(
+                 $args->{'template'},
+                 $args,
+                 \$out
+         ) || confess $self->{'tt'}->error();
+         return $out;
+ }
+ =head2 output_file
+ Create output from in-memory data structure using Template Toolkit template
+ to a file.
+  $webpac->output_file(
+         file => 'out.txt',
+         template => 'text.tt',
+         data => @ds
+  );
+ =cut
+ sub output_file {
+         my $self = shift;
+         my $args = {@_};
+         my $log = $self->_get_logger();
+         my $file = $args->{'file'} || $log->logconfess("need file name");
+         $log->debug("creating file ",$file);
+         open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
+         print $fh $self->output(
+                 template => $args->{'template'},
+                 data => $args->{'data'},
+         ) || $log->logdie("print: $!");
+         close($fh) || $log->logdie("close: $!");
+ }
+ =head2 apply_format
+ Apply format specified in tag with C<format_name="name"> and
+ C<format_delimiter=";;">.
+  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+ Formats can contain C<lookup{...}> if you need them.
+ =cut
+ sub apply_format {
+         my $self = shift;
+         my ($name,$delimiter,$data) = @_;
+         my $log = $self->_get_logger();
+         if (! $self->{'import_xml'}->{'format'}->{$name}) {
+                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+                 return $data;
+         }
+         $log->warn("no delimiter for format $name") if (! $delimiter);
+         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+         my @data = split(/\Q$delimiter\E/, $data);
+         my $out = sprintf($format, @data);
+         $log->debug("using format $name [$format] on $data to produce: $out");
+         if ($out =~ m/$LOOKUP_REGEX/o) {
+                 return $self->lookup($out);
+         } else {
+                 return $out;
+         }
+ }
+ #
+ #
+ #
+ =head1 INTERNAL METHODS
+ Here is a quick list of internal methods, mostly useful to turn debugging
+ on them (see L<LOGGING> below for explanation).
+ =cut
+ =head2 _eval
+ Internal function to eval code without C<strict 'subs'>.
+ =cut
+ sub _eval {
+         my $self = shift;
+         my $code = shift || return;
+         my $log = $self->_get_logger();
+         no strict 'subs';
+         my $ret = eval $code;
+         if ($@) {
+                 $log->error("problem with eval code [$code]: $@");
+         }
+         $log->debug("eval: ",$code," [",$ret,"]");
+         return $ret || undef;
+ }
+ =head2 _sort_by_order
+ Sort xml tags data structure accoding to C<order=""> attribute.
+ =cut
+ sub _sort_by_order {
+         my $self = shift;
+         my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
+                 $self->{'import_xml'}->{'indexer'}->{$a};
+         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
+                 $self->{'import_xml'}->{'indexer'}->{$b};
+         return $va <=> $vb;
+ }
+ =head2 _get_logger
+ Get C<Log::Log4perl> object with a twist: domains are defined for each
+ method
+  my $log = $webpac->_get_logger();
+ =cut
+ sub _get_logger {
+         my $self = shift;
+         my $name = (caller(1))[3] || caller;
+         return get_logger($name);
+ }
+ =head2 _x
+ Convert string from UTF-8 to code page defined in C<import_xml>.
+  my $text = $webpac->_x('utf8 text');
+ =cut
+ sub _x {
+         my $self = shift;
+         my $utf8 = shift || return;
+         return $self->{'utf2cp'}->convert($utf8) ||
+                 $self->_get_logger()->logwarn("can't convert '$utf8'");
+ }
+ #
+ #
+ #
+ =head1 LOGGING
+ Logging in WebPAC is performed by L<Log::Log4perl> with config file
+ C<log.conf>.
+ Methods defined above have different levels of logging, so
+ it's descriptions will be useful to turn (mostry B<debug> logging) on
+ or off to see why WabPAC isn't perforing as you expect it (it might even
+ be a bug!).
+ B<This is different from normal Log4perl behaviour>. To repeat, you can
+ also use method names, and not only classes (which are just few)
+ to filter logging.
+ =head1 MEMORY USAGE
+ C<low_mem> options is double-edged sword. If enabled, WebPAC
+ will run on memory constraint machines (which doesn't have enough
+ physical RAM to create memory structure for whole source database).
+ If your machine has 512Mb or more of RAM and database is around 10000 records,
+ memory shouldn't be an issue. If you don't have enough physical RAM, you
+ might consider using virtual memory (if your operating system is handling it
+ well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
+ parsed structure of ISIS database (this is what C<low_mem> option does).
+ Hitting swap at end of reading source database is probably o.k. However,
+ hitting swap before 90% will dramatically decrease performance and you will
+ be better off with C<low_mem> and using rest of availble memory for
+ operating system disk cache (Linux is particuallary good about this).
+ However, every access to database record will require disk access, so
+ generation phase will be slower 10-100 times.
+ Parsed structures are essential - you just have option to trade RAM memory
+ (which is fast) for disk space (which is slow). Be sure to have planty of
+ disk space if you are using C<low_mem> and thus L<DBD::Deep>.
+ However, when WebPAC is running on desktop machines (or laptops :-), it's
+ highly undesireable for system to start swapping. Using C<low_mem> option can
+ reduce WecPAC memory usage to around 64Mb for same database with lookup
+ fields and sorted indexes which stay in RAM. Performance will suffer, but
+ memory usage will really be minimal. It might be also more confortable to
+ run WebPAC reniced on those machines.
+ =cut
 ;

 Legend:



Removed from v.359
 


changed lines


 
Added in v.555
 Legend:



Removed from v.359
 


changed lines


 
Added in v.555
-Removed from v.359
+Added in v.555

	ViewVC Help
Powered by ViewVC 1.1.26