/[webpac]/trunk2/lib/WebPAC.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk2/lib/WebPAC.pm

Parent Directory | Revision Log | View Patch Patch

-revision 353 by dpavlin,
Wed Jun 16 11:29:37 2004 UTC
+revision 367 by dpavlin,
Thu Jun 17 12:05:01 2004 UTC
 Line 1
- package WebPac;
+ package WebPAC;
+ use warnings;
+ use strict;
  use Carp;
  use Text::Iconv;
  use Config::IniFiles;
+ use XML::Simple;
+ use Data::Dumper;
  =head1 NAME
- WebPac - base class for WebPac
+ WebPAC - base class for WebPAC
  =head1 DESCRIPTION
- This module implements methods used by WebPac.
+ This module implements methods used by WebPAC.
  =head1 METHODS
  =head2 new
- This will create new instance of WebPac using configuration specified by C<config_file>.
+ This will create new instance of WebPAC using configuration specified by C<config_file>.
-  my $webpac = new WebPac(
+  my $webpac = new WebPAC(
          config_file => 'name.conf',
          [code_page => 'ISO-8859-2',]
   );
-Line 32 
 which describes databases to be indexed.
+Line 38 
 which describes databases to be indexed.
  =cut
+ # mapping between data type and tag which specify
+ # format in XML file
+ my %type2tag = (
+         'isis' => 'isis',
+ #       'excel' => 'column',
+ #       'marc' => 'marc',
+ #       'feed' => 'feed'
+ );
  sub new {
          my $class = shift;
          my $self = {@_};
-Line 65 
 sub new {
+Line 80 
 sub new {
          $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";
-         # read global config parametars
+         $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
-         foreach my $var (qw(
-                         dbi_dbd
-                         dbi_dsn
-                         dbi_user
-                         dbi_passwd
-                         show_progress
-                         my_unac_filter
-                 )) {
-                 $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);
-         }
          return $self;
  }
-Line 96 
 By default, ISIS code page is assumed to
+Line 100 
 By default, ISIS code page is assumed to
  If optional parametar C<limit_mfn> is set, it will read just 500 records
  from database in example above.
- Returns number of last record read into memory (size of database, really).
  C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
  C<val>. Optional parametar C<eval> is perl code to evaluate before storing
  value in index.
-Line 109 
 value in index.
+Line 111 
 value in index.
      'val' => 'v900' },
   ]
+ Returns number of last record read into memory (size of database, really).
  =cut
  sub open_isis {
-Line 125 
 sub open_isis {
+Line 129 
 sub open_isis {
          # create Text::Iconv object
          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
+         print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});
          my $isis_db = OpenIsis::open($arg->{'filename'});
          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
+         $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
+         print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});
          # read database
          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
-Line 156 
 sub open_isis {
+Line 166 
 sub open_isis {
                  }
                  # create lookup
+                 my $rec = $self->{'data'}->{$mfn};
+                 $self->create_lookup($rec, @{$arg->{'lookup'}});
+         }
+         $self->{'current_mfn'} = 1;
+         # store max mfn and return it.
+         return $self->{'max_mfn'} = $maxmfn;
+ }
+ =head2 fetch_rec
+ Fetch next record from database. It will also display progress bar (once
+ it's implemented, that is).
+  my $rec = $webpac->fetch_rec;
+ =cut
+ sub fetch_rec {
+         my $self = shift;
+         my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";
+         if ($mfn > $self->{'max_mfn'}) {
+                 $self->{'current_mfn'} = $self->{'max_mfn'};
+                 return;
+         }
+         return $self->{'data'}->{$mfn};
+ }
+ =head2 open_import_xml
+ Read file from C<import_xml/> directory and parse it.
+  $webpac->open_import_xml(type => 'isis');
-                 foreach my $i (@{$arg->{lookup}}) {
+ =cut
-                         my $rec = $self->{'data'}->{$mfn};
-                         if ($i->{'eval'}) {
+ sub open_import_xml {
-                                 my $eval = $self->fill_in($rec,$i->{'eval'});
+         my $self = shift;
-                                 my $key = $self->fill_in($rec,$i->{'key'});
-                                 my @val = $self->fill_in($rec,$i->{'val'});
+         my $arg = {@_};
-                                 if ($key && @val && eval $eval) {
+         confess "need type to load file from import_xml/" if (! $arg->{'type'});
-                                         push @{$self->{'lookup'}->{$key}}, @val;
+         $self->{'type'} = $arg->{'type'};
+         my $type_base = $arg->{'type'};
+         $type_base =~ s/_.*$//g;
+         $self->{'tag'} = $type2tag{$type_base};
+         print STDERR "using type ",$self->{'type'}," tag ",$self->{'tag'},"\n" if ($self->{'debug'});
+         my $f = "./import_xml/".$self->{'type'}.".xml";
+         confess "import_xml file '$f' doesn't exist!" if (! -e "$f");
+         print STDERR "reading '$f'\n" if ($self->{'debug'});
+         $self->{'import_xml'} = XMLin($f,
+                 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
+                 ForceContent => 1
+         );
+         print Dumper($self->{'import_xml'});
+ }
+ =head2 create_lookup
+ Create lookup from record using lookup definition.
+  $self->create_lookup($rec, @lookups);
+ Called internally by C<open_*> methods.
+ =cut
+ sub create_lookup {
+         my $self = shift;
+         my $rec = shift || confess "need record to create lookup";
+         confess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         foreach my $i (@_) {
+                 if ($i->{'eval'}) {
+                         my $eval = $self->fill_in($rec,$i->{'eval'});
+                         my $key = $self->fill_in($rec,$i->{'key'});
+                         my @val = $self->fill_in($rec,$i->{'val'});
+                         if ($key && @val && eval $eval) {
+                                 push @{$self->{'lookup'}->{$key}}, @val;
+                         }
+                 } else {
+                         my $key = $self->fill_in($rec,$i->{'key'});
+                         my @val = $self->fill_in($rec,$i->{'val'});
+                         if ($key && @val) {
+                                 push @{$self->{'lookup'}->{$key}}, @val;
+                         }
+                 }
+         }
+ }
+ =head2 get_data
+ Returns value from record.
+  my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
+ Arguments are:
+ record reference C<$rec>,
+ field C<$f>,
+ optional subfiled C<$sf>,
+ index for repeatable values C<$i>.
+ Optinal variable C<$found> will be incremeted if there
+ is field.
+ Returns value or empty string.
+ =cut
+ sub get_data {
+         my $self = shift;
+         my ($rec,$f,$sf,$i,$found) = @_;
+         if ($$rec->{$f}) {
+                 return '' if (! $$rec->{$f}->[$i]);
+                 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
+                         $$found++ if (defined($$found));
+                         return $$rec->{$f}->[$i]->{$sf};
+                 } elsif ($$rec->{$f}->[$i]) {
+                         $$found++ if (defined($$found));
+                         # it still might have subfield, just
+                         # not specified, so we'll dump all
+                         if ($$rec->{$f}->[$i] =~ /HASH/o) {
+                                 my $out;
+                                 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
+                                         $out .= $$rec->{$f}->[$i]->{$k}." ";
                                  }
+                                 return $out;
                          } else {
-                                 my $key = $self->fill_in($rec,$i->{'key'});
+                                 return $$rec->{$f}->[$i];
-                                 my @val = $self->fill_in($rec,$i->{'val'});
-                                 if ($key && @val) {
-                                         push @{$self->{'lookup'}->{$key}}, @val;
-                                 }
                          }
                  }
+         } else {
+                 return '';
          }
-         # store max mfn and return it.
-         return $self->{'max_mfn'} = $maxmfn;
  }
  =head2 fill_in
-Line 186 
 Workhourse of all: takes record from in-
+Line 324 
 Workhourse of all: takes record from in-
  strings with placeholders and returns string or array of with substituted
  values from record.
-  $webpac->fill_in($rec,'v250^a');
+  my $text = $webpac->fill_in($rec,'v250^a');
  Optional argument is ordinal number for repeatable fields. By default,
  it's assume to be first repeatable field (fields are perl array, so first
  element is 0).
  Following example will read second value from repeatable field.
-  $webpac->fill_in($rec,'Title: v250^a',1);
+  my $text = $webpac->fill_in($rec,'Title: v250^a',1);
  This function B<does not> perform parsing of format to inteligenty skip
  delimiters before fields which aren't used.
-Line 209 
 sub fill_in {
+Line 347 
 sub fill_in {
          my $i = shift || 0;
          # FIXME remove for speedup?
-         if ($rec !~ /HASH/o) {
+         confess("need HASH as first argument!") if ($rec !~ /HASH/o);
-                 confess("need HASH as first argument!");
-         }
          my $found = 0;
-         # get field with subfield
+         my $eval_code;
-         sub get_sf {
+         # remove eval{...} from beginning
-                 my ($found,$rec,$f,$sf,$i) = @_;
+         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
-                 if ($$rec->{$f} && $$rec->{$f}->[$i]->{$sf}) {
-                         $$found++;
-                         return $$rec->{$f}->[$i]->{$sf};
-                 } else {
-                         return '';
-                 }
-         }
-         # get field (without subfield)
-         sub get_nosf {
-                 my ($found,$rec,$f,$i) = @_;
-                 if ($$rec->{$f} && $$rec->{$f}->[$i]) {
-                         $$found++;
-                         return $$rec->{$f}->[$i];
-                 } else {
-                         return '';
-                 }
-         }
          # do actual replacement of placeholders
-         $format =~ s/v(\d+)\^(\w)/get_sf(\$found,\$rec,$1,$2,$i)/ges;
+         $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
-         $format =~ s/v(\d+)/get_nosf(\$found,\$rec,$1,$i)/ges;
          if ($found) {
+                 if ($eval_code) {
+                         my $eval = $self->fill_in($rec,$eval_code,$i);
+                         return if (! eval $eval);
+                 }
                  # do we have lookups?
                  if ($format =~ /\[[^\[\]]+\]/o) {
                          return $self->lookup($format);
-Line 255 
 sub fill_in {
+Line 376 
 sub fill_in {
  =head2 lookup
- This function will perform lookups on format supplied to it.
+ Perform lookups on format supplied to it.
+  my $text = $self->lookup('[v900]');
-  my $txt = $self->lookup('[v900]');
+ Lookups can be nested (like C<[d:[a:[v900]]]>).
  =cut
-Line 268 
 sub lookup {
+Line 391 
 sub lookup {
          if ($tmp =~ /\[[^\[\]]+\]/o) {
                  my @in = ( $tmp );
- #print "##lookup $tmp\n";
                  my @out;
                  while (my $f = shift @in) {
                          if ($f =~ /\[([^\[\]]+)\]/) {
                                  my $k = $1;
                                  if ($self->{'lookup'}->{$k}) {
- #print "## lookup key = $k\n";
                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {
                                                  my $tmp2 = $f;
                                                  $tmp2 =~ s/\[$k\]/$nv/g;
                                                  push @in, $tmp2;
- #print "## lookup in => $tmp2\n";
                                          }
                                  } else {
                                          undef $f;
                                  }
                          } elsif ($f) {
                                  push @out, $f;
- #print "## lookup out => $f\n";
                          }
                  }
                  return @out;
-Line 295 
 sub lookup {
+Line 414 
 sub lookup {
          }
  }
+ =head2 parse
+ Perform smart parsing of string, skipping delimiters for fields which aren't
+ defined. It can also eval code in format starting with C<eval{...}> and
+ return output or nothing depending on eval code.
+  my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
+ =cut
+ sub parse {
+         my $self = shift;
+         my ($rec, $format_utf8, $i) = @_;
+         return if (! $format_utf8);
+         confess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
+         $i = 0 if (! $i);
+         my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
+         my @out;
+         my $eval_code;
+         # remove eval{...} from beginning
+         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+         my $prefix;
+         my $all_found=0;
+         while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
+                 my $del = $1 || '';
+                 $prefix ||= $del if ($all_found == 0);
+                 my $found = 0;
+                 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
+                 if ($found) {
+                         push @out, $del;
+                         push @out, $tmp;
+                         $all_found += $found;
+                 }
+         }
+         return if (! $all_found);
+         my $out = join('',@out) . $format;
+         # add prefix if not there
+         $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+         if ($eval_code) {
+                 my $eval = $self->fill_in($rec,$eval_code,$i);
+                 return if (! eval $eval);
+         }
+         return $out;
+ }
+ =head2 parse_to_arr
+ Similar to C<parse>, but returns array of all repeatable fields
+  my @arr = $webpac->parse_to_arr($rec,'v250^a');
+ =cut
+ sub parse_to_arr {
+         my $self = shift;
+         my ($rec, $format_utf8) = @_;
+         confess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         return if (! $format_utf8);
+         my $i = 0;
+         my @arr;
+         while (my $v = $self->parse($rec,$format_utf8,$i++)) {
+                 push @arr, $v;
+         }
+         return @arr;
+ }
+ =head2 data_structure
+ Create in-memory data structure which represents layout from C<import_xml>.
+ It is used later to produce output.
+  my $ds = $webpac->data_structure($rec);
+ =cut
+ # private method _sort_by_order
+ # sort subrouting using order="" attribute
+ sub _sort_by_order {
+         my $self = shift;
+         my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
+                 $self->{'import_xml'}->{'indexer'}->{$a};
+         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
+                 $self->{'import_xml'}->{'indexer'}->{$b};
+         return $va <=> $vb;
+ }
+ sub data_structure {
+         my $self = shift;
+         my $rec = shift;
+         confess("need HASH as first argument!") if ($rec !~ /HASH/o);
+         my @sorted_tags;
+         if ($self->{tags_by_order}) {
+                 @sorted_tags = @{$self->{tags_by_order}};
+         } else {
+                 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
+                 $self->{tags_by_order} = \@sorted_tags;
+         }
+         my $ds;
+         foreach my $field (@sorted_tags) {
+                 my $row;
+ #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+                 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+                         my @v = $self->parse_to_arr($rec,$tag->{'content'});
+                         next if (! @v);
+                         # does tag have type?
+                         if ($tag->{'type'}) {
+                                 push @{$row->{$tag->{'type'}}}, @v;
+                         } else {
+                                 push @{$row->{'display'}}, @v;
+                                 push @{$row->{'swish'}}, @v;
+                         }
+                 }
+                 push @{$ds->{$field}}, $row if ($row);
+         }
+         print "data_structure => ",Dumper($ds);
+ }
 ;

 Legend:



Removed from v.353
 


changed lines


 
Added in v.367
 Legend:



Removed from v.353
 


changed lines


 
Added in v.367
-Removed from v.353
+Added in v.367

	ViewVC Help
Powered by ViewVC 1.1.26