--- trunk2/lib/WebPAC.pm 2004/06/16 11:31:42 354
+++ trunk2/lib/WebPAC.pm 2004/06/17 01:44:25 366
@@ -3,6 +3,9 @@
use Carp;
use Text::Iconv;
use Config::IniFiles;
+use XML::Simple;
+
+use Data::Dumper;
=head1 NAME
@@ -32,6 +35,15 @@
=cut
+# mapping between data type and tag which specify
+# format in XML file
+my %type2tag = (
+ 'isis' => 'isis',
+# 'excel' => 'column',
+# 'marc' => 'marc',
+# 'feed' => 'feed'
+);
+
sub new {
my $class = shift;
my $self = {@_};
@@ -65,18 +77,7 @@
$self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";
- # read global config parametars
- foreach my $var (qw(
- dbi_dbd
- dbi_dsn
- dbi_user
- dbi_passwd
- show_progress
- my_unac_filter
- )) {
- $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);
- }
-
+ $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
return $self;
}
@@ -125,10 +126,16 @@
# create Text::Iconv object
my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
+ print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});
+
my $isis_db = OpenIsis::open($arg->{'filename'});
my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
+ $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
+
+ print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});
+
# read database
for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
@@ -156,28 +163,150 @@
}
# create lookup
+ my $rec = $self->{'data'}->{$mfn};
+ $self->create_lookup($rec, @{$arg->{'lookup'}});
+
+ }
+
+ $self->{'current_mfn'} = 1;
+
+ # store max mfn and return it.
+ return $self->{'max_mfn'} = $maxmfn;
+}
+
+=head2 fetch_rec
+
+Fetch next record from database. It will also display progress bar (once
+it's implemented, that is).
+
+ my $rec = $webpac->fetch_rec;
+
+=cut
+
+sub fetch_rec {
+ my $self = shift;
+
+ my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";
+
+ if ($mfn > $self->{'max_mfn'}) {
+ $self->{'current_mfn'} = $self->{'max_mfn'};
+ return;
+ }
+
+ return $self->{'data'}->{$mfn};
+}
+
+=head2 open_import_xml
+
+Read file from C directory and parse it.
+
+ $webpac->open_import_xml(type => 'isis');
- foreach my $i (@{$arg->{lookup}}) {
- my $rec = $self->{'data'}->{$mfn};
- if ($i->{'eval'}) {
- my $eval = $self->fill_in($rec,$i->{'eval'});
- my $key = $self->fill_in($rec,$i->{'key'});
- my @val = $self->fill_in($rec,$i->{'val'});
- if ($key && @val && eval $eval) {
- push @{$self->{'lookup'}->{$key}}, @val;
+=cut
+
+sub open_import_xml {
+ my $self = shift;
+
+ my $arg = {@_};
+ confess "need type to load file from import_xml/" if (! $arg->{'type'});
+
+ $self->{'type'} = $arg->{'type'};
+
+ my $type_base = $arg->{'type'};
+ $type_base =~ s/_.*$//g;
+
+ $self->{'tag'} = $type2tag{$type_base};
+
+ print STDERR "using type ",$self->{'type'}," tag ",$self->{'tag'},"\n" if ($self->{'debug'});
+
+ my $f = "./import_xml/".$self->{'type'}.".xml";
+ confess "import_xml file '$f' doesn't exist!" if (! -e "$f");
+
+ print STDERR "reading '$f'\n" if ($self->{'debug'});
+
+ $self->{'import_xml'} = XMLin($f,
+ ForceArray => [ $self->{'tag'}, 'config', 'format' ],
+ ForceContent => 1
+ );
+
+ print Dumper($self->{'import_xml'});
+
+}
+
+=head2 create_lookup
+
+Create lookup from record using lookup definition.
+
+=cut
+
+sub create_lookup {
+ my $self = shift;
+
+ my $rec = shift || confess "need record to create lookup";
+ confess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+ foreach my $i (@_) {
+ if ($i->{'eval'}) {
+ my $eval = $self->fill_in($rec,$i->{'eval'});
+ my $key = $self->fill_in($rec,$i->{'key'});
+ my @val = $self->fill_in($rec,$i->{'val'});
+ if ($key && @val && eval $eval) {
+ push @{$self->{'lookup'}->{$key}}, @val;
+ }
+ } else {
+ my $key = $self->fill_in($rec,$i->{'key'});
+ my @val = $self->fill_in($rec,$i->{'val'});
+ if ($key && @val) {
+ push @{$self->{'lookup'}->{$key}}, @val;
+ }
+ }
+ }
+}
+
+=head2 get_data
+
+Returns value from record.
+
+ $self->get_data(\$rec,$f,$sf,$i,\$found);
+
+Arguments are:
+record reference C<$rec>,
+field C<$f>,
+optional subfiled C<$sf>,
+index for repeatable values C<$i>.
+
+Optinal variable C<$found> will be incremeted if thre
+is field.
+
+Returns value or empty string.
+
+=cut
+
+sub get_data {
+ my $self = shift;
+
+ my ($rec,$f,$sf,$i,$found) = @_;
+ if ($$rec->{$f}) {
+ if ($sf && $$rec->{$f}->[$i]->{$sf}) {
+ $$found++ if (defined($$found));
+ return $$rec->{$f}->[$i]->{$sf};
+ } elsif ($$rec->{$f}->[$i]) {
+ $$found++ if (defined($$found));
+ # it still might have subfield, just
+ # not specified, so we'll dump all
+ if ($$rec->{$f}->[$i] =~ /HASH/o) {
+ my $out;
+ foreach my $k (keys %{$$rec->{$f}->[$i]}) {
+ $out .= $$rec->{$f}->[$i]->{$k}." ";
}
+ return $out;
} else {
- my $key = $self->fill_in($rec,$i->{'key'});
- my @val = $self->fill_in($rec,$i->{'val'});
- if ($key && @val) {
- push @{$self->{'lookup'}->{$key}}, @val;
- }
+ return $$rec->{$f}->[$i];
}
}
+ } else {
+ return '';
}
-
- # store max mfn and return it.
- return $self->{'max_mfn'} = $maxmfn;
}
=head2 fill_in
@@ -209,39 +338,22 @@
my $i = shift || 0;
# FIXME remove for speedup?
- if ($rec !~ /HASH/o) {
- confess("need HASH as first argument!");
- }
+ confess("need HASH as first argument!") if ($rec !~ /HASH/o);
my $found = 0;
- # get field with subfield
- sub get_sf {
- my ($found,$rec,$f,$sf,$i) = @_;
- if ($$rec->{$f} && $$rec->{$f}->[$i]->{$sf}) {
- $$found++;
- return $$rec->{$f}->[$i]->{$sf};
- } else {
- return '';
- }
- }
-
- # get field (without subfield)
- sub get_nosf {
- my ($found,$rec,$f,$i) = @_;
- if ($$rec->{$f} && $$rec->{$f}->[$i]) {
- $$found++;
- return $$rec->{$f}->[$i];
- } else {
- return '';
- }
- }
+ my $eval_code;
+ # remove eval{...} from beginning
+ $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
# do actual replacement of placeholders
- $format =~ s/v(\d+)\^(\w)/get_sf(\$found,\$rec,$1,$2,$i)/ges;
- $format =~ s/v(\d+)/get_nosf(\$found,\$rec,$1,$i)/ges;
+ $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
if ($found) {
+ if ($eval_code) {
+ my $eval = $self->fill_in($rec,$eval_code,$i);
+ return if (! eval $eval);
+ }
# do we have lookups?
if ($format =~ /\[[^\[\]]+\]/o) {
return $self->lookup($format);
@@ -255,10 +367,12 @@
=head2 lookup
-This function will perform lookups on format supplied to it.
+Perform lookups on format supplied to it.
my $txt = $self->lookup('[v900]');
+Lookups can be nested (like C<[d:[a:[v900]]]>).
+
=cut
sub lookup {
@@ -295,4 +409,138 @@
}
}
+=head2 parse
+
+Perform smart parsing of string, skipping delimiters for fields which aren't
+defined. It can also eval code in format starting with C and
+return output or nothing depending on eval code.
+
+ $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
+
+=cut
+
+sub parse {
+ my $self = shift;
+
+ my ($rec, $format_utf8, $i) = @_;
+
+ return if (! $format_utf8);
+
+ confess("need HASH as first argument!") if ($rec !~ /HASH/o);
+ confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
+
+ $i = 0 if (! $i);
+
+ my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
+
+ my @out;
+
+ my $eval_code;
+ # remove eval{...} from beginning
+ $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+ my $prefix;
+ my $all_found=0;
+
+#print "## $format\n";
+ while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
+#print "## [ $1 | $2 | $3 ] $format\n";
+
+ my $del = $1 || '';
+ $prefix ||= $del if ($all_found == 0);
+
+ my $found = 0;
+ my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
+
+ if ($found) {
+ push @out, $del;
+ push @out, $tmp;
+ $all_found += $found;
+ }
+ }
+
+ return if (! $all_found);
+
+ my $out = join('',@out) . $format;
+
+ # add prefix if not there
+ $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+
+ if ($eval_code) {
+ my $eval = $self->fill_in($rec,$eval_code,$i);
+ return if (! eval $eval);
+ }
+
+ return $out;
+}
+
+=head2 data_structure
+
+Create in-memory data structure which represents layout from C.
+It is used later to produce output.
+
+ my $ds = $webpac->data_structure($rec);
+
+=cut
+
+# private method _sort_by_order
+# sort subrouting using order="" attribute
+sub _sort_by_order {
+ my $self = shift;
+
+ my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
+ $self->{'import_xml'}->{'indexer'}->{$a};
+ my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
+ $self->{'import_xml'}->{'indexer'}->{$b};
+
+ return $va <=> $vb;
+}
+
+sub data_structure {
+ my $self = shift;
+
+ my $rec = shift;
+ confess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+ my @sorted_tags;
+ if ($self->{tags_by_order}) {
+ @sorted_tags = @{$self->{tags_by_order}};
+ } else {
+ @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
+ $self->{tags_by_order} = \@sorted_tags;
+ }
+
+ my $ds;
+
+ foreach my $field (@sorted_tags) {
+
+ my $row;
+ my $i = 0;
+
+#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+
+ foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+
+ my $v = $self->parse($rec,$tag->{'content'},$i);
+print "## $i:",$tag->{'content'}," = ",($v || 'null'),"\n";
+
+ next if (!$v || $v && $v eq '');
+
+ # does tag have type?
+ if ($tag->{'type'}) {
+ push @{$row->{$tag->{'type'}}}, $v;
+ } else {
+ push @{$row->{'display'}}, $v;
+ push @{$row->{'swish'}}, $v;
+ }
+ }
+
+ push @{$ds->{$field}}, $row if ($row);
+
+ }
+
+ print Dumper($ds);
+
+}
+
1;