--- trunk2/lib/WebPac.pm 2004/06/15 22:40:07 352 +++ trunk2/lib/WebPAC.pm 2004/06/20 18:39:30 376 @@ -1,52 +1,79 @@ -package WebPac; +package WebPAC; + +use warnings; +use strict; use Carp; +use Text::Iconv; +use Config::IniFiles; +use XML::Simple; +use Template; +use Log::Log4perl qw(get_logger :levels); + +use Data::Dumper; + +#my $LOOKUP_REGEX = '\[[^\[\]]+\]'; +#my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]'; +my $LOOKUP_REGEX = 'lookup{[^\{\}]+}'; +my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}'; =head1 NAME -WebPac - base class for WebPac +WebPAC - base class for WebPAC =head1 DESCRIPTION -This class does basic thing for WebPac. +This module implements methods used by WebPAC. =head1 METHODS =head2 new -This will create new instance of WebPac using configuration specified by C. +This will create new instance of WebPAC using configuration specified by C. - my $webpac = new WebPac( + my $webpac = new WebPAC( config_file => 'name.conf', [code_page => 'ISO-8859-2',] ); Default C is C. +It will also read configuration files +C (used by indexer and Web font-end) +and configuration file specified by C +which describes databases to be indexed. + =cut +# mapping between data type and tag which specify +# format in XML file +my %type2tag = ( + 'isis' => 'isis', +# 'excel' => 'column', +# 'marc' => 'marc', +# 'feed' => 'feed' +); + sub new { my $class = shift; my $self = {@_}; bless($self, $class); + my $log_file = $self->{'log'} || "log.conf"; + Log::Log4perl->init($log_file); + + my $log = $self->_get_logger(); + # fill in default values # output codepage $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'}); - return $self; -} - -=head2 read_global_config + # + # read global.conf + # + $log->debug("read 'global.conf'"); -Read global configuration (used by indexer and Web font-end) - -=cut - -sub read_global_config { - my $self = shift; - - $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'"; + my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'"); # read global config parametars foreach my $var (qw( @@ -56,35 +83,28 @@ dbi_passwd show_progress my_unac_filter + output_template )) { - $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var); + $self->{'global_config'}->{$var} = $config->val('global', $var); } - return $self; -} - -=head2 read_indexer_config - -Read indexer configuration (specify databases, types etc.) - -=cut - -sub read_indexer_config { - my $self = shift; - - $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'"; - - # read global config parametars - foreach my $var (qw( - dbi_dbd - dbi_dsn - dbi_user - dbi_passwd - show_progress - my_unac_filter - )) { - $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var); - } + # + # read indexer config file + # + + $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'"); + + # create UTF-8 convertor for import_xml files + $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'}); + + # create Template toolkit instance + $self->{'tt'} = Template->new( + INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'), +# FILTERS => { +# 'foo' => \&foo_filter, +# }, + EVAL_PERL => 1, + ); return $self; } @@ -102,10 +122,8 @@ By default, ISIS code page is assumed to be C<852>. -If C is set, it will read just 500 records from -database in example above. - -Returns number of last record read into memory (size of database, really). +If optional parametar C is set, it will read just 500 records +from database in example above. C argument is an array of lookups to create. Each lookup must have C and C. Optional parametar C is perl code to evaluate before storing @@ -118,24 +136,36 @@ 'val' => 'v900' }, ] +Returns number of last record read into memory (size of database, really). + =cut sub open_isis { my $self = shift; my $arg = {@_}; - croak "need filename" if (! $arg->{'filename'}); + my $log = $self->_get_logger(); + + $log->logcroak("need filename") if (! $arg->{'filename'}); my $code_page = $arg->{'code_page'} || '852'; + use OpenIsis; + #$self->{'isis_code_page'} = $code_page; # create Text::Iconv object my $cp = Text::Iconv->new($code_page,$self->{'code_page'}); + $log->info("reading ISIS database '",$arg->{'filename'},"'"); + my $isis_db = OpenIsis::open($arg->{'filename'}); my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1; + $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn}); + + $log->info("processing $maxmfn records..."); + # read database for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) { @@ -158,33 +188,173 @@ push @{$self->{'data'}->{$mfn}->{$k}}, $val; } + } else { + push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn; } } # create lookup + my $rec = $self->{'data'}->{$mfn}; + $self->create_lookup($rec, @{$arg->{'lookup'}}); + + } + + $self->{'current_mfn'} = 1; + + # store max mfn and return it. + return $self->{'max_mfn'} = $maxmfn; +} + +=head2 fetch_rec + +Fetch next record from database. It will also display progress bar (once +it's implemented, that is). + + my $rec = $webpac->fetch_rec; + +=cut + +sub fetch_rec { + my $self = shift; + + my $log = $self->_get_logger(); + + my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!"); + + if ($mfn > $self->{'max_mfn'}) { + $self->{'current_mfn'} = $self->{'max_mfn'}; + $log->debug("at EOF"); + return; + } + + return $self->{'data'}->{$mfn}; +} + +=head2 open_import_xml + +Read file from C directory and parse it. + + $webpac->open_import_xml(type => 'isis'); + +=cut + +sub open_import_xml { + my $self = shift; + + my $log = $self->_get_logger(); + + my $arg = {@_}; + $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'}); + + $self->{'type'} = $arg->{'type'}; + + my $type_base = $arg->{'type'}; + $type_base =~ s/_.*$//g; + + $self->{'tag'} = $type2tag{$type_base}; + + $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">"); + + my $f = "./import_xml/".$self->{'type'}.".xml"; + $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f"); + + $log->info("reading '$f'"); + + $self->{'import_xml_file'} = $f; + + $self->{'import_xml'} = XMLin($f, + ForceArray => [ $self->{'tag'}, 'config', 'format' ], + ); + + $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }); - foreach my $i (@{$arg->{lookup}}) { - my $rec = $self->{'data'}->{$mfn}; - if ($i->{'eval'}) { - my $eval = $self->fill_in($rec,$i->{'eval'}); - my $key = $self->fill_in($rec,$i->{'key'}); - my @val = $self->fill_in($rec,$i->{'val'}); - if ($key && @val && eval $eval) { - push @{$self->{'lookup'}->{$key}}, @val; +} + +=head2 create_lookup + +Create lookup from record using lookup definition. + + $self->create_lookup($rec, @lookups); + +Called internally by C methods. + +=cut + +sub create_lookup { + my $self = shift; + + my $log = $self->_get_logger(); + + my $rec = shift || $log->logconfess("need record to create lookup"); + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + foreach my $i (@_) { + if ($i->{'eval'}) { + my $eval = $self->fill_in($rec,$i->{'eval'}); + my $key = $self->fill_in($rec,$i->{'key'}); + my @val = $self->fill_in($rec,$i->{'val'}); + if ($key && @val && eval $eval) { + $log->debug("stored $key = ",sub { join(" | ",@val) }); + push @{$self->{'lookup'}->{$key}}, @val; + } + } else { + my $key = $self->fill_in($rec,$i->{'key'}); + my @val = $self->fill_in($rec,$i->{'val'}); + if ($key && @val) { + $log->debug("stored $key = ",sub { join(" | ",@val) }); + push @{$self->{'lookup'}->{$key}}, @val; + } + } + } +} + +=head2 get_data + +Returns value from record. + + my $text = $self->get_data(\$rec,$f,$sf,$i,\$found); + +Arguments are: +record reference C<$rec>, +field C<$f>, +optional subfiled C<$sf>, +index for repeatable values C<$i>. + +Optinal variable C<$found> will be incremeted if there +is field. + +Returns value or empty string. + +=cut + +sub get_data { + my $self = shift; + + my ($rec,$f,$sf,$i,$found) = @_; + + if ($$rec->{$f}) { + return '' if (! $$rec->{$f}->[$i]); + if ($sf && $$rec->{$f}->[$i]->{$sf}) { + $$found++ if (defined($$found)); + return $$rec->{$f}->[$i]->{$sf}; + } elsif ($$rec->{$f}->[$i]) { + $$found++ if (defined($$found)); + # it still might have subfield, just + # not specified, so we'll dump all + if ($$rec->{$f}->[$i] =~ /HASH/o) { + my $out; + foreach my $k (keys %{$$rec->{$f}->[$i]}) { + $out .= $$rec->{$f}->[$i]->{$k}." "; } + return $out; } else { - my $key = $self->fill_in($rec,$i->{'key'}); - my @val = $self->fill_in($rec,$i->{'val'}); - if ($key && @val) { - push @{$self->{'lookup'}->{$key}}, @val; - } + return $$rec->{$f}->[$i]; } } + } else { + return ''; } - - # store max mfn and return it. - return $self->{'max_mfn'} = $maxmfn; } =head2 fill_in @@ -193,56 +363,62 @@ strings with placeholders and returns string or array of with substituted values from record. - $webpac->fill_in($rec,'v250^a'); + my $text = $webpac->fill_in($rec,'v250^a'); Optional argument is ordinal number for repeatable fields. By default, -it's assume to be first repeatable field. +it's assume to be first repeatable field (fields are perl array, so first +element is 0). +Following example will read second value from repeatable field. + + my $text = $webpac->fill_in($rec,'Title: v250^a',1); + +This function B perform parsing of format to inteligenty skip +delimiters before fields which aren't used. + +This method will automatically decode UTF-8 string to local code page +if needed. =cut sub fill_in { my $self = shift; - my $rec = shift || confess "need data record"; - my $format = shift || confess "need format to parse"; + my $log = $self->_get_logger(); + + my $rec = shift || $log->logconfess("need data record"); + my $format = shift || $log->logconfess("need format to parse"); # iteration (for repeatable fields) my $i = shift || 0; # FIXME remove for speedup? - if ($rec !~ /HASH/) { - confess("need HASH as first argument!"); + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + if (utf8::is_utf8($format)) { + $format = $self->_x($format); } my $found = 0; - # get field with subfield - sub get_sf { - my ($found,$rec,$f,$sf,$i) = @_; - if ($$rec->{$f} && $$rec->{$f}->[$i]->{$sf}) { - $$found++; - return $$rec->{$f}->[$i]->{$sf}; - } else { - return ''; - } - } - - # get field (without subfield) - sub get_nosf { - my ($found,$rec,$f,$i) = @_; - if ($$rec->{$f} && $$rec->{$f}->[$i]) { - $$found++; - return $$rec->{$f}->[$i]; - } else { - return ''; - } - } + my $eval_code; + # remove eval{...} from beginning + $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); # do actual replacement of placeholders - $format =~ s/v(\d+)\^(\w)/get_sf(\$found,\$rec,$1,$2,$i)/ges; - $format =~ s/v(\d+)/get_nosf(\$found,\$rec,$1,$i)/ges; + $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; - if ($found) { - return $format; + if ($found) { + $log->debug("format: $format"); + if ($eval_code) { + my $eval = $self->fill_in($rec,$eval_code,$i); + return if (! $self->_eval($eval)); + } + # do we have lookups? + if ($format =~ /$LOOKUP_REGEX/o) { + $log->debug("format '$format' has lookup"); + return $self->lookup($format); + } else { + return $format; + } } else { return; } @@ -250,44 +426,451 @@ =head2 lookup -This function will perform lookups on format supplied to it. +Perform lookups on format supplied to it. + + my $text = $self->lookup('[v900]'); - my $txt = $self->lookup('[v900]'); +Lookups can be nested (like C<[d:[a:[v900]]]>). =cut sub lookup { my $self = shift; - my $tmp = shift || confess "need format"; + my $log = $self->_get_logger(); - if ($tmp =~ /\[[^\[\]]+\]/) { + my $tmp = shift || $log->logconfess("need format"); + + if ($tmp =~ /$LOOKUP_REGEX/o) { my @in = ( $tmp ); -print "##lookup $tmp\n"; + + $log->debug("lookup for: ",$tmp); + my @out; while (my $f = shift @in) { - if ($f =~ /\[([^\[\]]+)\]/) { + if ($f =~ /$LOOKUP_REGEX_SAVE/o) { my $k = $1; if ($self->{'lookup'}->{$k}) { -print "## lookup key = $k\n"; foreach my $nv (@{$self->{'lookup'}->{$k}}) { my $tmp2 = $f; - $tmp2 =~ s/\[$k\]/$nv/g; + $tmp2 =~ s/lookup{$k}/$nv/g; push @in, $tmp2; -print "## lookup in => $tmp2\n"; } } else { undef $f; } } elsif ($f) { push @out, $f; -print "## lookup out => $f\n"; } } + $log->logconfess("return is array and it's not expected!") unless wantarray; return @out; } else { return $tmp; } } +=head2 parse + +Perform smart parsing of string, skipping delimiters for fields which aren't +defined. It can also eval code in format starting with C and +return output or nothing depending on eval code. + + my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i); + +=cut + +sub parse { + my $self = shift; + + my ($rec, $format_utf8, $i) = @_; + + return if (! $format_utf8); + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'}); + + $i = 0 if (! $i); + + my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'}); + + my @out; + + $log->debug("format: $format"); + + my $eval_code; + # remove eval{...} from beginning + $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); + + my $prefix; + my $all_found=0; + + while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) { + + my $del = $1 || ''; + $prefix ||= $del if ($all_found == 0); + + my $found = 0; + my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found); + + if ($found) { + push @out, $del; + push @out, $tmp; + $all_found += $found; + } + } + + return if (! $all_found); + + my $out = join('',@out); + + if ($out) { + # add rest of format (suffix) + $out .= $format; + + # add prefix if not there + $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); + + $log->debug("result: $out"); + } + + if ($eval_code) { + my $eval = $self->fill_in($rec,$eval_code,$i); + $log->debug("about to eval{",$eval,"} format: $out"); + return if (! $self->_eval($eval)); + } + + return $out; +} + +=head2 parse_to_arr + +Similar to C, but returns array of all repeatable fields + + my @arr = $webpac->parse_to_arr($rec,'v250^a'); + +=cut + +sub parse_to_arr { + my $self = shift; + + my ($rec, $format_utf8) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + my $i = 0; + my @arr; + + while (my $v = $self->parse($rec,$format_utf8,$i++)) { + push @arr, $v; + } + + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + + return @arr; +} + +=head2 fill_in_to_arr + +Similar to C, but returns array of all repeatable fields. Usable +for fields which have lookups, so they shouldn't be parsed but rather +Ced. + + my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]'); + +=cut + +sub fill_in_to_arr { + my $self = shift; + + my ($rec, $format_utf8) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + my $i = 0; + my @arr; + + while (my @v = $self->fill_in($rec,$format_utf8,$i++)) { + push @arr, @v; + } + + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + + return @arr; +} + + +=head2 data_structure + +Create in-memory data structure which represents layout from C. +It is used later to produce output. + + my @ds = $webpac->data_structure($rec); + +This method will also set C<$webpac->{'currnet_filename'}> if there is + tag in C. + +=cut + +sub data_structure { + my $self = shift; + + my $log = $self->_get_logger(); + + my $rec = shift; + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + undef $self->{'currnet_filename'}; + + my @sorted_tags; + if ($self->{tags_by_order}) { + @sorted_tags = @{$self->{tags_by_order}}; + } else { + @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}}; + $self->{tags_by_order} = \@sorted_tags; + } + + my @ds; + + $log->debug("tags: ",sub { join(", ",@sorted_tags) }); + + foreach my $field (@sorted_tags) { + + my $row; + +#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); + + foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { + my $format = $tag->{'value'} || $tag->{'content'}; + + $log->debug("format: $format"); + + my @v; + if ($format =~ /$LOOKUP_REGEX/o) { + @v = $self->fill_in_to_arr($rec,$format); + } else { + @v = $self->parse_to_arr($rec,$format); + } + next if (! @v); + + # use format? + if ($tag->{'format_name'}) { + @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; + } + + # does tag have type? + if ($tag->{'type'}) { + push @{$row->{$tag->{'type'}}}, @v; + } else { + push @{$row->{'display'}}, @v; + push @{$row->{'swish'}}, @v; + } + + if ($field eq 'filename') { + $self->{'current_filename'} = join('',@v); + $log->debug("filename: ",$self->{'current_filename'}); + } + + } + + if ($row) { + $row->{'tag'} = $field; + + # TODO: name_sigular, name_plural + my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; + $row->{'name'} = $name ? $self->_x($name) : $field; + + push @ds, $row; + + $log->debug("row $field: ",sub { Dumper($row) }); + } + + } + + return @ds; + +} + +=head2 output + +Create output from in-memory data structure using Template Toolkit template. + +my $text = $webpac->output( template => 'text.tt', data => @ds ); + +=cut + +sub output { + my $self = shift; + + my $args = {@_}; + + my $log = $self->_get_logger(); + + $log->logconfess("need template name") if (! $args->{'template'}); + $log->logconfess("need data array") if (! $args->{'data'}); + + my $out; + + $self->{'tt'}->process( + $args->{'template'}, + $args, + \$out + ) || confess $self->{'tt'}->error(); + + return $out; +} + +=head2 apply_format + +Apply format specified in tag with C and +C. + + my $text = $webpac->apply_format($format_name,$format_delimiter,$data); + +Formats can contain C if you need them. + +=cut + +sub apply_format { + my $self = shift; + + my ($name,$delimiter,$data) = @_; + + my $log = $self->_get_logger(); + + if (! $self->{'import_xml'}->{'format'}->{$name}) { + $log->warn(" is not defined in ",$self->{'import_xml_file'}); + return $data; + } + + $log->warn("no delimiter for format $name") if (! $delimiter); + + my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + + my @data = split(/\Q$delimiter\E/, $data); + + my $out = sprintf($format, @data); + $log->debug("using format $name [$format] on $data to produce: $out"); + + if ($out =~ m/$LOOKUP_REGEX/o) { + return $self->lookup($out); + } else { + return $out; + } + +} + + +# +# +# + +=head1 INTERNAL METHODS + +Here is a quick list of internal methods, mostly useful to turn debugging +on them (see L below for explanation). + +=cut + +=head2 _eval + +Internal function to eval code without C. + +=cut + +sub _eval { + my $self = shift; + + my $code = shift || return; + + my $log = $self->_get_logger(); + + no strict 'subs'; + my $ret = eval $code; + if ($@) { + $log->error("problem with eval code [$code]: $@"); + } + + $log->debug("eval: ",$code," [",$ret,"]"); + + return $ret || 0; +} + +=head2 _sort_by_order + +Sort xml tags data structure accoding to C attribute. + +=cut + +sub _sort_by_order { + my $self = shift; + + my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$a}; + my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$b}; + + return $va <=> $vb; +} + +=head2 _get_logger + +Get C object with a twist: domains are defined for each +method + + my $log = $webpac->_get_logger(); + +=cut + +sub _get_logger { + my $self = shift; + + my $name = (caller(1))[3] || caller; + return get_logger($name); +} + +=head2 _x + +Convert string from UTF-8 to code page defined in C. + + my $text = $webpac->_x('utf8 text'); + +=cut + +sub _x { + my $self = shift; + my $utf8 = shift || return; + + return $self->{'utf2cp'}->convert($utf8) || + $self->_get_logger()->logwarn("can't convert '$utf8'"); +} + +# +# +# + +=head1 LOGGING + +Logging in WebPAC is performed by L with config file +C. + +Methods defined above have different levels of logging, so +it's descriptions will be useful to turn (mostry B logging) on +or off to see why WabPAC isn't perforing as you expect it (it might even +be a bug!). + +B. To repeat, you can +also use method names, and not only classes (which are just few) +to filter logging. + +=cut + 1;