--- trunk2/lib/WebPAC.pm 2004/06/16 20:05:19 363 +++ trunk2/lib/WebPAC.pm 2004/07/24 13:48:08 398 @@ -1,12 +1,22 @@ package WebPAC; +use warnings; +use strict; + use Carp; use Text::Iconv; use Config::IniFiles; use XML::Simple; +use Template; +use Log::Log4perl qw(get_logger :levels); use Data::Dumper; +#my $LOOKUP_REGEX = '\[[^\[\]]+\]'; +#my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]'; +my $LOOKUP_REGEX = 'lookup{[^\{\}]+}'; +my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}'; + =head1 NAME WebPAC - base class for WebPAC @@ -49,6 +59,11 @@ my $self = {@_}; bless($self, $class); + my $log_file = $self->{'log'} || "log.conf"; + Log::Log4perl->init($log_file); + + my $log = $self->_get_logger(); + # fill in default values # output codepage $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'}); @@ -56,8 +71,9 @@ # # read global.conf # + $log->debug("read 'global.conf'"); - $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'"; + my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'"); # read global config parametars foreach my $var (qw( @@ -67,15 +83,28 @@ dbi_passwd show_progress my_unac_filter + output_template )) { - $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var); + $self->{'global_config'}->{$var} = $config->val('global', $var); } # # read indexer config file # - $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'"; + $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'"); + + # create UTF-8 convertor for import_xml files + $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'}); + + # create Template toolkit instance + $self->{'tt'} = Template->new( + INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'), +# FILTERS => { +# 'foo' => \&foo_filter, +# }, + EVAL_PERL => 1, + ); return $self; } @@ -96,8 +125,6 @@ If optional parametar C is set, it will read just 500 records from database in example above. -Returns number of last record read into memory (size of database, really). - C argument is an array of lookups to create. Each lookup must have C and C. Optional parametar C is perl code to evaluate before storing value in index. @@ -109,13 +136,17 @@ 'val' => 'v900' }, ] +Returns number of last record read into memory (size of database, really). + =cut sub open_isis { my $self = shift; my $arg = {@_}; - croak "need filename" if (! $arg->{'filename'}); + my $log = $self->_get_logger(); + + $log->logcroak("need filename") if (! $arg->{'filename'}); my $code_page = $arg->{'code_page'} || '852'; use OpenIsis; @@ -125,7 +156,7 @@ # create Text::Iconv object my $cp = Text::Iconv->new($code_page,$self->{'code_page'}); - print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'}); + $log->info("reading ISIS database '",$arg->{'filename'},"'"); my $isis_db = OpenIsis::open($arg->{'filename'}); @@ -133,7 +164,7 @@ $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn}); - print STDERR "processing $maxmfn records...\n" if ($self->{'debug'}); + $log->info("processing $maxmfn records..."); # read database for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) { @@ -157,6 +188,8 @@ push @{$self->{'data'}->{$mfn}->{$k}}, $val; } + } else { + push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn; } } @@ -165,9 +198,12 @@ my $rec = $self->{'data'}->{$mfn}; $self->create_lookup($rec, @{$arg->{'lookup'}}); + $self->progress_bar($mfn,$maxmfn); + } $self->{'current_mfn'} = 1; + $self->{'last_pcnt'} = 0; # store max mfn and return it. return $self->{'max_mfn'} = $maxmfn; @@ -185,16 +221,55 @@ sub fetch_rec { my $self = shift; - my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!"; + my $log = $self->_get_logger(); + + my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!"); if ($mfn > $self->{'max_mfn'}) { $self->{'current_mfn'} = $self->{'max_mfn'}; + $log->debug("at EOF"); return; } + $self->progress_bar($mfn,$self->{'max_mfn'}); + return $self->{'data'}->{$mfn}; } +=head2 progress_bar + +Draw progress bar on STDERR. + + $webpac->progress_bar($current, $max); + +=cut + +sub progress_bar { + my $self = shift; + + my ($curr,$max) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("no current value!") if (! $curr); + $log->logconfess("no maximum value!") if (! $max); + + if ($curr > $max) { + $max = $curr; + $log->debug("overflow to $curr"); + } + + $self->{'last_pcnt'} ||= 1; + + $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'}); + + my $p = int($curr * 100 / $max); + if ($p != $self->{'last_pcnt'}) { + printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p ); + $self->{'last_pcnt'} = $p; + } +} + =head2 open_import_xml Read file from C directory and parse it. @@ -206,25 +281,32 @@ sub open_import_xml { my $self = shift; + my $log = $self->_get_logger(); + my $arg = {@_}; - confess "need type to load file from import_xml/" if (! $arg->{'type'}); + $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'}); - my $type = $arg->{'type'}; + $self->{'type'} = $arg->{'type'}; - my $type_base = $type; + my $type_base = $arg->{'type'}; $type_base =~ s/_.*$//g; - my $f = "./import_xml/$type.xml"; - confess "import_xml file '$f' doesn't exist!" if (! -e "$f"); + $self->{'tag'} = $type2tag{$type_base}; + + $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">"); + + my $f = "./import_xml/".$self->{'type'}.".xml"; + $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f"); - print STDERR "reading '$f'\n" if ($self->{'debug'}); + $log->info("reading '$f'"); + + $self->{'import_xml_file'} = $f; $self->{'import_xml'} = XMLin($f, - ForceArray => [ $type2tag{$type_base}, 'config', 'format' ], - ForceContent => 1 + ForceArray => [ $self->{'tag'}, 'config', 'format' ], ); - print Dumper($self->{'import_xml'}); + $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }); } @@ -232,13 +314,19 @@ Create lookup from record using lookup definition. + $self->create_lookup($rec, @lookups); + +Called internally by C methods. + =cut sub create_lookup { my $self = shift; - my $rec = shift || confess "need record to create lookup"; - confess("need HASH as first argument!") if ($rec !~ /HASH/o); + my $log = $self->_get_logger(); + + my $rec = shift || $log->logconfess("need record to create lookup"); + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); foreach my $i (@_) { if ($i->{'eval'}) { @@ -246,12 +334,14 @@ my $key = $self->fill_in($rec,$i->{'key'}); my @val = $self->fill_in($rec,$i->{'val'}); if ($key && @val && eval $eval) { + $log->debug("stored $key = ",sub { join(" | ",@val) }); push @{$self->{'lookup'}->{$key}}, @val; } } else { my $key = $self->fill_in($rec,$i->{'key'}); my @val = $self->fill_in($rec,$i->{'val'}); if ($key && @val) { + $log->debug("stored $key = ",sub { join(" | ",@val) }); push @{$self->{'lookup'}->{$key}}, @val; } } @@ -262,7 +352,7 @@ Returns value from record. - $self->get_data(\$rec,$f,$sf,$i,\$found); + my $text = $self->get_data(\$rec,$f,$sf,$i,\$found); Arguments are: record reference C<$rec>, @@ -270,7 +360,7 @@ optional subfiled C<$sf>, index for repeatable values C<$i>. -Optinal variable C<$found> will be incremeted if thre +Optinal variable C<$found> will be incremeted if there is field. Returns value or empty string. @@ -281,13 +371,26 @@ my $self = shift; my ($rec,$f,$sf,$i,$found) = @_; + if ($$rec->{$f}) { + return '' if (! $$rec->{$f}->[$i]); + no strict 'refs'; if ($sf && $$rec->{$f}->[$i]->{$sf}) { $$found++ if (defined($$found)); return $$rec->{$f}->[$i]->{$sf}; } elsif ($$rec->{$f}->[$i]) { $$found++ if (defined($$found)); - return $$rec->{$f}->[$i]; + # it still might have subfield, just + # not specified, so we'll dump all + if ($$rec->{$f}->[$i] =~ /HASH/o) { + my $out; + foreach my $k (keys %{$$rec->{$f}->[$i]}) { + $out .= $$rec->{$f}->[$i]->{$k}." "; + } + return $out; + } else { + return $$rec->{$f}->[$i]; + } } } else { return ''; @@ -300,30 +403,39 @@ strings with placeholders and returns string or array of with substituted values from record. - $webpac->fill_in($rec,'v250^a'); + my $text = $webpac->fill_in($rec,'v250^a'); Optional argument is ordinal number for repeatable fields. By default, it's assume to be first repeatable field (fields are perl array, so first element is 0). Following example will read second value from repeatable field. - $webpac->fill_in($rec,'Title: v250^a',1); + my $text = $webpac->fill_in($rec,'Title: v250^a',1); This function B perform parsing of format to inteligenty skip delimiters before fields which aren't used. +This method will automatically decode UTF-8 string to local code page +if needed. + =cut sub fill_in { my $self = shift; - my $rec = shift || confess "need data record"; - my $format = shift || confess "need format to parse"; + my $log = $self->_get_logger(); + + my $rec = shift || $log->logconfess("need data record"); + my $format = shift || $log->logconfess("need format to parse"); # iteration (for repeatable fields) my $i = shift || 0; # FIXME remove for speedup? - confess("need HASH as first argument!") if ($rec !~ /HASH/o); + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + if (utf8::is_utf8($format)) { + $format = $self->_x($format); + } my $found = 0; @@ -332,15 +444,17 @@ $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); # do actual replacement of placeholders - $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; + $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; if ($found) { + $log->debug("format: $format"); if ($eval_code) { my $eval = $self->fill_in($rec,$eval_code,$i); - return if (! eval $eval); + return if (! $self->_eval($eval)); } # do we have lookups? - if ($format =~ /\[[^\[\]]+\]/o) { + if ($format =~ /$LOOKUP_REGEX/o) { + $log->debug("format '$format' has lookup"); return $self->lookup($format); } else { return $format; @@ -354,7 +468,7 @@ Perform lookups on format supplied to it. - my $txt = $self->lookup('[v900]'); + my $text = $self->lookup('[v900]'); Lookups can be nested (like C<[d:[a:[v900]]]>). @@ -363,31 +477,33 @@ sub lookup { my $self = shift; - my $tmp = shift || confess "need format"; + my $log = $self->_get_logger(); - if ($tmp =~ /\[[^\[\]]+\]/o) { + my $tmp = shift || $log->logconfess("need format"); + + if ($tmp =~ /$LOOKUP_REGEX/o) { my @in = ( $tmp ); -#print "##lookup $tmp\n"; + + $log->debug("lookup for: ",$tmp); + my @out; while (my $f = shift @in) { - if ($f =~ /\[([^\[\]]+)\]/) { + if ($f =~ /$LOOKUP_REGEX_SAVE/o) { my $k = $1; if ($self->{'lookup'}->{$k}) { -#print "## lookup key = $k\n"; foreach my $nv (@{$self->{'lookup'}->{$k}}) { my $tmp2 = $f; - $tmp2 =~ s/\[$k\]/$nv/g; + $tmp2 =~ s/lookup{$k}/$nv/g; push @in, $tmp2; -#print "## lookup in => $tmp2\n"; } } else { undef $f; } } elsif ($f) { push @out, $f; -#print "## lookup out => $f\n"; } } + $log->logconfess("return is array and it's not expected!") unless wantarray; return @out; } else { return $tmp; @@ -400,21 +516,30 @@ defined. It can also eval code in format starting with C and return output or nothing depending on eval code. - $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i); + my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i); =cut sub parse { my $self = shift; - my ($rec, $format, $i) = @_; + my ($rec, $format_utf8, $i) = @_; - confess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'}); $i = 0 if (! $i); + my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'}); + my @out; + $log->debug("format: $format"); + my $eval_code; # remove eval{...} from beginning $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); @@ -422,9 +547,7 @@ my $prefix; my $all_found=0; -#print "## $format\n"; - while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) { -#print "## [ $1 | $2 | $3 ] $format\n"; + while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) { my $del = $1 || ''; $prefix ||= $del if ($all_found == 0); @@ -441,17 +564,360 @@ return if (! $all_found); - my $out = join('',@out) . $format; + my $out = join('',@out); + + if ($out) { + # add rest of format (suffix) + $out .= $format; + + # add prefix if not there + $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); + + $log->debug("result: $out"); + } - # add prefix if not there - $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); - if ($eval_code) { my $eval = $self->fill_in($rec,$eval_code,$i); - return if (! eval $eval); + $log->debug("about to eval{",$eval,"} format: $out"); + return if (! $self->_eval($eval)); } return $out; } +=head2 parse_to_arr + +Similar to C, but returns array of all repeatable fields + + my @arr = $webpac->parse_to_arr($rec,'v250^a'); + +=cut + +sub parse_to_arr { + my $self = shift; + + my ($rec, $format_utf8) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + my $i = 0; + my @arr; + + while (my $v = $self->parse($rec,$format_utf8,$i++)) { + push @arr, $v; + } + + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + + return @arr; +} + +=head2 fill_in_to_arr + +Similar to C, but returns array of all repeatable fields. Usable +for fields which have lookups, so they shouldn't be parsed but rather +Ced. + + my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]'); + +=cut + +sub fill_in_to_arr { + my $self = shift; + + my ($rec, $format_utf8) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + my $i = 0; + my @arr; + + while (my @v = $self->fill_in($rec,$format_utf8,$i++)) { + push @arr, @v; + } + + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + + return @arr; +} + + +=head2 data_structure + +Create in-memory data structure which represents layout from C. +It is used later to produce output. + + my @ds = $webpac->data_structure($rec); + +This method will also set C<$webpac->{'currnet_filename'}> if there is + tag in C and C<$webpac->{'headline'}> if there is + tag. + +=cut + +sub data_structure { + my $self = shift; + + my $log = $self->_get_logger(); + + my $rec = shift; + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + undef $self->{'currnet_filename'}; + undef $self->{'headline'}; + + my @sorted_tags; + if ($self->{tags_by_order}) { + @sorted_tags = @{$self->{tags_by_order}}; + } else { + @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}}; + $self->{tags_by_order} = \@sorted_tags; + } + + my @ds; + + $log->debug("tags: ",sub { join(", ",@sorted_tags) }); + + foreach my $field (@sorted_tags) { + + my $row; + +#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); + + foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { + my $format = $tag->{'value'} || $tag->{'content'}; + + $log->debug("format: $format"); + + my @v; + if ($format =~ /$LOOKUP_REGEX/o) { + @v = $self->fill_in_to_arr($rec,$format); + } else { + @v = $self->parse_to_arr($rec,$format); + } + next if (! @v); + + # use format? + if ($tag->{'format_name'}) { + @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; + } + + if ($field eq 'filename') { + $self->{'current_filename'} = join('',@v); + $log->debug("filename: ",$self->{'current_filename'}); + } elsif ($field eq 'headline') { + $self->{'headline'} .= join('',@v); + $log->debug("headline: ",$self->{'headline'}); + next; # don't return headline in data_structure! + } + + # does tag have type? + if ($tag->{'type'}) { + push @{$row->{$tag->{'type'}}}, @v; + } else { + push @{$row->{'display'}}, @v; + push @{$row->{'swish'}}, @v; + } + + + } + + if ($row) { + $row->{'tag'} = $field; + + # TODO: name_sigular, name_plural + my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; + $row->{'name'} = $name ? $self->_x($name) : $field; + + push @ds, $row; + + $log->debug("row $field: ",sub { Dumper($row) }); + } + + } + + return @ds; + +} + +=head2 output + +Create output from in-memory data structure using Template Toolkit template. + +my $text = $webpac->output( template => 'text.tt', data => @ds ); + +=cut + +sub output { + my $self = shift; + + my $args = {@_}; + + my $log = $self->_get_logger(); + + $log->logconfess("need template name") if (! $args->{'template'}); + $log->logconfess("need data array") if (! $args->{'data'}); + + my $out; + + $self->{'tt'}->process( + $args->{'template'}, + $args, + \$out + ) || confess $self->{'tt'}->error(); + + return $out; +} + +=head2 apply_format + +Apply format specified in tag with C and +C. + + my $text = $webpac->apply_format($format_name,$format_delimiter,$data); + +Formats can contain C if you need them. + +=cut + +sub apply_format { + my $self = shift; + + my ($name,$delimiter,$data) = @_; + + my $log = $self->_get_logger(); + + if (! $self->{'import_xml'}->{'format'}->{$name}) { + $log->warn(" is not defined in ",$self->{'import_xml_file'}); + return $data; + } + + $log->warn("no delimiter for format $name") if (! $delimiter); + + my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + + my @data = split(/\Q$delimiter\E/, $data); + + my $out = sprintf($format, @data); + $log->debug("using format $name [$format] on $data to produce: $out"); + + if ($out =~ m/$LOOKUP_REGEX/o) { + return $self->lookup($out); + } else { + return $out; + } + +} + + +# +# +# + +=head1 INTERNAL METHODS + +Here is a quick list of internal methods, mostly useful to turn debugging +on them (see L below for explanation). + +=cut + +=head2 _eval + +Internal function to eval code without C. + +=cut + +sub _eval { + my $self = shift; + + my $code = shift || return; + + my $log = $self->_get_logger(); + + no strict 'subs'; + my $ret = eval $code; + if ($@) { + $log->error("problem with eval code [$code]: $@"); + } + + $log->debug("eval: ",$code," [",$ret,"]"); + + return $ret || 0; +} + +=head2 _sort_by_order + +Sort xml tags data structure accoding to C attribute. + +=cut + +sub _sort_by_order { + my $self = shift; + + my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$a}; + my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$b}; + + return $va <=> $vb; +} + +=head2 _get_logger + +Get C object with a twist: domains are defined for each +method + + my $log = $webpac->_get_logger(); + +=cut + +sub _get_logger { + my $self = shift; + + my $name = (caller(1))[3] || caller; + return get_logger($name); +} + +=head2 _x + +Convert string from UTF-8 to code page defined in C. + + my $text = $webpac->_x('utf8 text'); + +=cut + +sub _x { + my $self = shift; + my $utf8 = shift || return; + + return $self->{'utf2cp'}->convert($utf8) || + $self->_get_logger()->logwarn("can't convert '$utf8'"); +} + +# +# +# + +=head1 LOGGING + +Logging in WebPAC is performed by L with config file +C. + +Methods defined above have different levels of logging, so +it's descriptions will be useful to turn (mostry B logging) on +or off to see why WabPAC isn't perforing as you expect it (it might even +be a bug!). + +B. To repeat, you can +also use method names, and not only classes (which are just few) +to filter logging. + +=cut + 1;