--- trunk2/lib/WebPAC.pm 2004/06/17 20:44:45 371 +++ trunk2/lib/WebPAC.pm 2004/09/10 22:24:42 421 @@ -8,9 +8,15 @@ use Config::IniFiles; use XML::Simple; use Template; +use Log::Log4perl qw(get_logger :levels); use Data::Dumper; +#my $LOOKUP_REGEX = '\[[^\[\]]+\]'; +#my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]'; +my $LOOKUP_REGEX = 'lookup{[^\{\}]+}'; +my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}'; + =head1 NAME WebPAC - base class for WebPAC @@ -23,20 +29,38 @@ =head2 new -This will create new instance of WebPAC using configuration specified by C. +Create new instance of WebPAC using configuration specified by C. my $webpac = new WebPAC( config_file => 'name.conf', [code_page => 'ISO-8859-2',] + [low_mem => 1,] ); Default C is C. -It will also read configuration files +This method will also read configuration files C (used by indexer and Web font-end) and configuration file specified by C which describes databases to be indexed. +C options is double-edged sword. If enabled, WebPAC +will run on memory constraint machines (which doesn't have enough +physical RAM to create memory structure for whole ISIS database). + +If your machine has 512Mb or more and database is around 10000 records, +memory shouldn't be an issue. If you don't have enough physical RAM, you +might consider using virtual memory (if your operating system is handling it +well, like on FreeBSD or Linux) instead of dropping to L to handle +parsed structure of ISIS database. + +However, when WebPAC is running on desktop machines (or laptops :-), it's +highly undesireable for system to start swapping. Using C option can +reduce WecPAC memory usage to 16Mb for same database with lookup fields and +sorted indexes which stay in RAM. Performance will suffer, but memory usage +will really be minimal. It might be also more confortable to run WebPAC reniced +on those machines. + =cut # mapping between data type and tag which specify @@ -53,6 +77,11 @@ my $self = {@_}; bless($self, $class); + my $log_file = $self->{'log'} || "log.conf"; + Log::Log4perl->init($log_file); + + my $log = $self->_get_logger(); + # fill in default values # output codepage $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'}); @@ -60,8 +89,9 @@ # # read global.conf # + $log->debug("read 'global.conf'"); - my $config = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'"; + my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'"); # read global config parametars foreach my $var (qw( @@ -80,7 +110,7 @@ # read indexer config file # - $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'"; + $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'"); # create UTF-8 convertor for import_xml files $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'}); @@ -94,6 +124,32 @@ EVAL_PERL => 1, ); + # running with low_mem flag? well, use DBM::Deep then. + if ($self->{'low_mem'}) { + $log->info("running with low_mem which impacts performance (<64 Mb memory usage)"); + + my $db_file = "data.db"; + + if (-e $db_file) { + unlink $db_file or $log->logdie("can't remove '$db_file' from last run"); + $log->debug("removed '$db_file' from last run"); + } + + use DBM::Deep; + + my $db = new DBM::Deep $db_file; + + $log->logdie("DBM::Deep error: $!") unless ($db); + + if ($db->error()) { + $log->logdie("can't open '$db_file' under low_mem: ",$db->error()); + } else { + $log->debug("using file $db_file for DBM::Deep"); + } + + $self->{'db'} = $db; + } + return $self; } @@ -132,9 +188,15 @@ my $self = shift; my $arg = {@_}; - croak "need filename" if (! $arg->{'filename'}); + my $log = $self->_get_logger(); + + $log->logcroak("need filename") if (! $arg->{'filename'}); my $code_page = $arg->{'code_page'} || '852'; + # store data in object + $self->{'isis_filename'} = $arg->{'filename'}; + $self->{'isis_code_page'} = $code_page; + use OpenIsis; #$self->{'isis_code_page'} = $code_page; @@ -142,7 +204,8 @@ # create Text::Iconv object my $cp = Text::Iconv->new($code_page,$self->{'code_page'}); - print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'}); + $log->info("reading ISIS database '",$arg->{'filename'},"'"); + $log->debug("isis code page: $code_page"); my $isis_db = OpenIsis::open($arg->{'filename'}); @@ -150,11 +213,16 @@ $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn}); - print STDERR "processing $maxmfn records...\n" if ($self->{'debug'}); + $log->info("processing $maxmfn records..."); # read database for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) { + + $log->debug("mfn: $mfn\n"); + + my $rec; + # read record my $row = OpenIsis::read( $isis_db, $mfn ); foreach my $k (keys %{$row}) { @@ -172,19 +240,34 @@ $val = $l; } - push @{$self->{'data'}->{$mfn}->{$k}}, $val; + push @{$rec->{$k}}, $val; } + } else { + push @{$rec->{'000'}}, $mfn; } } + $log->confess("record $mfn empty?") unless ($rec); + + # store + if ($self->{'low_mem'}) { + $self->{'db'}->put($mfn, $rec); + } else { + $self->{'data'}->{$mfn} = $rec; + } + # create lookup - my $rec = $self->{'data'}->{$mfn}; $self->create_lookup($rec, @{$arg->{'lookup'}}); + $self->progress_bar($mfn,$maxmfn); + } $self->{'current_mfn'} = 1; + $self->{'last_pcnt'} = 0; + + $log->debug("max mfn: $maxmfn"); # store max mfn and return it. return $self->{'max_mfn'} = $maxmfn; @@ -202,14 +285,58 @@ sub fetch_rec { my $self = shift; - my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!"; + my $log = $self->_get_logger(); + + my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!"); if ($mfn > $self->{'max_mfn'}) { $self->{'current_mfn'} = $self->{'max_mfn'}; + $log->debug("at EOF"); return; } - return $self->{'data'}->{$mfn}; + $self->progress_bar($mfn,$self->{'max_mfn'}); + + if ($self->{'low_mem'}) { + return $self->{'db'}->get($mfn); + } else { + return $self->{'data'}->{$mfn}; + } +} + +=head2 progress_bar + +Draw progress bar on STDERR. + + $webpac->progress_bar($current, $max); + +=cut + +sub progress_bar { + my $self = shift; + + my ($curr,$max) = @_; + + my $log = $self->_get_logger(); + + $log->logconfess("no current value!") if (! $curr); + $log->logconfess("no maximum value!") if (! $max); + + if ($curr > $max) { + $max = $curr; + $log->debug("overflow to $curr"); + } + + $self->{'last_pcnt'} ||= 1; + + $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'}); + + my $p = int($curr * 100 / $max); + if ($p != $self->{'last_pcnt'}) { + printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p ); + $self->{'last_pcnt'} = $p; + } + print STDERR "\n" if ($p == 100); } =head2 open_import_xml @@ -223,8 +350,10 @@ sub open_import_xml { my $self = shift; + my $log = $self->_get_logger(); + my $arg = {@_}; - confess "need type to load file from import_xml/" if (! $arg->{'type'}); + $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'}); $self->{'type'} = $arg->{'type'}; @@ -233,18 +362,21 @@ $self->{'tag'} = $type2tag{$type_base}; - print STDERR "using type '",$self->{'type'},"' tag <",$self->{'tag'},">\n" if ($self->{'debug'}); + $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">"); my $f = "./import_xml/".$self->{'type'}.".xml"; - confess "import_xml file '$f' doesn't exist!" if (! -e "$f"); + $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f"); + + $log->info("reading '$f'"); - print STDERR "reading '$f'\n" if ($self->{'debug'}); + $self->{'import_xml_file'} = $f; $self->{'import_xml'} = XMLin($f, ForceArray => [ $self->{'tag'}, 'config', 'format' ], - ForceContent => 1 ); + $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }); + } =head2 create_lookup @@ -260,23 +392,29 @@ sub create_lookup { my $self = shift; - my $rec = shift || confess "need record to create lookup"; - confess("need HASH as first argument!") if ($rec !~ /HASH/o); + my $log = $self->_get_logger(); + + my $rec = shift || $log->logconfess("need record to create lookup"); + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); foreach my $i (@_) { - if ($i->{'eval'}) { - my $eval = $self->fill_in($rec,$i->{'eval'}); - my $key = $self->fill_in($rec,$i->{'key'}); - my @val = $self->fill_in($rec,$i->{'val'}); - if ($key && @val && eval $eval) { + $log->logconfess("need key") unless defined($i->{'key'}); + $log->logconfess("need val") unless defined($i->{'val'}); + + if (defined($i->{'eval'})) { + # eval first, so we can skip fill_in for key and val + my $eval = $self->fill_in($rec,$i->{'eval'}) || next; + if ($self->_eval($eval)) { + my $key = $self->fill_in($rec,$i->{'key'}) || next; + my @val = $self->fill_in($rec,$i->{'val'}) || next; + $log->debug("stored $key = ",sub { join(" | ",@val) }); push @{$self->{'lookup'}->{$key}}, @val; } } else { - my $key = $self->fill_in($rec,$i->{'key'}); - my @val = $self->fill_in($rec,$i->{'val'}); - if ($key && @val) { - push @{$self->{'lookup'}->{$key}}, @val; - } + my $key = $self->fill_in($rec,$i->{'key'}) || next; + my @val = $self->fill_in($rec,$i->{'val'}) || next; + $log->debug("stored $key = ",sub { join(" | ",@val) }); + push @{$self->{'lookup'}->{$key}}, @val; } } } @@ -307,6 +445,7 @@ if ($$rec->{$f}) { return '' if (! $$rec->{$f}->[$i]); + no strict 'refs'; if ($sf && $$rec->{$f}->[$i]->{$sf}) { $$found++ if (defined($$found)); return $$rec->{$f}->[$i]->{$sf}; @@ -347,31 +486,27 @@ This function B perform parsing of format to inteligenty skip delimiters before fields which aren't used. -=cut +This method will automatically decode UTF-8 string to local code page +if needed. -# internal function to eval code -sub _eval { - my $self = shift; - - my $code = shift || return; - no strict 'subs'; - my $ret = eval $code; - if ($@) { - print STDERR "problem with eval code [$code]: $@\n"; - } - return $ret; -} +=cut sub fill_in { my $self = shift; - my $rec = shift || confess "need data record"; - my $format = shift || confess "need format to parse"; + my $log = $self->_get_logger(); + + my $rec = shift || $log->logconfess("need data record"); + my $format = shift || $log->logconfess("need format to parse"); # iteration (for repeatable fields) my $i = shift || 0; # FIXME remove for speedup? - confess("need HASH as first argument!") if ($rec !~ /HASH/o); + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + if (utf8::is_utf8($format)) { + $format = $self->_x($format); + } my $found = 0; @@ -380,16 +515,17 @@ $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); # do actual replacement of placeholders - $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; + $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; if ($found) { + $log->debug("format: $format"); if ($eval_code) { my $eval = $self->fill_in($rec,$eval_code,$i); return if (! $self->_eval($eval)); } # do we have lookups? - if ($format =~ /\[[^\[\]]+\]/o) { -print "## probable lookup: $format\n"; + if ($format =~ /$LOOKUP_REGEX/o) { + $log->debug("format '$format' has lookup"); return $self->lookup($format); } else { return $format; @@ -412,19 +548,23 @@ sub lookup { my $self = shift; - my $tmp = shift || confess "need format"; + my $log = $self->_get_logger(); - if ($tmp =~ /\[[^\[\]]+\]/o) { + my $tmp = shift || $log->logconfess("need format"); + + if ($tmp =~ /$LOOKUP_REGEX/o) { my @in = ( $tmp ); -print "## lookup $tmp\n"; + + $log->debug("lookup for: ",$tmp); + my @out; while (my $f = shift @in) { - if ($f =~ /\[([^\[\]]+)\]/) { + if ($f =~ /$LOOKUP_REGEX_SAVE/o) { my $k = $1; if ($self->{'lookup'}->{$k}) { foreach my $nv (@{$self->{'lookup'}->{$k}}) { my $tmp2 = $f; - $tmp2 =~ s/\[$k\]/$nv/g; + $tmp2 =~ s/lookup{$k}/$nv/g; push @in, $tmp2; } } else { @@ -434,6 +574,7 @@ push @out, $f; } } + $log->logconfess("return is array and it's not expected!") unless wantarray; return @out; } else { return $tmp; @@ -457,15 +598,19 @@ return if (! $format_utf8); - confess("need HASH as first argument!") if ($rec !~ /HASH/o); - confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'}); + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'}); $i = 0 if (! $i); - my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'}); + my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'}); my @out; + $log->debug("format: $format"); + my $eval_code; # remove eval{...} from beginning $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); @@ -473,7 +618,7 @@ my $prefix; my $all_found=0; - while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) { + while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) { my $del = $1 || ''; $prefix ||= $del if ($all_found == 0); @@ -490,13 +635,21 @@ return if (! $all_found); - my $out = join('',@out) . $format; + my $out = join('',@out); - # add prefix if not there - $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); + if ($out) { + # add rest of format (suffix) + $out .= $format; + + # add prefix if not there + $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); + + $log->debug("result: $out"); + } if ($eval_code) { my $eval = $self->fill_in($rec,$eval_code,$i); + $log->debug("about to eval{",$eval,"} format: $out"); return if (! $self->_eval($eval)); } @@ -516,7 +669,9 @@ my ($rec, $format_utf8) = @_; - confess("need HASH as first argument!") if ($rec !~ /HASH/o); + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); return if (! $format_utf8); my $i = 0; @@ -526,36 +681,67 @@ push @arr, $v; } + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + return @arr; } -=head2 data_structure +=head2 fill_in_to_arr -Create in-memory data structure which represents layout from C. -It is used later to produce output. +Similar to C, but returns array of all repeatable fields. Usable +for fields which have lookups, so they shouldn't be parsed but rather +Ced. - my @ds = $webpac->data_structure($rec); + my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]'); =cut -# private method _sort_by_order -# sort subrouting using order="" attribute -sub _sort_by_order { +sub fill_in_to_arr { my $self = shift; - my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} || - $self->{'import_xml'}->{'indexer'}->{$a}; - my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} || - $self->{'import_xml'}->{'indexer'}->{$b}; + my ($rec, $format_utf8) = @_; - return $va <=> $vb; + my $log = $self->_get_logger(); + + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + return if (! $format_utf8); + + my $i = 0; + my @arr; + + while (my @v = $self->fill_in($rec,$format_utf8,$i++)) { + push @arr, @v; + } + + $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); + + return @arr; } + +=head2 data_structure + +Create in-memory data structure which represents layout from C. +It is used later to produce output. + + my @ds = $webpac->data_structure($rec); + +This method will also set C<$webpac->{'currnet_filename'}> if there is + tag in C and C<$webpac->{'headline'}> if there is + tag. + +=cut + sub data_structure { my $self = shift; + my $log = $self->_get_logger(); + my $rec = shift; - confess("need HASH as first argument!") if ($rec !~ /HASH/o); + $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); + + undef $self->{'currnet_filename'}; + undef $self->{'headline'}; my @sorted_tags; if ($self->{tags_by_order}) { @@ -567,6 +753,8 @@ my @ds; + $log->debug("tags: ",sub { join(", ",@sorted_tags) }); + foreach my $field (@sorted_tags) { my $row; @@ -574,10 +762,32 @@ #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { - my @v = $self->parse_to_arr($rec,$tag->{'content'}); + my $format = $tag->{'value'} || $tag->{'content'}; + $log->debug("format: $format"); + + my @v; + if ($format =~ /$LOOKUP_REGEX/o) { + @v = $self->fill_in_to_arr($rec,$format); + } else { + @v = $self->parse_to_arr($rec,$format); + } next if (! @v); + # use format? + if ($tag->{'format_name'}) { + @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; + } + + if ($field eq 'filename') { + $self->{'current_filename'} = join('',@v); + $log->debug("filename: ",$self->{'current_filename'}); + } elsif ($field eq 'headline') { + $self->{'headline'} .= join('',@v); + $log->debug("headline: ",$self->{'headline'}); + next; # don't return headline in data_structure! + } + # does tag have type? if ($tag->{'type'}) { push @{$row->{$tag->{'type'}}}, @v; @@ -585,11 +795,20 @@ push @{$row->{'display'}}, @v; push @{$row->{'swish'}}, @v; } + + } if ($row) { $row->{'tag'} = $field; + + # TODO: name_sigular, name_plural + my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; + $row->{'name'} = $name ? $self->_x($name) : $field; + push @ds, $row; + + $log->debug("row $field: ",sub { Dumper($row) }); } } @@ -611,8 +830,10 @@ my $args = {@_}; - confess("need template name") if (! $args->{'template'}); - confess("need data array") if (! $args->{'data'}); + my $log = $self->_get_logger(); + + $log->logconfess("need template name") if (! $args->{'template'}); + $log->logconfess("need data array") if (! $args->{'data'}); my $out; @@ -625,4 +846,181 @@ return $out; } +=head2 output_file + +Create output from in-memory data structure using Template Toolkit template +to a file. + + $webpac->output_file( + file => 'out.txt', + template => 'text.tt', + data => @ds + ); + +=cut + +sub output_file { + my $self = shift; + + my $args = {@_}; + + my $log = $self->_get_logger(); + + my $file = $args->{'file'} || $log->logconfess("need file name"); + + $log->debug("creating file ",$file); + + open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!"); + print $fh $self->output( + template => $args->{'template'}, + data => $args->{'data'}, + ) || $log->logdie("print: $!"); + close($fh) || $log->logdie("close: $!"); +} + +=head2 apply_format + +Apply format specified in tag with C and +C. + + my $text = $webpac->apply_format($format_name,$format_delimiter,$data); + +Formats can contain C if you need them. + +=cut + +sub apply_format { + my $self = shift; + + my ($name,$delimiter,$data) = @_; + + my $log = $self->_get_logger(); + + if (! $self->{'import_xml'}->{'format'}->{$name}) { + $log->warn(" is not defined in ",$self->{'import_xml_file'}); + return $data; + } + + $log->warn("no delimiter for format $name") if (! $delimiter); + + my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); + + my @data = split(/\Q$delimiter\E/, $data); + + my $out = sprintf($format, @data); + $log->debug("using format $name [$format] on $data to produce: $out"); + + if ($out =~ m/$LOOKUP_REGEX/o) { + return $self->lookup($out); + } else { + return $out; + } + +} + + +# +# +# + +=head1 INTERNAL METHODS + +Here is a quick list of internal methods, mostly useful to turn debugging +on them (see L below for explanation). + +=cut + +=head2 _eval + +Internal function to eval code without C. + +=cut + +sub _eval { + my $self = shift; + + my $code = shift || return; + + my $log = $self->_get_logger(); + + no strict 'subs'; + my $ret = eval $code; + if ($@) { + $log->error("problem with eval code [$code]: $@"); + } + + $log->debug("eval: ",$code," [",$ret,"]"); + + return $ret || 0; +} + +=head2 _sort_by_order + +Sort xml tags data structure accoding to C attribute. + +=cut + +sub _sort_by_order { + my $self = shift; + + my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$a}; + my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} || + $self->{'import_xml'}->{'indexer'}->{$b}; + + return $va <=> $vb; +} + +=head2 _get_logger + +Get C object with a twist: domains are defined for each +method + + my $log = $webpac->_get_logger(); + +=cut + +sub _get_logger { + my $self = shift; + + my $name = (caller(1))[3] || caller; + return get_logger($name); +} + +=head2 _x + +Convert string from UTF-8 to code page defined in C. + + my $text = $webpac->_x('utf8 text'); + +=cut + +sub _x { + my $self = shift; + my $utf8 = shift || return; + + return $self->{'utf2cp'}->convert($utf8) || + $self->_get_logger()->logwarn("can't convert '$utf8'"); +} + +# +# +# + +=head1 LOGGING + +Logging in WebPAC is performed by L with config file +C. + +Methods defined above have different levels of logging, so +it's descriptions will be useful to turn (mostry B logging) on +or off to see why WabPAC isn't perforing as you expect it (it might even +be a bug!). + +B. To repeat, you can +also use method names, and not only classes (which are just few) +to filter logging. + +=cut + 1;