12 |
|
|
13 |
use Data::Dumper; |
use Data::Dumper; |
14 |
|
|
15 |
|
#my $LOOKUP_REGEX = '\[[^\[\]]+\]'; |
16 |
|
#my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]'; |
17 |
|
my $LOOKUP_REGEX = 'lookup{[^\{\}]+}'; |
18 |
|
my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}'; |
19 |
|
|
20 |
=head1 NAME |
=head1 NAME |
21 |
|
|
22 |
WebPAC - base class for WebPAC |
WebPAC - base class for WebPAC |
29 |
|
|
30 |
=head2 new |
=head2 new |
31 |
|
|
32 |
This will create new instance of WebPAC using configuration specified by C<config_file>. |
Create new instance of WebPAC using configuration specified by C<config_file>. |
33 |
|
|
34 |
my $webpac = new WebPAC( |
my $webpac = new WebPAC( |
35 |
config_file => 'name.conf', |
config_file => 'name.conf', |
36 |
[code_page => 'ISO-8859-2',] |
[code_page => 'ISO-8859-2',] |
37 |
|
[low_mem => 1,] |
38 |
); |
); |
39 |
|
|
40 |
Default C<code_page> is C<ISO-8859-2>. |
Default C<code_page> is C<ISO-8859-2>. |
41 |
|
|
42 |
It will also read configuration files |
This method will also read configuration files |
43 |
C<global.conf> (used by indexer and Web font-end) |
C<global.conf> (used by indexer and Web font-end) |
44 |
and configuration file specified by C<config_file> |
and configuration file specified by C<config_file> |
45 |
which describes databases to be indexed. |
which describes databases to be indexed. |
46 |
|
|
47 |
|
C<low_mem> options is double-edged sword. If enabled, WebPAC |
48 |
|
will run on memory constraint machines (which doesn't have enough |
49 |
|
physical RAM to create memory structure for whole ISIS database). |
50 |
|
|
51 |
|
If your machine has 512Mb or more and database is around 10000 records, |
52 |
|
memory shouldn't be an issue. If you don't have enough physical RAM, you |
53 |
|
might consider using virtual memory (if your operating system is handling it |
54 |
|
well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle |
55 |
|
parsed structure of ISIS database. |
56 |
|
|
57 |
|
However, when WebPAC is running on desktop machines (or laptops :-), it's |
58 |
|
highly undesireable for system to start swapping. Using C<low_mem> option can |
59 |
|
reduce WecPAC memory usage to 16Mb for same database with lookup fields and |
60 |
|
sorted indexes which stay in RAM. Performance will suffer, but memory usage |
61 |
|
will really be minimal. It might be also more confortable to run WebPAC reniced |
62 |
|
on those machines. |
63 |
|
|
64 |
=cut |
=cut |
65 |
|
|
66 |
# mapping between data type and tag which specify |
# mapping between data type and tag which specify |
124 |
EVAL_PERL => 1, |
EVAL_PERL => 1, |
125 |
); |
); |
126 |
|
|
127 |
|
# running with low_mem flag? well, use DBM::Deep then. |
128 |
|
if ($self->{'low_mem'}) { |
129 |
|
$log->info("running with low_mem which impacts performance (<64 Mb memory usage)"); |
130 |
|
|
131 |
|
my $db_file = "data.db"; |
132 |
|
|
133 |
|
if (-e $db_file) { |
134 |
|
unlink $db_file or $log->logdie("can't remove '$db_file' from last run"); |
135 |
|
$log->debug("removed '$db_file' from last run"); |
136 |
|
} |
137 |
|
|
138 |
|
use DBM::Deep; |
139 |
|
|
140 |
|
my $db = new DBM::Deep $db_file; |
141 |
|
|
142 |
|
$log->logdie("DBM::Deep error: $!") unless ($db); |
143 |
|
|
144 |
|
if ($db->error()) { |
145 |
|
$log->logdie("can't open '$db_file' under low_mem: ",$db->error()); |
146 |
|
} else { |
147 |
|
$log->debug("using file $db_file for DBM::Deep"); |
148 |
|
} |
149 |
|
|
150 |
|
$self->{'db'} = $db; |
151 |
|
} |
152 |
|
|
153 |
return $self; |
return $self; |
154 |
} |
} |
155 |
|
|
193 |
$log->logcroak("need filename") if (! $arg->{'filename'}); |
$log->logcroak("need filename") if (! $arg->{'filename'}); |
194 |
my $code_page = $arg->{'code_page'} || '852'; |
my $code_page = $arg->{'code_page'} || '852'; |
195 |
|
|
196 |
|
# store data in object |
197 |
|
$self->{'isis_filename'} = $arg->{'filename'}; |
198 |
|
$self->{'isis_code_page'} = $code_page; |
199 |
|
|
200 |
use OpenIsis; |
use OpenIsis; |
201 |
|
|
202 |
#$self->{'isis_code_page'} = $code_page; |
#$self->{'isis_code_page'} = $code_page; |
205 |
my $cp = Text::Iconv->new($code_page,$self->{'code_page'}); |
my $cp = Text::Iconv->new($code_page,$self->{'code_page'}); |
206 |
|
|
207 |
$log->info("reading ISIS database '",$arg->{'filename'},"'"); |
$log->info("reading ISIS database '",$arg->{'filename'},"'"); |
208 |
|
$log->debug("isis code page: $code_page"); |
209 |
|
|
210 |
my $isis_db = OpenIsis::open($arg->{'filename'}); |
my $isis_db = OpenIsis::open($arg->{'filename'}); |
211 |
|
|
218 |
# read database |
# read database |
219 |
for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) { |
for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) { |
220 |
|
|
221 |
|
|
222 |
|
$log->debug("mfn: $mfn\n"); |
223 |
|
|
224 |
|
my $rec; |
225 |
|
|
226 |
# read record |
# read record |
227 |
my $row = OpenIsis::read( $isis_db, $mfn ); |
my $row = OpenIsis::read( $isis_db, $mfn ); |
228 |
foreach my $k (keys %{$row}) { |
foreach my $k (keys %{$row}) { |
240 |
$val = $l; |
$val = $l; |
241 |
} |
} |
242 |
|
|
243 |
push @{$self->{'data'}->{$mfn}->{$k}}, $val; |
push @{$rec->{$k}}, $val; |
244 |
} |
} |
245 |
|
} else { |
246 |
|
push @{$rec->{'000'}}, $mfn; |
247 |
} |
} |
248 |
|
|
249 |
} |
} |
250 |
|
|
251 |
|
$log->confess("record $mfn empty?") unless ($rec); |
252 |
|
|
253 |
|
# store |
254 |
|
if ($self->{'low_mem'}) { |
255 |
|
$self->{'db'}->put($mfn, $rec); |
256 |
|
} else { |
257 |
|
$self->{'data'}->{$mfn} = $rec; |
258 |
|
} |
259 |
|
|
260 |
# create lookup |
# create lookup |
|
my $rec = $self->{'data'}->{$mfn}; |
|
261 |
$self->create_lookup($rec, @{$arg->{'lookup'}}); |
$self->create_lookup($rec, @{$arg->{'lookup'}}); |
262 |
|
|
263 |
|
$self->progress_bar($mfn,$maxmfn); |
264 |
|
|
265 |
} |
} |
266 |
|
|
267 |
$self->{'current_mfn'} = 1; |
$self->{'current_mfn'} = 1; |
268 |
|
$self->{'last_pcnt'} = 0; |
269 |
|
|
270 |
|
$log->debug("max mfn: $maxmfn"); |
271 |
|
|
272 |
# store max mfn and return it. |
# store max mfn and return it. |
273 |
return $self->{'max_mfn'} = $maxmfn; |
return $self->{'max_mfn'} = $maxmfn; |
291 |
|
|
292 |
if ($mfn > $self->{'max_mfn'}) { |
if ($mfn > $self->{'max_mfn'}) { |
293 |
$self->{'current_mfn'} = $self->{'max_mfn'}; |
$self->{'current_mfn'} = $self->{'max_mfn'}; |
294 |
|
$log->debug("at EOF"); |
295 |
return; |
return; |
296 |
} |
} |
297 |
|
|
298 |
return $self->{'data'}->{$mfn}; |
$self->progress_bar($mfn,$self->{'max_mfn'}); |
299 |
|
|
300 |
|
if ($self->{'low_mem'}) { |
301 |
|
return $self->{'db'}->get($mfn); |
302 |
|
} else { |
303 |
|
return $self->{'data'}->{$mfn}; |
304 |
|
} |
305 |
|
} |
306 |
|
|
307 |
|
=head2 progress_bar |
308 |
|
|
309 |
|
Draw progress bar on STDERR. |
310 |
|
|
311 |
|
$webpac->progress_bar($current, $max); |
312 |
|
|
313 |
|
=cut |
314 |
|
|
315 |
|
sub progress_bar { |
316 |
|
my $self = shift; |
317 |
|
|
318 |
|
my ($curr,$max) = @_; |
319 |
|
|
320 |
|
my $log = $self->_get_logger(); |
321 |
|
|
322 |
|
$log->logconfess("no current value!") if (! $curr); |
323 |
|
$log->logconfess("no maximum value!") if (! $max); |
324 |
|
|
325 |
|
if ($curr > $max) { |
326 |
|
$max = $curr; |
327 |
|
$log->debug("overflow to $curr"); |
328 |
|
} |
329 |
|
|
330 |
|
$self->{'last_pcnt'} ||= 1; |
331 |
|
|
332 |
|
$self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'}); |
333 |
|
|
334 |
|
my $p = int($curr * 100 / $max); |
335 |
|
if ($p != $self->{'last_pcnt'}) { |
336 |
|
printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p ); |
337 |
|
$self->{'last_pcnt'} = $p; |
338 |
|
} |
339 |
|
print STDERR "\n" if ($p == 100); |
340 |
} |
} |
341 |
|
|
342 |
=head2 open_import_xml |
=head2 open_import_xml |
362 |
|
|
363 |
$self->{'tag'} = $type2tag{$type_base}; |
$self->{'tag'} = $type2tag{$type_base}; |
364 |
|
|
365 |
$log->debug("using type '",$self->{'type'},"' tag <",$self->{'tag'},">") if ($self->{'debug'}); |
$log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">"); |
366 |
|
|
367 |
my $f = "./import_xml/".$self->{'type'}.".xml"; |
my $f = "./import_xml/".$self->{'type'}.".xml"; |
368 |
$log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f"); |
$log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f"); |
369 |
|
|
370 |
$log->debug("reading '$f'") if ($self->{'debug'}); |
$log->info("reading '$f'"); |
371 |
|
|
372 |
|
$self->{'import_xml_file'} = $f; |
373 |
|
|
374 |
$self->{'import_xml'} = XMLin($f, |
$self->{'import_xml'} = XMLin($f, |
375 |
ForceArray => [ $self->{'tag'}, 'config', 'format' ], |
ForceArray => [ $self->{'tag'}, 'config', 'format' ], |
|
ForceContent => 1 |
|
376 |
); |
); |
377 |
|
|
378 |
|
$log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }); |
379 |
|
|
380 |
} |
} |
381 |
|
|
382 |
=head2 create_lookup |
=head2 create_lookup |
398 |
$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); |
$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); |
399 |
|
|
400 |
foreach my $i (@_) { |
foreach my $i (@_) { |
401 |
if ($i->{'eval'}) { |
$log->logconfess("need key") unless defined($i->{'key'}); |
402 |
my $eval = $self->fill_in($rec,$i->{'eval'}); |
$log->logconfess("need val") unless defined($i->{'val'}); |
403 |
my $key = $self->fill_in($rec,$i->{'key'}); |
|
404 |
my @val = $self->fill_in($rec,$i->{'val'}); |
if (defined($i->{'eval'})) { |
405 |
if ($key && @val && eval $eval) { |
# eval first, so we can skip fill_in for key and val |
406 |
|
my $eval = $self->fill_in($rec,$i->{'eval'}) || next; |
407 |
|
if ($self->_eval($eval)) { |
408 |
|
my $key = $self->fill_in($rec,$i->{'key'}) || next; |
409 |
|
my @val = $self->fill_in($rec,$i->{'val'}) || next; |
410 |
|
$log->debug("stored $key = ",sub { join(" | ",@val) }); |
411 |
push @{$self->{'lookup'}->{$key}}, @val; |
push @{$self->{'lookup'}->{$key}}, @val; |
412 |
} |
} |
413 |
} else { |
} else { |
414 |
my $key = $self->fill_in($rec,$i->{'key'}); |
my $key = $self->fill_in($rec,$i->{'key'}) || next; |
415 |
my @val = $self->fill_in($rec,$i->{'val'}); |
my @val = $self->fill_in($rec,$i->{'val'}) || next; |
416 |
if ($key && @val) { |
$log->debug("stored $key = ",sub { join(" | ",@val) }); |
417 |
push @{$self->{'lookup'}->{$key}}, @val; |
push @{$self->{'lookup'}->{$key}}, @val; |
|
} |
|
418 |
} |
} |
419 |
} |
} |
420 |
} |
} |
445 |
|
|
446 |
if ($$rec->{$f}) { |
if ($$rec->{$f}) { |
447 |
return '' if (! $$rec->{$f}->[$i]); |
return '' if (! $$rec->{$f}->[$i]); |
448 |
|
no strict 'refs'; |
449 |
if ($sf && $$rec->{$f}->[$i]->{$sf}) { |
if ($sf && $$rec->{$f}->[$i]->{$sf}) { |
450 |
$$found++ if (defined($$found)); |
$$found++ if (defined($$found)); |
451 |
return $$rec->{$f}->[$i]->{$sf}; |
return $$rec->{$f}->[$i]->{$sf}; |
486 |
This function B<does not> perform parsing of format to inteligenty skip |
This function B<does not> perform parsing of format to inteligenty skip |
487 |
delimiters before fields which aren't used. |
delimiters before fields which aren't used. |
488 |
|
|
489 |
|
This method will automatically decode UTF-8 string to local code page |
490 |
|
if needed. |
491 |
|
|
492 |
=cut |
=cut |
493 |
|
|
494 |
sub fill_in { |
sub fill_in { |
504 |
# FIXME remove for speedup? |
# FIXME remove for speedup? |
505 |
$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); |
$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); |
506 |
|
|
507 |
|
if (utf8::is_utf8($format)) { |
508 |
|
$format = $self->_x($format); |
509 |
|
} |
510 |
|
|
511 |
my $found = 0; |
my $found = 0; |
512 |
|
|
513 |
my $eval_code; |
my $eval_code; |
515 |
$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); |
$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); |
516 |
|
|
517 |
# do actual replacement of placeholders |
# do actual replacement of placeholders |
518 |
$format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; |
$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges; |
519 |
|
|
520 |
if ($found) { |
if ($found) { |
521 |
|
$log->debug("format: $format"); |
522 |
if ($eval_code) { |
if ($eval_code) { |
523 |
my $eval = $self->fill_in($rec,$eval_code,$i); |
my $eval = $self->fill_in($rec,$eval_code,$i); |
524 |
return if (! $self->_eval($eval)); |
return if (! $self->_eval($eval)); |
525 |
} |
} |
526 |
# do we have lookups? |
# do we have lookups? |
527 |
$log->debug("test format '$format' for lookups"); |
if ($format =~ /$LOOKUP_REGEX/o) { |
528 |
if ($format =~ /\[[^\[\]]+\]/o) { |
$log->debug("format '$format' has lookup"); |
529 |
return $self->lookup($format); |
return $self->lookup($format); |
530 |
} else { |
} else { |
531 |
return $format; |
return $format; |
552 |
|
|
553 |
my $tmp = shift || $log->logconfess("need format"); |
my $tmp = shift || $log->logconfess("need format"); |
554 |
|
|
555 |
if ($tmp =~ /\[[^\[\]]+\]/o) { |
if ($tmp =~ /$LOOKUP_REGEX/o) { |
556 |
my @in = ( $tmp ); |
my @in = ( $tmp ); |
557 |
|
|
558 |
$log->debug("lookup for: ",$tmp); |
$log->debug("lookup for: ",$tmp); |
559 |
|
|
560 |
my @out; |
my @out; |
561 |
while (my $f = shift @in) { |
while (my $f = shift @in) { |
562 |
if ($f =~ /\[([^\[\]]+)\]/) { |
if ($f =~ /$LOOKUP_REGEX_SAVE/o) { |
563 |
my $k = $1; |
my $k = $1; |
564 |
if ($self->{'lookup'}->{$k}) { |
if ($self->{'lookup'}->{$k}) { |
565 |
foreach my $nv (@{$self->{'lookup'}->{$k}}) { |
foreach my $nv (@{$self->{'lookup'}->{$k}}) { |
566 |
my $tmp2 = $f; |
my $tmp2 = $f; |
567 |
$tmp2 =~ s/\[$k\]/$nv/g; |
$tmp2 =~ s/lookup{$k}/$nv/g; |
568 |
push @in, $tmp2; |
push @in, $tmp2; |
569 |
} |
} |
570 |
} else { |
} else { |
574 |
push @out, $f; |
push @out, $f; |
575 |
} |
} |
576 |
} |
} |
577 |
|
$log->logconfess("return is array and it's not expected!") unless wantarray; |
578 |
return @out; |
return @out; |
579 |
} else { |
} else { |
580 |
return $tmp; |
return $tmp; |
605 |
|
|
606 |
$i = 0 if (! $i); |
$i = 0 if (! $i); |
607 |
|
|
608 |
my $format = $self->{'utf2cp'}->convert($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'}); |
my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'}); |
609 |
|
|
610 |
my @out; |
my @out; |
611 |
|
|
612 |
|
$log->debug("format: $format"); |
613 |
|
|
614 |
my $eval_code; |
my $eval_code; |
615 |
# remove eval{...} from beginning |
# remove eval{...} from beginning |
616 |
$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); |
$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s); |
618 |
my $prefix; |
my $prefix; |
619 |
my $all_found=0; |
my $all_found=0; |
620 |
|
|
621 |
while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) { |
while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) { |
622 |
|
|
623 |
my $del = $1 || ''; |
my $del = $1 || ''; |
624 |
$prefix ||= $del if ($all_found == 0); |
$prefix ||= $del if ($all_found == 0); |
635 |
|
|
636 |
return if (! $all_found); |
return if (! $all_found); |
637 |
|
|
638 |
my $out = join('',@out) . $format; |
my $out = join('',@out); |
639 |
|
|
640 |
# add prefix if not there |
if ($out) { |
641 |
$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); |
# add rest of format (suffix) |
642 |
|
$out .= $format; |
643 |
|
|
644 |
|
# add prefix if not there |
645 |
|
$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/); |
646 |
|
|
647 |
|
$log->debug("result: $out"); |
648 |
|
} |
649 |
|
|
650 |
if ($eval_code) { |
if ($eval_code) { |
651 |
my $eval = $self->fill_in($rec,$eval_code,$i); |
my $eval = $self->fill_in($rec,$eval_code,$i); |
652 |
$log->debug("about to eval ",$eval," [$out]"); |
$log->debug("about to eval{",$eval,"} format: $out"); |
653 |
return if (! $self->_eval($eval)); |
return if (! $self->_eval($eval)); |
654 |
} |
} |
655 |
|
|
681 |
push @arr, $v; |
push @arr, $v; |
682 |
} |
} |
683 |
|
|
684 |
|
$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); |
685 |
|
|
686 |
return @arr; |
return @arr; |
687 |
} |
} |
688 |
|
|
689 |
|
=head2 fill_in_to_arr |
690 |
|
|
691 |
|
Similar to C<fill_in>, but returns array of all repeatable fields. Usable |
692 |
|
for fields which have lookups, so they shouldn't be parsed but rather |
693 |
|
C<fill_id>ed. |
694 |
|
|
695 |
|
my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]'); |
696 |
|
|
697 |
|
=cut |
698 |
|
|
699 |
|
sub fill_in_to_arr { |
700 |
|
my $self = shift; |
701 |
|
|
702 |
|
my ($rec, $format_utf8) = @_; |
703 |
|
|
704 |
|
my $log = $self->_get_logger(); |
705 |
|
|
706 |
|
$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); |
707 |
|
return if (! $format_utf8); |
708 |
|
|
709 |
|
my $i = 0; |
710 |
|
my @arr; |
711 |
|
|
712 |
|
while (my @v = $self->fill_in($rec,$format_utf8,$i++)) { |
713 |
|
push @arr, @v; |
714 |
|
} |
715 |
|
|
716 |
|
$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr); |
717 |
|
|
718 |
|
return @arr; |
719 |
|
} |
720 |
|
|
721 |
|
|
722 |
=head2 data_structure |
=head2 data_structure |
723 |
|
|
724 |
Create in-memory data structure which represents layout from C<import_xml>. |
Create in-memory data structure which represents layout from C<import_xml>. |
726 |
|
|
727 |
my @ds = $webpac->data_structure($rec); |
my @ds = $webpac->data_structure($rec); |
728 |
|
|
729 |
|
This method will also set C<$webpac->{'currnet_filename'}> if there is |
730 |
|
<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is |
731 |
|
<headline> tag. |
732 |
|
|
733 |
=cut |
=cut |
734 |
|
|
735 |
sub data_structure { |
sub data_structure { |
740 |
my $rec = shift; |
my $rec = shift; |
741 |
$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); |
$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o); |
742 |
|
|
743 |
|
undef $self->{'currnet_filename'}; |
744 |
|
undef $self->{'headline'}; |
745 |
|
|
746 |
my @sorted_tags; |
my @sorted_tags; |
747 |
if ($self->{tags_by_order}) { |
if ($self->{tags_by_order}) { |
748 |
@sorted_tags = @{$self->{tags_by_order}}; |
@sorted_tags = @{$self->{tags_by_order}}; |
753 |
|
|
754 |
my @ds; |
my @ds; |
755 |
|
|
756 |
|
$log->debug("tags: ",sub { join(", ",@sorted_tags) }); |
757 |
|
|
758 |
foreach my $field (@sorted_tags) { |
foreach my $field (@sorted_tags) { |
759 |
|
|
760 |
my $row; |
my $row; |
762 |
#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); |
#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}); |
763 |
|
|
764 |
foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { |
foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) { |
765 |
my @v = $self->parse_to_arr($rec,$tag->{'content'}); |
my $format = $tag->{'value'} || $tag->{'content'}; |
766 |
|
|
767 |
|
$log->debug("format: $format"); |
768 |
|
|
769 |
|
my @v; |
770 |
|
if ($format =~ /$LOOKUP_REGEX/o) { |
771 |
|
@v = $self->fill_in_to_arr($rec,$format); |
772 |
|
} else { |
773 |
|
@v = $self->parse_to_arr($rec,$format); |
774 |
|
} |
775 |
next if (! @v); |
next if (! @v); |
776 |
|
|
777 |
|
# use format? |
778 |
|
if ($tag->{'format_name'}) { |
779 |
|
@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v; |
780 |
|
} |
781 |
|
|
782 |
|
if ($field eq 'filename') { |
783 |
|
$self->{'current_filename'} = join('',@v); |
784 |
|
$log->debug("filename: ",$self->{'current_filename'}); |
785 |
|
} elsif ($field eq 'headline') { |
786 |
|
$self->{'headline'} .= join('',@v); |
787 |
|
$log->debug("headline: ",$self->{'headline'}); |
788 |
|
next; # don't return headline in data_structure! |
789 |
|
} |
790 |
|
|
791 |
# does tag have type? |
# does tag have type? |
792 |
if ($tag->{'type'}) { |
if ($tag->{'type'}) { |
793 |
push @{$row->{$tag->{'type'}}}, @v; |
push @{$row->{$tag->{'type'}}}, @v; |
795 |
push @{$row->{'display'}}, @v; |
push @{$row->{'display'}}, @v; |
796 |
push @{$row->{'swish'}}, @v; |
push @{$row->{'swish'}}, @v; |
797 |
} |
} |
798 |
|
|
799 |
|
|
800 |
} |
} |
801 |
|
|
802 |
if ($row) { |
if ($row) { |
803 |
$row->{'tag'} = $field; |
$row->{'tag'} = $field; |
804 |
|
|
805 |
|
# TODO: name_sigular, name_plural |
806 |
|
my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'}; |
807 |
|
$row->{'name'} = $name ? $self->_x($name) : $field; |
808 |
|
|
809 |
push @ds, $row; |
push @ds, $row; |
810 |
|
|
811 |
|
$log->debug("row $field: ",sub { Dumper($row) }); |
812 |
} |
} |
813 |
|
|
814 |
} |
} |
846 |
return $out; |
return $out; |
847 |
} |
} |
848 |
|
|
849 |
|
=head2 output_file |
850 |
|
|
851 |
|
Create output from in-memory data structure using Template Toolkit template |
852 |
|
to a file. |
853 |
|
|
854 |
|
$webpac->output_file( |
855 |
|
file => 'out.txt', |
856 |
|
template => 'text.tt', |
857 |
|
data => @ds |
858 |
|
); |
859 |
|
|
860 |
|
=cut |
861 |
|
|
862 |
|
sub output_file { |
863 |
|
my $self = shift; |
864 |
|
|
865 |
|
my $args = {@_}; |
866 |
|
|
867 |
|
my $log = $self->_get_logger(); |
868 |
|
|
869 |
|
my $file = $args->{'file'} || $log->logconfess("need file name"); |
870 |
|
|
871 |
|
$log->debug("creating file ",$file); |
872 |
|
|
873 |
|
open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!"); |
874 |
|
print $fh $self->output( |
875 |
|
template => $args->{'template'}, |
876 |
|
data => $args->{'data'}, |
877 |
|
) || $log->logdie("print: $!"); |
878 |
|
close($fh) || $log->logdie("close: $!"); |
879 |
|
} |
880 |
|
|
881 |
|
=head2 apply_format |
882 |
|
|
883 |
|
Apply format specified in tag with C<format_name="name"> and |
884 |
|
C<format_delimiter=";;">. |
885 |
|
|
886 |
|
my $text = $webpac->apply_format($format_name,$format_delimiter,$data); |
887 |
|
|
888 |
|
Formats can contain C<lookup{...}> if you need them. |
889 |
|
|
890 |
|
=cut |
891 |
|
|
892 |
|
sub apply_format { |
893 |
|
my $self = shift; |
894 |
|
|
895 |
|
my ($name,$delimiter,$data) = @_; |
896 |
|
|
897 |
|
my $log = $self->_get_logger(); |
898 |
|
|
899 |
|
if (! $self->{'import_xml'}->{'format'}->{$name}) { |
900 |
|
$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'}); |
901 |
|
return $data; |
902 |
|
} |
903 |
|
|
904 |
|
$log->warn("no delimiter for format $name") if (! $delimiter); |
905 |
|
|
906 |
|
my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); |
907 |
|
|
908 |
|
my @data = split(/\Q$delimiter\E/, $data); |
909 |
|
|
910 |
|
my $out = sprintf($format, @data); |
911 |
|
$log->debug("using format $name [$format] on $data to produce: $out"); |
912 |
|
|
913 |
|
if ($out =~ m/$LOOKUP_REGEX/o) { |
914 |
|
return $self->lookup($out); |
915 |
|
} else { |
916 |
|
return $out; |
917 |
|
} |
918 |
|
|
919 |
|
} |
920 |
|
|
921 |
|
|
922 |
# |
# |
923 |
# |
# |
924 |
# |
# |
971 |
return $va <=> $vb; |
return $va <=> $vb; |
972 |
} |
} |
973 |
|
|
974 |
|
=head2 _get_logger |
975 |
|
|
976 |
|
Get C<Log::Log4perl> object with a twist: domains are defined for each |
977 |
|
method |
978 |
|
|
979 |
|
my $log = $webpac->_get_logger(); |
980 |
|
|
981 |
|
=cut |
982 |
|
|
983 |
sub _get_logger { |
sub _get_logger { |
984 |
my $self = shift; |
my $self = shift; |
985 |
|
|
986 |
my @c = caller(1); |
my $name = (caller(1))[3] || caller; |
987 |
return get_logger($c[3]); |
return get_logger($name); |
988 |
|
} |
989 |
|
|
990 |
|
=head2 _x |
991 |
|
|
992 |
|
Convert string from UTF-8 to code page defined in C<import_xml>. |
993 |
|
|
994 |
|
my $text = $webpac->_x('utf8 text'); |
995 |
|
|
996 |
|
=cut |
997 |
|
|
998 |
|
sub _x { |
999 |
|
my $self = shift; |
1000 |
|
my $utf8 = shift || return; |
1001 |
|
|
1002 |
|
return $self->{'utf2cp'}->convert($utf8) || |
1003 |
|
$self->_get_logger()->logwarn("can't convert '$utf8'"); |
1004 |
} |
} |
1005 |
|
|
1006 |
# |
# |