/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 705 - (show annotations)
Wed Jul 13 22:34:52 2005 UTC (18 years, 8 months ago) by dpavlin
File size: 28410 byte(s)
ported code to use Biblio::Isis, newer version (released on CPAN of IsisDB)

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 my ($have_biblio_isis, $have_openisis) = (0,0);
17
18 eval "use Biblio::Isis 0.13;";
19 unless ($@) {
20 $have_biblio_isis = 1
21 } else {
22 eval "use OpenIsis;";
23 $have_openisis = 1 unless ($@);
24 }
25
26 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
27 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
28 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
29 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
30
31 =head1 NAME
32
33 WebPAC - base class for WebPAC
34
35 =head1 DESCRIPTION
36
37 This module implements methods used by WebPAC.
38
39 =head1 METHODS
40
41 =head2 new
42
43 Create new instance of WebPAC using configuration specified by C<config_file>.
44
45 my $webpac = new WebPAC(
46 config_file => 'name.conf',
47 code_page => 'ISO-8859-2',
48 low_mem => 1,
49 filter => {
50 'lower' => sub { lc($_[0]) },
51 },
52 );
53
54 Default C<code_page> is C<ISO-8859-2>.
55
56 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
57
58 There is optinal parametar C<filter> which specify different filters which
59 can be applied using C<filter{name}> notation.
60 Same filters can be used in Template Toolkit files.
61
62 This method will also read configuration files
63 C<global.conf> (used by indexer and Web font-end)
64 and configuration file specified by C<config_file>
65 which describes databases to be indexed.
66
67 =cut
68
69 # mapping between data type and tag which specify
70 # format in XML file
71 my %type2tag = (
72 'isis' => 'isis',
73 # 'excel' => 'column',
74 # 'marc' => 'marc',
75 # 'feed' => 'feed'
76 );
77
78 sub new {
79 my $class = shift;
80 my $self = {@_};
81 bless($self, $class);
82
83 $self->{'start_t'} = time();
84
85 my $log_file = $self->{'log'} || "log.conf";
86 Log::Log4perl->init($log_file);
87
88 my $log = $self->_get_logger();
89
90 # fill in default values
91 # output codepage
92 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
93
94 #
95 # read global.conf
96 #
97 $log->debug("read 'global.conf'");
98
99 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
100
101 # read global config parametars
102 foreach my $var (qw(
103 dbi_dbd
104 dbi_dsn
105 dbi_user
106 dbi_passwd
107 show_progress
108 my_unac_filter
109 output_template
110 )) {
111 $self->{'global_config'}->{$var} = $config->val('global', $var);
112 }
113
114 #
115 # read indexer config file
116 #
117
118 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
119
120 # create UTF-8 convertor for import_xml files
121 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
122
123 # create Template toolkit instance
124 $self->{'tt'} = Template->new(
125 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
126 FILTERS => $self->{'filter'},
127 EVAL_PERL => 1,
128 );
129
130 # running with low_mem flag? well, use DBM::Deep then.
131 if ($self->{'low_mem'}) {
132 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
133
134 my $db_file = "data.db";
135
136 if (-e $db_file) {
137 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
138 $log->debug("removed '$db_file' from last run");
139 }
140
141 require DBM::Deep;
142
143 my $db = new DBM::Deep $db_file;
144
145 $log->logdie("DBM::Deep error: $!") unless ($db);
146
147 if ($db->error()) {
148 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
149 } else {
150 $log->debug("using file '$db_file' for DBM::Deep");
151 }
152
153 $self->{'db'} = $db;
154 }
155
156 $log->debug("filters defined: ",Dumper($self->{'filter'}));
157
158 return $self;
159 }
160
161 =head2 open_isis
162
163 Open CDS/ISIS, WinISIS or IsisMarc database using IsisDB or OpenIsis module
164 and read all records to memory.
165
166 $webpac->open_isis(
167 filename => '/data/ISIS/ISIS',
168 code_page => '852',
169 limit_mfn => 500,
170 start_mfn => 6000,
171 lookup => [ ... ],
172 );
173
174 By default, ISIS code page is assumed to be C<852>.
175
176 If optional parametar C<start_mfn> is set, this will be first MFN to read
177 from database (so you can skip beginning of your database if you need to).
178
179 If optional parametar C<limit_mfn> is set, it will read just 500 records
180 from database in example above.
181
182 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
183 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
184 value in index.
185
186 lookup => [
187 { 'key' => 'd:v900', 'val' => 'v250^a' },
188 { 'eval' => '"v901^a" eq "Podruèje"',
189 'key' => 'pa:v561^4:v562^4:v461^1',
190 'val' => 'v900' },
191 ]
192
193 Returns number of last record read into memory (size of database, really).
194
195 =cut
196
197 sub open_isis {
198 my $self = shift;
199 my $arg = {@_};
200
201 my $log = $self->_get_logger();
202
203 $log->logcroak("need filename") if (! $arg->{'filename'});
204 my $code_page = $arg->{'code_page'} || '852';
205
206 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
207
208 # store data in object
209 $self->{'isis_filename'} = $arg->{'filename'};
210 $self->{'isis_code_page'} = $code_page;
211
212 #$self->{'isis_code_page'} = $code_page;
213
214 # create Text::Iconv object
215 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
216
217 $log->info("reading ISIS database '",$arg->{'filename'},"'");
218 $log->debug("isis code page: $code_page");
219
220 my ($isis_db,$maxmfn);
221
222 if ($have_openisis) {
223 $log->debug("using OpenIsis perl bindings");
224 $isis_db = OpenIsis::open($arg->{'filename'});
225 $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
226 } elsif ($have_biblio_isis) {
227 $log->debug("using Biblio::Isis");
228 use Biblio::Isis;
229 $isis_db = new Biblio::Isis(
230 isisdb => $arg->{'filename'},
231 include_deleted => 1,
232 hash_filter => sub {
233 my $l = shift || return;
234 $l = $cp->convert($l);
235 return $l;
236 },
237 );
238 $maxmfn = $isis_db->{'maxmfn'};
239 } else {
240 $log->logdie("Can't find supported ISIS library for perl. I suggent that you install Bilbio::Isis from CPAN.");
241 }
242
243
244 my $startmfn = 1;
245
246 if (my $s = $self->{'start_mfn'}) {
247 $log->info("skipping to MFN $s");
248 $startmfn = $s;
249 } else {
250 $self->{'start_mfn'} = $startmfn;
251 }
252
253 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
254
255 $log->info("processing ",($maxmfn-$startmfn)." records using ",( $have_openisis ? 'OpenIsis' : 'Biblio::Isis'));
256
257
258 # read database
259 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
260
261
262 $log->debug("mfn: $mfn\n");
263
264 my $rec;
265
266 if ($have_openisis) {
267
268 # read record using OpenIsis
269 my $row = OpenIsis::read( $isis_db, $mfn );
270 foreach my $k (keys %{$row}) {
271 if ($k ne "mfn") {
272 foreach my $l (@{$row->{$k}}) {
273 $l = $cp->convert($l);
274 # has subfields?
275 my $val;
276 if ($l =~ m/\^/) {
277 foreach my $t (split(/\^/,$l)) {
278 next if (! $t);
279 $val->{substr($t,0,1)} = substr($t,1);
280 }
281 } else {
282 $val = $l;
283 }
284
285 push @{$rec->{$k}}, $val;
286 }
287 } else {
288 push @{$rec->{'000'}}, $mfn;
289 }
290 }
291
292 } elsif ($have_biblio_isis) {
293 $rec = $isis_db->to_hash($mfn);
294 } else {
295 $log->logdie("hum? implementation missing?");
296 }
297
298 $log->confess("record $mfn empty?") unless ($rec);
299
300 # store
301 if ($self->{'low_mem'}) {
302 $self->{'db'}->put($mfn, $rec);
303 } else {
304 $self->{'data'}->{$mfn} = $rec;
305 }
306
307 # create lookup
308 $self->create_lookup($rec, @{$arg->{'lookup'}});
309
310 $self->progress_bar($mfn,$maxmfn);
311
312 }
313
314 $self->{'current_mfn'} = -1;
315 $self->{'last_pcnt'} = 0;
316
317 $log->debug("max mfn: $maxmfn");
318
319 # store max mfn and return it.
320 return $self->{'max_mfn'} = $maxmfn;
321 }
322
323 =head2 fetch_rec
324
325 Fetch next record from database. It will also display progress bar (once
326 it's implemented, that is).
327
328 my $rec = $webpac->fetch_rec;
329
330 =cut
331
332 sub fetch_rec {
333 my $self = shift;
334
335 my $log = $self->_get_logger();
336
337 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
338
339 if ($self->{'current_mfn'} == -1) {
340 $self->{'current_mfn'} = $self->{'start_mfn'};
341 } else {
342 $self->{'current_mfn'}++;
343 }
344
345 my $mfn = $self->{'current_mfn'};
346
347 if ($mfn > $self->{'max_mfn'}) {
348 $self->{'current_mfn'} = $self->{'max_mfn'};
349 $log->debug("at EOF");
350 return;
351 }
352
353 $self->progress_bar($mfn,$self->{'max_mfn'});
354
355 if ($self->{'low_mem'}) {
356 return $self->{'db'}->get($mfn);
357 } else {
358 return $self->{'data'}->{$mfn};
359 }
360 }
361
362 =head2 mfn
363
364 Returns current record number (MFN).
365
366 print $webpac->mfn;
367
368 =cut
369
370 sub mfn {
371 my $self = shift;
372 return $self->{'current_mfn'};
373 }
374
375 =head2 progress_bar
376
377 Draw progress bar on STDERR.
378
379 $webpac->progress_bar($current, $max);
380
381 =cut
382
383 sub progress_bar {
384 my $self = shift;
385
386 my ($curr,$max) = @_;
387
388 my $log = $self->_get_logger();
389
390 $log->logconfess("no current value!") if (! $curr);
391 $log->logconfess("no maximum value!") if (! $max);
392
393 if ($curr > $max) {
394 $max = $curr;
395 $log->debug("overflow to $curr");
396 }
397
398 $self->{'last_pcnt'} ||= 1;
399
400 my $p = int($curr * 100 / $max) || 1;
401
402 # reset on re-run
403 if ($p < $self->{'last_pcnt'}) {
404 $self->{'last_pcnt'} = $p;
405 $self->{'start_t'} = time();
406 }
407
408 if ($p != $self->{'last_pcnt'}) {
409
410 my $t = time();
411 my $rate = ($curr / ($t - $self->{'start_t'} || 1));
412 my $eta = ($max-$curr) / ($rate || 1);
413 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
414 $self->{'last_pcnt'} = $p;
415 $self->{'last_curr'} = $curr;
416 }
417 print STDERR "\n" if ($p == 100);
418 }
419
420 =head2 fmt_time
421
422 Format time (in seconds) for display.
423
424 print $webpac->fmt_time(time());
425
426 This method is called by L<progress_bar> to display remaining time.
427
428 =cut
429
430 sub fmt_time {
431 my $self = shift;
432
433 my $t = shift || 0;
434 my $out = "";
435
436 my ($ss,$mm,$hh) = gmtime($t);
437 $out .= "${hh}h" if ($hh);
438 $out .= sprintf("%02d:%02d", $mm,$ss);
439 $out .= " " if ($hh == 0);
440 return $out;
441 }
442
443 =head2 open_import_xml
444
445 Read file from C<import_xml/> directory and parse it.
446
447 $webpac->open_import_xml(type => 'isis');
448
449 =cut
450
451 sub open_import_xml {
452 my $self = shift;
453
454 my $log = $self->_get_logger();
455
456 my $arg = {@_};
457 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
458
459 $self->{'type'} = $arg->{'type'};
460
461 my $type_base = $arg->{'type'};
462 $type_base =~ s/_.*$//g;
463
464 $self->{'tag'} = $type2tag{$type_base};
465
466 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
467
468 my $f = "./import_xml/".$self->{'type'}.".xml";
469 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
470
471 $log->info("reading '$f'");
472
473 $self->{'import_xml_file'} = $f;
474
475 $self->{'import_xml'} = XMLin($f,
476 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
477 );
478
479 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
480
481 }
482
483 =head2 create_lookup
484
485 Create lookup from record using lookup definition.
486
487 $self->create_lookup($rec, @lookups);
488
489 Called internally by C<open_*> methods.
490
491 =cut
492
493 sub create_lookup {
494 my $self = shift;
495
496 my $log = $self->_get_logger();
497
498 my $rec = shift || $log->logconfess("need record to create lookup");
499 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
500
501 foreach my $i (@_) {
502 $log->logconfess("need key") unless defined($i->{'key'});
503 $log->logconfess("need val") unless defined($i->{'val'});
504
505 if (defined($i->{'eval'})) {
506 # eval first, so we can skip fill_in for key and val
507 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
508 if ($self->_eval($eval)) {
509 my $key = $self->fill_in($rec,$i->{'key'}) || next;
510 my @val = $self->fill_in($rec,$i->{'val'}) || next;
511 $log->debug("stored $key = ",sub { join(" | ",@val) });
512 push @{$self->{'lookup'}->{$key}}, @val;
513 }
514 } else {
515 my $key = $self->fill_in($rec,$i->{'key'}) || next;
516 my @val = $self->fill_in($rec,$i->{'val'}) || next;
517 $log->debug("stored $key = ",sub { join(" | ",@val) });
518 push @{$self->{'lookup'}->{$key}}, @val;
519 }
520 }
521 }
522
523 =head2 get_data
524
525 Returns value from record.
526
527 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
528
529 Arguments are:
530 record reference C<$rec>,
531 field C<$f>,
532 optional subfiled C<$sf>,
533 index for repeatable values C<$i>.
534
535 Optinal variable C<$found> will be incremeted if there
536 is field.
537
538 Returns value or empty string.
539
540 =cut
541
542 sub get_data {
543 my $self = shift;
544
545 my ($rec,$f,$sf,$i,$found) = @_;
546
547 if ($$rec->{$f}) {
548 return '' if (! $$rec->{$f}->[$i]);
549 no strict 'refs';
550 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
551 $$found++ if (defined($$found));
552 return $$rec->{$f}->[$i]->{$sf};
553 } elsif ($$rec->{$f}->[$i]) {
554 $$found++ if (defined($$found));
555 # it still might have subfield, just
556 # not specified, so we'll dump all
557 if ($$rec->{$f}->[$i] =~ /HASH/o) {
558 my $out;
559 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
560 $out .= $$rec->{$f}->[$i]->{$k}." ";
561 }
562 return $out;
563 } else {
564 return $$rec->{$f}->[$i];
565 }
566 }
567 } else {
568 return '';
569 }
570 }
571
572 =head2 fill_in
573
574 Workhourse of all: takes record from in-memory structure of database and
575 strings with placeholders and returns string or array of with substituted
576 values from record.
577
578 my $text = $webpac->fill_in($rec,'v250^a');
579
580 Optional argument is ordinal number for repeatable fields. By default,
581 it's assume to be first repeatable field (fields are perl array, so first
582 element is 0).
583 Following example will read second value from repeatable field.
584
585 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
586
587 This function B<does not> perform parsing of format to inteligenty skip
588 delimiters before fields which aren't used.
589
590 This method will automatically decode UTF-8 string to local code page
591 if needed.
592
593 =cut
594
595 sub fill_in {
596 my $self = shift;
597
598 my $log = $self->_get_logger();
599
600 my $rec = shift || $log->logconfess("need data record");
601 my $format = shift || $log->logconfess("need format to parse");
602 # iteration (for repeatable fields)
603 my $i = shift || 0;
604
605 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
606
607 # FIXME remove for speedup?
608 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
609
610 if (utf8::is_utf8($format)) {
611 $format = $self->_x($format);
612 }
613
614 my $found = 0;
615
616 my $eval_code;
617 # remove eval{...} from beginning
618 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
619
620 my $filter_name;
621 # remove filter{...} from beginning
622 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
623
624 # do actual replacement of placeholders
625 # repeatable fields
626 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
627 # non-repeatable fields
628 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
629
630 if ($found) {
631 $log->debug("format: $format");
632 if ($eval_code) {
633 my $eval = $self->fill_in($rec,$eval_code,$i);
634 return if (! $self->_eval($eval));
635 }
636 if ($filter_name && $self->{'filter'}->{$filter_name}) {
637 $log->debug("filter '$filter_name' for $format");
638 $format = $self->{'filter'}->{$filter_name}->($format);
639 return unless(defined($format));
640 $log->debug("filter result: $format");
641 }
642 # do we have lookups?
643 if ($format =~ /$LOOKUP_REGEX/o) {
644 $log->debug("format '$format' has lookup");
645 return $self->lookup($format);
646 } else {
647 return $format;
648 }
649 } else {
650 return;
651 }
652 }
653
654 =head2 lookup
655
656 Perform lookups on format supplied to it.
657
658 my $text = $self->lookup('[v900]');
659
660 Lookups can be nested (like C<[d:[a:[v900]]]>).
661
662 =cut
663
664 sub lookup {
665 my $self = shift;
666
667 my $log = $self->_get_logger();
668
669 my $tmp = shift || $log->logconfess("need format");
670
671 if ($tmp =~ /$LOOKUP_REGEX/o) {
672 my @in = ( $tmp );
673
674 $log->debug("lookup for: ",$tmp);
675
676 my @out;
677 while (my $f = shift @in) {
678 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
679 my $k = $1;
680 if ($self->{'lookup'}->{$k}) {
681 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
682 my $tmp2 = $f;
683 $tmp2 =~ s/lookup{$k}/$nv/g;
684 push @in, $tmp2;
685 }
686 } else {
687 undef $f;
688 }
689 } elsif ($f) {
690 push @out, $f;
691 }
692 }
693 $log->logconfess("return is array and it's not expected!") unless wantarray;
694 return @out;
695 } else {
696 return $tmp;
697 }
698 }
699
700 =head2 parse
701
702 Perform smart parsing of string, skipping delimiters for fields which aren't
703 defined. It can also eval code in format starting with C<eval{...}> and
704 return output or nothing depending on eval code.
705
706 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
707
708 =cut
709
710 sub parse {
711 my $self = shift;
712
713 my ($rec, $format_utf8, $i) = @_;
714
715 return if (! $format_utf8);
716
717 my $log = $self->_get_logger();
718
719 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
720 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
721
722 $i = 0 if (! $i);
723
724 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
725
726 my @out;
727
728 $log->debug("format: $format");
729
730 my $eval_code;
731 # remove eval{...} from beginning
732 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
733
734 my $filter_name;
735 # remove filter{...} from beginning
736 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
737
738 my $prefix;
739 my $all_found=0;
740
741 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
742
743 my $del = $1 || '';
744 $prefix ||= $del if ($all_found == 0);
745
746 # repeatable index
747 my $r = $i;
748 $r = 0 if (lc("$2") eq 's');
749
750 my $found = 0;
751 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
752
753 if ($found) {
754 push @out, $del;
755 push @out, $tmp;
756 $all_found += $found;
757 }
758 }
759
760 return if (! $all_found);
761
762 my $out = join('',@out);
763
764 if ($out) {
765 # add rest of format (suffix)
766 $out .= $format;
767
768 # add prefix if not there
769 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
770
771 $log->debug("result: $out");
772 }
773
774 if ($eval_code) {
775 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
776 $log->debug("about to eval{$eval} format: $out");
777 return if (! $self->_eval($eval));
778 }
779
780 if ($filter_name && $self->{'filter'}->{$filter_name}) {
781 $log->debug("about to filter{$filter_name} format: $out");
782 $out = $self->{'filter'}->{$filter_name}->($out);
783 return unless(defined($out));
784 $log->debug("filter result: $out");
785 }
786
787 return $out;
788 }
789
790 =head2 parse_to_arr
791
792 Similar to C<parse>, but returns array of all repeatable fields
793
794 my @arr = $webpac->parse_to_arr($rec,'v250^a');
795
796 =cut
797
798 sub parse_to_arr {
799 my $self = shift;
800
801 my ($rec, $format_utf8) = @_;
802
803 my $log = $self->_get_logger();
804
805 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
806 return if (! $format_utf8);
807
808 my $i = 0;
809 my @arr;
810
811 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
812 push @arr, $v;
813 }
814
815 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
816
817 return @arr;
818 }
819
820 =head2 fill_in_to_arr
821
822 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
823 for fields which have lookups, so they shouldn't be parsed but rather
824 C<fill_id>ed.
825
826 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
827
828 =cut
829
830 sub fill_in_to_arr {
831 my $self = shift;
832
833 my ($rec, $format_utf8) = @_;
834
835 my $log = $self->_get_logger();
836
837 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
838 return if (! $format_utf8);
839
840 my $i = 0;
841 my @arr;
842
843 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
844 push @arr, @v;
845 }
846
847 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
848
849 return @arr;
850 }
851
852 =head2 sort_arr
853
854 Sort array ignoring case and html in data
855
856 my @sorted = $webpac->sort_arr(@unsorted);
857
858 =cut
859
860 sub sort_arr {
861 my $self = shift;
862
863 my $log = $self->_get_logger();
864
865 # FIXME add Schwartzian Transformation?
866
867 my @sorted = sort {
868 $a =~ s#<[^>]+/*>##;
869 $b =~ s#<[^>]+/*>##;
870 lc($b) cmp lc($a)
871 } @_;
872 $log->debug("sorted values: ",sub { join(", ",@sorted) });
873
874 return @sorted;
875 }
876
877
878 =head2 data_structure
879
880 Create in-memory data structure which represents layout from C<import_xml>.
881 It is used later to produce output.
882
883 my @ds = $webpac->data_structure($rec);
884
885 This method will also set C<$webpac->{'currnet_filename'}> if there is
886 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
887 <headline> tag.
888
889 =cut
890
891 sub data_structure {
892 my $self = shift;
893
894 my $log = $self->_get_logger();
895
896 my $rec = shift;
897 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
898
899 undef $self->{'currnet_filename'};
900 undef $self->{'headline'};
901
902 my @sorted_tags;
903 if ($self->{tags_by_order}) {
904 @sorted_tags = @{$self->{tags_by_order}};
905 } else {
906 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
907 $self->{tags_by_order} = \@sorted_tags;
908 }
909
910 my @ds;
911
912 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
913
914 foreach my $field (@sorted_tags) {
915
916 my $row;
917
918 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
919
920 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
921 my $format = $tag->{'value'} || $tag->{'content'};
922
923 $log->debug("format: $format");
924
925 my @v;
926 if ($format =~ /$LOOKUP_REGEX/o) {
927 @v = $self->fill_in_to_arr($rec,$format);
928 } else {
929 @v = $self->parse_to_arr($rec,$format);
930 }
931 next if (! @v);
932
933 if ($tag->{'sort'}) {
934 @v = $self->sort_arr(@v);
935 }
936
937 # use format?
938 if ($tag->{'format_name'}) {
939 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
940 }
941
942 if ($field eq 'filename') {
943 $self->{'current_filename'} = join('',@v);
944 $log->debug("filename: ",$self->{'current_filename'});
945 } elsif ($field eq 'headline') {
946 $self->{'headline'} .= join('',@v);
947 $log->debug("headline: ",$self->{'headline'});
948 next; # don't return headline in data_structure!
949 }
950
951 # delimiter will join repeatable fields
952 if ($tag->{'delimiter'}) {
953 @v = ( join($tag->{'delimiter'}, @v) );
954 }
955
956 # default types
957 my @types = qw(display swish);
958 # override by type attribute
959 @types = ( $tag->{'type'} ) if ($tag->{'type'});
960
961 foreach my $type (@types) {
962 # append to previous line?
963 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
964 if ($tag->{'append'}) {
965
966 # I will delimit appended part with
967 # delimiter (or ,)
968 my $d = $tag->{'delimiter'};
969 # default delimiter
970 $d ||= " ";
971
972 my $last = pop @{$row->{$type}};
973 $d = "" if (! $last);
974 $last .= $d . join($d, @v);
975 push @{$row->{$type}}, $last;
976
977 } else {
978 push @{$row->{$type}}, @v;
979 }
980 }
981
982
983 }
984
985 if ($row) {
986 $row->{'tag'} = $field;
987
988 # TODO: name_sigular, name_plural
989 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
990 $row->{'name'} = $name ? $self->_x($name) : $field;
991
992 # post-sort all values in field
993 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
994 $log->warn("sort at field tag not implemented");
995 }
996
997 push @ds, $row;
998
999 $log->debug("row $field: ",sub { Dumper($row) });
1000 }
1001
1002 }
1003
1004 return @ds;
1005
1006 }
1007
1008 =head2 output
1009
1010 Create output from in-memory data structure using Template Toolkit template.
1011
1012 my $text = $webpac->output( template => 'text.tt', data => @ds );
1013
1014 =cut
1015
1016 sub output {
1017 my $self = shift;
1018
1019 my $args = {@_};
1020
1021 my $log = $self->_get_logger();
1022
1023 $log->logconfess("need template name") if (! $args->{'template'});
1024 $log->logconfess("need data array") if (! $args->{'data'});
1025
1026 my $out;
1027
1028 $self->{'tt'}->process(
1029 $args->{'template'},
1030 $args,
1031 \$out
1032 ) || confess $self->{'tt'}->error();
1033
1034 return $out;
1035 }
1036
1037 =head2 output_file
1038
1039 Create output from in-memory data structure using Template Toolkit template
1040 to a file.
1041
1042 $webpac->output_file(
1043 file => 'out.txt',
1044 template => 'text.tt',
1045 data => @ds
1046 );
1047
1048 =cut
1049
1050 sub output_file {
1051 my $self = shift;
1052
1053 my $args = {@_};
1054
1055 my $log = $self->_get_logger();
1056
1057 my $file = $args->{'file'} || $log->logconfess("need file name");
1058
1059 $log->debug("creating file ",$file);
1060
1061 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1062 print $fh $self->output(
1063 template => $args->{'template'},
1064 data => $args->{'data'},
1065 ) || $log->logdie("print: $!");
1066 close($fh) || $log->logdie("close: $!");
1067 }
1068
1069 =head2 apply_format
1070
1071 Apply format specified in tag with C<format_name="name"> and
1072 C<format_delimiter=";;">.
1073
1074 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1075
1076 Formats can contain C<lookup{...}> if you need them.
1077
1078 =cut
1079
1080 sub apply_format {
1081 my $self = shift;
1082
1083 my ($name,$delimiter,$data) = @_;
1084
1085 my $log = $self->_get_logger();
1086
1087 if (! $self->{'import_xml'}->{'format'}->{$name}) {
1088 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1089 return $data;
1090 }
1091
1092 $log->warn("no delimiter for format $name") if (! $delimiter);
1093
1094 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1095
1096 my @data = split(/\Q$delimiter\E/, $data);
1097
1098 my $out = sprintf($format, @data);
1099 $log->debug("using format $name [$format] on $data to produce: $out");
1100
1101 if ($out =~ m/$LOOKUP_REGEX/o) {
1102 return $self->lookup($out);
1103 } else {
1104 return $out;
1105 }
1106
1107 }
1108
1109
1110 #
1111 #
1112 #
1113
1114 =head1 INTERNAL METHODS
1115
1116 Here is a quick list of internal methods, mostly useful to turn debugging
1117 on them (see L<LOGGING> below for explanation).
1118
1119 =cut
1120
1121 =head2 _eval
1122
1123 Internal function to eval code without C<strict 'subs'>.
1124
1125 =cut
1126
1127 sub _eval {
1128 my $self = shift;
1129
1130 my $code = shift || return;
1131
1132 my $log = $self->_get_logger();
1133
1134 no strict 'subs';
1135 my $ret = eval $code;
1136 if ($@) {
1137 $log->error("problem with eval code [$code]: $@");
1138 }
1139
1140 $log->debug("eval: ",$code," [",$ret,"]");
1141
1142 return $ret || undef;
1143 }
1144
1145 =head2 _sort_by_order
1146
1147 Sort xml tags data structure accoding to C<order=""> attribute.
1148
1149 =cut
1150
1151 sub _sort_by_order {
1152 my $self = shift;
1153
1154 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1155 $self->{'import_xml'}->{'indexer'}->{$a};
1156 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1157 $self->{'import_xml'}->{'indexer'}->{$b};
1158
1159 return $va <=> $vb;
1160 }
1161
1162 =head2 _get_logger
1163
1164 Get C<Log::Log4perl> object with a twist: domains are defined for each
1165 method
1166
1167 my $log = $webpac->_get_logger();
1168
1169 =cut
1170
1171 sub _get_logger {
1172 my $self = shift;
1173
1174 my $name = (caller(1))[3] || caller;
1175 return get_logger($name);
1176 }
1177
1178 =head2 _x
1179
1180 Convert string from UTF-8 to code page defined in C<import_xml>.
1181
1182 my $text = $webpac->_x('utf8 text');
1183
1184 =cut
1185
1186 sub _x {
1187 my $self = shift;
1188 my $utf8 = shift || return;
1189
1190 return $self->{'utf2cp'}->convert($utf8) ||
1191 $self->_get_logger()->logwarn("can't convert '$utf8'");
1192 }
1193
1194 #
1195 #
1196 #
1197
1198 =head1 LOGGING
1199
1200 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1201 C<log.conf>.
1202
1203 Methods defined above have different levels of logging, so
1204 it's descriptions will be useful to turn (mostry B<debug> logging) on
1205 or off to see why WabPAC isn't perforing as you expect it (it might even
1206 be a bug!).
1207
1208 B<This is different from normal Log4perl behaviour>. To repeat, you can
1209 also use method names, and not only classes (which are just few)
1210 to filter logging.
1211
1212
1213 =head1 MEMORY USAGE
1214
1215 C<low_mem> options is double-edged sword. If enabled, WebPAC
1216 will run on memory constraint machines (which doesn't have enough
1217 physical RAM to create memory structure for whole source database).
1218
1219 If your machine has 512Mb or more of RAM and database is around 10000 records,
1220 memory shouldn't be an issue. If you don't have enough physical RAM, you
1221 might consider using virtual memory (if your operating system is handling it
1222 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1223 parsed structure of ISIS database (this is what C<low_mem> option does).
1224
1225 Hitting swap at end of reading source database is probably o.k. However,
1226 hitting swap before 90% will dramatically decrease performance and you will
1227 be better off with C<low_mem> and using rest of availble memory for
1228 operating system disk cache (Linux is particuallary good about this).
1229 However, every access to database record will require disk access, so
1230 generation phase will be slower 10-100 times.
1231
1232 Parsed structures are essential - you just have option to trade RAM memory
1233 (which is fast) for disk space (which is slow). Be sure to have planty of
1234 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1235
1236 However, when WebPAC is running on desktop machines (or laptops :-), it's
1237 highly undesireable for system to start swapping. Using C<low_mem> option can
1238 reduce WecPAC memory usage to around 64Mb for same database with lookup
1239 fields and sorted indexes which stay in RAM. Performance will suffer, but
1240 memory usage will really be minimal. It might be also more confortable to
1241 run WebPAC reniced on those machines.
1242
1243 =cut
1244
1245 1;

  ViewVC Help
Powered by ViewVC 1.1.26