/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 373 - (show annotations)
Sun Jun 20 15:49:09 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 16513 byte(s)
a lot more logging, lookups are now working as expected (and documented)

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12
13 use Data::Dumper;
14
15 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
16 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
17 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
18 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
19
20 =head1 NAME
21
22 WebPAC - base class for WebPAC
23
24 =head1 DESCRIPTION
25
26 This module implements methods used by WebPAC.
27
28 =head1 METHODS
29
30 =head2 new
31
32 This will create new instance of WebPAC using configuration specified by C<config_file>.
33
34 my $webpac = new WebPAC(
35 config_file => 'name.conf',
36 [code_page => 'ISO-8859-2',]
37 );
38
39 Default C<code_page> is C<ISO-8859-2>.
40
41 It will also read configuration files
42 C<global.conf> (used by indexer and Web font-end)
43 and configuration file specified by C<config_file>
44 which describes databases to be indexed.
45
46 =cut
47
48 # mapping between data type and tag which specify
49 # format in XML file
50 my %type2tag = (
51 'isis' => 'isis',
52 # 'excel' => 'column',
53 # 'marc' => 'marc',
54 # 'feed' => 'feed'
55 );
56
57 sub new {
58 my $class = shift;
59 my $self = {@_};
60 bless($self, $class);
61
62 my $log_file = $self->{'log'} || "log.conf";
63 Log::Log4perl->init($log_file);
64
65 my $log = $self->_get_logger();
66
67 # fill in default values
68 # output codepage
69 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
70
71 #
72 # read global.conf
73 #
74 $log->debug("read 'global.conf'");
75
76 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
77
78 # read global config parametars
79 foreach my $var (qw(
80 dbi_dbd
81 dbi_dsn
82 dbi_user
83 dbi_passwd
84 show_progress
85 my_unac_filter
86 output_template
87 )) {
88 $self->{'global_config'}->{$var} = $config->val('global', $var);
89 }
90
91 #
92 # read indexer config file
93 #
94
95 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
96
97 # create UTF-8 convertor for import_xml files
98 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
99
100 # create Template toolkit instance
101 $self->{'tt'} = Template->new(
102 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
103 # FILTERS => {
104 # 'foo' => \&foo_filter,
105 # },
106 EVAL_PERL => 1,
107 );
108
109 return $self;
110 }
111
112 =head2 open_isis
113
114 Open CDS/ISIS database using OpenIsis module and read all records to memory.
115
116 $webpac->open_isis(
117 filename => '/data/ISIS/ISIS',
118 code_page => '852',
119 limit_mfn => '500',
120 lookup => [ ... ],
121 );
122
123 By default, ISIS code page is assumed to be C<852>.
124
125 If optional parametar C<limit_mfn> is set, it will read just 500 records
126 from database in example above.
127
128 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
129 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
130 value in index.
131
132 lookup => [
133 { 'key' => 'd:v900', 'val' => 'v250^a' },
134 { 'eval' => '"v901^a" eq "Podruèje"',
135 'key' => 'pa:v561^4:v562^4:v461^1',
136 'val' => 'v900' },
137 ]
138
139 Returns number of last record read into memory (size of database, really).
140
141 =cut
142
143 sub open_isis {
144 my $self = shift;
145 my $arg = {@_};
146
147 my $log = $self->_get_logger();
148
149 $log->logcroak("need filename") if (! $arg->{'filename'});
150 my $code_page = $arg->{'code_page'} || '852';
151
152 use OpenIsis;
153
154 #$self->{'isis_code_page'} = $code_page;
155
156 # create Text::Iconv object
157 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
158
159 $log->info("reading ISIS database '",$arg->{'filename'},"'");
160
161 my $isis_db = OpenIsis::open($arg->{'filename'});
162
163 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
164
165 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
166
167 $log->info("processing $maxmfn records...");
168
169 # read database
170 for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
171
172 # read record
173 my $row = OpenIsis::read( $isis_db, $mfn );
174 foreach my $k (keys %{$row}) {
175 if ($k ne "mfn") {
176 foreach my $l (@{$row->{$k}}) {
177 $l = $cp->convert($l);
178 # has subfields?
179 my $val;
180 if ($l =~ m/\^/) {
181 foreach my $t (split(/\^/,$l)) {
182 next if (! $t);
183 $val->{substr($t,0,1)} = substr($t,1);
184 }
185 } else {
186 $val = $l;
187 }
188
189 push @{$self->{'data'}->{$mfn}->{$k}}, $val;
190 }
191 }
192
193 }
194
195 # create lookup
196 my $rec = $self->{'data'}->{$mfn};
197 $self->create_lookup($rec, @{$arg->{'lookup'}});
198
199 }
200
201 $self->{'current_mfn'} = 1;
202
203 # store max mfn and return it.
204 return $self->{'max_mfn'} = $maxmfn;
205 }
206
207 =head2 fetch_rec
208
209 Fetch next record from database. It will also display progress bar (once
210 it's implemented, that is).
211
212 my $rec = $webpac->fetch_rec;
213
214 =cut
215
216 sub fetch_rec {
217 my $self = shift;
218
219 my $log = $self->_get_logger();
220
221 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
222
223 if ($mfn > $self->{'max_mfn'}) {
224 $self->{'current_mfn'} = $self->{'max_mfn'};
225 $log->debug("at EOF");
226 return;
227 }
228
229 return $self->{'data'}->{$mfn};
230 }
231
232 =head2 open_import_xml
233
234 Read file from C<import_xml/> directory and parse it.
235
236 $webpac->open_import_xml(type => 'isis');
237
238 =cut
239
240 sub open_import_xml {
241 my $self = shift;
242
243 my $log = $self->_get_logger();
244
245 my $arg = {@_};
246 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
247
248 $self->{'type'} = $arg->{'type'};
249
250 my $type_base = $arg->{'type'};
251 $type_base =~ s/_.*$//g;
252
253 $self->{'tag'} = $type2tag{$type_base};
254
255 $log->debug("using type '",$self->{'type'},"' tag <",$self->{'tag'},">") if ($self->{'debug'});
256
257 my $f = "./import_xml/".$self->{'type'}.".xml";
258 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
259
260 $log->debug("reading '$f'") if ($self->{'debug'});
261
262 $self->{'import_xml'} = XMLin($f,
263 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
264 );
265
266 }
267
268 =head2 create_lookup
269
270 Create lookup from record using lookup definition.
271
272 $self->create_lookup($rec, @lookups);
273
274 Called internally by C<open_*> methods.
275
276 =cut
277
278 sub create_lookup {
279 my $self = shift;
280
281 my $log = $self->_get_logger();
282
283 my $rec = shift || $log->logconfess("need record to create lookup");
284 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
285
286 foreach my $i (@_) {
287 if ($i->{'eval'}) {
288 my $eval = $self->fill_in($rec,$i->{'eval'});
289 my $key = $self->fill_in($rec,$i->{'key'});
290 my @val = $self->fill_in($rec,$i->{'val'});
291 if ($key && @val && eval $eval) {
292 $log->debug("stored $key = ",sub { join(" | ",@val) });
293 push @{$self->{'lookup'}->{$key}}, @val;
294 }
295 } else {
296 my $key = $self->fill_in($rec,$i->{'key'});
297 my @val = $self->fill_in($rec,$i->{'val'});
298 if ($key && @val) {
299 $log->debug("stored $key = ",sub { join(" | ",@val) });
300 push @{$self->{'lookup'}->{$key}}, @val;
301 }
302 }
303 }
304 }
305
306 =head2 get_data
307
308 Returns value from record.
309
310 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
311
312 Arguments are:
313 record reference C<$rec>,
314 field C<$f>,
315 optional subfiled C<$sf>,
316 index for repeatable values C<$i>.
317
318 Optinal variable C<$found> will be incremeted if there
319 is field.
320
321 Returns value or empty string.
322
323 =cut
324
325 sub get_data {
326 my $self = shift;
327
328 my ($rec,$f,$sf,$i,$found) = @_;
329
330 if ($$rec->{$f}) {
331 return '' if (! $$rec->{$f}->[$i]);
332 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
333 $$found++ if (defined($$found));
334 return $$rec->{$f}->[$i]->{$sf};
335 } elsif ($$rec->{$f}->[$i]) {
336 $$found++ if (defined($$found));
337 # it still might have subfield, just
338 # not specified, so we'll dump all
339 if ($$rec->{$f}->[$i] =~ /HASH/o) {
340 my $out;
341 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
342 $out .= $$rec->{$f}->[$i]->{$k}." ";
343 }
344 return $out;
345 } else {
346 return $$rec->{$f}->[$i];
347 }
348 }
349 } else {
350 return '';
351 }
352 }
353
354 =head2 fill_in
355
356 Workhourse of all: takes record from in-memory structure of database and
357 strings with placeholders and returns string or array of with substituted
358 values from record.
359
360 my $text = $webpac->fill_in($rec,'v250^a');
361
362 Optional argument is ordinal number for repeatable fields. By default,
363 it's assume to be first repeatable field (fields are perl array, so first
364 element is 0).
365 Following example will read second value from repeatable field.
366
367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
368
369 This function B<does not> perform parsing of format to inteligenty skip
370 delimiters before fields which aren't used.
371
372 =cut
373
374 sub fill_in {
375 my $self = shift;
376
377 my $log = $self->_get_logger();
378
379 my $rec = shift || $log->logconfess("need data record");
380 my $format = shift || $log->logconfess("need format to parse");
381 # iteration (for repeatable fields)
382 my $i = shift || 0;
383
384 # FIXME remove for speedup?
385 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
386
387 my $found = 0;
388
389 my $eval_code;
390 # remove eval{...} from beginning
391 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
392
393 # do actual replacement of placeholders
394 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
395
396 if ($found) {
397 $log->debug("format: $format");
398 if ($eval_code) {
399 my $eval = $self->fill_in($rec,$eval_code,$i);
400 return if (! $self->_eval($eval));
401 }
402 # do we have lookups?
403 if ($format =~ /$LOOKUP_REGEX/o) {
404 $log->debug("format '$format' has lookup");
405 return $self->lookup($format);
406 } else {
407 return $format;
408 }
409 } else {
410 return;
411 }
412 }
413
414 =head2 lookup
415
416 Perform lookups on format supplied to it.
417
418 my $text = $self->lookup('[v900]');
419
420 Lookups can be nested (like C<[d:[a:[v900]]]>).
421
422 =cut
423
424 sub lookup {
425 my $self = shift;
426
427 my $log = $self->_get_logger();
428
429 my $tmp = shift || $log->logconfess("need format");
430
431 if ($tmp =~ /$LOOKUP_REGEX/o) {
432 my @in = ( $tmp );
433
434 $log->debug("lookup for: ",$tmp);
435
436 my @out;
437 while (my $f = shift @in) {
438 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
439 my $k = $1;
440 if ($self->{'lookup'}->{$k}) {
441 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
442 my $tmp2 = $f;
443 $tmp2 =~ s/lookup{$k}/$nv/g;
444 push @in, $tmp2;
445 }
446 } else {
447 undef $f;
448 }
449 } elsif ($f) {
450 push @out, $f;
451 }
452 }
453 $log->logconfess("return is array and it's not expected!") unless wantarray;
454 return @out;
455 } else {
456 return $tmp;
457 }
458 }
459
460 =head2 parse
461
462 Perform smart parsing of string, skipping delimiters for fields which aren't
463 defined. It can also eval code in format starting with C<eval{...}> and
464 return output or nothing depending on eval code.
465
466 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
467
468 =cut
469
470 sub parse {
471 my $self = shift;
472
473 my ($rec, $format_utf8, $i) = @_;
474
475 return if (! $format_utf8);
476
477 my $log = $self->_get_logger();
478
479 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
480 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
481
482 $i = 0 if (! $i);
483
484 my $format = $self->{'utf2cp'}->convert($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
485
486 my @out;
487
488 $log->debug("format: $format");
489
490 my $eval_code;
491 # remove eval{...} from beginning
492 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
493
494 my $prefix;
495 my $all_found=0;
496
497 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
498
499 my $del = $1 || '';
500 $prefix ||= $del if ($all_found == 0);
501
502 my $found = 0;
503 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
504
505 if ($found) {
506 push @out, $del;
507 push @out, $tmp;
508 $all_found += $found;
509 }
510 }
511
512 return if (! $all_found);
513
514 my $out = join('',@out);
515
516 if ($out) {
517 # add rest of format (suffix)
518 $out .= $format;
519
520 # add prefix if not there
521 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
522
523 $log->debug("result: $out");
524 }
525
526 if ($eval_code) {
527 my $eval = $self->fill_in($rec,$eval_code,$i);
528 $log->debug("about to eval{",$eval,"} format: $out");
529 return if (! $self->_eval($eval));
530 }
531
532 return $out;
533 }
534
535 =head2 parse_to_arr
536
537 Similar to C<parse>, but returns array of all repeatable fields
538
539 my @arr = $webpac->parse_to_arr($rec,'v250^a');
540
541 =cut
542
543 sub parse_to_arr {
544 my $self = shift;
545
546 my ($rec, $format_utf8) = @_;
547
548 my $log = $self->_get_logger();
549
550 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
551 return if (! $format_utf8);
552
553 my $i = 0;
554 my @arr;
555
556 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
557 push @arr, $v;
558 }
559
560 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
561
562 return @arr;
563 }
564
565 =head2 fill_in_to_arr
566
567 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
568 for fields which have lookups, so they shouldn't be parsed but rather
569 C<fill_id>ed.
570
571 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
572
573 =cut
574
575 sub fill_in_to_arr {
576 my $self = shift;
577
578 my ($rec, $format_utf8) = @_;
579
580 my $log = $self->_get_logger();
581
582 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
583 return if (! $format_utf8);
584
585 my $i = 0;
586 my @arr;
587
588 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
589 push @arr, @v;
590 }
591
592 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
593
594 return @arr;
595 }
596
597
598 =head2 data_structure
599
600 Create in-memory data structure which represents layout from C<import_xml>.
601 It is used later to produce output.
602
603 my @ds = $webpac->data_structure($rec);
604
605 =cut
606
607 sub data_structure {
608 my $self = shift;
609
610 my $log = $self->_get_logger();
611
612 my $rec = shift;
613 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
614
615 my @sorted_tags;
616 if ($self->{tags_by_order}) {
617 @sorted_tags = @{$self->{tags_by_order}};
618 } else {
619 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
620 $self->{tags_by_order} = \@sorted_tags;
621 }
622
623 my @ds;
624
625 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
626
627 foreach my $field (@sorted_tags) {
628
629 my $row;
630
631 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
632
633 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
634 my $format = $tag->{'value'} || $tag->{'content'};
635
636 $log->debug("format: $format");
637
638 my @v;
639 if ($format =~ /$LOOKUP_REGEX/o) {
640 @v = $self->fill_in_to_arr($rec,$format);
641 } else {
642 @v = $self->parse_to_arr($rec,$format);
643 }
644 next if (! @v);
645
646 # does tag have type?
647 if ($tag->{'type'}) {
648 push @{$row->{$tag->{'type'}}}, @v;
649 } else {
650 push @{$row->{'display'}}, @v;
651 push @{$row->{'swish'}}, @v;
652 }
653
654 }
655
656 if ($row) {
657 $row->{'tag'} = $field;
658 push @ds, $row;
659 $log->debug("row $field: ",sub { Dumper($row) });
660 }
661
662 }
663
664 return @ds;
665
666 }
667
668 =head2 output
669
670 Create output from in-memory data structure using Template Toolkit template.
671
672 my $text = $webpac->output( template => 'text.tt', data => @ds );
673
674 =cut
675
676 sub output {
677 my $self = shift;
678
679 my $args = {@_};
680
681 my $log = $self->_get_logger();
682
683 $log->logconfess("need template name") if (! $args->{'template'});
684 $log->logconfess("need data array") if (! $args->{'data'});
685
686 my $out;
687
688 $self->{'tt'}->process(
689 $args->{'template'},
690 $args,
691 \$out
692 ) || confess $self->{'tt'}->error();
693
694 return $out;
695 }
696
697 #
698 #
699 #
700
701 =head1 INTERNAL METHODS
702
703 Here is a quick list of internal methods, mostly useful to turn debugging
704 on them (see L<LOGGING> below for explanation).
705
706 =cut
707
708 =head2 _eval
709
710 Internal function to eval code without C<strict 'subs'>.
711
712 =cut
713
714 sub _eval {
715 my $self = shift;
716
717 my $code = shift || return;
718
719 my $log = $self->_get_logger();
720
721 no strict 'subs';
722 my $ret = eval $code;
723 if ($@) {
724 $log->error("problem with eval code [$code]: $@");
725 }
726
727 $log->debug("eval: ",$code," [",$ret,"]");
728
729 return $ret || 0;
730 }
731
732 =head2 _sort_by_order
733
734 Sort xml tags data structure accoding to C<order=""> attribute.
735
736 =cut
737
738 sub _sort_by_order {
739 my $self = shift;
740
741 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
742 $self->{'import_xml'}->{'indexer'}->{$a};
743 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
744 $self->{'import_xml'}->{'indexer'}->{$b};
745
746 return $va <=> $vb;
747 }
748
749 sub _get_logger {
750 my $self = shift;
751
752 my @c = caller(1);
753 return get_logger($c[3]);
754 }
755
756 #
757 #
758 #
759
760 =head1 LOGGING
761
762 Logging in WebPAC is performed by L<Log::Log4perl> with config file
763 C<log.conf>.
764
765 Methods defined above have different levels of logging, so
766 it's descriptions will be useful to turn (mostry B<debug> logging) on
767 or off to see why WabPAC isn't perforing as you expect it (it might even
768 be a bug!).
769
770 B<This is different from normal Log4perl behaviour>. To repeat, you can
771 also use method names, and not only classes (which are just few)
772 to filter logging.
773
774 =cut
775
776 1;

  ViewVC Help
Powered by ViewVC 1.1.26