/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 374 - (show annotations)
Sun Jun 20 16:57:52 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 16881 byte(s)
save mfn as field v000, _get logger handles calls from main as it should,
support for <filename> tag

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12
13 use Data::Dumper;
14
15 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
16 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
17 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
18 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
19
20 =head1 NAME
21
22 WebPAC - base class for WebPAC
23
24 =head1 DESCRIPTION
25
26 This module implements methods used by WebPAC.
27
28 =head1 METHODS
29
30 =head2 new
31
32 This will create new instance of WebPAC using configuration specified by C<config_file>.
33
34 my $webpac = new WebPAC(
35 config_file => 'name.conf',
36 [code_page => 'ISO-8859-2',]
37 );
38
39 Default C<code_page> is C<ISO-8859-2>.
40
41 It will also read configuration files
42 C<global.conf> (used by indexer and Web font-end)
43 and configuration file specified by C<config_file>
44 which describes databases to be indexed.
45
46 =cut
47
48 # mapping between data type and tag which specify
49 # format in XML file
50 my %type2tag = (
51 'isis' => 'isis',
52 # 'excel' => 'column',
53 # 'marc' => 'marc',
54 # 'feed' => 'feed'
55 );
56
57 sub new {
58 my $class = shift;
59 my $self = {@_};
60 bless($self, $class);
61
62 my $log_file = $self->{'log'} || "log.conf";
63 Log::Log4perl->init($log_file);
64
65 my $log = $self->_get_logger();
66
67 # fill in default values
68 # output codepage
69 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
70
71 #
72 # read global.conf
73 #
74 $log->debug("read 'global.conf'");
75
76 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
77
78 # read global config parametars
79 foreach my $var (qw(
80 dbi_dbd
81 dbi_dsn
82 dbi_user
83 dbi_passwd
84 show_progress
85 my_unac_filter
86 output_template
87 )) {
88 $self->{'global_config'}->{$var} = $config->val('global', $var);
89 }
90
91 #
92 # read indexer config file
93 #
94
95 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
96
97 # create UTF-8 convertor for import_xml files
98 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
99
100 # create Template toolkit instance
101 $self->{'tt'} = Template->new(
102 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
103 # FILTERS => {
104 # 'foo' => \&foo_filter,
105 # },
106 EVAL_PERL => 1,
107 );
108
109 return $self;
110 }
111
112 =head2 open_isis
113
114 Open CDS/ISIS database using OpenIsis module and read all records to memory.
115
116 $webpac->open_isis(
117 filename => '/data/ISIS/ISIS',
118 code_page => '852',
119 limit_mfn => '500',
120 lookup => [ ... ],
121 );
122
123 By default, ISIS code page is assumed to be C<852>.
124
125 If optional parametar C<limit_mfn> is set, it will read just 500 records
126 from database in example above.
127
128 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
129 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
130 value in index.
131
132 lookup => [
133 { 'key' => 'd:v900', 'val' => 'v250^a' },
134 { 'eval' => '"v901^a" eq "Podruèje"',
135 'key' => 'pa:v561^4:v562^4:v461^1',
136 'val' => 'v900' },
137 ]
138
139 Returns number of last record read into memory (size of database, really).
140
141 =cut
142
143 sub open_isis {
144 my $self = shift;
145 my $arg = {@_};
146
147 my $log = $self->_get_logger();
148
149 $log->logcroak("need filename") if (! $arg->{'filename'});
150 my $code_page = $arg->{'code_page'} || '852';
151
152 use OpenIsis;
153
154 #$self->{'isis_code_page'} = $code_page;
155
156 # create Text::Iconv object
157 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
158
159 $log->info("reading ISIS database '",$arg->{'filename'},"'");
160
161 my $isis_db = OpenIsis::open($arg->{'filename'});
162
163 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
164
165 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
166
167 $log->info("processing $maxmfn records...");
168
169 # read database
170 for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
171
172 # read record
173 my $row = OpenIsis::read( $isis_db, $mfn );
174 foreach my $k (keys %{$row}) {
175 if ($k ne "mfn") {
176 foreach my $l (@{$row->{$k}}) {
177 $l = $cp->convert($l);
178 # has subfields?
179 my $val;
180 if ($l =~ m/\^/) {
181 foreach my $t (split(/\^/,$l)) {
182 next if (! $t);
183 $val->{substr($t,0,1)} = substr($t,1);
184 }
185 } else {
186 $val = $l;
187 }
188
189 push @{$self->{'data'}->{$mfn}->{$k}}, $val;
190 }
191 } else {
192 push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn;
193 }
194
195 }
196
197 # create lookup
198 my $rec = $self->{'data'}->{$mfn};
199 $self->create_lookup($rec, @{$arg->{'lookup'}});
200
201 }
202
203 $self->{'current_mfn'} = 1;
204
205 # store max mfn and return it.
206 return $self->{'max_mfn'} = $maxmfn;
207 }
208
209 =head2 fetch_rec
210
211 Fetch next record from database. It will also display progress bar (once
212 it's implemented, that is).
213
214 my $rec = $webpac->fetch_rec;
215
216 =cut
217
218 sub fetch_rec {
219 my $self = shift;
220
221 my $log = $self->_get_logger();
222
223 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
224
225 if ($mfn > $self->{'max_mfn'}) {
226 $self->{'current_mfn'} = $self->{'max_mfn'};
227 $log->debug("at EOF");
228 return;
229 }
230
231 return $self->{'data'}->{$mfn};
232 }
233
234 =head2 open_import_xml
235
236 Read file from C<import_xml/> directory and parse it.
237
238 $webpac->open_import_xml(type => 'isis');
239
240 =cut
241
242 sub open_import_xml {
243 my $self = shift;
244
245 my $log = $self->_get_logger();
246
247 my $arg = {@_};
248 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
249
250 $self->{'type'} = $arg->{'type'};
251
252 my $type_base = $arg->{'type'};
253 $type_base =~ s/_.*$//g;
254
255 $self->{'tag'} = $type2tag{$type_base};
256
257 $log->debug("using type '",$self->{'type'},"' tag <",$self->{'tag'},">") if ($self->{'debug'});
258
259 my $f = "./import_xml/".$self->{'type'}.".xml";
260 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
261
262 $log->debug("reading '$f'") if ($self->{'debug'});
263
264 $self->{'import_xml'} = XMLin($f,
265 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
266 );
267
268 }
269
270 =head2 create_lookup
271
272 Create lookup from record using lookup definition.
273
274 $self->create_lookup($rec, @lookups);
275
276 Called internally by C<open_*> methods.
277
278 =cut
279
280 sub create_lookup {
281 my $self = shift;
282
283 my $log = $self->_get_logger();
284
285 my $rec = shift || $log->logconfess("need record to create lookup");
286 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
287
288 foreach my $i (@_) {
289 if ($i->{'eval'}) {
290 my $eval = $self->fill_in($rec,$i->{'eval'});
291 my $key = $self->fill_in($rec,$i->{'key'});
292 my @val = $self->fill_in($rec,$i->{'val'});
293 if ($key && @val && eval $eval) {
294 $log->debug("stored $key = ",sub { join(" | ",@val) });
295 push @{$self->{'lookup'}->{$key}}, @val;
296 }
297 } else {
298 my $key = $self->fill_in($rec,$i->{'key'});
299 my @val = $self->fill_in($rec,$i->{'val'});
300 if ($key && @val) {
301 $log->debug("stored $key = ",sub { join(" | ",@val) });
302 push @{$self->{'lookup'}->{$key}}, @val;
303 }
304 }
305 }
306 }
307
308 =head2 get_data
309
310 Returns value from record.
311
312 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
313
314 Arguments are:
315 record reference C<$rec>,
316 field C<$f>,
317 optional subfiled C<$sf>,
318 index for repeatable values C<$i>.
319
320 Optinal variable C<$found> will be incremeted if there
321 is field.
322
323 Returns value or empty string.
324
325 =cut
326
327 sub get_data {
328 my $self = shift;
329
330 my ($rec,$f,$sf,$i,$found) = @_;
331
332 if ($$rec->{$f}) {
333 return '' if (! $$rec->{$f}->[$i]);
334 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
335 $$found++ if (defined($$found));
336 return $$rec->{$f}->[$i]->{$sf};
337 } elsif ($$rec->{$f}->[$i]) {
338 $$found++ if (defined($$found));
339 # it still might have subfield, just
340 # not specified, so we'll dump all
341 if ($$rec->{$f}->[$i] =~ /HASH/o) {
342 my $out;
343 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
344 $out .= $$rec->{$f}->[$i]->{$k}." ";
345 }
346 return $out;
347 } else {
348 return $$rec->{$f}->[$i];
349 }
350 }
351 } else {
352 return '';
353 }
354 }
355
356 =head2 fill_in
357
358 Workhourse of all: takes record from in-memory structure of database and
359 strings with placeholders and returns string or array of with substituted
360 values from record.
361
362 my $text = $webpac->fill_in($rec,'v250^a');
363
364 Optional argument is ordinal number for repeatable fields. By default,
365 it's assume to be first repeatable field (fields are perl array, so first
366 element is 0).
367 Following example will read second value from repeatable field.
368
369 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
370
371 This function B<does not> perform parsing of format to inteligenty skip
372 delimiters before fields which aren't used.
373
374 =cut
375
376 sub fill_in {
377 my $self = shift;
378
379 my $log = $self->_get_logger();
380
381 my $rec = shift || $log->logconfess("need data record");
382 my $format = shift || $log->logconfess("need format to parse");
383 # iteration (for repeatable fields)
384 my $i = shift || 0;
385
386 # FIXME remove for speedup?
387 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
388
389 my $found = 0;
390
391 my $eval_code;
392 # remove eval{...} from beginning
393 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
394
395 # do actual replacement of placeholders
396 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
397
398 if ($found) {
399 $log->debug("format: $format");
400 if ($eval_code) {
401 my $eval = $self->fill_in($rec,$eval_code,$i);
402 return if (! $self->_eval($eval));
403 }
404 # do we have lookups?
405 if ($format =~ /$LOOKUP_REGEX/o) {
406 $log->debug("format '$format' has lookup");
407 return $self->lookup($format);
408 } else {
409 return $format;
410 }
411 } else {
412 return;
413 }
414 }
415
416 =head2 lookup
417
418 Perform lookups on format supplied to it.
419
420 my $text = $self->lookup('[v900]');
421
422 Lookups can be nested (like C<[d:[a:[v900]]]>).
423
424 =cut
425
426 sub lookup {
427 my $self = shift;
428
429 my $log = $self->_get_logger();
430
431 my $tmp = shift || $log->logconfess("need format");
432
433 if ($tmp =~ /$LOOKUP_REGEX/o) {
434 my @in = ( $tmp );
435
436 $log->debug("lookup for: ",$tmp);
437
438 my @out;
439 while (my $f = shift @in) {
440 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
441 my $k = $1;
442 if ($self->{'lookup'}->{$k}) {
443 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
444 my $tmp2 = $f;
445 $tmp2 =~ s/lookup{$k}/$nv/g;
446 push @in, $tmp2;
447 }
448 } else {
449 undef $f;
450 }
451 } elsif ($f) {
452 push @out, $f;
453 }
454 }
455 $log->logconfess("return is array and it's not expected!") unless wantarray;
456 return @out;
457 } else {
458 return $tmp;
459 }
460 }
461
462 =head2 parse
463
464 Perform smart parsing of string, skipping delimiters for fields which aren't
465 defined. It can also eval code in format starting with C<eval{...}> and
466 return output or nothing depending on eval code.
467
468 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
469
470 =cut
471
472 sub parse {
473 my $self = shift;
474
475 my ($rec, $format_utf8, $i) = @_;
476
477 return if (! $format_utf8);
478
479 my $log = $self->_get_logger();
480
481 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
482 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
483
484 $i = 0 if (! $i);
485
486 my $format = $self->{'utf2cp'}->convert($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
487
488 my @out;
489
490 $log->debug("format: $format");
491
492 my $eval_code;
493 # remove eval{...} from beginning
494 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
495
496 my $prefix;
497 my $all_found=0;
498
499 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
500
501 my $del = $1 || '';
502 $prefix ||= $del if ($all_found == 0);
503
504 my $found = 0;
505 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
506
507 if ($found) {
508 push @out, $del;
509 push @out, $tmp;
510 $all_found += $found;
511 }
512 }
513
514 return if (! $all_found);
515
516 my $out = join('',@out);
517
518 if ($out) {
519 # add rest of format (suffix)
520 $out .= $format;
521
522 # add prefix if not there
523 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
524
525 $log->debug("result: $out");
526 }
527
528 if ($eval_code) {
529 my $eval = $self->fill_in($rec,$eval_code,$i);
530 $log->debug("about to eval{",$eval,"} format: $out");
531 return if (! $self->_eval($eval));
532 }
533
534 return $out;
535 }
536
537 =head2 parse_to_arr
538
539 Similar to C<parse>, but returns array of all repeatable fields
540
541 my @arr = $webpac->parse_to_arr($rec,'v250^a');
542
543 =cut
544
545 sub parse_to_arr {
546 my $self = shift;
547
548 my ($rec, $format_utf8) = @_;
549
550 my $log = $self->_get_logger();
551
552 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
553 return if (! $format_utf8);
554
555 my $i = 0;
556 my @arr;
557
558 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
559 push @arr, $v;
560 }
561
562 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
563
564 return @arr;
565 }
566
567 =head2 fill_in_to_arr
568
569 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
570 for fields which have lookups, so they shouldn't be parsed but rather
571 C<fill_id>ed.
572
573 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
574
575 =cut
576
577 sub fill_in_to_arr {
578 my $self = shift;
579
580 my ($rec, $format_utf8) = @_;
581
582 my $log = $self->_get_logger();
583
584 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
585 return if (! $format_utf8);
586
587 my $i = 0;
588 my @arr;
589
590 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
591 push @arr, @v;
592 }
593
594 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
595
596 return @arr;
597 }
598
599
600 =head2 data_structure
601
602 Create in-memory data structure which represents layout from C<import_xml>.
603 It is used later to produce output.
604
605 my @ds = $webpac->data_structure($rec);
606
607 This method will also set C<$webpac->{'currnet_filename'}> if there is
608 <filename> tag in C<import_xml>.
609
610 =cut
611
612 sub data_structure {
613 my $self = shift;
614
615 my $log = $self->_get_logger();
616
617 my $rec = shift;
618 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
619
620 undef $self->{'currnet_filename'};
621
622 my @sorted_tags;
623 if ($self->{tags_by_order}) {
624 @sorted_tags = @{$self->{tags_by_order}};
625 } else {
626 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
627 $self->{tags_by_order} = \@sorted_tags;
628 }
629
630 my @ds;
631
632 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
633
634 foreach my $field (@sorted_tags) {
635
636 my $row;
637
638 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
639
640 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
641 my $format = $tag->{'value'} || $tag->{'content'};
642
643 $log->debug("format: $format");
644
645 my @v;
646 if ($format =~ /$LOOKUP_REGEX/o) {
647 @v = $self->fill_in_to_arr($rec,$format);
648 } else {
649 @v = $self->parse_to_arr($rec,$format);
650 }
651 next if (! @v);
652
653 # does tag have type?
654 if ($tag->{'type'}) {
655 push @{$row->{$tag->{'type'}}}, @v;
656 } else {
657 push @{$row->{'display'}}, @v;
658 push @{$row->{'swish'}}, @v;
659 }
660
661 if ($field eq 'filename') {
662 $self->{'current_filename'} = join('',@v);
663 $log->debug("filename: ",$self->{'current_filename'});
664 }
665
666 }
667
668 if ($row) {
669 $row->{'tag'} = $field;
670 push @ds, $row;
671
672 $log->debug("row $field: ",sub { Dumper($row) });
673 }
674
675 }
676
677 return @ds;
678
679 }
680
681 =head2 output
682
683 Create output from in-memory data structure using Template Toolkit template.
684
685 my $text = $webpac->output( template => 'text.tt', data => @ds );
686
687 =cut
688
689 sub output {
690 my $self = shift;
691
692 my $args = {@_};
693
694 my $log = $self->_get_logger();
695
696 $log->logconfess("need template name") if (! $args->{'template'});
697 $log->logconfess("need data array") if (! $args->{'data'});
698
699 my $out;
700
701 $self->{'tt'}->process(
702 $args->{'template'},
703 $args,
704 \$out
705 ) || confess $self->{'tt'}->error();
706
707 return $out;
708 }
709
710 #
711 #
712 #
713
714 =head1 INTERNAL METHODS
715
716 Here is a quick list of internal methods, mostly useful to turn debugging
717 on them (see L<LOGGING> below for explanation).
718
719 =cut
720
721 =head2 _eval
722
723 Internal function to eval code without C<strict 'subs'>.
724
725 =cut
726
727 sub _eval {
728 my $self = shift;
729
730 my $code = shift || return;
731
732 my $log = $self->_get_logger();
733
734 no strict 'subs';
735 my $ret = eval $code;
736 if ($@) {
737 $log->error("problem with eval code [$code]: $@");
738 }
739
740 $log->debug("eval: ",$code," [",$ret,"]");
741
742 return $ret || 0;
743 }
744
745 =head2 _sort_by_order
746
747 Sort xml tags data structure accoding to C<order=""> attribute.
748
749 =cut
750
751 sub _sort_by_order {
752 my $self = shift;
753
754 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
755 $self->{'import_xml'}->{'indexer'}->{$a};
756 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
757 $self->{'import_xml'}->{'indexer'}->{$b};
758
759 return $va <=> $vb;
760 }
761
762 sub _get_logger {
763 my $self = shift;
764
765 my $name = (caller(1))[3] || caller;
766 return get_logger($name);
767 }
768
769 #
770 #
771 #
772
773 =head1 LOGGING
774
775 Logging in WebPAC is performed by L<Log::Log4perl> with config file
776 C<log.conf>.
777
778 Methods defined above have different levels of logging, so
779 it's descriptions will be useful to turn (mostry B<debug> logging) on
780 or off to see why WabPAC isn't perforing as you expect it (it might even
781 be a bug!).
782
783 B<This is different from normal Log4perl behaviour>. To repeat, you can
784 also use method names, and not only classes (which are just few)
785 to filter logging.
786
787 =cut
788
789 1;

  ViewVC Help
Powered by ViewVC 1.1.26