/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 372 - (show annotations)
Sat Jun 19 18:16:20 2004 UTC (15 years, 7 months ago) by dpavlin
File size: 14882 byte(s)
Log4perl implementation

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12
13 use Data::Dumper;
14
15 =head1 NAME
16
17 WebPAC - base class for WebPAC
18
19 =head1 DESCRIPTION
20
21 This module implements methods used by WebPAC.
22
23 =head1 METHODS
24
25 =head2 new
26
27 This will create new instance of WebPAC using configuration specified by C<config_file>.
28
29 my $webpac = new WebPAC(
30 config_file => 'name.conf',
31 [code_page => 'ISO-8859-2',]
32 );
33
34 Default C<code_page> is C<ISO-8859-2>.
35
36 It will also read configuration files
37 C<global.conf> (used by indexer and Web font-end)
38 and configuration file specified by C<config_file>
39 which describes databases to be indexed.
40
41 =cut
42
43 # mapping between data type and tag which specify
44 # format in XML file
45 my %type2tag = (
46 'isis' => 'isis',
47 # 'excel' => 'column',
48 # 'marc' => 'marc',
49 # 'feed' => 'feed'
50 );
51
52 sub new {
53 my $class = shift;
54 my $self = {@_};
55 bless($self, $class);
56
57 my $log_file = $self->{'log'} || "log.conf";
58 Log::Log4perl->init($log_file);
59
60 my $log = $self->_get_logger();
61
62 # fill in default values
63 # output codepage
64 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
65
66 #
67 # read global.conf
68 #
69 $log->debug("read 'global.conf'");
70
71 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
72
73 # read global config parametars
74 foreach my $var (qw(
75 dbi_dbd
76 dbi_dsn
77 dbi_user
78 dbi_passwd
79 show_progress
80 my_unac_filter
81 output_template
82 )) {
83 $self->{'global_config'}->{$var} = $config->val('global', $var);
84 }
85
86 #
87 # read indexer config file
88 #
89
90 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
91
92 # create UTF-8 convertor for import_xml files
93 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
94
95 # create Template toolkit instance
96 $self->{'tt'} = Template->new(
97 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
98 # FILTERS => {
99 # 'foo' => \&foo_filter,
100 # },
101 EVAL_PERL => 1,
102 );
103
104 return $self;
105 }
106
107 =head2 open_isis
108
109 Open CDS/ISIS database using OpenIsis module and read all records to memory.
110
111 $webpac->open_isis(
112 filename => '/data/ISIS/ISIS',
113 code_page => '852',
114 limit_mfn => '500',
115 lookup => [ ... ],
116 );
117
118 By default, ISIS code page is assumed to be C<852>.
119
120 If optional parametar C<limit_mfn> is set, it will read just 500 records
121 from database in example above.
122
123 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
124 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
125 value in index.
126
127 lookup => [
128 { 'key' => 'd:v900', 'val' => 'v250^a' },
129 { 'eval' => '"v901^a" eq "Podruèje"',
130 'key' => 'pa:v561^4:v562^4:v461^1',
131 'val' => 'v900' },
132 ]
133
134 Returns number of last record read into memory (size of database, really).
135
136 =cut
137
138 sub open_isis {
139 my $self = shift;
140 my $arg = {@_};
141
142 my $log = $self->_get_logger();
143
144 $log->logcroak("need filename") if (! $arg->{'filename'});
145 my $code_page = $arg->{'code_page'} || '852';
146
147 use OpenIsis;
148
149 #$self->{'isis_code_page'} = $code_page;
150
151 # create Text::Iconv object
152 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
153
154 $log->info("reading ISIS database '",$arg->{'filename'},"'");
155
156 my $isis_db = OpenIsis::open($arg->{'filename'});
157
158 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
159
160 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
161
162 $log->info("processing $maxmfn records...");
163
164 # read database
165 for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
166
167 # read record
168 my $row = OpenIsis::read( $isis_db, $mfn );
169 foreach my $k (keys %{$row}) {
170 if ($k ne "mfn") {
171 foreach my $l (@{$row->{$k}}) {
172 $l = $cp->convert($l);
173 # has subfields?
174 my $val;
175 if ($l =~ m/\^/) {
176 foreach my $t (split(/\^/,$l)) {
177 next if (! $t);
178 $val->{substr($t,0,1)} = substr($t,1);
179 }
180 } else {
181 $val = $l;
182 }
183
184 push @{$self->{'data'}->{$mfn}->{$k}}, $val;
185 }
186 }
187
188 }
189
190 # create lookup
191 my $rec = $self->{'data'}->{$mfn};
192 $self->create_lookup($rec, @{$arg->{'lookup'}});
193
194 }
195
196 $self->{'current_mfn'} = 1;
197
198 # store max mfn and return it.
199 return $self->{'max_mfn'} = $maxmfn;
200 }
201
202 =head2 fetch_rec
203
204 Fetch next record from database. It will also display progress bar (once
205 it's implemented, that is).
206
207 my $rec = $webpac->fetch_rec;
208
209 =cut
210
211 sub fetch_rec {
212 my $self = shift;
213
214 my $log = $self->_get_logger();
215
216 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
217
218 if ($mfn > $self->{'max_mfn'}) {
219 $self->{'current_mfn'} = $self->{'max_mfn'};
220 return;
221 }
222
223 return $self->{'data'}->{$mfn};
224 }
225
226 =head2 open_import_xml
227
228 Read file from C<import_xml/> directory and parse it.
229
230 $webpac->open_import_xml(type => 'isis');
231
232 =cut
233
234 sub open_import_xml {
235 my $self = shift;
236
237 my $log = $self->_get_logger();
238
239 my $arg = {@_};
240 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
241
242 $self->{'type'} = $arg->{'type'};
243
244 my $type_base = $arg->{'type'};
245 $type_base =~ s/_.*$//g;
246
247 $self->{'tag'} = $type2tag{$type_base};
248
249 $log->debug("using type '",$self->{'type'},"' tag <",$self->{'tag'},">") if ($self->{'debug'});
250
251 my $f = "./import_xml/".$self->{'type'}.".xml";
252 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
253
254 $log->debug("reading '$f'") if ($self->{'debug'});
255
256 $self->{'import_xml'} = XMLin($f,
257 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
258 ForceContent => 1
259 );
260
261 }
262
263 =head2 create_lookup
264
265 Create lookup from record using lookup definition.
266
267 $self->create_lookup($rec, @lookups);
268
269 Called internally by C<open_*> methods.
270
271 =cut
272
273 sub create_lookup {
274 my $self = shift;
275
276 my $log = $self->_get_logger();
277
278 my $rec = shift || $log->logconfess("need record to create lookup");
279 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
280
281 foreach my $i (@_) {
282 if ($i->{'eval'}) {
283 my $eval = $self->fill_in($rec,$i->{'eval'});
284 my $key = $self->fill_in($rec,$i->{'key'});
285 my @val = $self->fill_in($rec,$i->{'val'});
286 if ($key && @val && eval $eval) {
287 push @{$self->{'lookup'}->{$key}}, @val;
288 }
289 } else {
290 my $key = $self->fill_in($rec,$i->{'key'});
291 my @val = $self->fill_in($rec,$i->{'val'});
292 if ($key && @val) {
293 push @{$self->{'lookup'}->{$key}}, @val;
294 }
295 }
296 }
297 }
298
299 =head2 get_data
300
301 Returns value from record.
302
303 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
304
305 Arguments are:
306 record reference C<$rec>,
307 field C<$f>,
308 optional subfiled C<$sf>,
309 index for repeatable values C<$i>.
310
311 Optinal variable C<$found> will be incremeted if there
312 is field.
313
314 Returns value or empty string.
315
316 =cut
317
318 sub get_data {
319 my $self = shift;
320
321 my ($rec,$f,$sf,$i,$found) = @_;
322
323 if ($$rec->{$f}) {
324 return '' if (! $$rec->{$f}->[$i]);
325 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
326 $$found++ if (defined($$found));
327 return $$rec->{$f}->[$i]->{$sf};
328 } elsif ($$rec->{$f}->[$i]) {
329 $$found++ if (defined($$found));
330 # it still might have subfield, just
331 # not specified, so we'll dump all
332 if ($$rec->{$f}->[$i] =~ /HASH/o) {
333 my $out;
334 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
335 $out .= $$rec->{$f}->[$i]->{$k}." ";
336 }
337 return $out;
338 } else {
339 return $$rec->{$f}->[$i];
340 }
341 }
342 } else {
343 return '';
344 }
345 }
346
347 =head2 fill_in
348
349 Workhourse of all: takes record from in-memory structure of database and
350 strings with placeholders and returns string or array of with substituted
351 values from record.
352
353 my $text = $webpac->fill_in($rec,'v250^a');
354
355 Optional argument is ordinal number for repeatable fields. By default,
356 it's assume to be first repeatable field (fields are perl array, so first
357 element is 0).
358 Following example will read second value from repeatable field.
359
360 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
361
362 This function B<does not> perform parsing of format to inteligenty skip
363 delimiters before fields which aren't used.
364
365 =cut
366
367 sub fill_in {
368 my $self = shift;
369
370 my $log = $self->_get_logger();
371
372 my $rec = shift || $log->logconfess("need data record");
373 my $format = shift || $log->logconfess("need format to parse");
374 # iteration (for repeatable fields)
375 my $i = shift || 0;
376
377 # FIXME remove for speedup?
378 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
379
380 my $found = 0;
381
382 my $eval_code;
383 # remove eval{...} from beginning
384 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
385
386 # do actual replacement of placeholders
387 $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
388
389 if ($found) {
390 if ($eval_code) {
391 my $eval = $self->fill_in($rec,$eval_code,$i);
392 return if (! $self->_eval($eval));
393 }
394 # do we have lookups?
395 $log->debug("test format '$format' for lookups");
396 if ($format =~ /\[[^\[\]]+\]/o) {
397 return $self->lookup($format);
398 } else {
399 return $format;
400 }
401 } else {
402 return;
403 }
404 }
405
406 =head2 lookup
407
408 Perform lookups on format supplied to it.
409
410 my $text = $self->lookup('[v900]');
411
412 Lookups can be nested (like C<[d:[a:[v900]]]>).
413
414 =cut
415
416 sub lookup {
417 my $self = shift;
418
419 my $log = $self->_get_logger();
420
421 my $tmp = shift || $log->logconfess("need format");
422
423 if ($tmp =~ /\[[^\[\]]+\]/o) {
424 my @in = ( $tmp );
425
426 $log->debug("lookup for: ",$tmp);
427
428 my @out;
429 while (my $f = shift @in) {
430 if ($f =~ /\[([^\[\]]+)\]/) {
431 my $k = $1;
432 if ($self->{'lookup'}->{$k}) {
433 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
434 my $tmp2 = $f;
435 $tmp2 =~ s/\[$k\]/$nv/g;
436 push @in, $tmp2;
437 }
438 } else {
439 undef $f;
440 }
441 } elsif ($f) {
442 push @out, $f;
443 }
444 }
445 return @out;
446 } else {
447 return $tmp;
448 }
449 }
450
451 =head2 parse
452
453 Perform smart parsing of string, skipping delimiters for fields which aren't
454 defined. It can also eval code in format starting with C<eval{...}> and
455 return output or nothing depending on eval code.
456
457 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
458
459 =cut
460
461 sub parse {
462 my $self = shift;
463
464 my ($rec, $format_utf8, $i) = @_;
465
466 return if (! $format_utf8);
467
468 my $log = $self->_get_logger();
469
470 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
471 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
472
473 $i = 0 if (! $i);
474
475 my $format = $self->{'utf2cp'}->convert($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
476
477 my @out;
478
479 my $eval_code;
480 # remove eval{...} from beginning
481 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
482
483 my $prefix;
484 my $all_found=0;
485
486 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
487
488 my $del = $1 || '';
489 $prefix ||= $del if ($all_found == 0);
490
491 my $found = 0;
492 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
493
494 if ($found) {
495 push @out, $del;
496 push @out, $tmp;
497 $all_found += $found;
498 }
499 }
500
501 return if (! $all_found);
502
503 my $out = join('',@out) . $format;
504
505 # add prefix if not there
506 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
507
508 if ($eval_code) {
509 my $eval = $self->fill_in($rec,$eval_code,$i);
510 $log->debug("about to eval ",$eval," [$out]");
511 return if (! $self->_eval($eval));
512 }
513
514 return $out;
515 }
516
517 =head2 parse_to_arr
518
519 Similar to C<parse>, but returns array of all repeatable fields
520
521 my @arr = $webpac->parse_to_arr($rec,'v250^a');
522
523 =cut
524
525 sub parse_to_arr {
526 my $self = shift;
527
528 my ($rec, $format_utf8) = @_;
529
530 my $log = $self->_get_logger();
531
532 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
533 return if (! $format_utf8);
534
535 my $i = 0;
536 my @arr;
537
538 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
539 push @arr, $v;
540 }
541
542 return @arr;
543 }
544
545 =head2 data_structure
546
547 Create in-memory data structure which represents layout from C<import_xml>.
548 It is used later to produce output.
549
550 my @ds = $webpac->data_structure($rec);
551
552 =cut
553
554 sub data_structure {
555 my $self = shift;
556
557 my $log = $self->_get_logger();
558
559 my $rec = shift;
560 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
561
562 my @sorted_tags;
563 if ($self->{tags_by_order}) {
564 @sorted_tags = @{$self->{tags_by_order}};
565 } else {
566 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
567 $self->{tags_by_order} = \@sorted_tags;
568 }
569
570 my @ds;
571
572 foreach my $field (@sorted_tags) {
573
574 my $row;
575
576 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
577
578 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
579 my @v = $self->parse_to_arr($rec,$tag->{'content'});
580
581 next if (! @v);
582
583 # does tag have type?
584 if ($tag->{'type'}) {
585 push @{$row->{$tag->{'type'}}}, @v;
586 } else {
587 push @{$row->{'display'}}, @v;
588 push @{$row->{'swish'}}, @v;
589 }
590 }
591
592 if ($row) {
593 $row->{'tag'} = $field;
594 push @ds, $row;
595 }
596
597 }
598
599 return @ds;
600
601 }
602
603 =head2 output
604
605 Create output from in-memory data structure using Template Toolkit template.
606
607 my $text = $webpac->output( template => 'text.tt', data => @ds );
608
609 =cut
610
611 sub output {
612 my $self = shift;
613
614 my $args = {@_};
615
616 my $log = $self->_get_logger();
617
618 $log->logconfess("need template name") if (! $args->{'template'});
619 $log->logconfess("need data array") if (! $args->{'data'});
620
621 my $out;
622
623 $self->{'tt'}->process(
624 $args->{'template'},
625 $args,
626 \$out
627 ) || confess $self->{'tt'}->error();
628
629 return $out;
630 }
631
632 #
633 #
634 #
635
636 =head1 INTERNAL METHODS
637
638 Here is a quick list of internal methods, mostly useful to turn debugging
639 on them (see L<LOGGING> below for explanation).
640
641 =cut
642
643 =head2 _eval
644
645 Internal function to eval code without C<strict 'subs'>.
646
647 =cut
648
649 sub _eval {
650 my $self = shift;
651
652 my $code = shift || return;
653
654 my $log = $self->_get_logger();
655
656 no strict 'subs';
657 my $ret = eval $code;
658 if ($@) {
659 $log->error("problem with eval code [$code]: $@");
660 }
661
662 $log->debug("eval: ",$code," [",$ret,"]");
663
664 return $ret || 0;
665 }
666
667 =head2 _sort_by_order
668
669 Sort xml tags data structure accoding to C<order=""> attribute.
670
671 =cut
672
673 sub _sort_by_order {
674 my $self = shift;
675
676 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
677 $self->{'import_xml'}->{'indexer'}->{$a};
678 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
679 $self->{'import_xml'}->{'indexer'}->{$b};
680
681 return $va <=> $vb;
682 }
683
684 sub _get_logger {
685 my $self = shift;
686
687 my @c = caller(1);
688 return get_logger($c[3]);
689 }
690
691 #
692 #
693 #
694
695 =head1 LOGGING
696
697 Logging in WebPAC is performed by L<Log::Log4perl> with config file
698 C<log.conf>.
699
700 Methods defined above have different levels of logging, so
701 it's descriptions will be useful to turn (mostry B<debug> logging) on
702 or off to see why WabPAC isn't perforing as you expect it (it might even
703 be a bug!).
704
705 B<This is different from normal Log4perl behaviour>. To repeat, you can
706 also use method names, and not only classes (which are just few)
707 to filter logging.
708
709 =cut
710
711 1;

  ViewVC Help
Powered by ViewVC 1.1.26