/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 371 - (hide annotations)
Thu Jun 17 20:44:45 2004 UTC (15 years, 7 months ago) by dpavlin
File size: 13265 byte(s)
use local (more relaxed) eval, report errors in eval

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 352
12 dpavlin 358 use Data::Dumper;
13    
14 dpavlin 352 =head1 NAME
15    
16 dpavlin 354 WebPAC - base class for WebPAC
17 dpavlin 352
18     =head1 DESCRIPTION
19    
20 dpavlin 354 This module implements methods used by WebPAC.
21 dpavlin 352
22     =head1 METHODS
23    
24     =head2 new
25    
26 dpavlin 354 This will create new instance of WebPAC using configuration specified by C<config_file>.
27 dpavlin 352
28 dpavlin 354 my $webpac = new WebPAC(
29 dpavlin 352 config_file => 'name.conf',
30     [code_page => 'ISO-8859-2',]
31     );
32    
33     Default C<code_page> is C<ISO-8859-2>.
34    
35 dpavlin 353 It will also read configuration files
36     C<global.conf> (used by indexer and Web font-end)
37     and configuration file specified by C<config_file>
38     which describes databases to be indexed.
39    
40 dpavlin 352 =cut
41    
42 dpavlin 363 # mapping between data type and tag which specify
43     # format in XML file
44     my %type2tag = (
45     'isis' => 'isis',
46     # 'excel' => 'column',
47     # 'marc' => 'marc',
48     # 'feed' => 'feed'
49     );
50    
51 dpavlin 352 sub new {
52     my $class = shift;
53     my $self = {@_};
54     bless($self, $class);
55    
56     # fill in default values
57     # output codepage
58     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
59    
60 dpavlin 353 #
61     # read global.conf
62     #
63 dpavlin 352
64 dpavlin 370 my $config = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";
65 dpavlin 352
66     # read global config parametars
67     foreach my $var (qw(
68     dbi_dbd
69     dbi_dsn
70     dbi_user
71     dbi_passwd
72     show_progress
73     my_unac_filter
74 dpavlin 370 output_template
75 dpavlin 352 )) {
76 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
77 dpavlin 352 }
78    
79 dpavlin 353 #
80     # read indexer config file
81     #
82 dpavlin 352
83     $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";
84    
85 dpavlin 370 # create UTF-8 convertor for import_xml files
86 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
87 dpavlin 370
88     # create Template toolkit instance
89     $self->{'tt'} = Template->new(
90     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
91     # FILTERS => {
92     # 'foo' => \&foo_filter,
93     # },
94     EVAL_PERL => 1,
95     );
96    
97 dpavlin 352 return $self;
98     }
99    
100     =head2 open_isis
101    
102     Open CDS/ISIS database using OpenIsis module and read all records to memory.
103    
104     $webpac->open_isis(
105     filename => '/data/ISIS/ISIS',
106     code_page => '852',
107     limit_mfn => '500',
108     lookup => [ ... ],
109     );
110    
111     By default, ISIS code page is assumed to be C<852>.
112    
113 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
114     from database in example above.
115 dpavlin 352
116     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
117     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
118     value in index.
119    
120     lookup => [
121     { 'key' => 'd:v900', 'val' => 'v250^a' },
122     { 'eval' => '"v901^a" eq "Podruèje"',
123     'key' => 'pa:v561^4:v562^4:v461^1',
124     'val' => 'v900' },
125     ]
126    
127 dpavlin 367 Returns number of last record read into memory (size of database, really).
128    
129 dpavlin 352 =cut
130    
131     sub open_isis {
132     my $self = shift;
133     my $arg = {@_};
134    
135     croak "need filename" if (! $arg->{'filename'});
136     my $code_page = $arg->{'code_page'} || '852';
137    
138 dpavlin 353 use OpenIsis;
139    
140 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
141    
142     # create Text::Iconv object
143     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
144    
145 dpavlin 363 print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});
146    
147 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
148    
149     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
150    
151 dpavlin 357 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
152    
153 dpavlin 363 print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});
154    
155 dpavlin 352 # read database
156     for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
157    
158     # read record
159     my $row = OpenIsis::read( $isis_db, $mfn );
160     foreach my $k (keys %{$row}) {
161     if ($k ne "mfn") {
162     foreach my $l (@{$row->{$k}}) {
163     $l = $cp->convert($l);
164     # has subfields?
165     my $val;
166     if ($l =~ m/\^/) {
167     foreach my $t (split(/\^/,$l)) {
168     next if (! $t);
169     $val->{substr($t,0,1)} = substr($t,1);
170     }
171     } else {
172     $val = $l;
173     }
174    
175     push @{$self->{'data'}->{$mfn}->{$k}}, $val;
176     }
177     }
178    
179     }
180    
181     # create lookup
182 dpavlin 355 my $rec = $self->{'data'}->{$mfn};
183     $self->create_lookup($rec, @{$arg->{'lookup'}});
184 dpavlin 352
185     }
186    
187 dpavlin 362 $self->{'current_mfn'} = 1;
188    
189 dpavlin 352 # store max mfn and return it.
190     return $self->{'max_mfn'} = $maxmfn;
191     }
192    
193 dpavlin 362 =head2 fetch_rec
194    
195     Fetch next record from database. It will also display progress bar (once
196     it's implemented, that is).
197    
198     my $rec = $webpac->fetch_rec;
199    
200     =cut
201    
202     sub fetch_rec {
203     my $self = shift;
204    
205     my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";
206    
207     if ($mfn > $self->{'max_mfn'}) {
208     $self->{'current_mfn'} = $self->{'max_mfn'};
209     return;
210     }
211    
212     return $self->{'data'}->{$mfn};
213     }
214    
215 dpavlin 363 =head2 open_import_xml
216    
217     Read file from C<import_xml/> directory and parse it.
218    
219     $webpac->open_import_xml(type => 'isis');
220    
221     =cut
222    
223     sub open_import_xml {
224     my $self = shift;
225    
226     my $arg = {@_};
227     confess "need type to load file from import_xml/" if (! $arg->{'type'});
228    
229 dpavlin 366 $self->{'type'} = $arg->{'type'};
230 dpavlin 363
231 dpavlin 366 my $type_base = $arg->{'type'};
232 dpavlin 363 $type_base =~ s/_.*$//g;
233    
234 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
235    
236 dpavlin 371 print STDERR "using type '",$self->{'type'},"' tag <",$self->{'tag'},">\n" if ($self->{'debug'});
237 dpavlin 366
238     my $f = "./import_xml/".$self->{'type'}.".xml";
239 dpavlin 363 confess "import_xml file '$f' doesn't exist!" if (! -e "$f");
240    
241     print STDERR "reading '$f'\n" if ($self->{'debug'});
242    
243     $self->{'import_xml'} = XMLin($f,
244 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
245 dpavlin 363 ForceContent => 1
246     );
247    
248     }
249    
250 dpavlin 355 =head2 create_lookup
251    
252     Create lookup from record using lookup definition.
253    
254 dpavlin 367 $self->create_lookup($rec, @lookups);
255    
256     Called internally by C<open_*> methods.
257    
258 dpavlin 355 =cut
259    
260     sub create_lookup {
261     my $self = shift;
262    
263     my $rec = shift || confess "need record to create lookup";
264     confess("need HASH as first argument!") if ($rec !~ /HASH/o);
265    
266     foreach my $i (@_) {
267     if ($i->{'eval'}) {
268     my $eval = $self->fill_in($rec,$i->{'eval'});
269     my $key = $self->fill_in($rec,$i->{'key'});
270     my @val = $self->fill_in($rec,$i->{'val'});
271     if ($key && @val && eval $eval) {
272     push @{$self->{'lookup'}->{$key}}, @val;
273     }
274     } else {
275     my $key = $self->fill_in($rec,$i->{'key'});
276     my @val = $self->fill_in($rec,$i->{'val'});
277     if ($key && @val) {
278     push @{$self->{'lookup'}->{$key}}, @val;
279     }
280     }
281     }
282     }
283    
284 dpavlin 356 =head2 get_data
285    
286     Returns value from record.
287    
288 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
289 dpavlin 356
290     Arguments are:
291     record reference C<$rec>,
292     field C<$f>,
293     optional subfiled C<$sf>,
294     index for repeatable values C<$i>.
295    
296 dpavlin 367 Optinal variable C<$found> will be incremeted if there
297 dpavlin 356 is field.
298    
299     Returns value or empty string.
300    
301     =cut
302    
303     sub get_data {
304     my $self = shift;
305    
306     my ($rec,$f,$sf,$i,$found) = @_;
307 dpavlin 367
308 dpavlin 356 if ($$rec->{$f}) {
309 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
310 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
311     $$found++ if (defined($$found));
312     return $$rec->{$f}->[$i]->{$sf};
313     } elsif ($$rec->{$f}->[$i]) {
314     $$found++ if (defined($$found));
315 dpavlin 366 # it still might have subfield, just
316     # not specified, so we'll dump all
317     if ($$rec->{$f}->[$i] =~ /HASH/o) {
318     my $out;
319     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
320     $out .= $$rec->{$f}->[$i]->{$k}." ";
321     }
322     return $out;
323     } else {
324     return $$rec->{$f}->[$i];
325     }
326 dpavlin 356 }
327     } else {
328     return '';
329     }
330     }
331    
332 dpavlin 352 =head2 fill_in
333    
334     Workhourse of all: takes record from in-memory structure of database and
335     strings with placeholders and returns string or array of with substituted
336     values from record.
337    
338 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
339 dpavlin 352
340     Optional argument is ordinal number for repeatable fields. By default,
341 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
342     element is 0).
343     Following example will read second value from repeatable field.
344 dpavlin 352
345 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
346 dpavlin 353
347     This function B<does not> perform parsing of format to inteligenty skip
348     delimiters before fields which aren't used.
349    
350 dpavlin 352 =cut
351    
352 dpavlin 371 # internal function to eval code
353     sub _eval {
354     my $self = shift;
355    
356     my $code = shift || return;
357     no strict 'subs';
358     my $ret = eval $code;
359     if ($@) {
360     print STDERR "problem with eval code [$code]: $@\n";
361     }
362     return $ret;
363     }
364    
365 dpavlin 352 sub fill_in {
366     my $self = shift;
367    
368     my $rec = shift || confess "need data record";
369     my $format = shift || confess "need format to parse";
370     # iteration (for repeatable fields)
371     my $i = shift || 0;
372    
373     # FIXME remove for speedup?
374 dpavlin 355 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
375 dpavlin 352
376     my $found = 0;
377    
378 dpavlin 359 my $eval_code;
379     # remove eval{...} from beginning
380     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
381    
382 dpavlin 352 # do actual replacement of placeholders
383 dpavlin 356 $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
384 dpavlin 352
385 dpavlin 353 if ($found) {
386 dpavlin 359 if ($eval_code) {
387     my $eval = $self->fill_in($rec,$eval_code,$i);
388 dpavlin 371 return if (! $self->_eval($eval));
389 dpavlin 359 }
390 dpavlin 353 # do we have lookups?
391     if ($format =~ /\[[^\[\]]+\]/o) {
392 dpavlin 371 print "## probable lookup: $format\n";
393 dpavlin 353 return $self->lookup($format);
394     } else {
395     return $format;
396     }
397 dpavlin 352 } else {
398     return;
399     }
400     }
401    
402     =head2 lookup
403    
404 dpavlin 355 Perform lookups on format supplied to it.
405 dpavlin 352
406 dpavlin 367 my $text = $self->lookup('[v900]');
407 dpavlin 352
408 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
409    
410 dpavlin 352 =cut
411    
412     sub lookup {
413     my $self = shift;
414    
415     my $tmp = shift || confess "need format";
416    
417 dpavlin 353 if ($tmp =~ /\[[^\[\]]+\]/o) {
418 dpavlin 352 my @in = ( $tmp );
419 dpavlin 371 print "## lookup $tmp\n";
420 dpavlin 352 my @out;
421     while (my $f = shift @in) {
422     if ($f =~ /\[([^\[\]]+)\]/) {
423     my $k = $1;
424     if ($self->{'lookup'}->{$k}) {
425     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
426     my $tmp2 = $f;
427     $tmp2 =~ s/\[$k\]/$nv/g;
428     push @in, $tmp2;
429     }
430     } else {
431     undef $f;
432     }
433     } elsif ($f) {
434     push @out, $f;
435     }
436     }
437     return @out;
438     } else {
439     return $tmp;
440     }
441     }
442    
443 dpavlin 356 =head2 parse
444    
445     Perform smart parsing of string, skipping delimiters for fields which aren't
446     defined. It can also eval code in format starting with C<eval{...}> and
447     return output or nothing depending on eval code.
448    
449 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
450 dpavlin 356
451     =cut
452    
453     sub parse {
454     my $self = shift;
455    
456 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
457 dpavlin 356
458 dpavlin 366 return if (! $format_utf8);
459    
460 dpavlin 358 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
461 dpavlin 366 confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
462 dpavlin 358
463     $i = 0 if (! $i);
464    
465 dpavlin 366 my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
466    
467 dpavlin 356 my @out;
468    
469     my $eval_code;
470     # remove eval{...} from beginning
471     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
472    
473 dpavlin 358 my $prefix;
474     my $all_found=0;
475 dpavlin 356
476 dpavlin 358 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
477 dpavlin 356
478 dpavlin 358 my $del = $1 || '';
479 dpavlin 359 $prefix ||= $del if ($all_found == 0);
480 dpavlin 358
481     my $found = 0;
482     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
483    
484 dpavlin 356 if ($found) {
485 dpavlin 358 push @out, $del;
486     push @out, $tmp;
487     $all_found += $found;
488 dpavlin 356 }
489     }
490    
491 dpavlin 358 return if (! $all_found);
492 dpavlin 356
493 dpavlin 358 my $out = join('',@out) . $format;
494    
495     # add prefix if not there
496     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
497 dpavlin 367
498 dpavlin 359 if ($eval_code) {
499     my $eval = $self->fill_in($rec,$eval_code,$i);
500 dpavlin 371 return if (! $self->_eval($eval));
501 dpavlin 359 }
502    
503 dpavlin 358 return $out;
504 dpavlin 356 }
505    
506 dpavlin 367 =head2 parse_to_arr
507    
508     Similar to C<parse>, but returns array of all repeatable fields
509    
510     my @arr = $webpac->parse_to_arr($rec,'v250^a');
511    
512     =cut
513    
514     sub parse_to_arr {
515     my $self = shift;
516    
517     my ($rec, $format_utf8) = @_;
518    
519     confess("need HASH as first argument!") if ($rec !~ /HASH/o);
520     return if (! $format_utf8);
521    
522     my $i = 0;
523     my @arr;
524    
525     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
526     push @arr, $v;
527     }
528    
529     return @arr;
530     }
531    
532 dpavlin 366 =head2 data_structure
533    
534     Create in-memory data structure which represents layout from C<import_xml>.
535     It is used later to produce output.
536    
537 dpavlin 368 my @ds = $webpac->data_structure($rec);
538 dpavlin 366
539     =cut
540    
541     # private method _sort_by_order
542     # sort subrouting using order="" attribute
543     sub _sort_by_order {
544     my $self = shift;
545    
546     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
547     $self->{'import_xml'}->{'indexer'}->{$a};
548     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
549     $self->{'import_xml'}->{'indexer'}->{$b};
550    
551     return $va <=> $vb;
552     }
553    
554     sub data_structure {
555     my $self = shift;
556    
557     my $rec = shift;
558     confess("need HASH as first argument!") if ($rec !~ /HASH/o);
559    
560     my @sorted_tags;
561     if ($self->{tags_by_order}) {
562     @sorted_tags = @{$self->{tags_by_order}};
563     } else {
564     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
565     $self->{tags_by_order} = \@sorted_tags;
566     }
567    
568 dpavlin 368 my @ds;
569 dpavlin 366
570     foreach my $field (@sorted_tags) {
571    
572     my $row;
573    
574     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
575    
576     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
577 dpavlin 367 my @v = $self->parse_to_arr($rec,$tag->{'content'});
578 dpavlin 366
579 dpavlin 367 next if (! @v);
580 dpavlin 366
581     # does tag have type?
582     if ($tag->{'type'}) {
583 dpavlin 367 push @{$row->{$tag->{'type'}}}, @v;
584 dpavlin 366 } else {
585 dpavlin 367 push @{$row->{'display'}}, @v;
586     push @{$row->{'swish'}}, @v;
587 dpavlin 366 }
588     }
589    
590 dpavlin 368 if ($row) {
591     $row->{'tag'} = $field;
592     push @ds, $row;
593     }
594 dpavlin 366
595     }
596    
597 dpavlin 370 return @ds;
598 dpavlin 366
599     }
600    
601 dpavlin 370 =head2 output
602    
603     Create output from in-memory data structure using Template Toolkit template.
604    
605     my $text = $webpac->output( template => 'text.tt', data => @ds );
606    
607     =cut
608    
609     sub output {
610     my $self = shift;
611    
612     my $args = {@_};
613    
614     confess("need template name") if (! $args->{'template'});
615     confess("need data array") if (! $args->{'data'});
616    
617     my $out;
618    
619     $self->{'tt'}->process(
620     $args->{'template'},
621     $args,
622     \$out
623     ) || confess $self->{'tt'}->error();
624    
625     return $out;
626     }
627    
628 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26