/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 368 - (hide annotations)
Thu Jun 17 12:27:02 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 12281 byte(s)
data_structure now returns array to preserve order of tags

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 352
11 dpavlin 358 use Data::Dumper;
12    
13 dpavlin 352 =head1 NAME
14    
15 dpavlin 354 WebPAC - base class for WebPAC
16 dpavlin 352
17     =head1 DESCRIPTION
18    
19 dpavlin 354 This module implements methods used by WebPAC.
20 dpavlin 352
21     =head1 METHODS
22    
23     =head2 new
24    
25 dpavlin 354 This will create new instance of WebPAC using configuration specified by C<config_file>.
26 dpavlin 352
27 dpavlin 354 my $webpac = new WebPAC(
28 dpavlin 352 config_file => 'name.conf',
29     [code_page => 'ISO-8859-2',]
30     );
31    
32     Default C<code_page> is C<ISO-8859-2>.
33    
34 dpavlin 353 It will also read configuration files
35     C<global.conf> (used by indexer and Web font-end)
36     and configuration file specified by C<config_file>
37     which describes databases to be indexed.
38    
39 dpavlin 352 =cut
40    
41 dpavlin 363 # mapping between data type and tag which specify
42     # format in XML file
43     my %type2tag = (
44     'isis' => 'isis',
45     # 'excel' => 'column',
46     # 'marc' => 'marc',
47     # 'feed' => 'feed'
48     );
49    
50 dpavlin 352 sub new {
51     my $class = shift;
52     my $self = {@_};
53     bless($self, $class);
54    
55     # fill in default values
56     # output codepage
57     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
58    
59 dpavlin 353 #
60     # read global.conf
61     #
62 dpavlin 352
63     $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";
64    
65     # read global config parametars
66     foreach my $var (qw(
67     dbi_dbd
68     dbi_dsn
69     dbi_user
70     dbi_passwd
71     show_progress
72     my_unac_filter
73     )) {
74     $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);
75     }
76    
77 dpavlin 353 #
78     # read indexer config file
79     #
80 dpavlin 352
81     $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";
82    
83 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
84 dpavlin 352 return $self;
85     }
86    
87     =head2 open_isis
88    
89     Open CDS/ISIS database using OpenIsis module and read all records to memory.
90    
91     $webpac->open_isis(
92     filename => '/data/ISIS/ISIS',
93     code_page => '852',
94     limit_mfn => '500',
95     lookup => [ ... ],
96     );
97    
98     By default, ISIS code page is assumed to be C<852>.
99    
100 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
101     from database in example above.
102 dpavlin 352
103     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
104     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
105     value in index.
106    
107     lookup => [
108     { 'key' => 'd:v900', 'val' => 'v250^a' },
109     { 'eval' => '"v901^a" eq "Podruèje"',
110     'key' => 'pa:v561^4:v562^4:v461^1',
111     'val' => 'v900' },
112     ]
113    
114 dpavlin 367 Returns number of last record read into memory (size of database, really).
115    
116 dpavlin 352 =cut
117    
118     sub open_isis {
119     my $self = shift;
120     my $arg = {@_};
121    
122     croak "need filename" if (! $arg->{'filename'});
123     my $code_page = $arg->{'code_page'} || '852';
124    
125 dpavlin 353 use OpenIsis;
126    
127 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
128    
129     # create Text::Iconv object
130     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
131    
132 dpavlin 363 print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});
133    
134 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
135    
136     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
137    
138 dpavlin 357 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
139    
140 dpavlin 363 print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});
141    
142 dpavlin 352 # read database
143     for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
144    
145     # read record
146     my $row = OpenIsis::read( $isis_db, $mfn );
147     foreach my $k (keys %{$row}) {
148     if ($k ne "mfn") {
149     foreach my $l (@{$row->{$k}}) {
150     $l = $cp->convert($l);
151     # has subfields?
152     my $val;
153     if ($l =~ m/\^/) {
154     foreach my $t (split(/\^/,$l)) {
155     next if (! $t);
156     $val->{substr($t,0,1)} = substr($t,1);
157     }
158     } else {
159     $val = $l;
160     }
161    
162     push @{$self->{'data'}->{$mfn}->{$k}}, $val;
163     }
164     }
165    
166     }
167    
168     # create lookup
169 dpavlin 355 my $rec = $self->{'data'}->{$mfn};
170     $self->create_lookup($rec, @{$arg->{'lookup'}});
171 dpavlin 352
172     }
173    
174 dpavlin 362 $self->{'current_mfn'} = 1;
175    
176 dpavlin 352 # store max mfn and return it.
177     return $self->{'max_mfn'} = $maxmfn;
178     }
179    
180 dpavlin 362 =head2 fetch_rec
181    
182     Fetch next record from database. It will also display progress bar (once
183     it's implemented, that is).
184    
185     my $rec = $webpac->fetch_rec;
186    
187     =cut
188    
189     sub fetch_rec {
190     my $self = shift;
191    
192     my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";
193    
194     if ($mfn > $self->{'max_mfn'}) {
195     $self->{'current_mfn'} = $self->{'max_mfn'};
196     return;
197     }
198    
199     return $self->{'data'}->{$mfn};
200     }
201    
202 dpavlin 363 =head2 open_import_xml
203    
204     Read file from C<import_xml/> directory and parse it.
205    
206     $webpac->open_import_xml(type => 'isis');
207    
208     =cut
209    
210     sub open_import_xml {
211     my $self = shift;
212    
213     my $arg = {@_};
214     confess "need type to load file from import_xml/" if (! $arg->{'type'});
215    
216 dpavlin 366 $self->{'type'} = $arg->{'type'};
217 dpavlin 363
218 dpavlin 366 my $type_base = $arg->{'type'};
219 dpavlin 363 $type_base =~ s/_.*$//g;
220    
221 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
222    
223     print STDERR "using type ",$self->{'type'}," tag ",$self->{'tag'},"\n" if ($self->{'debug'});
224    
225     my $f = "./import_xml/".$self->{'type'}.".xml";
226 dpavlin 363 confess "import_xml file '$f' doesn't exist!" if (! -e "$f");
227    
228     print STDERR "reading '$f'\n" if ($self->{'debug'});
229    
230     $self->{'import_xml'} = XMLin($f,
231 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
232 dpavlin 363 ForceContent => 1
233     );
234    
235     print Dumper($self->{'import_xml'});
236    
237     }
238    
239 dpavlin 355 =head2 create_lookup
240    
241     Create lookup from record using lookup definition.
242    
243 dpavlin 367 $self->create_lookup($rec, @lookups);
244    
245     Called internally by C<open_*> methods.
246    
247 dpavlin 355 =cut
248    
249     sub create_lookup {
250     my $self = shift;
251    
252     my $rec = shift || confess "need record to create lookup";
253     confess("need HASH as first argument!") if ($rec !~ /HASH/o);
254    
255     foreach my $i (@_) {
256     if ($i->{'eval'}) {
257     my $eval = $self->fill_in($rec,$i->{'eval'});
258     my $key = $self->fill_in($rec,$i->{'key'});
259     my @val = $self->fill_in($rec,$i->{'val'});
260     if ($key && @val && eval $eval) {
261     push @{$self->{'lookup'}->{$key}}, @val;
262     }
263     } else {
264     my $key = $self->fill_in($rec,$i->{'key'});
265     my @val = $self->fill_in($rec,$i->{'val'});
266     if ($key && @val) {
267     push @{$self->{'lookup'}->{$key}}, @val;
268     }
269     }
270     }
271     }
272    
273 dpavlin 356 =head2 get_data
274    
275     Returns value from record.
276    
277 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
278 dpavlin 356
279     Arguments are:
280     record reference C<$rec>,
281     field C<$f>,
282     optional subfiled C<$sf>,
283     index for repeatable values C<$i>.
284    
285 dpavlin 367 Optinal variable C<$found> will be incremeted if there
286 dpavlin 356 is field.
287    
288     Returns value or empty string.
289    
290     =cut
291    
292     sub get_data {
293     my $self = shift;
294    
295     my ($rec,$f,$sf,$i,$found) = @_;
296 dpavlin 367
297 dpavlin 356 if ($$rec->{$f}) {
298 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
299 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
300     $$found++ if (defined($$found));
301     return $$rec->{$f}->[$i]->{$sf};
302     } elsif ($$rec->{$f}->[$i]) {
303     $$found++ if (defined($$found));
304 dpavlin 366 # it still might have subfield, just
305     # not specified, so we'll dump all
306     if ($$rec->{$f}->[$i] =~ /HASH/o) {
307     my $out;
308     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
309     $out .= $$rec->{$f}->[$i]->{$k}." ";
310     }
311     return $out;
312     } else {
313     return $$rec->{$f}->[$i];
314     }
315 dpavlin 356 }
316     } else {
317     return '';
318     }
319     }
320    
321 dpavlin 352 =head2 fill_in
322    
323     Workhourse of all: takes record from in-memory structure of database and
324     strings with placeholders and returns string or array of with substituted
325     values from record.
326    
327 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
328 dpavlin 352
329     Optional argument is ordinal number for repeatable fields. By default,
330 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
331     element is 0).
332     Following example will read second value from repeatable field.
333 dpavlin 352
334 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
335 dpavlin 353
336     This function B<does not> perform parsing of format to inteligenty skip
337     delimiters before fields which aren't used.
338    
339 dpavlin 352 =cut
340    
341     sub fill_in {
342     my $self = shift;
343    
344     my $rec = shift || confess "need data record";
345     my $format = shift || confess "need format to parse";
346     # iteration (for repeatable fields)
347     my $i = shift || 0;
348    
349     # FIXME remove for speedup?
350 dpavlin 355 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
351 dpavlin 352
352     my $found = 0;
353    
354 dpavlin 359 my $eval_code;
355     # remove eval{...} from beginning
356     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
357    
358 dpavlin 352 # do actual replacement of placeholders
359 dpavlin 356 $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
360 dpavlin 352
361 dpavlin 353 if ($found) {
362 dpavlin 359 if ($eval_code) {
363     my $eval = $self->fill_in($rec,$eval_code,$i);
364     return if (! eval $eval);
365     }
366 dpavlin 353 # do we have lookups?
367     if ($format =~ /\[[^\[\]]+\]/o) {
368     return $self->lookup($format);
369     } else {
370     return $format;
371     }
372 dpavlin 352 } else {
373     return;
374     }
375     }
376    
377     =head2 lookup
378    
379 dpavlin 355 Perform lookups on format supplied to it.
380 dpavlin 352
381 dpavlin 367 my $text = $self->lookup('[v900]');
382 dpavlin 352
383 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
384    
385 dpavlin 352 =cut
386    
387     sub lookup {
388     my $self = shift;
389    
390     my $tmp = shift || confess "need format";
391    
392 dpavlin 353 if ($tmp =~ /\[[^\[\]]+\]/o) {
393 dpavlin 352 my @in = ( $tmp );
394     my @out;
395     while (my $f = shift @in) {
396     if ($f =~ /\[([^\[\]]+)\]/) {
397     my $k = $1;
398     if ($self->{'lookup'}->{$k}) {
399     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
400     my $tmp2 = $f;
401     $tmp2 =~ s/\[$k\]/$nv/g;
402     push @in, $tmp2;
403     }
404     } else {
405     undef $f;
406     }
407     } elsif ($f) {
408     push @out, $f;
409     }
410     }
411     return @out;
412     } else {
413     return $tmp;
414     }
415     }
416    
417 dpavlin 356 =head2 parse
418    
419     Perform smart parsing of string, skipping delimiters for fields which aren't
420     defined. It can also eval code in format starting with C<eval{...}> and
421     return output or nothing depending on eval code.
422    
423 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
424 dpavlin 356
425     =cut
426    
427     sub parse {
428     my $self = shift;
429    
430 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
431 dpavlin 356
432 dpavlin 366 return if (! $format_utf8);
433    
434 dpavlin 358 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
435 dpavlin 366 confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
436 dpavlin 358
437     $i = 0 if (! $i);
438    
439 dpavlin 366 my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
440    
441 dpavlin 356 my @out;
442    
443     my $eval_code;
444     # remove eval{...} from beginning
445     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
446    
447 dpavlin 358 my $prefix;
448     my $all_found=0;
449 dpavlin 356
450 dpavlin 358 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
451 dpavlin 356
452 dpavlin 358 my $del = $1 || '';
453 dpavlin 359 $prefix ||= $del if ($all_found == 0);
454 dpavlin 358
455     my $found = 0;
456     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
457    
458 dpavlin 356 if ($found) {
459 dpavlin 358 push @out, $del;
460     push @out, $tmp;
461     $all_found += $found;
462 dpavlin 356 }
463     }
464    
465 dpavlin 358 return if (! $all_found);
466 dpavlin 356
467 dpavlin 358 my $out = join('',@out) . $format;
468    
469     # add prefix if not there
470     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
471 dpavlin 367
472 dpavlin 359 if ($eval_code) {
473     my $eval = $self->fill_in($rec,$eval_code,$i);
474     return if (! eval $eval);
475     }
476    
477 dpavlin 358 return $out;
478 dpavlin 356 }
479    
480 dpavlin 367 =head2 parse_to_arr
481    
482     Similar to C<parse>, but returns array of all repeatable fields
483    
484     my @arr = $webpac->parse_to_arr($rec,'v250^a');
485    
486     =cut
487    
488     sub parse_to_arr {
489     my $self = shift;
490    
491     my ($rec, $format_utf8) = @_;
492    
493     confess("need HASH as first argument!") if ($rec !~ /HASH/o);
494     return if (! $format_utf8);
495    
496     my $i = 0;
497     my @arr;
498    
499     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
500     push @arr, $v;
501     }
502    
503     return @arr;
504     }
505    
506 dpavlin 366 =head2 data_structure
507    
508     Create in-memory data structure which represents layout from C<import_xml>.
509     It is used later to produce output.
510    
511 dpavlin 368 my @ds = $webpac->data_structure($rec);
512 dpavlin 366
513     =cut
514    
515     # private method _sort_by_order
516     # sort subrouting using order="" attribute
517     sub _sort_by_order {
518     my $self = shift;
519    
520     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
521     $self->{'import_xml'}->{'indexer'}->{$a};
522     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
523     $self->{'import_xml'}->{'indexer'}->{$b};
524    
525     return $va <=> $vb;
526     }
527    
528     sub data_structure {
529     my $self = shift;
530    
531     my $rec = shift;
532     confess("need HASH as first argument!") if ($rec !~ /HASH/o);
533    
534     my @sorted_tags;
535     if ($self->{tags_by_order}) {
536     @sorted_tags = @{$self->{tags_by_order}};
537     } else {
538     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
539     $self->{tags_by_order} = \@sorted_tags;
540     }
541    
542 dpavlin 368 my @ds;
543 dpavlin 366
544     foreach my $field (@sorted_tags) {
545    
546     my $row;
547    
548     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
549    
550     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
551 dpavlin 367 my @v = $self->parse_to_arr($rec,$tag->{'content'});
552 dpavlin 366
553 dpavlin 367 next if (! @v);
554 dpavlin 366
555     # does tag have type?
556     if ($tag->{'type'}) {
557 dpavlin 367 push @{$row->{$tag->{'type'}}}, @v;
558 dpavlin 366 } else {
559 dpavlin 367 push @{$row->{'display'}}, @v;
560     push @{$row->{'swish'}}, @v;
561 dpavlin 366 }
562     }
563    
564 dpavlin 368 if ($row) {
565     $row->{'tag'} = $field;
566     push @ds, $row;
567     }
568 dpavlin 366
569     }
570    
571 dpavlin 368 print "data_structure => ",Dumper(\@ds);
572 dpavlin 366
573     }
574    
575 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26