/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 366 - (hide annotations)
Thu Jun 17 01:44:25 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 11894 byte(s)
make in-memory data_structure

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3     use Carp;
4 dpavlin 353 use Text::Iconv;
5     use Config::IniFiles;
6 dpavlin 363 use XML::Simple;
7 dpavlin 352
8 dpavlin 358 use Data::Dumper;
9    
10 dpavlin 352 =head1 NAME
11    
12 dpavlin 354 WebPAC - base class for WebPAC
13 dpavlin 352
14     =head1 DESCRIPTION
15    
16 dpavlin 354 This module implements methods used by WebPAC.
17 dpavlin 352
18     =head1 METHODS
19    
20     =head2 new
21    
22 dpavlin 354 This will create new instance of WebPAC using configuration specified by C<config_file>.
23 dpavlin 352
24 dpavlin 354 my $webpac = new WebPAC(
25 dpavlin 352 config_file => 'name.conf',
26     [code_page => 'ISO-8859-2',]
27     );
28    
29     Default C<code_page> is C<ISO-8859-2>.
30    
31 dpavlin 353 It will also read configuration files
32     C<global.conf> (used by indexer and Web font-end)
33     and configuration file specified by C<config_file>
34     which describes databases to be indexed.
35    
36 dpavlin 352 =cut
37    
38 dpavlin 363 # mapping between data type and tag which specify
39     # format in XML file
40     my %type2tag = (
41     'isis' => 'isis',
42     # 'excel' => 'column',
43     # 'marc' => 'marc',
44     # 'feed' => 'feed'
45     );
46    
47 dpavlin 352 sub new {
48     my $class = shift;
49     my $self = {@_};
50     bless($self, $class);
51    
52     # fill in default values
53     # output codepage
54     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
55    
56 dpavlin 353 #
57     # read global.conf
58     #
59 dpavlin 352
60     $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";
61    
62     # read global config parametars
63     foreach my $var (qw(
64     dbi_dbd
65     dbi_dsn
66     dbi_user
67     dbi_passwd
68     show_progress
69     my_unac_filter
70     )) {
71     $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);
72     }
73    
74 dpavlin 353 #
75     # read indexer config file
76     #
77 dpavlin 352
78     $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";
79    
80 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
81 dpavlin 352 return $self;
82     }
83    
84     =head2 open_isis
85    
86     Open CDS/ISIS database using OpenIsis module and read all records to memory.
87    
88     $webpac->open_isis(
89     filename => '/data/ISIS/ISIS',
90     code_page => '852',
91     limit_mfn => '500',
92     lookup => [ ... ],
93     );
94    
95     By default, ISIS code page is assumed to be C<852>.
96    
97 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
98     from database in example above.
99 dpavlin 352
100     Returns number of last record read into memory (size of database, really).
101    
102     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
103     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
104     value in index.
105    
106     lookup => [
107     { 'key' => 'd:v900', 'val' => 'v250^a' },
108     { 'eval' => '"v901^a" eq "Podruèje"',
109     'key' => 'pa:v561^4:v562^4:v461^1',
110     'val' => 'v900' },
111     ]
112    
113     =cut
114    
115     sub open_isis {
116     my $self = shift;
117     my $arg = {@_};
118    
119     croak "need filename" if (! $arg->{'filename'});
120     my $code_page = $arg->{'code_page'} || '852';
121    
122 dpavlin 353 use OpenIsis;
123    
124 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
125    
126     # create Text::Iconv object
127     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
128    
129 dpavlin 363 print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});
130    
131 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
132    
133     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
134    
135 dpavlin 357 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
136    
137 dpavlin 363 print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});
138    
139 dpavlin 352 # read database
140     for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
141    
142     # read record
143     my $row = OpenIsis::read( $isis_db, $mfn );
144     foreach my $k (keys %{$row}) {
145     if ($k ne "mfn") {
146     foreach my $l (@{$row->{$k}}) {
147     $l = $cp->convert($l);
148     # has subfields?
149     my $val;
150     if ($l =~ m/\^/) {
151     foreach my $t (split(/\^/,$l)) {
152     next if (! $t);
153     $val->{substr($t,0,1)} = substr($t,1);
154     }
155     } else {
156     $val = $l;
157     }
158    
159     push @{$self->{'data'}->{$mfn}->{$k}}, $val;
160     }
161     }
162    
163     }
164    
165     # create lookup
166 dpavlin 355 my $rec = $self->{'data'}->{$mfn};
167     $self->create_lookup($rec, @{$arg->{'lookup'}});
168 dpavlin 352
169     }
170    
171 dpavlin 362 $self->{'current_mfn'} = 1;
172    
173 dpavlin 352 # store max mfn and return it.
174     return $self->{'max_mfn'} = $maxmfn;
175     }
176    
177 dpavlin 362 =head2 fetch_rec
178    
179     Fetch next record from database. It will also display progress bar (once
180     it's implemented, that is).
181    
182     my $rec = $webpac->fetch_rec;
183    
184     =cut
185    
186     sub fetch_rec {
187     my $self = shift;
188    
189     my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";
190    
191     if ($mfn > $self->{'max_mfn'}) {
192     $self->{'current_mfn'} = $self->{'max_mfn'};
193     return;
194     }
195    
196     return $self->{'data'}->{$mfn};
197     }
198    
199 dpavlin 363 =head2 open_import_xml
200    
201     Read file from C<import_xml/> directory and parse it.
202    
203     $webpac->open_import_xml(type => 'isis');
204    
205     =cut
206    
207     sub open_import_xml {
208     my $self = shift;
209    
210     my $arg = {@_};
211     confess "need type to load file from import_xml/" if (! $arg->{'type'});
212    
213 dpavlin 366 $self->{'type'} = $arg->{'type'};
214 dpavlin 363
215 dpavlin 366 my $type_base = $arg->{'type'};
216 dpavlin 363 $type_base =~ s/_.*$//g;
217    
218 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
219    
220     print STDERR "using type ",$self->{'type'}," tag ",$self->{'tag'},"\n" if ($self->{'debug'});
221    
222     my $f = "./import_xml/".$self->{'type'}.".xml";
223 dpavlin 363 confess "import_xml file '$f' doesn't exist!" if (! -e "$f");
224    
225     print STDERR "reading '$f'\n" if ($self->{'debug'});
226    
227     $self->{'import_xml'} = XMLin($f,
228 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
229 dpavlin 363 ForceContent => 1
230     );
231    
232     print Dumper($self->{'import_xml'});
233    
234     }
235    
236 dpavlin 355 =head2 create_lookup
237    
238     Create lookup from record using lookup definition.
239    
240     =cut
241    
242     sub create_lookup {
243     my $self = shift;
244    
245     my $rec = shift || confess "need record to create lookup";
246     confess("need HASH as first argument!") if ($rec !~ /HASH/o);
247    
248     foreach my $i (@_) {
249     if ($i->{'eval'}) {
250     my $eval = $self->fill_in($rec,$i->{'eval'});
251     my $key = $self->fill_in($rec,$i->{'key'});
252     my @val = $self->fill_in($rec,$i->{'val'});
253     if ($key && @val && eval $eval) {
254     push @{$self->{'lookup'}->{$key}}, @val;
255     }
256     } else {
257     my $key = $self->fill_in($rec,$i->{'key'});
258     my @val = $self->fill_in($rec,$i->{'val'});
259     if ($key && @val) {
260     push @{$self->{'lookup'}->{$key}}, @val;
261     }
262     }
263     }
264     }
265    
266 dpavlin 356 =head2 get_data
267    
268     Returns value from record.
269    
270     $self->get_data(\$rec,$f,$sf,$i,\$found);
271    
272     Arguments are:
273     record reference C<$rec>,
274     field C<$f>,
275     optional subfiled C<$sf>,
276     index for repeatable values C<$i>.
277    
278     Optinal variable C<$found> will be incremeted if thre
279     is field.
280    
281     Returns value or empty string.
282    
283     =cut
284    
285     sub get_data {
286     my $self = shift;
287    
288     my ($rec,$f,$sf,$i,$found) = @_;
289     if ($$rec->{$f}) {
290     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
291     $$found++ if (defined($$found));
292     return $$rec->{$f}->[$i]->{$sf};
293     } elsif ($$rec->{$f}->[$i]) {
294     $$found++ if (defined($$found));
295 dpavlin 366 # it still might have subfield, just
296     # not specified, so we'll dump all
297     if ($$rec->{$f}->[$i] =~ /HASH/o) {
298     my $out;
299     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
300     $out .= $$rec->{$f}->[$i]->{$k}." ";
301     }
302     return $out;
303     } else {
304     return $$rec->{$f}->[$i];
305     }
306 dpavlin 356 }
307     } else {
308     return '';
309     }
310     }
311    
312 dpavlin 352 =head2 fill_in
313    
314     Workhourse of all: takes record from in-memory structure of database and
315     strings with placeholders and returns string or array of with substituted
316     values from record.
317    
318     $webpac->fill_in($rec,'v250^a');
319    
320     Optional argument is ordinal number for repeatable fields. By default,
321 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
322     element is 0).
323     Following example will read second value from repeatable field.
324 dpavlin 352
325 dpavlin 353 $webpac->fill_in($rec,'Title: v250^a',1);
326    
327     This function B<does not> perform parsing of format to inteligenty skip
328     delimiters before fields which aren't used.
329    
330 dpavlin 352 =cut
331    
332     sub fill_in {
333     my $self = shift;
334    
335     my $rec = shift || confess "need data record";
336     my $format = shift || confess "need format to parse";
337     # iteration (for repeatable fields)
338     my $i = shift || 0;
339    
340     # FIXME remove for speedup?
341 dpavlin 355 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
342 dpavlin 352
343     my $found = 0;
344    
345 dpavlin 359 my $eval_code;
346     # remove eval{...} from beginning
347     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
348    
349 dpavlin 352 # do actual replacement of placeholders
350 dpavlin 356 $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
351 dpavlin 352
352 dpavlin 353 if ($found) {
353 dpavlin 359 if ($eval_code) {
354     my $eval = $self->fill_in($rec,$eval_code,$i);
355     return if (! eval $eval);
356     }
357 dpavlin 353 # do we have lookups?
358     if ($format =~ /\[[^\[\]]+\]/o) {
359     return $self->lookup($format);
360     } else {
361     return $format;
362     }
363 dpavlin 352 } else {
364     return;
365     }
366     }
367    
368     =head2 lookup
369    
370 dpavlin 355 Perform lookups on format supplied to it.
371 dpavlin 352
372     my $txt = $self->lookup('[v900]');
373    
374 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
375    
376 dpavlin 352 =cut
377    
378     sub lookup {
379     my $self = shift;
380    
381     my $tmp = shift || confess "need format";
382    
383 dpavlin 353 if ($tmp =~ /\[[^\[\]]+\]/o) {
384 dpavlin 352 my @in = ( $tmp );
385 dpavlin 353 #print "##lookup $tmp\n";
386 dpavlin 352 my @out;
387     while (my $f = shift @in) {
388     if ($f =~ /\[([^\[\]]+)\]/) {
389     my $k = $1;
390     if ($self->{'lookup'}->{$k}) {
391 dpavlin 353 #print "## lookup key = $k\n";
392 dpavlin 352 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
393     my $tmp2 = $f;
394     $tmp2 =~ s/\[$k\]/$nv/g;
395     push @in, $tmp2;
396 dpavlin 353 #print "## lookup in => $tmp2\n";
397 dpavlin 352 }
398     } else {
399     undef $f;
400     }
401     } elsif ($f) {
402     push @out, $f;
403 dpavlin 353 #print "## lookup out => $f\n";
404 dpavlin 352 }
405     }
406     return @out;
407     } else {
408     return $tmp;
409     }
410     }
411    
412 dpavlin 356 =head2 parse
413    
414     Perform smart parsing of string, skipping delimiters for fields which aren't
415     defined. It can also eval code in format starting with C<eval{...}> and
416     return output or nothing depending on eval code.
417    
418     $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
419    
420     =cut
421    
422     sub parse {
423     my $self = shift;
424    
425 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
426 dpavlin 356
427 dpavlin 366 return if (! $format_utf8);
428    
429 dpavlin 358 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
430 dpavlin 366 confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
431 dpavlin 358
432     $i = 0 if (! $i);
433    
434 dpavlin 366 my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
435    
436 dpavlin 356 my @out;
437    
438     my $eval_code;
439     # remove eval{...} from beginning
440     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
441    
442 dpavlin 358 my $prefix;
443     my $all_found=0;
444 dpavlin 356
445 dpavlin 359 #print "## $format\n";
446 dpavlin 358 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
447 dpavlin 359 #print "## [ $1 | $2 | $3 ] $format\n";
448 dpavlin 356
449 dpavlin 358 my $del = $1 || '';
450 dpavlin 359 $prefix ||= $del if ($all_found == 0);
451 dpavlin 358
452     my $found = 0;
453     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
454    
455 dpavlin 356 if ($found) {
456 dpavlin 358 push @out, $del;
457     push @out, $tmp;
458     $all_found += $found;
459 dpavlin 356 }
460     }
461    
462 dpavlin 358 return if (! $all_found);
463 dpavlin 356
464 dpavlin 358 my $out = join('',@out) . $format;
465    
466     # add prefix if not there
467     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
468    
469 dpavlin 359 if ($eval_code) {
470     my $eval = $self->fill_in($rec,$eval_code,$i);
471     return if (! eval $eval);
472     }
473    
474 dpavlin 358 return $out;
475 dpavlin 356 }
476    
477 dpavlin 366 =head2 data_structure
478    
479     Create in-memory data structure which represents layout from C<import_xml>.
480     It is used later to produce output.
481    
482     my $ds = $webpac->data_structure($rec);
483    
484     =cut
485    
486     # private method _sort_by_order
487     # sort subrouting using order="" attribute
488     sub _sort_by_order {
489     my $self = shift;
490    
491     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
492     $self->{'import_xml'}->{'indexer'}->{$a};
493     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
494     $self->{'import_xml'}->{'indexer'}->{$b};
495    
496     return $va <=> $vb;
497     }
498    
499     sub data_structure {
500     my $self = shift;
501    
502     my $rec = shift;
503     confess("need HASH as first argument!") if ($rec !~ /HASH/o);
504    
505     my @sorted_tags;
506     if ($self->{tags_by_order}) {
507     @sorted_tags = @{$self->{tags_by_order}};
508     } else {
509     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
510     $self->{tags_by_order} = \@sorted_tags;
511     }
512    
513     my $ds;
514    
515     foreach my $field (@sorted_tags) {
516    
517     my $row;
518     my $i = 0;
519    
520     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
521    
522     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
523    
524     my $v = $self->parse($rec,$tag->{'content'},$i);
525     print "## $i:",$tag->{'content'}," = ",($v || 'null'),"\n";
526    
527     next if (!$v || $v && $v eq '');
528    
529     # does tag have type?
530     if ($tag->{'type'}) {
531     push @{$row->{$tag->{'type'}}}, $v;
532     } else {
533     push @{$row->{'display'}}, $v;
534     push @{$row->{'swish'}}, $v;
535     }
536     }
537    
538     push @{$ds->{$field}}, $row if ($row);
539    
540     }
541    
542     print Dumper($ds);
543    
544     }
545    
546 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26