/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 367 - (show annotations)
Thu Jun 17 12:05:01 2004 UTC (15 years, 7 months ago) by dpavlin
File size: 12257 byte(s)
parse_to_arr, repeatable fieldns now work, documentation improvements

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10
11 use Data::Dumper;
12
13 =head1 NAME
14
15 WebPAC - base class for WebPAC
16
17 =head1 DESCRIPTION
18
19 This module implements methods used by WebPAC.
20
21 =head1 METHODS
22
23 =head2 new
24
25 This will create new instance of WebPAC using configuration specified by C<config_file>.
26
27 my $webpac = new WebPAC(
28 config_file => 'name.conf',
29 [code_page => 'ISO-8859-2',]
30 );
31
32 Default C<code_page> is C<ISO-8859-2>.
33
34 It will also read configuration files
35 C<global.conf> (used by indexer and Web font-end)
36 and configuration file specified by C<config_file>
37 which describes databases to be indexed.
38
39 =cut
40
41 # mapping between data type and tag which specify
42 # format in XML file
43 my %type2tag = (
44 'isis' => 'isis',
45 # 'excel' => 'column',
46 # 'marc' => 'marc',
47 # 'feed' => 'feed'
48 );
49
50 sub new {
51 my $class = shift;
52 my $self = {@_};
53 bless($self, $class);
54
55 # fill in default values
56 # output codepage
57 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
58
59 #
60 # read global.conf
61 #
62
63 $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";
64
65 # read global config parametars
66 foreach my $var (qw(
67 dbi_dbd
68 dbi_dsn
69 dbi_user
70 dbi_passwd
71 show_progress
72 my_unac_filter
73 )) {
74 $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);
75 }
76
77 #
78 # read indexer config file
79 #
80
81 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";
82
83 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
84 return $self;
85 }
86
87 =head2 open_isis
88
89 Open CDS/ISIS database using OpenIsis module and read all records to memory.
90
91 $webpac->open_isis(
92 filename => '/data/ISIS/ISIS',
93 code_page => '852',
94 limit_mfn => '500',
95 lookup => [ ... ],
96 );
97
98 By default, ISIS code page is assumed to be C<852>.
99
100 If optional parametar C<limit_mfn> is set, it will read just 500 records
101 from database in example above.
102
103 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
104 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
105 value in index.
106
107 lookup => [
108 { 'key' => 'd:v900', 'val' => 'v250^a' },
109 { 'eval' => '"v901^a" eq "Podruèje"',
110 'key' => 'pa:v561^4:v562^4:v461^1',
111 'val' => 'v900' },
112 ]
113
114 Returns number of last record read into memory (size of database, really).
115
116 =cut
117
118 sub open_isis {
119 my $self = shift;
120 my $arg = {@_};
121
122 croak "need filename" if (! $arg->{'filename'});
123 my $code_page = $arg->{'code_page'} || '852';
124
125 use OpenIsis;
126
127 #$self->{'isis_code_page'} = $code_page;
128
129 # create Text::Iconv object
130 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
131
132 print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});
133
134 my $isis_db = OpenIsis::open($arg->{'filename'});
135
136 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
137
138 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
139
140 print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});
141
142 # read database
143 for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
144
145 # read record
146 my $row = OpenIsis::read( $isis_db, $mfn );
147 foreach my $k (keys %{$row}) {
148 if ($k ne "mfn") {
149 foreach my $l (@{$row->{$k}}) {
150 $l = $cp->convert($l);
151 # has subfields?
152 my $val;
153 if ($l =~ m/\^/) {
154 foreach my $t (split(/\^/,$l)) {
155 next if (! $t);
156 $val->{substr($t,0,1)} = substr($t,1);
157 }
158 } else {
159 $val = $l;
160 }
161
162 push @{$self->{'data'}->{$mfn}->{$k}}, $val;
163 }
164 }
165
166 }
167
168 # create lookup
169 my $rec = $self->{'data'}->{$mfn};
170 $self->create_lookup($rec, @{$arg->{'lookup'}});
171
172 }
173
174 $self->{'current_mfn'} = 1;
175
176 # store max mfn and return it.
177 return $self->{'max_mfn'} = $maxmfn;
178 }
179
180 =head2 fetch_rec
181
182 Fetch next record from database. It will also display progress bar (once
183 it's implemented, that is).
184
185 my $rec = $webpac->fetch_rec;
186
187 =cut
188
189 sub fetch_rec {
190 my $self = shift;
191
192 my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";
193
194 if ($mfn > $self->{'max_mfn'}) {
195 $self->{'current_mfn'} = $self->{'max_mfn'};
196 return;
197 }
198
199 return $self->{'data'}->{$mfn};
200 }
201
202 =head2 open_import_xml
203
204 Read file from C<import_xml/> directory and parse it.
205
206 $webpac->open_import_xml(type => 'isis');
207
208 =cut
209
210 sub open_import_xml {
211 my $self = shift;
212
213 my $arg = {@_};
214 confess "need type to load file from import_xml/" if (! $arg->{'type'});
215
216 $self->{'type'} = $arg->{'type'};
217
218 my $type_base = $arg->{'type'};
219 $type_base =~ s/_.*$//g;
220
221 $self->{'tag'} = $type2tag{$type_base};
222
223 print STDERR "using type ",$self->{'type'}," tag ",$self->{'tag'},"\n" if ($self->{'debug'});
224
225 my $f = "./import_xml/".$self->{'type'}.".xml";
226 confess "import_xml file '$f' doesn't exist!" if (! -e "$f");
227
228 print STDERR "reading '$f'\n" if ($self->{'debug'});
229
230 $self->{'import_xml'} = XMLin($f,
231 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
232 ForceContent => 1
233 );
234
235 print Dumper($self->{'import_xml'});
236
237 }
238
239 =head2 create_lookup
240
241 Create lookup from record using lookup definition.
242
243 $self->create_lookup($rec, @lookups);
244
245 Called internally by C<open_*> methods.
246
247 =cut
248
249 sub create_lookup {
250 my $self = shift;
251
252 my $rec = shift || confess "need record to create lookup";
253 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
254
255 foreach my $i (@_) {
256 if ($i->{'eval'}) {
257 my $eval = $self->fill_in($rec,$i->{'eval'});
258 my $key = $self->fill_in($rec,$i->{'key'});
259 my @val = $self->fill_in($rec,$i->{'val'});
260 if ($key && @val && eval $eval) {
261 push @{$self->{'lookup'}->{$key}}, @val;
262 }
263 } else {
264 my $key = $self->fill_in($rec,$i->{'key'});
265 my @val = $self->fill_in($rec,$i->{'val'});
266 if ($key && @val) {
267 push @{$self->{'lookup'}->{$key}}, @val;
268 }
269 }
270 }
271 }
272
273 =head2 get_data
274
275 Returns value from record.
276
277 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
278
279 Arguments are:
280 record reference C<$rec>,
281 field C<$f>,
282 optional subfiled C<$sf>,
283 index for repeatable values C<$i>.
284
285 Optinal variable C<$found> will be incremeted if there
286 is field.
287
288 Returns value or empty string.
289
290 =cut
291
292 sub get_data {
293 my $self = shift;
294
295 my ($rec,$f,$sf,$i,$found) = @_;
296
297 if ($$rec->{$f}) {
298 return '' if (! $$rec->{$f}->[$i]);
299 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
300 $$found++ if (defined($$found));
301 return $$rec->{$f}->[$i]->{$sf};
302 } elsif ($$rec->{$f}->[$i]) {
303 $$found++ if (defined($$found));
304 # it still might have subfield, just
305 # not specified, so we'll dump all
306 if ($$rec->{$f}->[$i] =~ /HASH/o) {
307 my $out;
308 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
309 $out .= $$rec->{$f}->[$i]->{$k}." ";
310 }
311 return $out;
312 } else {
313 return $$rec->{$f}->[$i];
314 }
315 }
316 } else {
317 return '';
318 }
319 }
320
321 =head2 fill_in
322
323 Workhourse of all: takes record from in-memory structure of database and
324 strings with placeholders and returns string or array of with substituted
325 values from record.
326
327 my $text = $webpac->fill_in($rec,'v250^a');
328
329 Optional argument is ordinal number for repeatable fields. By default,
330 it's assume to be first repeatable field (fields are perl array, so first
331 element is 0).
332 Following example will read second value from repeatable field.
333
334 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
335
336 This function B<does not> perform parsing of format to inteligenty skip
337 delimiters before fields which aren't used.
338
339 =cut
340
341 sub fill_in {
342 my $self = shift;
343
344 my $rec = shift || confess "need data record";
345 my $format = shift || confess "need format to parse";
346 # iteration (for repeatable fields)
347 my $i = shift || 0;
348
349 # FIXME remove for speedup?
350 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
351
352 my $found = 0;
353
354 my $eval_code;
355 # remove eval{...} from beginning
356 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
357
358 # do actual replacement of placeholders
359 $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
360
361 if ($found) {
362 if ($eval_code) {
363 my $eval = $self->fill_in($rec,$eval_code,$i);
364 return if (! eval $eval);
365 }
366 # do we have lookups?
367 if ($format =~ /\[[^\[\]]+\]/o) {
368 return $self->lookup($format);
369 } else {
370 return $format;
371 }
372 } else {
373 return;
374 }
375 }
376
377 =head2 lookup
378
379 Perform lookups on format supplied to it.
380
381 my $text = $self->lookup('[v900]');
382
383 Lookups can be nested (like C<[d:[a:[v900]]]>).
384
385 =cut
386
387 sub lookup {
388 my $self = shift;
389
390 my $tmp = shift || confess "need format";
391
392 if ($tmp =~ /\[[^\[\]]+\]/o) {
393 my @in = ( $tmp );
394 my @out;
395 while (my $f = shift @in) {
396 if ($f =~ /\[([^\[\]]+)\]/) {
397 my $k = $1;
398 if ($self->{'lookup'}->{$k}) {
399 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
400 my $tmp2 = $f;
401 $tmp2 =~ s/\[$k\]/$nv/g;
402 push @in, $tmp2;
403 }
404 } else {
405 undef $f;
406 }
407 } elsif ($f) {
408 push @out, $f;
409 }
410 }
411 return @out;
412 } else {
413 return $tmp;
414 }
415 }
416
417 =head2 parse
418
419 Perform smart parsing of string, skipping delimiters for fields which aren't
420 defined. It can also eval code in format starting with C<eval{...}> and
421 return output or nothing depending on eval code.
422
423 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
424
425 =cut
426
427 sub parse {
428 my $self = shift;
429
430 my ($rec, $format_utf8, $i) = @_;
431
432 return if (! $format_utf8);
433
434 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
435 confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
436
437 $i = 0 if (! $i);
438
439 my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
440
441 my @out;
442
443 my $eval_code;
444 # remove eval{...} from beginning
445 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
446
447 my $prefix;
448 my $all_found=0;
449
450 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
451
452 my $del = $1 || '';
453 $prefix ||= $del if ($all_found == 0);
454
455 my $found = 0;
456 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
457
458 if ($found) {
459 push @out, $del;
460 push @out, $tmp;
461 $all_found += $found;
462 }
463 }
464
465 return if (! $all_found);
466
467 my $out = join('',@out) . $format;
468
469 # add prefix if not there
470 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
471
472 if ($eval_code) {
473 my $eval = $self->fill_in($rec,$eval_code,$i);
474 return if (! eval $eval);
475 }
476
477 return $out;
478 }
479
480 =head2 parse_to_arr
481
482 Similar to C<parse>, but returns array of all repeatable fields
483
484 my @arr = $webpac->parse_to_arr($rec,'v250^a');
485
486 =cut
487
488 sub parse_to_arr {
489 my $self = shift;
490
491 my ($rec, $format_utf8) = @_;
492
493 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
494 return if (! $format_utf8);
495
496 my $i = 0;
497 my @arr;
498
499 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
500 push @arr, $v;
501 }
502
503 return @arr;
504 }
505
506 =head2 data_structure
507
508 Create in-memory data structure which represents layout from C<import_xml>.
509 It is used later to produce output.
510
511 my $ds = $webpac->data_structure($rec);
512
513 =cut
514
515 # private method _sort_by_order
516 # sort subrouting using order="" attribute
517 sub _sort_by_order {
518 my $self = shift;
519
520 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
521 $self->{'import_xml'}->{'indexer'}->{$a};
522 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
523 $self->{'import_xml'}->{'indexer'}->{$b};
524
525 return $va <=> $vb;
526 }
527
528 sub data_structure {
529 my $self = shift;
530
531 my $rec = shift;
532 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
533
534 my @sorted_tags;
535 if ($self->{tags_by_order}) {
536 @sorted_tags = @{$self->{tags_by_order}};
537 } else {
538 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
539 $self->{tags_by_order} = \@sorted_tags;
540 }
541
542 my $ds;
543
544 foreach my $field (@sorted_tags) {
545
546 my $row;
547
548 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
549
550 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
551 my @v = $self->parse_to_arr($rec,$tag->{'content'});
552
553 next if (! @v);
554
555 # does tag have type?
556 if ($tag->{'type'}) {
557 push @{$row->{$tag->{'type'}}}, @v;
558 } else {
559 push @{$row->{'display'}}, @v;
560 push @{$row->{'swish'}}, @v;
561 }
562 }
563
564 push @{$ds->{$field}}, $row if ($row);
565
566 }
567
568 print "data_structure => ",Dumper($ds);
569
570 }
571
572 1;

  ViewVC Help
Powered by ViewVC 1.1.26