/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 366 - (show annotations)
Thu Jun 17 01:44:25 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 11894 byte(s)
make in-memory data_structure

1 package WebPAC;
2
3 use Carp;
4 use Text::Iconv;
5 use Config::IniFiles;
6 use XML::Simple;
7
8 use Data::Dumper;
9
10 =head1 NAME
11
12 WebPAC - base class for WebPAC
13
14 =head1 DESCRIPTION
15
16 This module implements methods used by WebPAC.
17
18 =head1 METHODS
19
20 =head2 new
21
22 This will create new instance of WebPAC using configuration specified by C<config_file>.
23
24 my $webpac = new WebPAC(
25 config_file => 'name.conf',
26 [code_page => 'ISO-8859-2',]
27 );
28
29 Default C<code_page> is C<ISO-8859-2>.
30
31 It will also read configuration files
32 C<global.conf> (used by indexer and Web font-end)
33 and configuration file specified by C<config_file>
34 which describes databases to be indexed.
35
36 =cut
37
38 # mapping between data type and tag which specify
39 # format in XML file
40 my %type2tag = (
41 'isis' => 'isis',
42 # 'excel' => 'column',
43 # 'marc' => 'marc',
44 # 'feed' => 'feed'
45 );
46
47 sub new {
48 my $class = shift;
49 my $self = {@_};
50 bless($self, $class);
51
52 # fill in default values
53 # output codepage
54 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
55
56 #
57 # read global.conf
58 #
59
60 $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";
61
62 # read global config parametars
63 foreach my $var (qw(
64 dbi_dbd
65 dbi_dsn
66 dbi_user
67 dbi_passwd
68 show_progress
69 my_unac_filter
70 )) {
71 $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);
72 }
73
74 #
75 # read indexer config file
76 #
77
78 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";
79
80 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
81 return $self;
82 }
83
84 =head2 open_isis
85
86 Open CDS/ISIS database using OpenIsis module and read all records to memory.
87
88 $webpac->open_isis(
89 filename => '/data/ISIS/ISIS',
90 code_page => '852',
91 limit_mfn => '500',
92 lookup => [ ... ],
93 );
94
95 By default, ISIS code page is assumed to be C<852>.
96
97 If optional parametar C<limit_mfn> is set, it will read just 500 records
98 from database in example above.
99
100 Returns number of last record read into memory (size of database, really).
101
102 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
103 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
104 value in index.
105
106 lookup => [
107 { 'key' => 'd:v900', 'val' => 'v250^a' },
108 { 'eval' => '"v901^a" eq "Podruèje"',
109 'key' => 'pa:v561^4:v562^4:v461^1',
110 'val' => 'v900' },
111 ]
112
113 =cut
114
115 sub open_isis {
116 my $self = shift;
117 my $arg = {@_};
118
119 croak "need filename" if (! $arg->{'filename'});
120 my $code_page = $arg->{'code_page'} || '852';
121
122 use OpenIsis;
123
124 #$self->{'isis_code_page'} = $code_page;
125
126 # create Text::Iconv object
127 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
128
129 print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});
130
131 my $isis_db = OpenIsis::open($arg->{'filename'});
132
133 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
134
135 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
136
137 print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});
138
139 # read database
140 for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
141
142 # read record
143 my $row = OpenIsis::read( $isis_db, $mfn );
144 foreach my $k (keys %{$row}) {
145 if ($k ne "mfn") {
146 foreach my $l (@{$row->{$k}}) {
147 $l = $cp->convert($l);
148 # has subfields?
149 my $val;
150 if ($l =~ m/\^/) {
151 foreach my $t (split(/\^/,$l)) {
152 next if (! $t);
153 $val->{substr($t,0,1)} = substr($t,1);
154 }
155 } else {
156 $val = $l;
157 }
158
159 push @{$self->{'data'}->{$mfn}->{$k}}, $val;
160 }
161 }
162
163 }
164
165 # create lookup
166 my $rec = $self->{'data'}->{$mfn};
167 $self->create_lookup($rec, @{$arg->{'lookup'}});
168
169 }
170
171 $self->{'current_mfn'} = 1;
172
173 # store max mfn and return it.
174 return $self->{'max_mfn'} = $maxmfn;
175 }
176
177 =head2 fetch_rec
178
179 Fetch next record from database. It will also display progress bar (once
180 it's implemented, that is).
181
182 my $rec = $webpac->fetch_rec;
183
184 =cut
185
186 sub fetch_rec {
187 my $self = shift;
188
189 my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";
190
191 if ($mfn > $self->{'max_mfn'}) {
192 $self->{'current_mfn'} = $self->{'max_mfn'};
193 return;
194 }
195
196 return $self->{'data'}->{$mfn};
197 }
198
199 =head2 open_import_xml
200
201 Read file from C<import_xml/> directory and parse it.
202
203 $webpac->open_import_xml(type => 'isis');
204
205 =cut
206
207 sub open_import_xml {
208 my $self = shift;
209
210 my $arg = {@_};
211 confess "need type to load file from import_xml/" if (! $arg->{'type'});
212
213 $self->{'type'} = $arg->{'type'};
214
215 my $type_base = $arg->{'type'};
216 $type_base =~ s/_.*$//g;
217
218 $self->{'tag'} = $type2tag{$type_base};
219
220 print STDERR "using type ",$self->{'type'}," tag ",$self->{'tag'},"\n" if ($self->{'debug'});
221
222 my $f = "./import_xml/".$self->{'type'}.".xml";
223 confess "import_xml file '$f' doesn't exist!" if (! -e "$f");
224
225 print STDERR "reading '$f'\n" if ($self->{'debug'});
226
227 $self->{'import_xml'} = XMLin($f,
228 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
229 ForceContent => 1
230 );
231
232 print Dumper($self->{'import_xml'});
233
234 }
235
236 =head2 create_lookup
237
238 Create lookup from record using lookup definition.
239
240 =cut
241
242 sub create_lookup {
243 my $self = shift;
244
245 my $rec = shift || confess "need record to create lookup";
246 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
247
248 foreach my $i (@_) {
249 if ($i->{'eval'}) {
250 my $eval = $self->fill_in($rec,$i->{'eval'});
251 my $key = $self->fill_in($rec,$i->{'key'});
252 my @val = $self->fill_in($rec,$i->{'val'});
253 if ($key && @val && eval $eval) {
254 push @{$self->{'lookup'}->{$key}}, @val;
255 }
256 } else {
257 my $key = $self->fill_in($rec,$i->{'key'});
258 my @val = $self->fill_in($rec,$i->{'val'});
259 if ($key && @val) {
260 push @{$self->{'lookup'}->{$key}}, @val;
261 }
262 }
263 }
264 }
265
266 =head2 get_data
267
268 Returns value from record.
269
270 $self->get_data(\$rec,$f,$sf,$i,\$found);
271
272 Arguments are:
273 record reference C<$rec>,
274 field C<$f>,
275 optional subfiled C<$sf>,
276 index for repeatable values C<$i>.
277
278 Optinal variable C<$found> will be incremeted if thre
279 is field.
280
281 Returns value or empty string.
282
283 =cut
284
285 sub get_data {
286 my $self = shift;
287
288 my ($rec,$f,$sf,$i,$found) = @_;
289 if ($$rec->{$f}) {
290 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
291 $$found++ if (defined($$found));
292 return $$rec->{$f}->[$i]->{$sf};
293 } elsif ($$rec->{$f}->[$i]) {
294 $$found++ if (defined($$found));
295 # it still might have subfield, just
296 # not specified, so we'll dump all
297 if ($$rec->{$f}->[$i] =~ /HASH/o) {
298 my $out;
299 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
300 $out .= $$rec->{$f}->[$i]->{$k}." ";
301 }
302 return $out;
303 } else {
304 return $$rec->{$f}->[$i];
305 }
306 }
307 } else {
308 return '';
309 }
310 }
311
312 =head2 fill_in
313
314 Workhourse of all: takes record from in-memory structure of database and
315 strings with placeholders and returns string or array of with substituted
316 values from record.
317
318 $webpac->fill_in($rec,'v250^a');
319
320 Optional argument is ordinal number for repeatable fields. By default,
321 it's assume to be first repeatable field (fields are perl array, so first
322 element is 0).
323 Following example will read second value from repeatable field.
324
325 $webpac->fill_in($rec,'Title: v250^a',1);
326
327 This function B<does not> perform parsing of format to inteligenty skip
328 delimiters before fields which aren't used.
329
330 =cut
331
332 sub fill_in {
333 my $self = shift;
334
335 my $rec = shift || confess "need data record";
336 my $format = shift || confess "need format to parse";
337 # iteration (for repeatable fields)
338 my $i = shift || 0;
339
340 # FIXME remove for speedup?
341 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
342
343 my $found = 0;
344
345 my $eval_code;
346 # remove eval{...} from beginning
347 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
348
349 # do actual replacement of placeholders
350 $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
351
352 if ($found) {
353 if ($eval_code) {
354 my $eval = $self->fill_in($rec,$eval_code,$i);
355 return if (! eval $eval);
356 }
357 # do we have lookups?
358 if ($format =~ /\[[^\[\]]+\]/o) {
359 return $self->lookup($format);
360 } else {
361 return $format;
362 }
363 } else {
364 return;
365 }
366 }
367
368 =head2 lookup
369
370 Perform lookups on format supplied to it.
371
372 my $txt = $self->lookup('[v900]');
373
374 Lookups can be nested (like C<[d:[a:[v900]]]>).
375
376 =cut
377
378 sub lookup {
379 my $self = shift;
380
381 my $tmp = shift || confess "need format";
382
383 if ($tmp =~ /\[[^\[\]]+\]/o) {
384 my @in = ( $tmp );
385 #print "##lookup $tmp\n";
386 my @out;
387 while (my $f = shift @in) {
388 if ($f =~ /\[([^\[\]]+)\]/) {
389 my $k = $1;
390 if ($self->{'lookup'}->{$k}) {
391 #print "## lookup key = $k\n";
392 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
393 my $tmp2 = $f;
394 $tmp2 =~ s/\[$k\]/$nv/g;
395 push @in, $tmp2;
396 #print "## lookup in => $tmp2\n";
397 }
398 } else {
399 undef $f;
400 }
401 } elsif ($f) {
402 push @out, $f;
403 #print "## lookup out => $f\n";
404 }
405 }
406 return @out;
407 } else {
408 return $tmp;
409 }
410 }
411
412 =head2 parse
413
414 Perform smart parsing of string, skipping delimiters for fields which aren't
415 defined. It can also eval code in format starting with C<eval{...}> and
416 return output or nothing depending on eval code.
417
418 $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
419
420 =cut
421
422 sub parse {
423 my $self = shift;
424
425 my ($rec, $format_utf8, $i) = @_;
426
427 return if (! $format_utf8);
428
429 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
430 confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
431
432 $i = 0 if (! $i);
433
434 my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
435
436 my @out;
437
438 my $eval_code;
439 # remove eval{...} from beginning
440 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
441
442 my $prefix;
443 my $all_found=0;
444
445 #print "## $format\n";
446 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
447 #print "## [ $1 | $2 | $3 ] $format\n";
448
449 my $del = $1 || '';
450 $prefix ||= $del if ($all_found == 0);
451
452 my $found = 0;
453 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
454
455 if ($found) {
456 push @out, $del;
457 push @out, $tmp;
458 $all_found += $found;
459 }
460 }
461
462 return if (! $all_found);
463
464 my $out = join('',@out) . $format;
465
466 # add prefix if not there
467 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
468
469 if ($eval_code) {
470 my $eval = $self->fill_in($rec,$eval_code,$i);
471 return if (! eval $eval);
472 }
473
474 return $out;
475 }
476
477 =head2 data_structure
478
479 Create in-memory data structure which represents layout from C<import_xml>.
480 It is used later to produce output.
481
482 my $ds = $webpac->data_structure($rec);
483
484 =cut
485
486 # private method _sort_by_order
487 # sort subrouting using order="" attribute
488 sub _sort_by_order {
489 my $self = shift;
490
491 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
492 $self->{'import_xml'}->{'indexer'}->{$a};
493 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
494 $self->{'import_xml'}->{'indexer'}->{$b};
495
496 return $va <=> $vb;
497 }
498
499 sub data_structure {
500 my $self = shift;
501
502 my $rec = shift;
503 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
504
505 my @sorted_tags;
506 if ($self->{tags_by_order}) {
507 @sorted_tags = @{$self->{tags_by_order}};
508 } else {
509 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
510 $self->{tags_by_order} = \@sorted_tags;
511 }
512
513 my $ds;
514
515 foreach my $field (@sorted_tags) {
516
517 my $row;
518 my $i = 0;
519
520 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
521
522 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
523
524 my $v = $self->parse($rec,$tag->{'content'},$i);
525 print "## $i:",$tag->{'content'}," = ",($v || 'null'),"\n";
526
527 next if (!$v || $v && $v eq '');
528
529 # does tag have type?
530 if ($tag->{'type'}) {
531 push @{$row->{$tag->{'type'}}}, $v;
532 } else {
533 push @{$row->{'display'}}, $v;
534 push @{$row->{'swish'}}, $v;
535 }
536 }
537
538 push @{$ds->{$field}}, $row if ($row);
539
540 }
541
542 print Dumper($ds);
543
544 }
545
546 1;

  ViewVC Help
Powered by ViewVC 1.1.26