/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 707 - (show annotations)
Wed Jul 13 23:36:53 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 28525 byte(s)
small fix

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 my ($have_biblio_isis, $have_openisis) = (0,0);
17
18 eval "use Biblio::Isis 0.13;";
19 unless ($@) {
20 $have_biblio_isis = 1
21 } else {
22 eval "use OpenIsis;";
23 $have_openisis = 1 unless ($@);
24 }
25
26 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
27 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
28 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
29 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
30
31 =head1 NAME
32
33 WebPAC - base class for WebPAC
34
35 =head1 DESCRIPTION
36
37 This module implements methods used by WebPAC.
38
39 =head1 METHODS
40
41 =head2 new
42
43 Create new instance of WebPAC using configuration specified by C<config_file>.
44
45 my $webpac = new WebPAC(
46 config_file => 'name.conf',
47 code_page => 'ISO-8859-2',
48 low_mem => 1,
49 filter => {
50 'lower' => sub { lc($_[0]) },
51 },
52 );
53
54 Default C<code_page> is C<ISO-8859-2>.
55
56 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
57
58 There is optinal parametar C<filter> which specify different filters which
59 can be applied using C<filter{name}> notation.
60 Same filters can be used in Template Toolkit files.
61
62 This method will also read configuration files
63 C<global.conf> (used by indexer and Web font-end)
64 and configuration file specified by C<config_file>
65 which describes databases to be indexed.
66
67 =cut
68
69 # mapping between data type and tag which specify
70 # format in XML file
71 my %type2tag = (
72 'isis' => 'isis',
73 # 'excel' => 'column',
74 # 'marc' => 'marc',
75 # 'feed' => 'feed'
76 );
77
78 sub new {
79 my $class = shift;
80 my $self = {@_};
81 bless($self, $class);
82
83 $self->{'start_t'} = time();
84
85 my $log_file = $self->{'log'} || "log.conf";
86 Log::Log4perl->init($log_file);
87
88 my $log = $self->_get_logger();
89
90 # fill in default values
91 # output codepage
92 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
93
94 #
95 # read global.conf
96 #
97 $log->debug("read 'global.conf'");
98
99 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
100
101 # read global config parametars
102 foreach my $var (qw(
103 dbi_dbd
104 dbi_dsn
105 dbi_user
106 dbi_passwd
107 show_progress
108 my_unac_filter
109 output_template
110 )) {
111 $self->{'global_config'}->{$var} = $config->val('global', $var);
112 }
113
114 #
115 # read indexer config file
116 #
117
118 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
119
120 # create UTF-8 convertor for import_xml files
121 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
122
123 # create Template toolkit instance
124 $self->{'tt'} = Template->new(
125 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
126 FILTERS => $self->{'filter'},
127 EVAL_PERL => 1,
128 );
129
130 # running with low_mem flag? well, use DBM::Deep then.
131 if ($self->{'low_mem'}) {
132 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
133
134 my $db_file = "data.db";
135
136 if (-e $db_file) {
137 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
138 $log->debug("removed '$db_file' from last run");
139 }
140
141 require DBM::Deep;
142
143 my $db = new DBM::Deep $db_file;
144
145 $log->logdie("DBM::Deep error: $!") unless ($db);
146
147 if ($db->error()) {
148 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
149 } else {
150 $log->debug("using file '$db_file' for DBM::Deep");
151 }
152
153 $self->{'db'} = $db;
154 }
155
156 $log->debug("filters defined: ",Dumper($self->{'filter'}));
157
158 return $self;
159 }
160
161 =head2 open_isis
162
163 Open CDS/ISIS, WinISIS or IsisMarc database using IsisDB or OpenIsis module
164 and read all records to memory.
165
166 $webpac->open_isis(
167 filename => '/data/ISIS/ISIS',
168 code_page => '852',
169 limit_mfn => 500,
170 start_mfn => 6000,
171 lookup => [ ... ],
172 );
173
174 By default, ISIS code page is assumed to be C<852>.
175
176 If optional parametar C<start_mfn> is set, this will be first MFN to read
177 from database (so you can skip beginning of your database if you need to).
178
179 If optional parametar C<limit_mfn> is set, it will read just 500 records
180 from database in example above.
181
182 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
183 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
184 value in index.
185
186 lookup => [
187 { 'key' => 'd:v900', 'val' => 'v250^a' },
188 { 'eval' => '"v901^a" eq "Podruèje"',
189 'key' => 'pa:v561^4:v562^4:v461^1',
190 'val' => 'v900' },
191 ]
192
193 Returns number of last record read into memory (size of database, really).
194
195 =cut
196
197 sub open_isis {
198 my $self = shift;
199 my $arg = {@_};
200
201 my $log = $self->_get_logger();
202
203 $log->logcroak("need filename") if (! $arg->{'filename'});
204 my $code_page = $arg->{'code_page'} || '852';
205
206 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
207
208 # store data in object
209 $self->{'isis_filename'} = $arg->{'filename'};
210 $self->{'isis_code_page'} = $code_page;
211
212 #$self->{'isis_code_page'} = $code_page;
213
214 # create Text::Iconv object
215 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
216
217 $log->info("reading ISIS database '",$arg->{'filename'},"'");
218 $log->debug("isis code page: $code_page");
219
220 my ($isis_db,$maxmfn);
221
222 if ($have_openisis) {
223 $log->debug("using OpenIsis perl bindings");
224 $isis_db = OpenIsis::open($arg->{'filename'});
225 $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
226 } elsif ($have_biblio_isis) {
227 $log->debug("using Biblio::Isis");
228 use Biblio::Isis;
229 $isis_db = new Biblio::Isis(
230 isisdb => $arg->{'filename'},
231 include_deleted => 1,
232 hash_filter => sub {
233 my $l = shift || return;
234 $l = $cp->convert($l);
235 return $l;
236 },
237 );
238 $maxmfn = $isis_db->count;
239
240 unless ($maxmfn) {
241 $log->logwarn("no records in database ", $arg->{'filename'}, ", skipping...");
242 return;
243 }
244
245 } else {
246 $log->logdie("Can't find supported ISIS library for perl. I suggent that you install Bilbio::Isis from CPAN.");
247 }
248
249
250 my $startmfn = 1;
251
252 if (my $s = $self->{'start_mfn'}) {
253 $log->info("skipping to MFN $s");
254 $startmfn = $s;
255 } else {
256 $self->{'start_mfn'} = $startmfn;
257 }
258
259 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
260
261 $log->info("processing ",($maxmfn-$startmfn)." records using ",( $have_openisis ? 'OpenIsis' : 'Biblio::Isis'));
262
263
264 # read database
265 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
266
267
268 $log->debug("mfn: $mfn\n");
269
270 my $rec;
271
272 if ($have_openisis) {
273
274 # read record using OpenIsis
275 my $row = OpenIsis::read( $isis_db, $mfn );
276 foreach my $k (keys %{$row}) {
277 if ($k ne "mfn") {
278 foreach my $l (@{$row->{$k}}) {
279 $l = $cp->convert($l);
280 # has subfields?
281 my $val;
282 if ($l =~ m/\^/) {
283 foreach my $t (split(/\^/,$l)) {
284 next if (! $t);
285 $val->{substr($t,0,1)} = substr($t,1);
286 }
287 } else {
288 $val = $l;
289 }
290
291 push @{$rec->{$k}}, $val;
292 }
293 } else {
294 push @{$rec->{'000'}}, $mfn;
295 }
296 }
297
298 } elsif ($have_biblio_isis) {
299 $rec = $isis_db->to_hash($mfn);
300 } else {
301 $log->logdie("hum? implementation missing?");
302 }
303
304 $log->confess("record $mfn empty?") unless ($rec);
305
306 # store
307 if ($self->{'low_mem'}) {
308 $self->{'db'}->put($mfn, $rec);
309 } else {
310 $self->{'data'}->{$mfn} = $rec;
311 }
312
313 # create lookup
314 $self->create_lookup($rec, @{$arg->{'lookup'}});
315
316 $self->progress_bar($mfn,$maxmfn);
317
318 }
319
320 $self->{'current_mfn'} = -1;
321 $self->{'last_pcnt'} = 0;
322
323 $log->debug("max mfn: $maxmfn");
324
325 # store max mfn and return it.
326 return $self->{'max_mfn'} = $maxmfn;
327 }
328
329 =head2 fetch_rec
330
331 Fetch next record from database. It will also display progress bar (once
332 it's implemented, that is).
333
334 my $rec = $webpac->fetch_rec;
335
336 =cut
337
338 sub fetch_rec {
339 my $self = shift;
340
341 my $log = $self->_get_logger();
342
343 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
344
345 if ($self->{'current_mfn'} == -1) {
346 $self->{'current_mfn'} = $self->{'start_mfn'};
347 } else {
348 $self->{'current_mfn'}++;
349 }
350
351 my $mfn = $self->{'current_mfn'};
352
353 if ($mfn > $self->{'max_mfn'}) {
354 $self->{'current_mfn'} = $self->{'max_mfn'};
355 $log->debug("at EOF");
356 return;
357 }
358
359 $self->progress_bar($mfn,$self->{'max_mfn'});
360
361 if ($self->{'low_mem'}) {
362 return $self->{'db'}->get($mfn);
363 } else {
364 return $self->{'data'}->{$mfn};
365 }
366 }
367
368 =head2 mfn
369
370 Returns current record number (MFN).
371
372 print $webpac->mfn;
373
374 =cut
375
376 sub mfn {
377 my $self = shift;
378 return $self->{'current_mfn'};
379 }
380
381 =head2 progress_bar
382
383 Draw progress bar on STDERR.
384
385 $webpac->progress_bar($current, $max);
386
387 =cut
388
389 sub progress_bar {
390 my $self = shift;
391
392 my ($curr,$max) = @_;
393
394 my $log = $self->_get_logger();
395
396 $log->logconfess("no current value!") if (! $curr);
397 $log->logconfess("no maximum value!") if (! $max);
398
399 if ($curr > $max) {
400 $max = $curr;
401 $log->debug("overflow to $curr");
402 }
403
404 $self->{'last_pcnt'} ||= 1;
405
406 my $p = int($curr * 100 / $max) || 1;
407
408 # reset on re-run
409 if ($p < $self->{'last_pcnt'}) {
410 $self->{'last_pcnt'} = $p;
411 $self->{'start_t'} = time();
412 }
413
414 if ($p != $self->{'last_pcnt'}) {
415
416 my $t = time();
417 my $rate = ($curr / ($t - $self->{'start_t'} || 1));
418 my $eta = ($max-$curr) / ($rate || 1);
419 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
420 $self->{'last_pcnt'} = $p;
421 $self->{'last_curr'} = $curr;
422 }
423 print STDERR "\n" if ($p == 100);
424 }
425
426 =head2 fmt_time
427
428 Format time (in seconds) for display.
429
430 print $webpac->fmt_time(time());
431
432 This method is called by L<progress_bar> to display remaining time.
433
434 =cut
435
436 sub fmt_time {
437 my $self = shift;
438
439 my $t = shift || 0;
440 my $out = "";
441
442 my ($ss,$mm,$hh) = gmtime($t);
443 $out .= "${hh}h" if ($hh);
444 $out .= sprintf("%02d:%02d", $mm,$ss);
445 $out .= " " if ($hh == 0);
446 return $out;
447 }
448
449 =head2 open_import_xml
450
451 Read file from C<import_xml/> directory and parse it.
452
453 $webpac->open_import_xml(type => 'isis');
454
455 =cut
456
457 sub open_import_xml {
458 my $self = shift;
459
460 my $log = $self->_get_logger();
461
462 my $arg = {@_};
463 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
464
465 $self->{'type'} = $arg->{'type'};
466
467 my $type_base = $arg->{'type'};
468 $type_base =~ s/_.*$//g;
469
470 $self->{'tag'} = $type2tag{$type_base};
471
472 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
473
474 my $f = "./import_xml/".$self->{'type'}.".xml";
475 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
476
477 $log->info("reading '$f'");
478
479 $self->{'import_xml_file'} = $f;
480
481 $self->{'import_xml'} = XMLin($f,
482 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
483 );
484
485 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
486
487 }
488
489 =head2 create_lookup
490
491 Create lookup from record using lookup definition.
492
493 $self->create_lookup($rec, @lookups);
494
495 Called internally by C<open_*> methods.
496
497 =cut
498
499 sub create_lookup {
500 my $self = shift;
501
502 my $log = $self->_get_logger();
503
504 my $rec = shift || $log->logconfess("need record to create lookup");
505 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
506
507 foreach my $i (@_) {
508 $log->logconfess("need key") unless defined($i->{'key'});
509 $log->logconfess("need val") unless defined($i->{'val'});
510
511 if (defined($i->{'eval'})) {
512 # eval first, so we can skip fill_in for key and val
513 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
514 if ($self->_eval($eval)) {
515 my $key = $self->fill_in($rec,$i->{'key'}) || next;
516 my @val = $self->fill_in($rec,$i->{'val'}) || next;
517 $log->debug("stored $key = ",sub { join(" | ",@val) });
518 push @{$self->{'lookup'}->{$key}}, @val;
519 }
520 } else {
521 my $key = $self->fill_in($rec,$i->{'key'}) || next;
522 my @val = $self->fill_in($rec,$i->{'val'}) || next;
523 $log->debug("stored $key = ",sub { join(" | ",@val) });
524 push @{$self->{'lookup'}->{$key}}, @val;
525 }
526 }
527 }
528
529 =head2 get_data
530
531 Returns value from record.
532
533 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
534
535 Arguments are:
536 record reference C<$rec>,
537 field C<$f>,
538 optional subfiled C<$sf>,
539 index for repeatable values C<$i>.
540
541 Optinal variable C<$found> will be incremeted if there
542 is field.
543
544 Returns value or empty string.
545
546 =cut
547
548 sub get_data {
549 my $self = shift;
550
551 my ($rec,$f,$sf,$i,$found) = @_;
552
553 if ($$rec->{$f}) {
554 return '' if (! $$rec->{$f}->[$i]);
555 no strict 'refs';
556 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
557 $$found++ if (defined($$found));
558 return $$rec->{$f}->[$i]->{$sf};
559 } elsif ($$rec->{$f}->[$i]) {
560 $$found++ if (defined($$found));
561 # it still might have subfield, just
562 # not specified, so we'll dump all
563 if ($$rec->{$f}->[$i] =~ /HASH/o) {
564 my $out;
565 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
566 $out .= $$rec->{$f}->[$i]->{$k}." ";
567 }
568 return $out;
569 } else {
570 return $$rec->{$f}->[$i];
571 }
572 }
573 } else {
574 return '';
575 }
576 }
577
578 =head2 fill_in
579
580 Workhourse of all: takes record from in-memory structure of database and
581 strings with placeholders and returns string or array of with substituted
582 values from record.
583
584 my $text = $webpac->fill_in($rec,'v250^a');
585
586 Optional argument is ordinal number for repeatable fields. By default,
587 it's assume to be first repeatable field (fields are perl array, so first
588 element is 0).
589 Following example will read second value from repeatable field.
590
591 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
592
593 This function B<does not> perform parsing of format to inteligenty skip
594 delimiters before fields which aren't used.
595
596 This method will automatically decode UTF-8 string to local code page
597 if needed.
598
599 =cut
600
601 sub fill_in {
602 my $self = shift;
603
604 my $log = $self->_get_logger();
605
606 my $rec = shift || $log->logconfess("need data record");
607 my $format = shift || $log->logconfess("need format to parse");
608 # iteration (for repeatable fields)
609 my $i = shift || 0;
610
611 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
612
613 # FIXME remove for speedup?
614 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
615
616 if (utf8::is_utf8($format)) {
617 $format = $self->_x($format);
618 }
619
620 my $found = 0;
621
622 my $eval_code;
623 # remove eval{...} from beginning
624 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
625
626 my $filter_name;
627 # remove filter{...} from beginning
628 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
629
630 # do actual replacement of placeholders
631 # repeatable fields
632 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
633 # non-repeatable fields
634 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
635
636 if ($found) {
637 $log->debug("format: $format");
638 if ($eval_code) {
639 my $eval = $self->fill_in($rec,$eval_code,$i);
640 return if (! $self->_eval($eval));
641 }
642 if ($filter_name && $self->{'filter'}->{$filter_name}) {
643 $log->debug("filter '$filter_name' for $format");
644 $format = $self->{'filter'}->{$filter_name}->($format);
645 return unless(defined($format));
646 $log->debug("filter result: $format");
647 }
648 # do we have lookups?
649 if ($format =~ /$LOOKUP_REGEX/o) {
650 $log->debug("format '$format' has lookup");
651 return $self->lookup($format);
652 } else {
653 return $format;
654 }
655 } else {
656 return;
657 }
658 }
659
660 =head2 lookup
661
662 Perform lookups on format supplied to it.
663
664 my $text = $self->lookup('[v900]');
665
666 Lookups can be nested (like C<[d:[a:[v900]]]>).
667
668 =cut
669
670 sub lookup {
671 my $self = shift;
672
673 my $log = $self->_get_logger();
674
675 my $tmp = shift || $log->logconfess("need format");
676
677 if ($tmp =~ /$LOOKUP_REGEX/o) {
678 my @in = ( $tmp );
679
680 $log->debug("lookup for: ",$tmp);
681
682 my @out;
683 while (my $f = shift @in) {
684 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
685 my $k = $1;
686 if ($self->{'lookup'}->{$k}) {
687 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
688 my $tmp2 = $f;
689 $tmp2 =~ s/lookup{$k}/$nv/g;
690 push @in, $tmp2;
691 }
692 } else {
693 undef $f;
694 }
695 } elsif ($f) {
696 push @out, $f;
697 }
698 }
699 $log->logconfess("return is array and it's not expected!") unless wantarray;
700 return @out;
701 } else {
702 return $tmp;
703 }
704 }
705
706 =head2 parse
707
708 Perform smart parsing of string, skipping delimiters for fields which aren't
709 defined. It can also eval code in format starting with C<eval{...}> and
710 return output or nothing depending on eval code.
711
712 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
713
714 =cut
715
716 sub parse {
717 my $self = shift;
718
719 my ($rec, $format_utf8, $i) = @_;
720
721 return if (! $format_utf8);
722
723 my $log = $self->_get_logger();
724
725 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
726 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
727
728 $i = 0 if (! $i);
729
730 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
731
732 my @out;
733
734 $log->debug("format: $format");
735
736 my $eval_code;
737 # remove eval{...} from beginning
738 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
739
740 my $filter_name;
741 # remove filter{...} from beginning
742 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
743
744 my $prefix;
745 my $all_found=0;
746
747 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
748
749 my $del = $1 || '';
750 $prefix ||= $del if ($all_found == 0);
751
752 # repeatable index
753 my $r = $i;
754 $r = 0 if (lc("$2") eq 's');
755
756 my $found = 0;
757 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
758
759 if ($found) {
760 push @out, $del;
761 push @out, $tmp;
762 $all_found += $found;
763 }
764 }
765
766 return if (! $all_found);
767
768 my $out = join('',@out);
769
770 if ($out) {
771 # add rest of format (suffix)
772 $out .= $format;
773
774 # add prefix if not there
775 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
776
777 $log->debug("result: $out");
778 }
779
780 if ($eval_code) {
781 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
782 $log->debug("about to eval{$eval} format: $out");
783 return if (! $self->_eval($eval));
784 }
785
786 if ($filter_name && $self->{'filter'}->{$filter_name}) {
787 $log->debug("about to filter{$filter_name} format: $out");
788 $out = $self->{'filter'}->{$filter_name}->($out);
789 return unless(defined($out));
790 $log->debug("filter result: $out");
791 }
792
793 return $out;
794 }
795
796 =head2 parse_to_arr
797
798 Similar to C<parse>, but returns array of all repeatable fields
799
800 my @arr = $webpac->parse_to_arr($rec,'v250^a');
801
802 =cut
803
804 sub parse_to_arr {
805 my $self = shift;
806
807 my ($rec, $format_utf8) = @_;
808
809 my $log = $self->_get_logger();
810
811 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
812 return if (! $format_utf8);
813
814 my $i = 0;
815 my @arr;
816
817 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
818 push @arr, $v;
819 }
820
821 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
822
823 return @arr;
824 }
825
826 =head2 fill_in_to_arr
827
828 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
829 for fields which have lookups, so they shouldn't be parsed but rather
830 C<fill_id>ed.
831
832 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
833
834 =cut
835
836 sub fill_in_to_arr {
837 my $self = shift;
838
839 my ($rec, $format_utf8) = @_;
840
841 my $log = $self->_get_logger();
842
843 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
844 return if (! $format_utf8);
845
846 my $i = 0;
847 my @arr;
848
849 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
850 push @arr, @v;
851 }
852
853 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
854
855 return @arr;
856 }
857
858 =head2 sort_arr
859
860 Sort array ignoring case and html in data
861
862 my @sorted = $webpac->sort_arr(@unsorted);
863
864 =cut
865
866 sub sort_arr {
867 my $self = shift;
868
869 my $log = $self->_get_logger();
870
871 # FIXME add Schwartzian Transformation?
872
873 my @sorted = sort {
874 $a =~ s#<[^>]+/*>##;
875 $b =~ s#<[^>]+/*>##;
876 lc($b) cmp lc($a)
877 } @_;
878 $log->debug("sorted values: ",sub { join(", ",@sorted) });
879
880 return @sorted;
881 }
882
883
884 =head2 data_structure
885
886 Create in-memory data structure which represents layout from C<import_xml>.
887 It is used later to produce output.
888
889 my @ds = $webpac->data_structure($rec);
890
891 This method will also set C<$webpac->{'currnet_filename'}> if there is
892 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
893 <headline> tag.
894
895 =cut
896
897 sub data_structure {
898 my $self = shift;
899
900 my $log = $self->_get_logger();
901
902 my $rec = shift;
903 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
904
905 undef $self->{'currnet_filename'};
906 undef $self->{'headline'};
907
908 my @sorted_tags;
909 if ($self->{tags_by_order}) {
910 @sorted_tags = @{$self->{tags_by_order}};
911 } else {
912 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
913 $self->{tags_by_order} = \@sorted_tags;
914 }
915
916 my @ds;
917
918 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
919
920 foreach my $field (@sorted_tags) {
921
922 my $row;
923
924 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
925
926 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
927 my $format = $tag->{'value'} || $tag->{'content'};
928
929 $log->debug("format: $format");
930
931 my @v;
932 if ($format =~ /$LOOKUP_REGEX/o) {
933 @v = $self->fill_in_to_arr($rec,$format);
934 } else {
935 @v = $self->parse_to_arr($rec,$format);
936 }
937 next if (! @v);
938
939 if ($tag->{'sort'}) {
940 @v = $self->sort_arr(@v);
941 }
942
943 # use format?
944 if ($tag->{'format_name'}) {
945 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
946 }
947
948 if ($field eq 'filename') {
949 $self->{'current_filename'} = join('',@v);
950 $log->debug("filename: ",$self->{'current_filename'});
951 } elsif ($field eq 'headline') {
952 $self->{'headline'} .= join('',@v);
953 $log->debug("headline: ",$self->{'headline'});
954 next; # don't return headline in data_structure!
955 }
956
957 # delimiter will join repeatable fields
958 if ($tag->{'delimiter'}) {
959 @v = ( join($tag->{'delimiter'}, @v) );
960 }
961
962 # default types
963 my @types = qw(display swish);
964 # override by type attribute
965 @types = ( $tag->{'type'} ) if ($tag->{'type'});
966
967 foreach my $type (@types) {
968 # append to previous line?
969 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
970 if ($tag->{'append'}) {
971
972 # I will delimit appended part with
973 # delimiter (or ,)
974 my $d = $tag->{'delimiter'};
975 # default delimiter
976 $d ||= " ";
977
978 my $last = pop @{$row->{$type}};
979 $d = "" if (! $last);
980 $last .= $d . join($d, @v);
981 push @{$row->{$type}}, $last;
982
983 } else {
984 push @{$row->{$type}}, @v;
985 }
986 }
987
988
989 }
990
991 if ($row) {
992 $row->{'tag'} = $field;
993
994 # TODO: name_sigular, name_plural
995 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
996 $row->{'name'} = $name ? $self->_x($name) : $field;
997
998 # post-sort all values in field
999 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
1000 $log->warn("sort at field tag not implemented");
1001 }
1002
1003 push @ds, $row;
1004
1005 $log->debug("row $field: ",sub { Dumper($row) });
1006 }
1007
1008 }
1009
1010 return @ds;
1011
1012 }
1013
1014 =head2 output
1015
1016 Create output from in-memory data structure using Template Toolkit template.
1017
1018 my $text = $webpac->output( template => 'text.tt', data => @ds );
1019
1020 =cut
1021
1022 sub output {
1023 my $self = shift;
1024
1025 my $args = {@_};
1026
1027 my $log = $self->_get_logger();
1028
1029 $log->logconfess("need template name") if (! $args->{'template'});
1030 $log->logconfess("need data array") if (! $args->{'data'});
1031
1032 my $out;
1033
1034 $self->{'tt'}->process(
1035 $args->{'template'},
1036 $args,
1037 \$out
1038 ) || confess $self->{'tt'}->error();
1039
1040 return $out;
1041 }
1042
1043 =head2 output_file
1044
1045 Create output from in-memory data structure using Template Toolkit template
1046 to a file.
1047
1048 $webpac->output_file(
1049 file => 'out.txt',
1050 template => 'text.tt',
1051 data => @ds
1052 );
1053
1054 =cut
1055
1056 sub output_file {
1057 my $self = shift;
1058
1059 my $args = {@_};
1060
1061 my $log = $self->_get_logger();
1062
1063 my $file = $args->{'file'} || $log->logconfess("need file name");
1064
1065 $log->debug("creating file ",$file);
1066
1067 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1068 print $fh $self->output(
1069 template => $args->{'template'},
1070 data => $args->{'data'},
1071 ) || $log->logdie("print: $!");
1072 close($fh) || $log->logdie("close: $!");
1073 }
1074
1075 =head2 apply_format
1076
1077 Apply format specified in tag with C<format_name="name"> and
1078 C<format_delimiter=";;">.
1079
1080 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1081
1082 Formats can contain C<lookup{...}> if you need them.
1083
1084 =cut
1085
1086 sub apply_format {
1087 my $self = shift;
1088
1089 my ($name,$delimiter,$data) = @_;
1090
1091 my $log = $self->_get_logger();
1092
1093 if (! $self->{'import_xml'}->{'format'}->{$name}) {
1094 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1095 return $data;
1096 }
1097
1098 $log->warn("no delimiter for format $name") if (! $delimiter);
1099
1100 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1101
1102 my @data = split(/\Q$delimiter\E/, $data);
1103
1104 my $out = sprintf($format, @data);
1105 $log->debug("using format $name [$format] on $data to produce: $out");
1106
1107 if ($out =~ m/$LOOKUP_REGEX/o) {
1108 return $self->lookup($out);
1109 } else {
1110 return $out;
1111 }
1112
1113 }
1114
1115
1116 #
1117 #
1118 #
1119
1120 =head1 INTERNAL METHODS
1121
1122 Here is a quick list of internal methods, mostly useful to turn debugging
1123 on them (see L<LOGGING> below for explanation).
1124
1125 =cut
1126
1127 =head2 _eval
1128
1129 Internal function to eval code without C<strict 'subs'>.
1130
1131 =cut
1132
1133 sub _eval {
1134 my $self = shift;
1135
1136 my $code = shift || return;
1137
1138 my $log = $self->_get_logger();
1139
1140 no strict 'subs';
1141 my $ret = eval $code;
1142 if ($@) {
1143 $log->error("problem with eval code [$code]: $@");
1144 }
1145
1146 $log->debug("eval: ",$code," [",$ret,"]");
1147
1148 return $ret || undef;
1149 }
1150
1151 =head2 _sort_by_order
1152
1153 Sort xml tags data structure accoding to C<order=""> attribute.
1154
1155 =cut
1156
1157 sub _sort_by_order {
1158 my $self = shift;
1159
1160 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1161 $self->{'import_xml'}->{'indexer'}->{$a};
1162 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1163 $self->{'import_xml'}->{'indexer'}->{$b};
1164
1165 return $va <=> $vb;
1166 }
1167
1168 =head2 _get_logger
1169
1170 Get C<Log::Log4perl> object with a twist: domains are defined for each
1171 method
1172
1173 my $log = $webpac->_get_logger();
1174
1175 =cut
1176
1177 sub _get_logger {
1178 my $self = shift;
1179
1180 my $name = (caller(1))[3] || caller;
1181 return get_logger($name);
1182 }
1183
1184 =head2 _x
1185
1186 Convert string from UTF-8 to code page defined in C<import_xml>.
1187
1188 my $text = $webpac->_x('utf8 text');
1189
1190 =cut
1191
1192 sub _x {
1193 my $self = shift;
1194 my $utf8 = shift || return;
1195
1196 return $self->{'utf2cp'}->convert($utf8) ||
1197 $self->_get_logger()->logwarn("can't convert '$utf8'");
1198 }
1199
1200 #
1201 #
1202 #
1203
1204 =head1 LOGGING
1205
1206 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1207 C<log.conf>.
1208
1209 Methods defined above have different levels of logging, so
1210 it's descriptions will be useful to turn (mostry B<debug> logging) on
1211 or off to see why WabPAC isn't perforing as you expect it (it might even
1212 be a bug!).
1213
1214 B<This is different from normal Log4perl behaviour>. To repeat, you can
1215 also use method names, and not only classes (which are just few)
1216 to filter logging.
1217
1218
1219 =head1 MEMORY USAGE
1220
1221 C<low_mem> options is double-edged sword. If enabled, WebPAC
1222 will run on memory constraint machines (which doesn't have enough
1223 physical RAM to create memory structure for whole source database).
1224
1225 If your machine has 512Mb or more of RAM and database is around 10000 records,
1226 memory shouldn't be an issue. If you don't have enough physical RAM, you
1227 might consider using virtual memory (if your operating system is handling it
1228 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1229 parsed structure of ISIS database (this is what C<low_mem> option does).
1230
1231 Hitting swap at end of reading source database is probably o.k. However,
1232 hitting swap before 90% will dramatically decrease performance and you will
1233 be better off with C<low_mem> and using rest of availble memory for
1234 operating system disk cache (Linux is particuallary good about this).
1235 However, every access to database record will require disk access, so
1236 generation phase will be slower 10-100 times.
1237
1238 Parsed structures are essential - you just have option to trade RAM memory
1239 (which is fast) for disk space (which is slow). Be sure to have planty of
1240 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1241
1242 However, when WebPAC is running on desktop machines (or laptops :-), it's
1243 highly undesireable for system to start swapping. Using C<low_mem> option can
1244 reduce WecPAC memory usage to around 64Mb for same database with lookup
1245 fields and sorted indexes which stay in RAM. Performance will suffer, but
1246 memory usage will really be minimal. It might be also more confortable to
1247 run WebPAC reniced on those machines.
1248
1249 =cut
1250
1251 1;

  ViewVC Help
Powered by ViewVC 1.1.26