/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 441 - (show annotations)
Tue Sep 14 17:07:59 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 25980 byte(s)
refactore tree generation into WebPAC::Tree

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 code_page => 'ISO-8859-2',
38 low_mem => 1,
39 );
40
41 Default C<code_page> is C<ISO-8859-2>.
42
43 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44
45 This method will also read configuration files
46 C<global.conf> (used by indexer and Web font-end)
47 and configuration file specified by C<config_file>
48 which describes databases to be indexed.
49
50 =cut
51
52 # mapping between data type and tag which specify
53 # format in XML file
54 my %type2tag = (
55 'isis' => 'isis',
56 # 'excel' => 'column',
57 # 'marc' => 'marc',
58 # 'feed' => 'feed'
59 );
60
61 sub new {
62 my $class = shift;
63 my $self = {@_};
64 bless($self, $class);
65
66 $self->{'start_t'} = time();
67
68 my $log_file = $self->{'log'} || "log.conf";
69 Log::Log4perl->init($log_file);
70
71 my $log = $self->_get_logger();
72
73 # fill in default values
74 # output codepage
75 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76
77 #
78 # read global.conf
79 #
80 $log->debug("read 'global.conf'");
81
82 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83
84 # read global config parametars
85 foreach my $var (qw(
86 dbi_dbd
87 dbi_dsn
88 dbi_user
89 dbi_passwd
90 show_progress
91 my_unac_filter
92 output_template
93 )) {
94 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 }
96
97 #
98 # read indexer config file
99 #
100
101 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102
103 # create UTF-8 convertor for import_xml files
104 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105
106 # create Template toolkit instance
107 $self->{'tt'} = Template->new(
108 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109 # FILTERS => {
110 # 'foo' => \&foo_filter,
111 # },
112 EVAL_PERL => 1,
113 );
114
115 # running with low_mem flag? well, use DBM::Deep then.
116 if ($self->{'low_mem'}) {
117 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118
119 my $db_file = "data.db";
120
121 if (-e $db_file) {
122 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123 $log->debug("removed '$db_file' from last run");
124 }
125
126 require DBM::Deep;
127
128 my $db = new DBM::Deep $db_file;
129
130 $log->logdie("DBM::Deep error: $!") unless ($db);
131
132 if ($db->error()) {
133 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134 } else {
135 $log->debug("using file '$db_file' for DBM::Deep");
136 }
137
138 $self->{'db'} = $db;
139 }
140
141 return $self;
142 }
143
144 =head2 open_isis
145
146 Open CDS/ISIS database using OpenIsis module and read all records to memory.
147
148 $webpac->open_isis(
149 filename => '/data/ISIS/ISIS',
150 code_page => '852',
151 limit_mfn => 500,
152 start_mfn => 6000,
153 lookup => [ ... ],
154 );
155
156 By default, ISIS code page is assumed to be C<852>.
157
158 If optional parametar C<start_mfn> is set, this will be first MFN to read
159 from database (so you can skip beginning of your database if you need to).
160
161 If optional parametar C<limit_mfn> is set, it will read just 500 records
162 from database in example above.
163
164 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166 value in index.
167
168 lookup => [
169 { 'key' => 'd:v900', 'val' => 'v250^a' },
170 { 'eval' => '"v901^a" eq "Podruèje"',
171 'key' => 'pa:v561^4:v562^4:v461^1',
172 'val' => 'v900' },
173 ]
174
175 Returns number of last record read into memory (size of database, really).
176
177 =cut
178
179 sub open_isis {
180 my $self = shift;
181 my $arg = {@_};
182
183 my $log = $self->_get_logger();
184
185 $log->logcroak("need filename") if (! $arg->{'filename'});
186 my $code_page = $arg->{'code_page'} || '852';
187
188 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189
190 # store data in object
191 $self->{'isis_filename'} = $arg->{'filename'};
192 $self->{'isis_code_page'} = $code_page;
193
194 use OpenIsis;
195
196 #$self->{'isis_code_page'} = $code_page;
197
198 # create Text::Iconv object
199 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200
201 $log->info("reading ISIS database '",$arg->{'filename'},"'");
202 $log->debug("isis code page: $code_page");
203
204 my $isis_db = OpenIsis::open($arg->{'filename'});
205
206 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207 my $startmfn = 1;
208
209 if (my $s = $self->{'start_mfn'}) {
210 $log->info("skipping to MFN $s");
211 $startmfn = $s;
212 }
213
214 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
215
216 $log->info("processing ",($maxmfn-$startmfn)." records...");
217
218 # read database
219 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
220
221
222 $log->debug("mfn: $mfn\n");
223
224 my $rec;
225
226 # read record
227 my $row = OpenIsis::read( $isis_db, $mfn );
228 foreach my $k (keys %{$row}) {
229 if ($k ne "mfn") {
230 foreach my $l (@{$row->{$k}}) {
231 $l = $cp->convert($l);
232 # has subfields?
233 my $val;
234 if ($l =~ m/\^/) {
235 foreach my $t (split(/\^/,$l)) {
236 next if (! $t);
237 $val->{substr($t,0,1)} = substr($t,1);
238 }
239 } else {
240 $val = $l;
241 }
242
243 push @{$rec->{$k}}, $val;
244 }
245 } else {
246 push @{$rec->{'000'}}, $mfn;
247 }
248
249 }
250
251 $log->confess("record $mfn empty?") unless ($rec);
252
253 # store
254 if ($self->{'low_mem'}) {
255 $self->{'db'}->put($mfn, $rec);
256 } else {
257 $self->{'data'}->{$mfn} = $rec;
258 }
259
260 # create lookup
261 $self->create_lookup($rec, @{$arg->{'lookup'}});
262
263 $self->progress_bar($mfn,$maxmfn);
264
265 }
266
267 $self->{'current_mfn'} = $startmfn;
268 $self->{'last_pcnt'} = 0;
269
270 $log->debug("max mfn: $maxmfn");
271
272 # store max mfn and return it.
273 return $self->{'max_mfn'} = $maxmfn;
274 }
275
276 =head2 fetch_rec
277
278 Fetch next record from database. It will also display progress bar (once
279 it's implemented, that is).
280
281 my $rec = $webpac->fetch_rec;
282
283 =cut
284
285 sub fetch_rec {
286 my $self = shift;
287
288 my $log = $self->_get_logger();
289
290 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
291
292 if ($mfn > $self->{'max_mfn'}) {
293 $self->{'current_mfn'} = $self->{'max_mfn'};
294 $log->debug("at EOF");
295 return;
296 }
297
298 $self->progress_bar($mfn,$self->{'max_mfn'});
299
300 if ($self->{'low_mem'}) {
301 return $self->{'db'}->get($mfn);
302 } else {
303 return $self->{'data'}->{$mfn};
304 }
305 }
306
307 =head2 progress_bar
308
309 Draw progress bar on STDERR.
310
311 $webpac->progress_bar($current, $max);
312
313 =cut
314
315 sub progress_bar {
316 my $self = shift;
317
318 my ($curr,$max) = @_;
319
320 my $log = $self->_get_logger();
321
322 $log->logconfess("no current value!") if (! $curr);
323 $log->logconfess("no maximum value!") if (! $max);
324
325 if ($curr > $max) {
326 $max = $curr;
327 $log->debug("overflow to $curr");
328 }
329
330 $self->{'last_pcnt'} ||= 1;
331
332 my $p = int($curr * 100 / $max);
333
334 # reset on re-run
335 if ($p < $self->{'last_pcnt'}) {
336 $self->{'last_pcnt'} = $p;
337 $self->{'last_t'} = time();
338 $self->{'last_curr'} = 1;
339 }
340
341 if ($p != $self->{'last_pcnt'}) {
342
343 my $last_curr = $self->{'last_curr'} || $curr;
344 my $t = time();
345 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
346 my $eta = ($max-$curr) / ($rate || 1);
347 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
348 $self->{'last_pcnt'} = $p;
349 $self->{'last_t'} = time();
350 $self->{'last_curr'} = $curr;
351 }
352 print STDERR "\n" if ($p == 100);
353 }
354
355 =head2 fmt_time
356
357 Format time (in seconds) for display.
358
359 print $webpac->fmt_time(time());
360
361 This method is called by L<progress_bar> to display remaining time.
362
363 =cut
364
365 sub fmt_time {
366 my $self = shift;
367
368 my $t = shift || 0;
369 my $out = "";
370
371 my ($ss,$mm,$hh) = gmtime($t);
372 $out .= "${hh}h" if ($hh);
373 $out .= sprintf("%02d:%02d", $mm,$ss);
374 $out .= " " if ($hh == 0);
375 return $out;
376 }
377
378 =head2 open_import_xml
379
380 Read file from C<import_xml/> directory and parse it.
381
382 $webpac->open_import_xml(type => 'isis');
383
384 =cut
385
386 sub open_import_xml {
387 my $self = shift;
388
389 my $log = $self->_get_logger();
390
391 my $arg = {@_};
392 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
393
394 $self->{'type'} = $arg->{'type'};
395
396 my $type_base = $arg->{'type'};
397 $type_base =~ s/_.*$//g;
398
399 $self->{'tag'} = $type2tag{$type_base};
400
401 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
402
403 my $f = "./import_xml/".$self->{'type'}.".xml";
404 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
405
406 $log->info("reading '$f'");
407
408 $self->{'import_xml_file'} = $f;
409
410 $self->{'import_xml'} = XMLin($f,
411 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
412 );
413
414 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
415
416 }
417
418 =head2 create_lookup
419
420 Create lookup from record using lookup definition.
421
422 $self->create_lookup($rec, @lookups);
423
424 Called internally by C<open_*> methods.
425
426 =cut
427
428 sub create_lookup {
429 my $self = shift;
430
431 my $log = $self->_get_logger();
432
433 my $rec = shift || $log->logconfess("need record to create lookup");
434 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
435
436 foreach my $i (@_) {
437 $log->logconfess("need key") unless defined($i->{'key'});
438 $log->logconfess("need val") unless defined($i->{'val'});
439
440 if (defined($i->{'eval'})) {
441 # eval first, so we can skip fill_in for key and val
442 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
443 if ($self->_eval($eval)) {
444 my $key = $self->fill_in($rec,$i->{'key'}) || next;
445 my @val = $self->fill_in($rec,$i->{'val'}) || next;
446 $log->debug("stored $key = ",sub { join(" | ",@val) });
447 push @{$self->{'lookup'}->{$key}}, @val;
448 }
449 } else {
450 my $key = $self->fill_in($rec,$i->{'key'}) || next;
451 my @val = $self->fill_in($rec,$i->{'val'}) || next;
452 $log->debug("stored $key = ",sub { join(" | ",@val) });
453 push @{$self->{'lookup'}->{$key}}, @val;
454 }
455 }
456 }
457
458 =head2 get_data
459
460 Returns value from record.
461
462 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
463
464 Arguments are:
465 record reference C<$rec>,
466 field C<$f>,
467 optional subfiled C<$sf>,
468 index for repeatable values C<$i>.
469
470 Optinal variable C<$found> will be incremeted if there
471 is field.
472
473 Returns value or empty string.
474
475 =cut
476
477 sub get_data {
478 my $self = shift;
479
480 my ($rec,$f,$sf,$i,$found) = @_;
481
482 if ($$rec->{$f}) {
483 return '' if (! $$rec->{$f}->[$i]);
484 no strict 'refs';
485 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
486 $$found++ if (defined($$found));
487 return $$rec->{$f}->[$i]->{$sf};
488 } elsif ($$rec->{$f}->[$i]) {
489 $$found++ if (defined($$found));
490 # it still might have subfield, just
491 # not specified, so we'll dump all
492 if ($$rec->{$f}->[$i] =~ /HASH/o) {
493 my $out;
494 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
495 $out .= $$rec->{$f}->[$i]->{$k}." ";
496 }
497 return $out;
498 } else {
499 return $$rec->{$f}->[$i];
500 }
501 }
502 } else {
503 return '';
504 }
505 }
506
507 =head2 fill_in
508
509 Workhourse of all: takes record from in-memory structure of database and
510 strings with placeholders and returns string or array of with substituted
511 values from record.
512
513 my $text = $webpac->fill_in($rec,'v250^a');
514
515 Optional argument is ordinal number for repeatable fields. By default,
516 it's assume to be first repeatable field (fields are perl array, so first
517 element is 0).
518 Following example will read second value from repeatable field.
519
520 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
521
522 This function B<does not> perform parsing of format to inteligenty skip
523 delimiters before fields which aren't used.
524
525 This method will automatically decode UTF-8 string to local code page
526 if needed.
527
528 =cut
529
530 sub fill_in {
531 my $self = shift;
532
533 my $log = $self->_get_logger();
534
535 my $rec = shift || $log->logconfess("need data record");
536 my $format = shift || $log->logconfess("need format to parse");
537 # iteration (for repeatable fields)
538 my $i = shift || 0;
539
540 # FIXME remove for speedup?
541 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
542
543 if (utf8::is_utf8($format)) {
544 $format = $self->_x($format);
545 }
546
547 my $found = 0;
548
549 my $eval_code;
550 # remove eval{...} from beginning
551 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
552
553 # do actual replacement of placeholders
554 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
555
556 if ($found) {
557 $log->debug("format: $format");
558 if ($eval_code) {
559 my $eval = $self->fill_in($rec,$eval_code,$i);
560 return if (! $self->_eval($eval));
561 }
562 # do we have lookups?
563 if ($format =~ /$LOOKUP_REGEX/o) {
564 $log->debug("format '$format' has lookup");
565 return $self->lookup($format);
566 } else {
567 return $format;
568 }
569 } else {
570 return;
571 }
572 }
573
574 =head2 lookup
575
576 Perform lookups on format supplied to it.
577
578 my $text = $self->lookup('[v900]');
579
580 Lookups can be nested (like C<[d:[a:[v900]]]>).
581
582 =cut
583
584 sub lookup {
585 my $self = shift;
586
587 my $log = $self->_get_logger();
588
589 my $tmp = shift || $log->logconfess("need format");
590
591 if ($tmp =~ /$LOOKUP_REGEX/o) {
592 my @in = ( $tmp );
593
594 $log->debug("lookup for: ",$tmp);
595
596 my @out;
597 while (my $f = shift @in) {
598 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
599 my $k = $1;
600 if ($self->{'lookup'}->{$k}) {
601 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
602 my $tmp2 = $f;
603 $tmp2 =~ s/lookup{$k}/$nv/g;
604 push @in, $tmp2;
605 }
606 } else {
607 undef $f;
608 }
609 } elsif ($f) {
610 push @out, $f;
611 }
612 }
613 $log->logconfess("return is array and it's not expected!") unless wantarray;
614 return @out;
615 } else {
616 return $tmp;
617 }
618 }
619
620 =head2 parse
621
622 Perform smart parsing of string, skipping delimiters for fields which aren't
623 defined. It can also eval code in format starting with C<eval{...}> and
624 return output or nothing depending on eval code.
625
626 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
627
628 =cut
629
630 sub parse {
631 my $self = shift;
632
633 my ($rec, $format_utf8, $i) = @_;
634
635 return if (! $format_utf8);
636
637 my $log = $self->_get_logger();
638
639 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
640 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
641
642 $i = 0 if (! $i);
643
644 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
645
646 my @out;
647
648 $log->debug("format: $format");
649
650 my $eval_code;
651 # remove eval{...} from beginning
652 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
653
654 my $prefix;
655 my $all_found=0;
656
657 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
658
659 my $del = $1 || '';
660 $prefix ||= $del if ($all_found == 0);
661
662 my $found = 0;
663 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
664
665 if ($found) {
666 push @out, $del;
667 push @out, $tmp;
668 $all_found += $found;
669 }
670 }
671
672 return if (! $all_found);
673
674 my $out = join('',@out);
675
676 if ($out) {
677 # add rest of format (suffix)
678 $out .= $format;
679
680 # add prefix if not there
681 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
682
683 $log->debug("result: $out");
684 }
685
686 if ($eval_code) {
687 my $eval = $self->fill_in($rec,$eval_code,$i);
688 $log->debug("about to eval{",$eval,"} format: $out");
689 return if (! $self->_eval($eval));
690 }
691
692 return $out;
693 }
694
695 =head2 parse_to_arr
696
697 Similar to C<parse>, but returns array of all repeatable fields
698
699 my @arr = $webpac->parse_to_arr($rec,'v250^a');
700
701 =cut
702
703 sub parse_to_arr {
704 my $self = shift;
705
706 my ($rec, $format_utf8) = @_;
707
708 my $log = $self->_get_logger();
709
710 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
711 return if (! $format_utf8);
712
713 my $i = 0;
714 my @arr;
715
716 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
717 push @arr, $v;
718 }
719
720 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
721
722 return @arr;
723 }
724
725 =head2 fill_in_to_arr
726
727 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
728 for fields which have lookups, so they shouldn't be parsed but rather
729 C<fill_id>ed.
730
731 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
732
733 =cut
734
735 sub fill_in_to_arr {
736 my $self = shift;
737
738 my ($rec, $format_utf8) = @_;
739
740 my $log = $self->_get_logger();
741
742 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
743 return if (! $format_utf8);
744
745 my $i = 0;
746 my @arr;
747
748 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
749 push @arr, @v;
750 }
751
752 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
753
754 return @arr;
755 }
756
757 =head2 sort_arr
758
759 Sort array ignoring case and html in data
760
761 my @sorted = $webpac->sort_arr(@unsorted);
762
763 =cut
764
765 sub sort_arr {
766 my $self = shift;
767
768 my $log = $self->_get_logger();
769
770 # FIXME add Schwartzian Transformation?
771
772 my @sorted = sort {
773 $a =~ s#<[^>]+/*>##;
774 $b =~ s#<[^>]+/*>##;
775 lc($b) cmp lc($a)
776 } @_;
777 $log->debug("sorted values: ",sub { join(", ",@sorted) });
778
779 return @sorted;
780 }
781
782
783 =head2 data_structure
784
785 Create in-memory data structure which represents layout from C<import_xml>.
786 It is used later to produce output.
787
788 my @ds = $webpac->data_structure($rec);
789
790 This method will also set C<$webpac->{'currnet_filename'}> if there is
791 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
792 <headline> tag.
793
794 =cut
795
796 sub data_structure {
797 my $self = shift;
798
799 my $log = $self->_get_logger();
800
801 my $rec = shift;
802 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
803
804 undef $self->{'currnet_filename'};
805 undef $self->{'headline'};
806
807 my @sorted_tags;
808 if ($self->{tags_by_order}) {
809 @sorted_tags = @{$self->{tags_by_order}};
810 } else {
811 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
812 $self->{tags_by_order} = \@sorted_tags;
813 }
814
815 my @ds;
816
817 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
818
819 foreach my $field (@sorted_tags) {
820
821 my $row;
822
823 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
824
825 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
826 my $format = $tag->{'value'} || $tag->{'content'};
827
828 $log->debug("format: $format");
829
830 my @v;
831 if ($format =~ /$LOOKUP_REGEX/o) {
832 @v = $self->fill_in_to_arr($rec,$format);
833 } else {
834 @v = $self->parse_to_arr($rec,$format);
835 }
836 next if (! @v);
837
838 if ($tag->{'sort'}) {
839 @v = $self->sort_arr(@v);
840 $log->warn("sort within tag is usually not what you want!");
841 }
842
843 # use format?
844 if ($tag->{'format_name'}) {
845 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
846 }
847
848 if ($field eq 'filename') {
849 $self->{'current_filename'} = join('',@v);
850 $log->debug("filename: ",$self->{'current_filename'});
851 } elsif ($field eq 'headline') {
852 $self->{'headline'} .= join('',@v);
853 $log->debug("headline: ",$self->{'headline'});
854 next; # don't return headline in data_structure!
855 }
856
857 # delimiter will join repeatable fields
858 if ($tag->{'delimiter'}) {
859 @v = ( join($tag->{'delimiter'}, @v) );
860 }
861
862 # default types
863 my @types = qw(display swish);
864 # override by type attribute
865 @types = ( $tag->{'type'} ) if ($tag->{'type'});
866
867 foreach my $type (@types) {
868 # append to previous line?
869 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
870 if ($tag->{'append'}) {
871
872 # I will delimit appended part with
873 # delimiter (or ,)
874 my $d = $tag->{'delimiter'};
875 # default delimiter
876 $d ||= ", ";
877
878 my $last = pop @{$row->{$type}};
879 $d = "" if (! $last);
880 $last .= $d . join($d, @v);
881 push @{$row->{$type}}, $last;
882
883 } else {
884 push @{$row->{$type}}, @v;
885 }
886 }
887
888
889 }
890
891 if ($row) {
892 $row->{'tag'} = $field;
893
894 # TODO: name_sigular, name_plural
895 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
896 $row->{'name'} = $name ? $self->_x($name) : $field;
897
898 # post-sort all values in field
899 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
900 $log->warn("sort at field tag not implemented");
901 }
902
903 push @ds, $row;
904
905 $log->debug("row $field: ",sub { Dumper($row) });
906 }
907
908 }
909
910 return @ds;
911
912 }
913
914 =head2 output
915
916 Create output from in-memory data structure using Template Toolkit template.
917
918 my $text = $webpac->output( template => 'text.tt', data => @ds );
919
920 =cut
921
922 sub output {
923 my $self = shift;
924
925 my $args = {@_};
926
927 my $log = $self->_get_logger();
928
929 $log->logconfess("need template name") if (! $args->{'template'});
930 $log->logconfess("need data array") if (! $args->{'data'});
931
932 my $out;
933
934 $self->{'tt'}->process(
935 $args->{'template'},
936 $args,
937 \$out
938 ) || confess $self->{'tt'}->error();
939
940 return $out;
941 }
942
943 =head2 output_file
944
945 Create output from in-memory data structure using Template Toolkit template
946 to a file.
947
948 $webpac->output_file(
949 file => 'out.txt',
950 template => 'text.tt',
951 data => @ds
952 );
953
954 =cut
955
956 sub output_file {
957 my $self = shift;
958
959 my $args = {@_};
960
961 my $log = $self->_get_logger();
962
963 my $file = $args->{'file'} || $log->logconfess("need file name");
964
965 $log->debug("creating file ",$file);
966
967 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
968 print $fh $self->output(
969 template => $args->{'template'},
970 data => $args->{'data'},
971 ) || $log->logdie("print: $!");
972 close($fh) || $log->logdie("close: $!");
973 }
974
975 =head2 apply_format
976
977 Apply format specified in tag with C<format_name="name"> and
978 C<format_delimiter=";;">.
979
980 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
981
982 Formats can contain C<lookup{...}> if you need them.
983
984 =cut
985
986 sub apply_format {
987 my $self = shift;
988
989 my ($name,$delimiter,$data) = @_;
990
991 my $log = $self->_get_logger();
992
993 if (! $self->{'import_xml'}->{'format'}->{$name}) {
994 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
995 return $data;
996 }
997
998 $log->warn("no delimiter for format $name") if (! $delimiter);
999
1000 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1001
1002 my @data = split(/\Q$delimiter\E/, $data);
1003
1004 my $out = sprintf($format, @data);
1005 $log->debug("using format $name [$format] on $data to produce: $out");
1006
1007 if ($out =~ m/$LOOKUP_REGEX/o) {
1008 return $self->lookup($out);
1009 } else {
1010 return $out;
1011 }
1012
1013 }
1014
1015
1016 #
1017 #
1018 #
1019
1020 =head1 INTERNAL METHODS
1021
1022 Here is a quick list of internal methods, mostly useful to turn debugging
1023 on them (see L<LOGGING> below for explanation).
1024
1025 =cut
1026
1027 =head2 _eval
1028
1029 Internal function to eval code without C<strict 'subs'>.
1030
1031 =cut
1032
1033 sub _eval {
1034 my $self = shift;
1035
1036 my $code = shift || return;
1037
1038 my $log = $self->_get_logger();
1039
1040 no strict 'subs';
1041 my $ret = eval $code;
1042 if ($@) {
1043 $log->error("problem with eval code [$code]: $@");
1044 }
1045
1046 $log->debug("eval: ",$code," [",$ret,"]");
1047
1048 return $ret || 0;
1049 }
1050
1051 =head2 _sort_by_order
1052
1053 Sort xml tags data structure accoding to C<order=""> attribute.
1054
1055 =cut
1056
1057 sub _sort_by_order {
1058 my $self = shift;
1059
1060 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1061 $self->{'import_xml'}->{'indexer'}->{$a};
1062 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1063 $self->{'import_xml'}->{'indexer'}->{$b};
1064
1065 return $va <=> $vb;
1066 }
1067
1068 =head2 _get_logger
1069
1070 Get C<Log::Log4perl> object with a twist: domains are defined for each
1071 method
1072
1073 my $log = $webpac->_get_logger();
1074
1075 =cut
1076
1077 sub _get_logger {
1078 my $self = shift;
1079
1080 my $name = (caller(1))[3] || caller;
1081 return get_logger($name);
1082 }
1083
1084 =head2 _x
1085
1086 Convert string from UTF-8 to code page defined in C<import_xml>.
1087
1088 my $text = $webpac->_x('utf8 text');
1089
1090 =cut
1091
1092 sub _x {
1093 my $self = shift;
1094 my $utf8 = shift || return;
1095
1096 return $self->{'utf2cp'}->convert($utf8) ||
1097 $self->_get_logger()->logwarn("can't convert '$utf8'");
1098 }
1099
1100 #
1101 #
1102 #
1103
1104 =head1 LOGGING
1105
1106 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1107 C<log.conf>.
1108
1109 Methods defined above have different levels of logging, so
1110 it's descriptions will be useful to turn (mostry B<debug> logging) on
1111 or off to see why WabPAC isn't perforing as you expect it (it might even
1112 be a bug!).
1113
1114 B<This is different from normal Log4perl behaviour>. To repeat, you can
1115 also use method names, and not only classes (which are just few)
1116 to filter logging.
1117
1118
1119 =head1 MEMORY USAGE
1120
1121 C<low_mem> options is double-edged sword. If enabled, WebPAC
1122 will run on memory constraint machines (which doesn't have enough
1123 physical RAM to create memory structure for whole source database).
1124
1125 If your machine has 512Mb or more of RAM and database is around 10000 records,
1126 memory shouldn't be an issue. If you don't have enough physical RAM, you
1127 might consider using virtual memory (if your operating system is handling it
1128 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1129 parsed structure of ISIS database (this is what C<low_mem> option does).
1130
1131 Hitting swap at end of reading source database is probably o.k. However,
1132 hitting swap before 90% will dramatically decrease performance and you will
1133 be better off with C<low_mem> and using rest of availble memory for
1134 operating system disk cache (Linux is particuallary good about this).
1135 However, every access to database record will require disk access, so
1136 generation phase will be slower 10-100 times.
1137
1138 Parsed structures are essential - you just have option to trade RAM memory
1139 (which is fast) for disk space (which is slow). Be sure to have planty of
1140 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1141
1142 However, when WebPAC is running on desktop machines (or laptops :-), it's
1143 highly undesireable for system to start swapping. Using C<low_mem> option can
1144 reduce WecPAC memory usage to around 64Mb for same database with lookup
1145 fields and sorted indexes which stay in RAM. Performance will suffer, but
1146 memory usage will really be minimal. It might be also more confortable to
1147 run WebPAC reniced on those machines.
1148
1149 =cut
1150
1151 1;

  ViewVC Help
Powered by ViewVC 1.1.26