/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 448 - (show annotations)
Wed Sep 15 16:53:51 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 26319 byte(s)
Major changes this time: updated for new bfilter (filtering on first
element), outline in sorted index when found in thesaurus.

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 code_page => 'ISO-8859-2',
38 low_mem => 1,
39 );
40
41 Default C<code_page> is C<ISO-8859-2>.
42
43 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44
45 This method will also read configuration files
46 C<global.conf> (used by indexer and Web font-end)
47 and configuration file specified by C<config_file>
48 which describes databases to be indexed.
49
50 =cut
51
52 # mapping between data type and tag which specify
53 # format in XML file
54 my %type2tag = (
55 'isis' => 'isis',
56 # 'excel' => 'column',
57 # 'marc' => 'marc',
58 # 'feed' => 'feed'
59 );
60
61 sub new {
62 my $class = shift;
63 my $self = {@_};
64 bless($self, $class);
65
66 $self->{'start_t'} = time();
67
68 my $log_file = $self->{'log'} || "log.conf";
69 Log::Log4perl->init($log_file);
70
71 my $log = $self->_get_logger();
72
73 # fill in default values
74 # output codepage
75 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76
77 #
78 # read global.conf
79 #
80 $log->debug("read 'global.conf'");
81
82 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83
84 # read global config parametars
85 foreach my $var (qw(
86 dbi_dbd
87 dbi_dsn
88 dbi_user
89 dbi_passwd
90 show_progress
91 my_unac_filter
92 output_template
93 )) {
94 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 }
96
97 #
98 # read indexer config file
99 #
100
101 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102
103 # create UTF-8 convertor for import_xml files
104 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105
106 # create Template toolkit instance
107 $self->{'tt'} = Template->new(
108 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109 # FILTERS => {
110 # 'foo' => \&foo_filter,
111 # },
112 EVAL_PERL => 1,
113 );
114
115 # running with low_mem flag? well, use DBM::Deep then.
116 if ($self->{'low_mem'}) {
117 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118
119 my $db_file = "data.db";
120
121 if (-e $db_file) {
122 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123 $log->debug("removed '$db_file' from last run");
124 }
125
126 require DBM::Deep;
127
128 my $db = new DBM::Deep $db_file;
129
130 $log->logdie("DBM::Deep error: $!") unless ($db);
131
132 if ($db->error()) {
133 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134 } else {
135 $log->debug("using file '$db_file' for DBM::Deep");
136 }
137
138 $self->{'db'} = $db;
139 }
140
141 return $self;
142 }
143
144 =head2 open_isis
145
146 Open CDS/ISIS database using OpenIsis module and read all records to memory.
147
148 $webpac->open_isis(
149 filename => '/data/ISIS/ISIS',
150 code_page => '852',
151 limit_mfn => 500,
152 start_mfn => 6000,
153 lookup => [ ... ],
154 );
155
156 By default, ISIS code page is assumed to be C<852>.
157
158 If optional parametar C<start_mfn> is set, this will be first MFN to read
159 from database (so you can skip beginning of your database if you need to).
160
161 If optional parametar C<limit_mfn> is set, it will read just 500 records
162 from database in example above.
163
164 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166 value in index.
167
168 lookup => [
169 { 'key' => 'd:v900', 'val' => 'v250^a' },
170 { 'eval' => '"v901^a" eq "Podruèje"',
171 'key' => 'pa:v561^4:v562^4:v461^1',
172 'val' => 'v900' },
173 ]
174
175 Returns number of last record read into memory (size of database, really).
176
177 =cut
178
179 sub open_isis {
180 my $self = shift;
181 my $arg = {@_};
182
183 my $log = $self->_get_logger();
184
185 $log->logcroak("need filename") if (! $arg->{'filename'});
186 my $code_page = $arg->{'code_page'} || '852';
187
188 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189
190 # store data in object
191 $self->{'isis_filename'} = $arg->{'filename'};
192 $self->{'isis_code_page'} = $code_page;
193
194 use OpenIsis;
195
196 #$self->{'isis_code_page'} = $code_page;
197
198 # create Text::Iconv object
199 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200
201 $log->info("reading ISIS database '",$arg->{'filename'},"'");
202 $log->debug("isis code page: $code_page");
203
204 my $isis_db = OpenIsis::open($arg->{'filename'});
205
206 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207 my $startmfn = 1;
208
209 if (my $s = $self->{'start_mfn'}) {
210 $log->info("skipping to MFN $s");
211 $startmfn = $s;
212 } else {
213 $self->{'start_mfn'} = $startmfn;
214 }
215
216 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
217
218 $log->info("processing ",($maxmfn-$startmfn)." records...");
219
220 # read database
221 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
222
223
224 $log->debug("mfn: $mfn\n");
225
226 my $rec;
227
228 # read record
229 my $row = OpenIsis::read( $isis_db, $mfn );
230 foreach my $k (keys %{$row}) {
231 if ($k ne "mfn") {
232 foreach my $l (@{$row->{$k}}) {
233 $l = $cp->convert($l);
234 # has subfields?
235 my $val;
236 if ($l =~ m/\^/) {
237 foreach my $t (split(/\^/,$l)) {
238 next if (! $t);
239 $val->{substr($t,0,1)} = substr($t,1);
240 }
241 } else {
242 $val = $l;
243 }
244
245 push @{$rec->{$k}}, $val;
246 }
247 } else {
248 push @{$rec->{'000'}}, $mfn;
249 }
250
251 }
252
253 $log->confess("record $mfn empty?") unless ($rec);
254
255 # store
256 if ($self->{'low_mem'}) {
257 $self->{'db'}->put($mfn, $rec);
258 } else {
259 $self->{'data'}->{$mfn} = $rec;
260 }
261
262 # create lookup
263 $self->create_lookup($rec, @{$arg->{'lookup'}});
264
265 $self->progress_bar($mfn,$maxmfn);
266
267 }
268
269 $self->{'current_mfn'} = -1;
270 $self->{'last_pcnt'} = 0;
271
272 $log->debug("max mfn: $maxmfn");
273
274 # store max mfn and return it.
275 return $self->{'max_mfn'} = $maxmfn;
276 }
277
278 =head2 fetch_rec
279
280 Fetch next record from database. It will also display progress bar (once
281 it's implemented, that is).
282
283 my $rec = $webpac->fetch_rec;
284
285 =cut
286
287 sub fetch_rec {
288 my $self = shift;
289
290 my $log = $self->_get_logger();
291
292 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
293
294 if ($self->{'current_mfn'} == -1) {
295 $self->{'current_mfn'} = $self->{'start_mfn'};
296 } else {
297 $self->{'current_mfn'}++;
298 }
299
300 my $mfn = $self->{'current_mfn'};
301
302 if ($mfn > $self->{'max_mfn'}) {
303 $self->{'current_mfn'} = $self->{'max_mfn'};
304 $log->debug("at EOF");
305 return;
306 }
307
308 $self->progress_bar($mfn,$self->{'max_mfn'});
309
310 if ($self->{'low_mem'}) {
311 return $self->{'db'}->get($mfn);
312 } else {
313 return $self->{'data'}->{$mfn};
314 }
315 }
316
317 =head2 mfn
318
319 Returns current record number (MFN).
320
321 print $webpac->mfn;
322
323 =cut
324
325 sub mfn {
326 my $self = shift;
327 return $self->{'current_mfn'};
328 }
329
330 =head2 progress_bar
331
332 Draw progress bar on STDERR.
333
334 $webpac->progress_bar($current, $max);
335
336 =cut
337
338 sub progress_bar {
339 my $self = shift;
340
341 my ($curr,$max) = @_;
342
343 my $log = $self->_get_logger();
344
345 $log->logconfess("no current value!") if (! $curr);
346 $log->logconfess("no maximum value!") if (! $max);
347
348 if ($curr > $max) {
349 $max = $curr;
350 $log->debug("overflow to $curr");
351 }
352
353 $self->{'last_pcnt'} ||= 1;
354
355 my $p = int($curr * 100 / $max);
356
357 # reset on re-run
358 if ($p < $self->{'last_pcnt'}) {
359 $self->{'last_pcnt'} = $p;
360 $self->{'last_t'} = time();
361 $self->{'last_curr'} = 1;
362 }
363
364 if ($p != $self->{'last_pcnt'}) {
365
366 my $last_curr = $self->{'last_curr'} || $curr;
367 my $t = time();
368 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
369 my $eta = ($max-$curr) / ($rate || 1);
370 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
371 $self->{'last_pcnt'} = $p;
372 $self->{'last_t'} = time();
373 $self->{'last_curr'} = $curr;
374 }
375 print STDERR "\n" if ($p == 100);
376 }
377
378 =head2 fmt_time
379
380 Format time (in seconds) for display.
381
382 print $webpac->fmt_time(time());
383
384 This method is called by L<progress_bar> to display remaining time.
385
386 =cut
387
388 sub fmt_time {
389 my $self = shift;
390
391 my $t = shift || 0;
392 my $out = "";
393
394 my ($ss,$mm,$hh) = gmtime($t);
395 $out .= "${hh}h" if ($hh);
396 $out .= sprintf("%02d:%02d", $mm,$ss);
397 $out .= " " if ($hh == 0);
398 return $out;
399 }
400
401 =head2 open_import_xml
402
403 Read file from C<import_xml/> directory and parse it.
404
405 $webpac->open_import_xml(type => 'isis');
406
407 =cut
408
409 sub open_import_xml {
410 my $self = shift;
411
412 my $log = $self->_get_logger();
413
414 my $arg = {@_};
415 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
416
417 $self->{'type'} = $arg->{'type'};
418
419 my $type_base = $arg->{'type'};
420 $type_base =~ s/_.*$//g;
421
422 $self->{'tag'} = $type2tag{$type_base};
423
424 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
425
426 my $f = "./import_xml/".$self->{'type'}.".xml";
427 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
428
429 $log->info("reading '$f'");
430
431 $self->{'import_xml_file'} = $f;
432
433 $self->{'import_xml'} = XMLin($f,
434 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
435 );
436
437 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
438
439 }
440
441 =head2 create_lookup
442
443 Create lookup from record using lookup definition.
444
445 $self->create_lookup($rec, @lookups);
446
447 Called internally by C<open_*> methods.
448
449 =cut
450
451 sub create_lookup {
452 my $self = shift;
453
454 my $log = $self->_get_logger();
455
456 my $rec = shift || $log->logconfess("need record to create lookup");
457 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
458
459 foreach my $i (@_) {
460 $log->logconfess("need key") unless defined($i->{'key'});
461 $log->logconfess("need val") unless defined($i->{'val'});
462
463 if (defined($i->{'eval'})) {
464 # eval first, so we can skip fill_in for key and val
465 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
466 if ($self->_eval($eval)) {
467 my $key = $self->fill_in($rec,$i->{'key'}) || next;
468 my @val = $self->fill_in($rec,$i->{'val'}) || next;
469 $log->debug("stored $key = ",sub { join(" | ",@val) });
470 push @{$self->{'lookup'}->{$key}}, @val;
471 }
472 } else {
473 my $key = $self->fill_in($rec,$i->{'key'}) || next;
474 my @val = $self->fill_in($rec,$i->{'val'}) || next;
475 $log->debug("stored $key = ",sub { join(" | ",@val) });
476 push @{$self->{'lookup'}->{$key}}, @val;
477 }
478 }
479 }
480
481 =head2 get_data
482
483 Returns value from record.
484
485 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
486
487 Arguments are:
488 record reference C<$rec>,
489 field C<$f>,
490 optional subfiled C<$sf>,
491 index for repeatable values C<$i>.
492
493 Optinal variable C<$found> will be incremeted if there
494 is field.
495
496 Returns value or empty string.
497
498 =cut
499
500 sub get_data {
501 my $self = shift;
502
503 my ($rec,$f,$sf,$i,$found) = @_;
504
505 if ($$rec->{$f}) {
506 return '' if (! $$rec->{$f}->[$i]);
507 no strict 'refs';
508 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
509 $$found++ if (defined($$found));
510 return $$rec->{$f}->[$i]->{$sf};
511 } elsif ($$rec->{$f}->[$i]) {
512 $$found++ if (defined($$found));
513 # it still might have subfield, just
514 # not specified, so we'll dump all
515 if ($$rec->{$f}->[$i] =~ /HASH/o) {
516 my $out;
517 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
518 $out .= $$rec->{$f}->[$i]->{$k}." ";
519 }
520 return $out;
521 } else {
522 return $$rec->{$f}->[$i];
523 }
524 }
525 } else {
526 return '';
527 }
528 }
529
530 =head2 fill_in
531
532 Workhourse of all: takes record from in-memory structure of database and
533 strings with placeholders and returns string or array of with substituted
534 values from record.
535
536 my $text = $webpac->fill_in($rec,'v250^a');
537
538 Optional argument is ordinal number for repeatable fields. By default,
539 it's assume to be first repeatable field (fields are perl array, so first
540 element is 0).
541 Following example will read second value from repeatable field.
542
543 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
544
545 This function B<does not> perform parsing of format to inteligenty skip
546 delimiters before fields which aren't used.
547
548 This method will automatically decode UTF-8 string to local code page
549 if needed.
550
551 =cut
552
553 sub fill_in {
554 my $self = shift;
555
556 my $log = $self->_get_logger();
557
558 my $rec = shift || $log->logconfess("need data record");
559 my $format = shift || $log->logconfess("need format to parse");
560 # iteration (for repeatable fields)
561 my $i = shift || 0;
562
563 # FIXME remove for speedup?
564 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
565
566 if (utf8::is_utf8($format)) {
567 $format = $self->_x($format);
568 }
569
570 my $found = 0;
571
572 my $eval_code;
573 # remove eval{...} from beginning
574 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
575
576 # do actual replacement of placeholders
577 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
578
579 if ($found) {
580 $log->debug("format: $format");
581 if ($eval_code) {
582 my $eval = $self->fill_in($rec,$eval_code,$i);
583 return if (! $self->_eval($eval));
584 }
585 # do we have lookups?
586 if ($format =~ /$LOOKUP_REGEX/o) {
587 $log->debug("format '$format' has lookup");
588 return $self->lookup($format);
589 } else {
590 return $format;
591 }
592 } else {
593 return;
594 }
595 }
596
597 =head2 lookup
598
599 Perform lookups on format supplied to it.
600
601 my $text = $self->lookup('[v900]');
602
603 Lookups can be nested (like C<[d:[a:[v900]]]>).
604
605 =cut
606
607 sub lookup {
608 my $self = shift;
609
610 my $log = $self->_get_logger();
611
612 my $tmp = shift || $log->logconfess("need format");
613
614 if ($tmp =~ /$LOOKUP_REGEX/o) {
615 my @in = ( $tmp );
616
617 $log->debug("lookup for: ",$tmp);
618
619 my @out;
620 while (my $f = shift @in) {
621 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
622 my $k = $1;
623 if ($self->{'lookup'}->{$k}) {
624 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
625 my $tmp2 = $f;
626 $tmp2 =~ s/lookup{$k}/$nv/g;
627 push @in, $tmp2;
628 }
629 } else {
630 undef $f;
631 }
632 } elsif ($f) {
633 push @out, $f;
634 }
635 }
636 $log->logconfess("return is array and it's not expected!") unless wantarray;
637 return @out;
638 } else {
639 return $tmp;
640 }
641 }
642
643 =head2 parse
644
645 Perform smart parsing of string, skipping delimiters for fields which aren't
646 defined. It can also eval code in format starting with C<eval{...}> and
647 return output or nothing depending on eval code.
648
649 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
650
651 =cut
652
653 sub parse {
654 my $self = shift;
655
656 my ($rec, $format_utf8, $i) = @_;
657
658 return if (! $format_utf8);
659
660 my $log = $self->_get_logger();
661
662 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
663 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
664
665 $i = 0 if (! $i);
666
667 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
668
669 my @out;
670
671 $log->debug("format: $format");
672
673 my $eval_code;
674 # remove eval{...} from beginning
675 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
676
677 my $prefix;
678 my $all_found=0;
679
680 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
681
682 my $del = $1 || '';
683 $prefix ||= $del if ($all_found == 0);
684
685 my $found = 0;
686 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
687
688 if ($found) {
689 push @out, $del;
690 push @out, $tmp;
691 $all_found += $found;
692 }
693 }
694
695 return if (! $all_found);
696
697 my $out = join('',@out);
698
699 if ($out) {
700 # add rest of format (suffix)
701 $out .= $format;
702
703 # add prefix if not there
704 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
705
706 $log->debug("result: $out");
707 }
708
709 if ($eval_code) {
710 my $eval = $self->fill_in($rec,$eval_code,$i);
711 $log->debug("about to eval{",$eval,"} format: $out");
712 return if (! $self->_eval($eval));
713 }
714
715 return $out;
716 }
717
718 =head2 parse_to_arr
719
720 Similar to C<parse>, but returns array of all repeatable fields
721
722 my @arr = $webpac->parse_to_arr($rec,'v250^a');
723
724 =cut
725
726 sub parse_to_arr {
727 my $self = shift;
728
729 my ($rec, $format_utf8) = @_;
730
731 my $log = $self->_get_logger();
732
733 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
734 return if (! $format_utf8);
735
736 my $i = 0;
737 my @arr;
738
739 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
740 push @arr, $v;
741 }
742
743 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
744
745 return @arr;
746 }
747
748 =head2 fill_in_to_arr
749
750 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
751 for fields which have lookups, so they shouldn't be parsed but rather
752 C<fill_id>ed.
753
754 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
755
756 =cut
757
758 sub fill_in_to_arr {
759 my $self = shift;
760
761 my ($rec, $format_utf8) = @_;
762
763 my $log = $self->_get_logger();
764
765 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
766 return if (! $format_utf8);
767
768 my $i = 0;
769 my @arr;
770
771 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
772 push @arr, @v;
773 }
774
775 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
776
777 return @arr;
778 }
779
780 =head2 sort_arr
781
782 Sort array ignoring case and html in data
783
784 my @sorted = $webpac->sort_arr(@unsorted);
785
786 =cut
787
788 sub sort_arr {
789 my $self = shift;
790
791 my $log = $self->_get_logger();
792
793 # FIXME add Schwartzian Transformation?
794
795 my @sorted = sort {
796 $a =~ s#<[^>]+/*>##;
797 $b =~ s#<[^>]+/*>##;
798 lc($b) cmp lc($a)
799 } @_;
800 $log->debug("sorted values: ",sub { join(", ",@sorted) });
801
802 return @sorted;
803 }
804
805
806 =head2 data_structure
807
808 Create in-memory data structure which represents layout from C<import_xml>.
809 It is used later to produce output.
810
811 my @ds = $webpac->data_structure($rec);
812
813 This method will also set C<$webpac->{'currnet_filename'}> if there is
814 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
815 <headline> tag.
816
817 =cut
818
819 sub data_structure {
820 my $self = shift;
821
822 my $log = $self->_get_logger();
823
824 my $rec = shift;
825 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
826
827 undef $self->{'currnet_filename'};
828 undef $self->{'headline'};
829
830 my @sorted_tags;
831 if ($self->{tags_by_order}) {
832 @sorted_tags = @{$self->{tags_by_order}};
833 } else {
834 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
835 $self->{tags_by_order} = \@sorted_tags;
836 }
837
838 my @ds;
839
840 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
841
842 foreach my $field (@sorted_tags) {
843
844 my $row;
845
846 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
847
848 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
849 my $format = $tag->{'value'} || $tag->{'content'};
850
851 $log->debug("format: $format");
852
853 my @v;
854 if ($format =~ /$LOOKUP_REGEX/o) {
855 @v = $self->fill_in_to_arr($rec,$format);
856 } else {
857 @v = $self->parse_to_arr($rec,$format);
858 }
859 next if (! @v);
860
861 if ($tag->{'sort'}) {
862 @v = $self->sort_arr(@v);
863 $log->warn("sort within tag is usually not what you want!");
864 }
865
866 # use format?
867 if ($tag->{'format_name'}) {
868 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
869 }
870
871 if ($field eq 'filename') {
872 $self->{'current_filename'} = join('',@v);
873 $log->debug("filename: ",$self->{'current_filename'});
874 } elsif ($field eq 'headline') {
875 $self->{'headline'} .= join('',@v);
876 $log->debug("headline: ",$self->{'headline'});
877 next; # don't return headline in data_structure!
878 }
879
880 # delimiter will join repeatable fields
881 if ($tag->{'delimiter'}) {
882 @v = ( join($tag->{'delimiter'}, @v) );
883 }
884
885 # default types
886 my @types = qw(display swish);
887 # override by type attribute
888 @types = ( $tag->{'type'} ) if ($tag->{'type'});
889
890 foreach my $type (@types) {
891 # append to previous line?
892 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
893 if ($tag->{'append'}) {
894
895 # I will delimit appended part with
896 # delimiter (or ,)
897 my $d = $tag->{'delimiter'};
898 # default delimiter
899 $d ||= ", ";
900
901 my $last = pop @{$row->{$type}};
902 $d = "" if (! $last);
903 $last .= $d . join($d, @v);
904 push @{$row->{$type}}, $last;
905
906 } else {
907 push @{$row->{$type}}, @v;
908 }
909 }
910
911
912 }
913
914 if ($row) {
915 $row->{'tag'} = $field;
916
917 # TODO: name_sigular, name_plural
918 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
919 $row->{'name'} = $name ? $self->_x($name) : $field;
920
921 # post-sort all values in field
922 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
923 $log->warn("sort at field tag not implemented");
924 }
925
926 push @ds, $row;
927
928 $log->debug("row $field: ",sub { Dumper($row) });
929 }
930
931 }
932
933 return @ds;
934
935 }
936
937 =head2 output
938
939 Create output from in-memory data structure using Template Toolkit template.
940
941 my $text = $webpac->output( template => 'text.tt', data => @ds );
942
943 =cut
944
945 sub output {
946 my $self = shift;
947
948 my $args = {@_};
949
950 my $log = $self->_get_logger();
951
952 $log->logconfess("need template name") if (! $args->{'template'});
953 $log->logconfess("need data array") if (! $args->{'data'});
954
955 my $out;
956
957 $self->{'tt'}->process(
958 $args->{'template'},
959 $args,
960 \$out
961 ) || confess $self->{'tt'}->error();
962
963 return $out;
964 }
965
966 =head2 output_file
967
968 Create output from in-memory data structure using Template Toolkit template
969 to a file.
970
971 $webpac->output_file(
972 file => 'out.txt',
973 template => 'text.tt',
974 data => @ds
975 );
976
977 =cut
978
979 sub output_file {
980 my $self = shift;
981
982 my $args = {@_};
983
984 my $log = $self->_get_logger();
985
986 my $file = $args->{'file'} || $log->logconfess("need file name");
987
988 $log->debug("creating file ",$file);
989
990 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
991 print $fh $self->output(
992 template => $args->{'template'},
993 data => $args->{'data'},
994 ) || $log->logdie("print: $!");
995 close($fh) || $log->logdie("close: $!");
996 }
997
998 =head2 apply_format
999
1000 Apply format specified in tag with C<format_name="name"> and
1001 C<format_delimiter=";;">.
1002
1003 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1004
1005 Formats can contain C<lookup{...}> if you need them.
1006
1007 =cut
1008
1009 sub apply_format {
1010 my $self = shift;
1011
1012 my ($name,$delimiter,$data) = @_;
1013
1014 my $log = $self->_get_logger();
1015
1016 if (! $self->{'import_xml'}->{'format'}->{$name}) {
1017 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1018 return $data;
1019 }
1020
1021 $log->warn("no delimiter for format $name") if (! $delimiter);
1022
1023 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1024
1025 my @data = split(/\Q$delimiter\E/, $data);
1026
1027 my $out = sprintf($format, @data);
1028 $log->debug("using format $name [$format] on $data to produce: $out");
1029
1030 if ($out =~ m/$LOOKUP_REGEX/o) {
1031 return $self->lookup($out);
1032 } else {
1033 return $out;
1034 }
1035
1036 }
1037
1038
1039 #
1040 #
1041 #
1042
1043 =head1 INTERNAL METHODS
1044
1045 Here is a quick list of internal methods, mostly useful to turn debugging
1046 on them (see L<LOGGING> below for explanation).
1047
1048 =cut
1049
1050 =head2 _eval
1051
1052 Internal function to eval code without C<strict 'subs'>.
1053
1054 =cut
1055
1056 sub _eval {
1057 my $self = shift;
1058
1059 my $code = shift || return;
1060
1061 my $log = $self->_get_logger();
1062
1063 no strict 'subs';
1064 my $ret = eval $code;
1065 if ($@) {
1066 $log->error("problem with eval code [$code]: $@");
1067 }
1068
1069 $log->debug("eval: ",$code," [",$ret,"]");
1070
1071 return $ret || 0;
1072 }
1073
1074 =head2 _sort_by_order
1075
1076 Sort xml tags data structure accoding to C<order=""> attribute.
1077
1078 =cut
1079
1080 sub _sort_by_order {
1081 my $self = shift;
1082
1083 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1084 $self->{'import_xml'}->{'indexer'}->{$a};
1085 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1086 $self->{'import_xml'}->{'indexer'}->{$b};
1087
1088 return $va <=> $vb;
1089 }
1090
1091 =head2 _get_logger
1092
1093 Get C<Log::Log4perl> object with a twist: domains are defined for each
1094 method
1095
1096 my $log = $webpac->_get_logger();
1097
1098 =cut
1099
1100 sub _get_logger {
1101 my $self = shift;
1102
1103 my $name = (caller(1))[3] || caller;
1104 return get_logger($name);
1105 }
1106
1107 =head2 _x
1108
1109 Convert string from UTF-8 to code page defined in C<import_xml>.
1110
1111 my $text = $webpac->_x('utf8 text');
1112
1113 =cut
1114
1115 sub _x {
1116 my $self = shift;
1117 my $utf8 = shift || return;
1118
1119 return $self->{'utf2cp'}->convert($utf8) ||
1120 $self->_get_logger()->logwarn("can't convert '$utf8'");
1121 }
1122
1123 #
1124 #
1125 #
1126
1127 =head1 LOGGING
1128
1129 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1130 C<log.conf>.
1131
1132 Methods defined above have different levels of logging, so
1133 it's descriptions will be useful to turn (mostry B<debug> logging) on
1134 or off to see why WabPAC isn't perforing as you expect it (it might even
1135 be a bug!).
1136
1137 B<This is different from normal Log4perl behaviour>. To repeat, you can
1138 also use method names, and not only classes (which are just few)
1139 to filter logging.
1140
1141
1142 =head1 MEMORY USAGE
1143
1144 C<low_mem> options is double-edged sword. If enabled, WebPAC
1145 will run on memory constraint machines (which doesn't have enough
1146 physical RAM to create memory structure for whole source database).
1147
1148 If your machine has 512Mb or more of RAM and database is around 10000 records,
1149 memory shouldn't be an issue. If you don't have enough physical RAM, you
1150 might consider using virtual memory (if your operating system is handling it
1151 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1152 parsed structure of ISIS database (this is what C<low_mem> option does).
1153
1154 Hitting swap at end of reading source database is probably o.k. However,
1155 hitting swap before 90% will dramatically decrease performance and you will
1156 be better off with C<low_mem> and using rest of availble memory for
1157 operating system disk cache (Linux is particuallary good about this).
1158 However, every access to database record will require disk access, so
1159 generation phase will be slower 10-100 times.
1160
1161 Parsed structures are essential - you just have option to trade RAM memory
1162 (which is fast) for disk space (which is slow). Be sure to have planty of
1163 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1164
1165 However, when WebPAC is running on desktop machines (or laptops :-), it's
1166 highly undesireable for system to start swapping. Using C<low_mem> option can
1167 reduce WecPAC memory usage to around 64Mb for same database with lookup
1168 fields and sorted indexes which stay in RAM. Performance will suffer, but
1169 memory usage will really be minimal. It might be also more confortable to
1170 run WebPAC reniced on those machines.
1171
1172 =cut
1173
1174 1;

  ViewVC Help
Powered by ViewVC 1.1.26