/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 439 - (show annotations)
Mon Sep 13 23:13:54 2004 UTC (19 years, 7 months ago) by dpavlin
File size: 25767 byte(s)
implemented append and delimiter, hooks for sort (within fields which are
repeatable and within tag)

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 code_page => 'ISO-8859-2',
38 low_mem => 1,
39 );
40
41 Default C<code_page> is C<ISO-8859-2>.
42
43 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44
45 This method will also read configuration files
46 C<global.conf> (used by indexer and Web font-end)
47 and configuration file specified by C<config_file>
48 which describes databases to be indexed.
49
50 =cut
51
52 # mapping between data type and tag which specify
53 # format in XML file
54 my %type2tag = (
55 'isis' => 'isis',
56 # 'excel' => 'column',
57 # 'marc' => 'marc',
58 # 'feed' => 'feed'
59 );
60
61 sub new {
62 my $class = shift;
63 my $self = {@_};
64 bless($self, $class);
65
66 $self->{'start_t'} = time();
67
68 my $log_file = $self->{'log'} || "log.conf";
69 Log::Log4perl->init($log_file);
70
71 my $log = $self->_get_logger();
72
73 # fill in default values
74 # output codepage
75 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76
77 #
78 # read global.conf
79 #
80 $log->debug("read 'global.conf'");
81
82 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83
84 # read global config parametars
85 foreach my $var (qw(
86 dbi_dbd
87 dbi_dsn
88 dbi_user
89 dbi_passwd
90 show_progress
91 my_unac_filter
92 output_template
93 )) {
94 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 }
96
97 #
98 # read indexer config file
99 #
100
101 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102
103 # create UTF-8 convertor for import_xml files
104 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105
106 # create Template toolkit instance
107 $self->{'tt'} = Template->new(
108 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109 # FILTERS => {
110 # 'foo' => \&foo_filter,
111 # },
112 EVAL_PERL => 1,
113 );
114
115 # running with low_mem flag? well, use DBM::Deep then.
116 if ($self->{'low_mem'}) {
117 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118
119 my $db_file = "data.db";
120
121 if (-e $db_file) {
122 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123 $log->debug("removed '$db_file' from last run");
124 }
125
126 require DBM::Deep;
127
128 my $db = new DBM::Deep $db_file;
129
130 $log->logdie("DBM::Deep error: $!") unless ($db);
131
132 if ($db->error()) {
133 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134 } else {
135 $log->debug("using file '$db_file' for DBM::Deep");
136 }
137
138 $self->{'db'} = $db;
139 }
140
141 return $self;
142 }
143
144 =head2 open_isis
145
146 Open CDS/ISIS database using OpenIsis module and read all records to memory.
147
148 $webpac->open_isis(
149 filename => '/data/ISIS/ISIS',
150 code_page => '852',
151 limit_mfn => 500,
152 start_mfn => 6000,
153 lookup => [ ... ],
154 );
155
156 By default, ISIS code page is assumed to be C<852>.
157
158 If optional parametar C<start_mfn> is set, this will be first MFN to read
159 from database (so you can skip beginning of your database if you need to).
160
161 If optional parametar C<limit_mfn> is set, it will read just 500 records
162 from database in example above.
163
164 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166 value in index.
167
168 lookup => [
169 { 'key' => 'd:v900', 'val' => 'v250^a' },
170 { 'eval' => '"v901^a" eq "Podruèje"',
171 'key' => 'pa:v561^4:v562^4:v461^1',
172 'val' => 'v900' },
173 ]
174
175 Returns number of last record read into memory (size of database, really).
176
177 =cut
178
179 sub open_isis {
180 my $self = shift;
181 my $arg = {@_};
182
183 my $log = $self->_get_logger();
184
185 $log->logcroak("need filename") if (! $arg->{'filename'});
186 my $code_page = $arg->{'code_page'} || '852';
187
188 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189
190 # store data in object
191 $self->{'isis_filename'} = $arg->{'filename'};
192 $self->{'isis_code_page'} = $code_page;
193
194 use OpenIsis;
195
196 #$self->{'isis_code_page'} = $code_page;
197
198 # create Text::Iconv object
199 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200
201 $log->info("reading ISIS database '",$arg->{'filename'},"'");
202 $log->debug("isis code page: $code_page");
203
204 my $isis_db = OpenIsis::open($arg->{'filename'});
205
206 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207 my $startmfn = 1;
208
209 if (my $s = $self->{'start_mfn'}) {
210 $log->info("skipping to MFN $s");
211 $startmfn = $s;
212 }
213
214 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
215
216 $log->info("processing ",($maxmfn-$startmfn)." records...");
217
218 # read database
219 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
220
221
222 $log->debug("mfn: $mfn\n");
223
224 my $rec;
225
226 # read record
227 my $row = OpenIsis::read( $isis_db, $mfn );
228 foreach my $k (keys %{$row}) {
229 if ($k ne "mfn") {
230 foreach my $l (@{$row->{$k}}) {
231 $l = $cp->convert($l);
232 # has subfields?
233 my $val;
234 if ($l =~ m/\^/) {
235 foreach my $t (split(/\^/,$l)) {
236 next if (! $t);
237 $val->{substr($t,0,1)} = substr($t,1);
238 }
239 } else {
240 $val = $l;
241 }
242
243 push @{$rec->{$k}}, $val;
244 }
245 } else {
246 push @{$rec->{'000'}}, $mfn;
247 }
248
249 }
250
251 $log->confess("record $mfn empty?") unless ($rec);
252
253 # store
254 if ($self->{'low_mem'}) {
255 $self->{'db'}->put($mfn, $rec);
256 } else {
257 $self->{'data'}->{$mfn} = $rec;
258 }
259
260 # create lookup
261 $self->create_lookup($rec, @{$arg->{'lookup'}});
262
263 $self->progress_bar($mfn,$maxmfn);
264
265 }
266
267 $self->{'current_mfn'} = $startmfn;
268 $self->{'last_pcnt'} = 0;
269
270 $log->debug("max mfn: $maxmfn");
271
272 # store max mfn and return it.
273 return $self->{'max_mfn'} = $maxmfn;
274 }
275
276 =head2 fetch_rec
277
278 Fetch next record from database. It will also display progress bar (once
279 it's implemented, that is).
280
281 my $rec = $webpac->fetch_rec;
282
283 =cut
284
285 sub fetch_rec {
286 my $self = shift;
287
288 my $log = $self->_get_logger();
289
290 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
291
292 if ($mfn > $self->{'max_mfn'}) {
293 $self->{'current_mfn'} = $self->{'max_mfn'};
294 $log->debug("at EOF");
295 return;
296 }
297
298 $self->progress_bar($mfn,$self->{'max_mfn'});
299
300 if ($self->{'low_mem'}) {
301 return $self->{'db'}->get($mfn);
302 } else {
303 return $self->{'data'}->{$mfn};
304 }
305 }
306
307 =head2 progress_bar
308
309 Draw progress bar on STDERR.
310
311 $webpac->progress_bar($current, $max);
312
313 =cut
314
315 sub progress_bar {
316 my $self = shift;
317
318 my ($curr,$max) = @_;
319
320 my $log = $self->_get_logger();
321
322 $log->logconfess("no current value!") if (! $curr);
323 $log->logconfess("no maximum value!") if (! $max);
324
325 if ($curr > $max) {
326 $max = $curr;
327 $log->debug("overflow to $curr");
328 }
329
330 $self->{'last_pcnt'} ||= 1;
331
332 my $p = int($curr * 100 / $max);
333
334 # reset on re-run
335 if ($p < $self->{'last_pcnt'}) {
336 $self->{'last_pcnt'} = $p;
337 $self->{'last_t'} = time();
338 $self->{'last_curr'} = 1;
339 }
340
341 if ($p != $self->{'last_pcnt'}) {
342
343 my $last_curr = $self->{'last_curr'} || $curr;
344 my $t = time();
345 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
346 my $eta = ($max-$curr) / ($rate || 1);
347 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
348 $self->{'last_pcnt'} = $p;
349 $self->{'last_t'} = time();
350 $self->{'last_curr'} = $curr;
351 }
352 print STDERR "\n" if ($p == 100);
353 }
354
355 =head2 fmt_time
356
357 Format time (in seconds) for display.
358
359 print $webpac->fmt_time(time());
360
361 This method is called by L<progress_bar> to display remaining time.
362
363 =cut
364
365 sub fmt_time {
366 my $self = shift;
367
368 my $t = shift || 0;
369 my $out = "";
370
371 my ($ss,$mm,$hh) = gmtime($t);
372 $out .= "${hh}h" if ($hh);
373 $out .= sprintf("%02d:%02d", $mm,$ss);
374 $out .= " " if ($hh == 0);
375 return $out;
376 }
377
378 =head2 open_import_xml
379
380 Read file from C<import_xml/> directory and parse it.
381
382 $webpac->open_import_xml(type => 'isis');
383
384 =cut
385
386 sub open_import_xml {
387 my $self = shift;
388
389 my $log = $self->_get_logger();
390
391 my $arg = {@_};
392 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
393
394 $self->{'type'} = $arg->{'type'};
395
396 my $type_base = $arg->{'type'};
397 $type_base =~ s/_.*$//g;
398
399 $self->{'tag'} = $type2tag{$type_base};
400
401 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
402
403 my $f = "./import_xml/".$self->{'type'}.".xml";
404 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
405
406 $log->info("reading '$f'");
407
408 $self->{'import_xml_file'} = $f;
409
410 $self->{'import_xml'} = XMLin($f,
411 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
412 );
413
414 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
415
416 }
417
418 =head2 create_lookup
419
420 Create lookup from record using lookup definition.
421
422 $self->create_lookup($rec, @lookups);
423
424 Called internally by C<open_*> methods.
425
426 =cut
427
428 sub create_lookup {
429 my $self = shift;
430
431 my $log = $self->_get_logger();
432
433 my $rec = shift || $log->logconfess("need record to create lookup");
434 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
435
436 foreach my $i (@_) {
437 $log->logconfess("need key") unless defined($i->{'key'});
438 $log->logconfess("need val") unless defined($i->{'val'});
439
440 if (defined($i->{'eval'})) {
441 # eval first, so we can skip fill_in for key and val
442 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
443 if ($self->_eval($eval)) {
444 my $key = $self->fill_in($rec,$i->{'key'}) || next;
445 my @val = $self->fill_in($rec,$i->{'val'}) || next;
446 $log->debug("stored $key = ",sub { join(" | ",@val) });
447 push @{$self->{'lookup'}->{$key}}, @val;
448 }
449 } else {
450 my $key = $self->fill_in($rec,$i->{'key'}) || next;
451 my @val = $self->fill_in($rec,$i->{'val'}) || next;
452 $log->debug("stored $key = ",sub { join(" | ",@val) });
453 push @{$self->{'lookup'}->{$key}}, @val;
454 }
455 }
456 }
457
458 =head2 get_data
459
460 Returns value from record.
461
462 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
463
464 Arguments are:
465 record reference C<$rec>,
466 field C<$f>,
467 optional subfiled C<$sf>,
468 index for repeatable values C<$i>.
469
470 Optinal variable C<$found> will be incremeted if there
471 is field.
472
473 Returns value or empty string.
474
475 =cut
476
477 sub get_data {
478 my $self = shift;
479
480 my ($rec,$f,$sf,$i,$found) = @_;
481
482 if ($$rec->{$f}) {
483 return '' if (! $$rec->{$f}->[$i]);
484 no strict 'refs';
485 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
486 $$found++ if (defined($$found));
487 return $$rec->{$f}->[$i]->{$sf};
488 } elsif ($$rec->{$f}->[$i]) {
489 $$found++ if (defined($$found));
490 # it still might have subfield, just
491 # not specified, so we'll dump all
492 if ($$rec->{$f}->[$i] =~ /HASH/o) {
493 my $out;
494 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
495 $out .= $$rec->{$f}->[$i]->{$k}." ";
496 }
497 return $out;
498 } else {
499 return $$rec->{$f}->[$i];
500 }
501 }
502 } else {
503 return '';
504 }
505 }
506
507 =head2 fill_in
508
509 Workhourse of all: takes record from in-memory structure of database and
510 strings with placeholders and returns string or array of with substituted
511 values from record.
512
513 my $text = $webpac->fill_in($rec,'v250^a');
514
515 Optional argument is ordinal number for repeatable fields. By default,
516 it's assume to be first repeatable field (fields are perl array, so first
517 element is 0).
518 Following example will read second value from repeatable field.
519
520 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
521
522 This function B<does not> perform parsing of format to inteligenty skip
523 delimiters before fields which aren't used.
524
525 This method will automatically decode UTF-8 string to local code page
526 if needed.
527
528 =cut
529
530 sub fill_in {
531 my $self = shift;
532
533 my $log = $self->_get_logger();
534
535 my $rec = shift || $log->logconfess("need data record");
536 my $format = shift || $log->logconfess("need format to parse");
537 # iteration (for repeatable fields)
538 my $i = shift || 0;
539
540 # FIXME remove for speedup?
541 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
542
543 if (utf8::is_utf8($format)) {
544 $format = $self->_x($format);
545 }
546
547 my $found = 0;
548
549 my $eval_code;
550 # remove eval{...} from beginning
551 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
552
553 # do actual replacement of placeholders
554 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
555
556 if ($found) {
557 $log->debug("format: $format");
558 if ($eval_code) {
559 my $eval = $self->fill_in($rec,$eval_code,$i);
560 return if (! $self->_eval($eval));
561 }
562 # do we have lookups?
563 if ($format =~ /$LOOKUP_REGEX/o) {
564 $log->debug("format '$format' has lookup");
565 return $self->lookup($format);
566 } else {
567 return $format;
568 }
569 } else {
570 return;
571 }
572 }
573
574 =head2 lookup
575
576 Perform lookups on format supplied to it.
577
578 my $text = $self->lookup('[v900]');
579
580 Lookups can be nested (like C<[d:[a:[v900]]]>).
581
582 =cut
583
584 sub lookup {
585 my $self = shift;
586
587 my $log = $self->_get_logger();
588
589 my $tmp = shift || $log->logconfess("need format");
590
591 if ($tmp =~ /$LOOKUP_REGEX/o) {
592 my @in = ( $tmp );
593
594 $log->debug("lookup for: ",$tmp);
595
596 my @out;
597 while (my $f = shift @in) {
598 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
599 my $k = $1;
600 if ($self->{'lookup'}->{$k}) {
601 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
602 my $tmp2 = $f;
603 $tmp2 =~ s/lookup{$k}/$nv/g;
604 push @in, $tmp2;
605 }
606 } else {
607 undef $f;
608 }
609 } elsif ($f) {
610 push @out, $f;
611 }
612 }
613 $log->logconfess("return is array and it's not expected!") unless wantarray;
614 return @out;
615 } else {
616 return $tmp;
617 }
618 }
619
620 =head2 parse
621
622 Perform smart parsing of string, skipping delimiters for fields which aren't
623 defined. It can also eval code in format starting with C<eval{...}> and
624 return output or nothing depending on eval code.
625
626 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
627
628 =cut
629
630 sub parse {
631 my $self = shift;
632
633 my ($rec, $format_utf8, $i) = @_;
634
635 return if (! $format_utf8);
636
637 my $log = $self->_get_logger();
638
639 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
640 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
641
642 $i = 0 if (! $i);
643
644 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
645
646 my @out;
647
648 $log->debug("format: $format");
649
650 my $eval_code;
651 # remove eval{...} from beginning
652 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
653
654 my $prefix;
655 my $all_found=0;
656
657 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
658
659 my $del = $1 || '';
660 $prefix ||= $del if ($all_found == 0);
661
662 my $found = 0;
663 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
664
665 if ($found) {
666 push @out, $del;
667 push @out, $tmp;
668 $all_found += $found;
669 }
670 }
671
672 return if (! $all_found);
673
674 my $out = join('',@out);
675
676 if ($out) {
677 # add rest of format (suffix)
678 $out .= $format;
679
680 # add prefix if not there
681 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
682
683 $log->debug("result: $out");
684 }
685
686 if ($eval_code) {
687 my $eval = $self->fill_in($rec,$eval_code,$i);
688 $log->debug("about to eval{",$eval,"} format: $out");
689 return if (! $self->_eval($eval));
690 }
691
692 return $out;
693 }
694
695 =head2 parse_to_arr
696
697 Similar to C<parse>, but returns array of all repeatable fields
698
699 my @arr = $webpac->parse_to_arr($rec,'v250^a');
700
701 =cut
702
703 sub parse_to_arr {
704 my $self = shift;
705
706 my ($rec, $format_utf8) = @_;
707
708 my $log = $self->_get_logger();
709
710 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
711 return if (! $format_utf8);
712
713 my $i = 0;
714 my @arr;
715
716 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
717 push @arr, $v;
718 }
719
720 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
721
722 return @arr;
723 }
724
725 =head2 fill_in_to_arr
726
727 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
728 for fields which have lookups, so they shouldn't be parsed but rather
729 C<fill_id>ed.
730
731 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
732
733 =cut
734
735 sub fill_in_to_arr {
736 my $self = shift;
737
738 my ($rec, $format_utf8) = @_;
739
740 my $log = $self->_get_logger();
741
742 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
743 return if (! $format_utf8);
744
745 my $i = 0;
746 my @arr;
747
748 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
749 push @arr, @v;
750 }
751
752 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
753
754 return @arr;
755 }
756
757
758 =head2 data_structure
759
760 Create in-memory data structure which represents layout from C<import_xml>.
761 It is used later to produce output.
762
763 my @ds = $webpac->data_structure($rec);
764
765 This method will also set C<$webpac->{'currnet_filename'}> if there is
766 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
767 <headline> tag.
768
769 =cut
770
771 sub data_structure {
772 my $self = shift;
773
774 my $log = $self->_get_logger();
775
776 my $rec = shift;
777 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
778
779 undef $self->{'currnet_filename'};
780 undef $self->{'headline'};
781
782 my @sorted_tags;
783 if ($self->{tags_by_order}) {
784 @sorted_tags = @{$self->{tags_by_order}};
785 } else {
786 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
787 $self->{tags_by_order} = \@sorted_tags;
788 }
789
790 my @ds;
791
792 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
793
794 foreach my $field (@sorted_tags) {
795
796 my $row;
797
798 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
799
800 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
801 my $format = $tag->{'value'} || $tag->{'content'};
802
803 $log->debug("format: $format");
804
805 my @v;
806 if ($format =~ /$LOOKUP_REGEX/o) {
807 @v = $self->fill_in_to_arr($rec,$format);
808 } else {
809 @v = $self->parse_to_arr($rec,$format);
810 }
811 next if (! @v);
812
813 if ($tag->{'sort'}) {
814 # very special sort, ignoring case and
815 # html
816 @v = sort {
817 $a =~ s#<[^>]+/*>##;
818 $b =~ s#<[^>]+/*>##;
819 lc($b) cmp lc($a)
820 } @v;
821 $log->warn("sort within tag is usually not what you want!");
822 $log->debug("sorted values: ",sub { join(", ",@v) });
823 }
824
825 # use format?
826 if ($tag->{'format_name'}) {
827 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
828 }
829
830 if ($field eq 'filename') {
831 $self->{'current_filename'} = join('',@v);
832 $log->debug("filename: ",$self->{'current_filename'});
833 } elsif ($field eq 'headline') {
834 $self->{'headline'} .= join('',@v);
835 $log->debug("headline: ",$self->{'headline'});
836 next; # don't return headline in data_structure!
837 }
838
839 # delimiter will join repeatable fields
840 if ($tag->{'delimiter'}) {
841 @v = ( join($tag->{'delimiter'}, @v) );
842 }
843
844 # default types
845 my @types = qw(display swish);
846 # override by type attribute
847 @types = ( $tag->{'type'} ) if ($tag->{'type'});
848
849 foreach my $type (@types) {
850 # append to previous line?
851 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
852 if ($tag->{'append'}) {
853
854 # I will delimit appended part with
855 # delimiter (or ,)
856 my $d = $tag->{'delimiter'};
857 # default delimiter
858 $d ||= ", ";
859
860 my $last = pop @{$row->{$type}};
861 $d = "" if (! $last);
862 $last .= $d . join($d, @v);
863 push @{$row->{$type}}, $last;
864
865 } else {
866 push @{$row->{$type}}, @v;
867 }
868 }
869
870
871 }
872
873 if ($row) {
874 $row->{'tag'} = $field;
875
876 # TODO: name_sigular, name_plural
877 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
878 $row->{'name'} = $name ? $self->_x($name) : $field;
879
880 # post-sort all values in field
881 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
882 $log->warn("sort at field tag not implemented");
883
884 }
885
886 push @ds, $row;
887
888 $log->debug("row $field: ",sub { Dumper($row) });
889 }
890
891 }
892
893 return @ds;
894
895 }
896
897 =head2 output
898
899 Create output from in-memory data structure using Template Toolkit template.
900
901 my $text = $webpac->output( template => 'text.tt', data => @ds );
902
903 =cut
904
905 sub output {
906 my $self = shift;
907
908 my $args = {@_};
909
910 my $log = $self->_get_logger();
911
912 $log->logconfess("need template name") if (! $args->{'template'});
913 $log->logconfess("need data array") if (! $args->{'data'});
914
915 my $out;
916
917 $self->{'tt'}->process(
918 $args->{'template'},
919 $args,
920 \$out
921 ) || confess $self->{'tt'}->error();
922
923 return $out;
924 }
925
926 =head2 output_file
927
928 Create output from in-memory data structure using Template Toolkit template
929 to a file.
930
931 $webpac->output_file(
932 file => 'out.txt',
933 template => 'text.tt',
934 data => @ds
935 );
936
937 =cut
938
939 sub output_file {
940 my $self = shift;
941
942 my $args = {@_};
943
944 my $log = $self->_get_logger();
945
946 my $file = $args->{'file'} || $log->logconfess("need file name");
947
948 $log->debug("creating file ",$file);
949
950 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
951 print $fh $self->output(
952 template => $args->{'template'},
953 data => $args->{'data'},
954 ) || $log->logdie("print: $!");
955 close($fh) || $log->logdie("close: $!");
956 }
957
958 =head2 apply_format
959
960 Apply format specified in tag with C<format_name="name"> and
961 C<format_delimiter=";;">.
962
963 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
964
965 Formats can contain C<lookup{...}> if you need them.
966
967 =cut
968
969 sub apply_format {
970 my $self = shift;
971
972 my ($name,$delimiter,$data) = @_;
973
974 my $log = $self->_get_logger();
975
976 if (! $self->{'import_xml'}->{'format'}->{$name}) {
977 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
978 return $data;
979 }
980
981 $log->warn("no delimiter for format $name") if (! $delimiter);
982
983 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
984
985 my @data = split(/\Q$delimiter\E/, $data);
986
987 my $out = sprintf($format, @data);
988 $log->debug("using format $name [$format] on $data to produce: $out");
989
990 if ($out =~ m/$LOOKUP_REGEX/o) {
991 return $self->lookup($out);
992 } else {
993 return $out;
994 }
995
996 }
997
998
999 #
1000 #
1001 #
1002
1003 =head1 INTERNAL METHODS
1004
1005 Here is a quick list of internal methods, mostly useful to turn debugging
1006 on them (see L<LOGGING> below for explanation).
1007
1008 =cut
1009
1010 =head2 _eval
1011
1012 Internal function to eval code without C<strict 'subs'>.
1013
1014 =cut
1015
1016 sub _eval {
1017 my $self = shift;
1018
1019 my $code = shift || return;
1020
1021 my $log = $self->_get_logger();
1022
1023 no strict 'subs';
1024 my $ret = eval $code;
1025 if ($@) {
1026 $log->error("problem with eval code [$code]: $@");
1027 }
1028
1029 $log->debug("eval: ",$code," [",$ret,"]");
1030
1031 return $ret || 0;
1032 }
1033
1034 =head2 _sort_by_order
1035
1036 Sort xml tags data structure accoding to C<order=""> attribute.
1037
1038 =cut
1039
1040 sub _sort_by_order {
1041 my $self = shift;
1042
1043 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1044 $self->{'import_xml'}->{'indexer'}->{$a};
1045 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1046 $self->{'import_xml'}->{'indexer'}->{$b};
1047
1048 return $va <=> $vb;
1049 }
1050
1051 =head2 _get_logger
1052
1053 Get C<Log::Log4perl> object with a twist: domains are defined for each
1054 method
1055
1056 my $log = $webpac->_get_logger();
1057
1058 =cut
1059
1060 sub _get_logger {
1061 my $self = shift;
1062
1063 my $name = (caller(1))[3] || caller;
1064 return get_logger($name);
1065 }
1066
1067 =head2 _x
1068
1069 Convert string from UTF-8 to code page defined in C<import_xml>.
1070
1071 my $text = $webpac->_x('utf8 text');
1072
1073 =cut
1074
1075 sub _x {
1076 my $self = shift;
1077 my $utf8 = shift || return;
1078
1079 return $self->{'utf2cp'}->convert($utf8) ||
1080 $self->_get_logger()->logwarn("can't convert '$utf8'");
1081 }
1082
1083 #
1084 #
1085 #
1086
1087 =head1 LOGGING
1088
1089 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1090 C<log.conf>.
1091
1092 Methods defined above have different levels of logging, so
1093 it's descriptions will be useful to turn (mostry B<debug> logging) on
1094 or off to see why WabPAC isn't perforing as you expect it (it might even
1095 be a bug!).
1096
1097 B<This is different from normal Log4perl behaviour>. To repeat, you can
1098 also use method names, and not only classes (which are just few)
1099 to filter logging.
1100
1101
1102 =head1 MEMORY USAGE
1103
1104 C<low_mem> options is double-edged sword. If enabled, WebPAC
1105 will run on memory constraint machines (which doesn't have enough
1106 physical RAM to create memory structure for whole source database).
1107
1108 If your machine has 512Mb or more of RAM and database is around 10000 records,
1109 memory shouldn't be an issue. If you don't have enough physical RAM, you
1110 might consider using virtual memory (if your operating system is handling it
1111 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1112 parsed structure of ISIS database (this is what C<low_mem> option does).
1113
1114 Hitting swap at end of reading source database is probably o.k. However,
1115 hitting swap before 90% will dramatically decrease performance and you will
1116 be better off with C<low_mem> and using rest of availble memory for
1117 operating system disk cache (Linux is particuallary good about this).
1118 However, every access to database record will require disk access, so
1119 generation phase will be slower 10-100 times.
1120
1121 Parsed structures are essential - you just have option to trade RAM memory
1122 (which is fast) for disk space (which is slow). Be sure to have planty of
1123 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1124
1125 However, when WebPAC is running on desktop machines (or laptops :-), it's
1126 highly undesireable for system to start swapping. Using C<low_mem> option can
1127 reduce WecPAC memory usage to around 64Mb for same database with lookup
1128 fields and sorted indexes which stay in RAM. Performance will suffer, but
1129 memory usage will really be minimal. It might be also more confortable to
1130 run WebPAC reniced on those machines.
1131
1132 =cut
1133
1134 1;

  ViewVC Help
Powered by ViewVC 1.1.26