/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 436 - (show annotations)
Mon Sep 13 14:55:13 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 24704 byte(s)
die if database doesn't exist

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 [code_page => 'ISO-8859-2',]
38 [low_mem => 1,]
39 );
40
41 Default C<code_page> is C<ISO-8859-2>.
42
43 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44
45 This method will also read configuration files
46 C<global.conf> (used by indexer and Web font-end)
47 and configuration file specified by C<config_file>
48 which describes databases to be indexed.
49
50 =cut
51
52 # mapping between data type and tag which specify
53 # format in XML file
54 my %type2tag = (
55 'isis' => 'isis',
56 # 'excel' => 'column',
57 # 'marc' => 'marc',
58 # 'feed' => 'feed'
59 );
60
61 sub new {
62 my $class = shift;
63 my $self = {@_};
64 bless($self, $class);
65
66 $self->{'start_t'} = time();
67
68 my $log_file = $self->{'log'} || "log.conf";
69 Log::Log4perl->init($log_file);
70
71 my $log = $self->_get_logger();
72
73 # fill in default values
74 # output codepage
75 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76
77 #
78 # read global.conf
79 #
80 $log->debug("read 'global.conf'");
81
82 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83
84 # read global config parametars
85 foreach my $var (qw(
86 dbi_dbd
87 dbi_dsn
88 dbi_user
89 dbi_passwd
90 show_progress
91 my_unac_filter
92 output_template
93 )) {
94 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 }
96
97 #
98 # read indexer config file
99 #
100
101 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102
103 # create UTF-8 convertor for import_xml files
104 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105
106 # create Template toolkit instance
107 $self->{'tt'} = Template->new(
108 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109 # FILTERS => {
110 # 'foo' => \&foo_filter,
111 # },
112 EVAL_PERL => 1,
113 );
114
115 # running with low_mem flag? well, use DBM::Deep then.
116 if ($self->{'low_mem'}) {
117 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118
119 my $db_file = "data.db";
120
121 if (-e $db_file) {
122 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123 $log->debug("removed '$db_file' from last run");
124 }
125
126 require DBM::Deep;
127
128 my $db = new DBM::Deep $db_file;
129
130 $log->logdie("DBM::Deep error: $!") unless ($db);
131
132 if ($db->error()) {
133 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134 } else {
135 $log->debug("using file '$db_file' for DBM::Deep");
136 }
137
138 $self->{'db'} = $db;
139 }
140
141 return $self;
142 }
143
144 =head2 open_isis
145
146 Open CDS/ISIS database using OpenIsis module and read all records to memory.
147
148 $webpac->open_isis(
149 filename => '/data/ISIS/ISIS',
150 code_page => '852',
151 limit_mfn => 500,
152 start_mfn => 6000,
153 lookup => [ ... ],
154 );
155
156 By default, ISIS code page is assumed to be C<852>.
157
158 If optional parametar C<start_mfn> is set, this will be first MFN to read
159 from database (so you can skip beginning of your database if you need to).
160
161 If optional parametar C<limit_mfn> is set, it will read just 500 records
162 from database in example above.
163
164 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166 value in index.
167
168 lookup => [
169 { 'key' => 'd:v900', 'val' => 'v250^a' },
170 { 'eval' => '"v901^a" eq "Podruèje"',
171 'key' => 'pa:v561^4:v562^4:v461^1',
172 'val' => 'v900' },
173 ]
174
175 Returns number of last record read into memory (size of database, really).
176
177 =cut
178
179 sub open_isis {
180 my $self = shift;
181 my $arg = {@_};
182
183 my $log = $self->_get_logger();
184
185 $log->logcroak("need filename") if (! $arg->{'filename'});
186 my $code_page = $arg->{'code_page'} || '852';
187
188 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189
190 # store data in object
191 $self->{'isis_filename'} = $arg->{'filename'};
192 $self->{'isis_code_page'} = $code_page;
193
194 use OpenIsis;
195
196 #$self->{'isis_code_page'} = $code_page;
197
198 # create Text::Iconv object
199 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200
201 $log->info("reading ISIS database '",$arg->{'filename'},"'");
202 $log->debug("isis code page: $code_page");
203
204 my $isis_db = OpenIsis::open($arg->{'filename'});
205
206 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207 my $startmfn = 1;
208
209 if (my $s = $self->{'start_mfn'}) {
210 $log->info("skipping to MFN $s");
211 $startmfn = $s;
212 }
213
214 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
215
216 $log->info("processing ",($maxmfn-$startmfn)." records...");
217
218 # read database
219 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
220
221
222 $log->debug("mfn: $mfn\n");
223
224 my $rec;
225
226 # read record
227 my $row = OpenIsis::read( $isis_db, $mfn );
228 foreach my $k (keys %{$row}) {
229 if ($k ne "mfn") {
230 foreach my $l (@{$row->{$k}}) {
231 $l = $cp->convert($l);
232 # has subfields?
233 my $val;
234 if ($l =~ m/\^/) {
235 foreach my $t (split(/\^/,$l)) {
236 next if (! $t);
237 $val->{substr($t,0,1)} = substr($t,1);
238 }
239 } else {
240 $val = $l;
241 }
242
243 push @{$rec->{$k}}, $val;
244 }
245 } else {
246 push @{$rec->{'000'}}, $mfn;
247 }
248
249 }
250
251 $log->confess("record $mfn empty?") unless ($rec);
252
253 # store
254 if ($self->{'low_mem'}) {
255 $self->{'db'}->put($mfn, $rec);
256 } else {
257 $self->{'data'}->{$mfn} = $rec;
258 }
259
260 # create lookup
261 $self->create_lookup($rec, @{$arg->{'lookup'}});
262
263 $self->progress_bar($mfn,$maxmfn);
264
265 }
266
267 $self->{'current_mfn'} = $startmfn;
268 $self->{'last_pcnt'} = 0;
269
270 $log->debug("max mfn: $maxmfn");
271
272 # store max mfn and return it.
273 return $self->{'max_mfn'} = $maxmfn;
274 }
275
276 =head2 fetch_rec
277
278 Fetch next record from database. It will also display progress bar (once
279 it's implemented, that is).
280
281 my $rec = $webpac->fetch_rec;
282
283 =cut
284
285 sub fetch_rec {
286 my $self = shift;
287
288 my $log = $self->_get_logger();
289
290 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
291
292 if ($mfn > $self->{'max_mfn'}) {
293 $self->{'current_mfn'} = $self->{'max_mfn'};
294 $log->debug("at EOF");
295 return;
296 }
297
298 $self->progress_bar($mfn,$self->{'max_mfn'});
299
300 if ($self->{'low_mem'}) {
301 return $self->{'db'}->get($mfn);
302 } else {
303 return $self->{'data'}->{$mfn};
304 }
305 }
306
307 =head2 progress_bar
308
309 Draw progress bar on STDERR.
310
311 $webpac->progress_bar($current, $max);
312
313 =cut
314
315 sub progress_bar {
316 my $self = shift;
317
318 my ($curr,$max) = @_;
319
320 my $log = $self->_get_logger();
321
322 $log->logconfess("no current value!") if (! $curr);
323 $log->logconfess("no maximum value!") if (! $max);
324
325 if ($curr > $max) {
326 $max = $curr;
327 $log->debug("overflow to $curr");
328 }
329
330 $self->{'last_pcnt'} ||= 1;
331
332 my $p = int($curr * 100 / $max);
333
334 # reset on re-run
335 if ($p < $self->{'last_pcnt'}) {
336 $self->{'last_pcnt'} = $p;
337 $self->{'last_t'} = time();
338 $self->{'last_curr'} = 1;
339 }
340
341 if ($p != $self->{'last_pcnt'}) {
342
343 my $last_curr = $self->{'last_curr'} || $curr;
344 my $t = time();
345 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
346 my $eta = ($max-$curr) / ($rate || 1);
347 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
348 $self->{'last_pcnt'} = $p;
349 $self->{'last_t'} = time();
350 $self->{'last_curr'} = $curr;
351 }
352 print STDERR "\n" if ($p == 100);
353 }
354
355 =head2 fmt_time
356
357 Format time (in seconds) for display.
358
359 print $webpac->fmt_time(time());
360
361 This method is called by L<progress_bar> to display remaining time.
362
363 =cut
364
365 sub fmt_time {
366 my $self = shift;
367
368 my $t = shift || 0;
369 my $out = "";
370
371 my ($ss,$mm,$hh) = gmtime($t);
372 $out .= "${hh}h" if ($hh);
373 $out .= sprintf("%02d:%02d", $mm,$ss);
374 $out .= " " if ($hh == 0);
375 return $out;
376 }
377
378 =head2 open_import_xml
379
380 Read file from C<import_xml/> directory and parse it.
381
382 $webpac->open_import_xml(type => 'isis');
383
384 =cut
385
386 sub open_import_xml {
387 my $self = shift;
388
389 my $log = $self->_get_logger();
390
391 my $arg = {@_};
392 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
393
394 $self->{'type'} = $arg->{'type'};
395
396 my $type_base = $arg->{'type'};
397 $type_base =~ s/_.*$//g;
398
399 $self->{'tag'} = $type2tag{$type_base};
400
401 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
402
403 my $f = "./import_xml/".$self->{'type'}.".xml";
404 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
405
406 $log->info("reading '$f'");
407
408 $self->{'import_xml_file'} = $f;
409
410 $self->{'import_xml'} = XMLin($f,
411 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
412 );
413
414 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
415
416 }
417
418 =head2 create_lookup
419
420 Create lookup from record using lookup definition.
421
422 $self->create_lookup($rec, @lookups);
423
424 Called internally by C<open_*> methods.
425
426 =cut
427
428 sub create_lookup {
429 my $self = shift;
430
431 my $log = $self->_get_logger();
432
433 my $rec = shift || $log->logconfess("need record to create lookup");
434 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
435
436 foreach my $i (@_) {
437 $log->logconfess("need key") unless defined($i->{'key'});
438 $log->logconfess("need val") unless defined($i->{'val'});
439
440 if (defined($i->{'eval'})) {
441 # eval first, so we can skip fill_in for key and val
442 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
443 if ($self->_eval($eval)) {
444 my $key = $self->fill_in($rec,$i->{'key'}) || next;
445 my @val = $self->fill_in($rec,$i->{'val'}) || next;
446 $log->debug("stored $key = ",sub { join(" | ",@val) });
447 push @{$self->{'lookup'}->{$key}}, @val;
448 }
449 } else {
450 my $key = $self->fill_in($rec,$i->{'key'}) || next;
451 my @val = $self->fill_in($rec,$i->{'val'}) || next;
452 $log->debug("stored $key = ",sub { join(" | ",@val) });
453 push @{$self->{'lookup'}->{$key}}, @val;
454 }
455 }
456 }
457
458 =head2 get_data
459
460 Returns value from record.
461
462 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
463
464 Arguments are:
465 record reference C<$rec>,
466 field C<$f>,
467 optional subfiled C<$sf>,
468 index for repeatable values C<$i>.
469
470 Optinal variable C<$found> will be incremeted if there
471 is field.
472
473 Returns value or empty string.
474
475 =cut
476
477 sub get_data {
478 my $self = shift;
479
480 my ($rec,$f,$sf,$i,$found) = @_;
481
482 if ($$rec->{$f}) {
483 return '' if (! $$rec->{$f}->[$i]);
484 no strict 'refs';
485 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
486 $$found++ if (defined($$found));
487 return $$rec->{$f}->[$i]->{$sf};
488 } elsif ($$rec->{$f}->[$i]) {
489 $$found++ if (defined($$found));
490 # it still might have subfield, just
491 # not specified, so we'll dump all
492 if ($$rec->{$f}->[$i] =~ /HASH/o) {
493 my $out;
494 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
495 $out .= $$rec->{$f}->[$i]->{$k}." ";
496 }
497 return $out;
498 } else {
499 return $$rec->{$f}->[$i];
500 }
501 }
502 } else {
503 return '';
504 }
505 }
506
507 =head2 fill_in
508
509 Workhourse of all: takes record from in-memory structure of database and
510 strings with placeholders and returns string or array of with substituted
511 values from record.
512
513 my $text = $webpac->fill_in($rec,'v250^a');
514
515 Optional argument is ordinal number for repeatable fields. By default,
516 it's assume to be first repeatable field (fields are perl array, so first
517 element is 0).
518 Following example will read second value from repeatable field.
519
520 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
521
522 This function B<does not> perform parsing of format to inteligenty skip
523 delimiters before fields which aren't used.
524
525 This method will automatically decode UTF-8 string to local code page
526 if needed.
527
528 =cut
529
530 sub fill_in {
531 my $self = shift;
532
533 my $log = $self->_get_logger();
534
535 my $rec = shift || $log->logconfess("need data record");
536 my $format = shift || $log->logconfess("need format to parse");
537 # iteration (for repeatable fields)
538 my $i = shift || 0;
539
540 # FIXME remove for speedup?
541 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
542
543 if (utf8::is_utf8($format)) {
544 $format = $self->_x($format);
545 }
546
547 my $found = 0;
548
549 my $eval_code;
550 # remove eval{...} from beginning
551 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
552
553 # do actual replacement of placeholders
554 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
555
556 if ($found) {
557 $log->debug("format: $format");
558 if ($eval_code) {
559 my $eval = $self->fill_in($rec,$eval_code,$i);
560 return if (! $self->_eval($eval));
561 }
562 # do we have lookups?
563 if ($format =~ /$LOOKUP_REGEX/o) {
564 $log->debug("format '$format' has lookup");
565 return $self->lookup($format);
566 } else {
567 return $format;
568 }
569 } else {
570 return;
571 }
572 }
573
574 =head2 lookup
575
576 Perform lookups on format supplied to it.
577
578 my $text = $self->lookup('[v900]');
579
580 Lookups can be nested (like C<[d:[a:[v900]]]>).
581
582 =cut
583
584 sub lookup {
585 my $self = shift;
586
587 my $log = $self->_get_logger();
588
589 my $tmp = shift || $log->logconfess("need format");
590
591 if ($tmp =~ /$LOOKUP_REGEX/o) {
592 my @in = ( $tmp );
593
594 $log->debug("lookup for: ",$tmp);
595
596 my @out;
597 while (my $f = shift @in) {
598 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
599 my $k = $1;
600 if ($self->{'lookup'}->{$k}) {
601 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
602 my $tmp2 = $f;
603 $tmp2 =~ s/lookup{$k}/$nv/g;
604 push @in, $tmp2;
605 }
606 } else {
607 undef $f;
608 }
609 } elsif ($f) {
610 push @out, $f;
611 }
612 }
613 $log->logconfess("return is array and it's not expected!") unless wantarray;
614 return @out;
615 } else {
616 return $tmp;
617 }
618 }
619
620 =head2 parse
621
622 Perform smart parsing of string, skipping delimiters for fields which aren't
623 defined. It can also eval code in format starting with C<eval{...}> and
624 return output or nothing depending on eval code.
625
626 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
627
628 =cut
629
630 sub parse {
631 my $self = shift;
632
633 my ($rec, $format_utf8, $i) = @_;
634
635 return if (! $format_utf8);
636
637 my $log = $self->_get_logger();
638
639 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
640 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
641
642 $i = 0 if (! $i);
643
644 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
645
646 my @out;
647
648 $log->debug("format: $format");
649
650 my $eval_code;
651 # remove eval{...} from beginning
652 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
653
654 my $prefix;
655 my $all_found=0;
656
657 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
658
659 my $del = $1 || '';
660 $prefix ||= $del if ($all_found == 0);
661
662 my $found = 0;
663 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
664
665 if ($found) {
666 push @out, $del;
667 push @out, $tmp;
668 $all_found += $found;
669 }
670 }
671
672 return if (! $all_found);
673
674 my $out = join('',@out);
675
676 if ($out) {
677 # add rest of format (suffix)
678 $out .= $format;
679
680 # add prefix if not there
681 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
682
683 $log->debug("result: $out");
684 }
685
686 if ($eval_code) {
687 my $eval = $self->fill_in($rec,$eval_code,$i);
688 $log->debug("about to eval{",$eval,"} format: $out");
689 return if (! $self->_eval($eval));
690 }
691
692 return $out;
693 }
694
695 =head2 parse_to_arr
696
697 Similar to C<parse>, but returns array of all repeatable fields
698
699 my @arr = $webpac->parse_to_arr($rec,'v250^a');
700
701 =cut
702
703 sub parse_to_arr {
704 my $self = shift;
705
706 my ($rec, $format_utf8) = @_;
707
708 my $log = $self->_get_logger();
709
710 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
711 return if (! $format_utf8);
712
713 my $i = 0;
714 my @arr;
715
716 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
717 push @arr, $v;
718 }
719
720 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
721
722 return @arr;
723 }
724
725 =head2 fill_in_to_arr
726
727 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
728 for fields which have lookups, so they shouldn't be parsed but rather
729 C<fill_id>ed.
730
731 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
732
733 =cut
734
735 sub fill_in_to_arr {
736 my $self = shift;
737
738 my ($rec, $format_utf8) = @_;
739
740 my $log = $self->_get_logger();
741
742 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
743 return if (! $format_utf8);
744
745 my $i = 0;
746 my @arr;
747
748 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
749 push @arr, @v;
750 }
751
752 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
753
754 return @arr;
755 }
756
757
758 =head2 data_structure
759
760 Create in-memory data structure which represents layout from C<import_xml>.
761 It is used later to produce output.
762
763 my @ds = $webpac->data_structure($rec);
764
765 This method will also set C<$webpac->{'currnet_filename'}> if there is
766 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
767 <headline> tag.
768
769 =cut
770
771 sub data_structure {
772 my $self = shift;
773
774 my $log = $self->_get_logger();
775
776 my $rec = shift;
777 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
778
779 undef $self->{'currnet_filename'};
780 undef $self->{'headline'};
781
782 my @sorted_tags;
783 if ($self->{tags_by_order}) {
784 @sorted_tags = @{$self->{tags_by_order}};
785 } else {
786 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
787 $self->{tags_by_order} = \@sorted_tags;
788 }
789
790 my @ds;
791
792 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
793
794 foreach my $field (@sorted_tags) {
795
796 my $row;
797
798 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
799
800 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
801 my $format = $tag->{'value'} || $tag->{'content'};
802
803 $log->debug("format: $format");
804
805 my @v;
806 if ($format =~ /$LOOKUP_REGEX/o) {
807 @v = $self->fill_in_to_arr($rec,$format);
808 } else {
809 @v = $self->parse_to_arr($rec,$format);
810 }
811 next if (! @v);
812
813 # use format?
814 if ($tag->{'format_name'}) {
815 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
816 }
817
818 if ($field eq 'filename') {
819 $self->{'current_filename'} = join('',@v);
820 $log->debug("filename: ",$self->{'current_filename'});
821 } elsif ($field eq 'headline') {
822 $self->{'headline'} .= join('',@v);
823 $log->debug("headline: ",$self->{'headline'});
824 next; # don't return headline in data_structure!
825 }
826
827 # does tag have type?
828 if ($tag->{'type'}) {
829 push @{$row->{$tag->{'type'}}}, @v;
830 } else {
831 push @{$row->{'display'}}, @v;
832 push @{$row->{'swish'}}, @v;
833 }
834
835
836 }
837
838 if ($row) {
839 $row->{'tag'} = $field;
840
841 # TODO: name_sigular, name_plural
842 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
843 $row->{'name'} = $name ? $self->_x($name) : $field;
844
845 push @ds, $row;
846
847 $log->debug("row $field: ",sub { Dumper($row) });
848 }
849
850 }
851
852 return @ds;
853
854 }
855
856 =head2 output
857
858 Create output from in-memory data structure using Template Toolkit template.
859
860 my $text = $webpac->output( template => 'text.tt', data => @ds );
861
862 =cut
863
864 sub output {
865 my $self = shift;
866
867 my $args = {@_};
868
869 my $log = $self->_get_logger();
870
871 $log->logconfess("need template name") if (! $args->{'template'});
872 $log->logconfess("need data array") if (! $args->{'data'});
873
874 my $out;
875
876 $self->{'tt'}->process(
877 $args->{'template'},
878 $args,
879 \$out
880 ) || confess $self->{'tt'}->error();
881
882 return $out;
883 }
884
885 =head2 output_file
886
887 Create output from in-memory data structure using Template Toolkit template
888 to a file.
889
890 $webpac->output_file(
891 file => 'out.txt',
892 template => 'text.tt',
893 data => @ds
894 );
895
896 =cut
897
898 sub output_file {
899 my $self = shift;
900
901 my $args = {@_};
902
903 my $log = $self->_get_logger();
904
905 my $file = $args->{'file'} || $log->logconfess("need file name");
906
907 $log->debug("creating file ",$file);
908
909 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
910 print $fh $self->output(
911 template => $args->{'template'},
912 data => $args->{'data'},
913 ) || $log->logdie("print: $!");
914 close($fh) || $log->logdie("close: $!");
915 }
916
917 =head2 apply_format
918
919 Apply format specified in tag with C<format_name="name"> and
920 C<format_delimiter=";;">.
921
922 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
923
924 Formats can contain C<lookup{...}> if you need them.
925
926 =cut
927
928 sub apply_format {
929 my $self = shift;
930
931 my ($name,$delimiter,$data) = @_;
932
933 my $log = $self->_get_logger();
934
935 if (! $self->{'import_xml'}->{'format'}->{$name}) {
936 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
937 return $data;
938 }
939
940 $log->warn("no delimiter for format $name") if (! $delimiter);
941
942 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
943
944 my @data = split(/\Q$delimiter\E/, $data);
945
946 my $out = sprintf($format, @data);
947 $log->debug("using format $name [$format] on $data to produce: $out");
948
949 if ($out =~ m/$LOOKUP_REGEX/o) {
950 return $self->lookup($out);
951 } else {
952 return $out;
953 }
954
955 }
956
957
958 #
959 #
960 #
961
962 =head1 INTERNAL METHODS
963
964 Here is a quick list of internal methods, mostly useful to turn debugging
965 on them (see L<LOGGING> below for explanation).
966
967 =cut
968
969 =head2 _eval
970
971 Internal function to eval code without C<strict 'subs'>.
972
973 =cut
974
975 sub _eval {
976 my $self = shift;
977
978 my $code = shift || return;
979
980 my $log = $self->_get_logger();
981
982 no strict 'subs';
983 my $ret = eval $code;
984 if ($@) {
985 $log->error("problem with eval code [$code]: $@");
986 }
987
988 $log->debug("eval: ",$code," [",$ret,"]");
989
990 return $ret || 0;
991 }
992
993 =head2 _sort_by_order
994
995 Sort xml tags data structure accoding to C<order=""> attribute.
996
997 =cut
998
999 sub _sort_by_order {
1000 my $self = shift;
1001
1002 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1003 $self->{'import_xml'}->{'indexer'}->{$a};
1004 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1005 $self->{'import_xml'}->{'indexer'}->{$b};
1006
1007 return $va <=> $vb;
1008 }
1009
1010 =head2 _get_logger
1011
1012 Get C<Log::Log4perl> object with a twist: domains are defined for each
1013 method
1014
1015 my $log = $webpac->_get_logger();
1016
1017 =cut
1018
1019 sub _get_logger {
1020 my $self = shift;
1021
1022 my $name = (caller(1))[3] || caller;
1023 return get_logger($name);
1024 }
1025
1026 =head2 _x
1027
1028 Convert string from UTF-8 to code page defined in C<import_xml>.
1029
1030 my $text = $webpac->_x('utf8 text');
1031
1032 =cut
1033
1034 sub _x {
1035 my $self = shift;
1036 my $utf8 = shift || return;
1037
1038 return $self->{'utf2cp'}->convert($utf8) ||
1039 $self->_get_logger()->logwarn("can't convert '$utf8'");
1040 }
1041
1042 #
1043 #
1044 #
1045
1046 =head1 LOGGING
1047
1048 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1049 C<log.conf>.
1050
1051 Methods defined above have different levels of logging, so
1052 it's descriptions will be useful to turn (mostry B<debug> logging) on
1053 or off to see why WabPAC isn't perforing as you expect it (it might even
1054 be a bug!).
1055
1056 B<This is different from normal Log4perl behaviour>. To repeat, you can
1057 also use method names, and not only classes (which are just few)
1058 to filter logging.
1059
1060
1061 =head1 MEMORY USAGE
1062
1063 C<low_mem> options is double-edged sword. If enabled, WebPAC
1064 will run on memory constraint machines (which doesn't have enough
1065 physical RAM to create memory structure for whole source database).
1066
1067 If your machine has 512Mb or more of RAM and database is around 10000 records,
1068 memory shouldn't be an issue. If you don't have enough physical RAM, you
1069 might consider using virtual memory (if your operating system is handling it
1070 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1071 parsed structure of ISIS database (this is what C<low_mem> option does).
1072
1073 Hitting swap at end of reading source database is probably o.k. However,
1074 hitting swap before 90% will dramatically decrease performance and you will
1075 be better off with C<low_mem> and using rest of availble memory for
1076 operating system disk cache (Linux is particuallary good about this).
1077 However, every access to database record will require disk access, so
1078 generation phase will be slower 10-100 times.
1079
1080 Parsed structures are essential - you just have option to trade RAM memory
1081 (which is fast) for disk space (which is slow). Be sure to have planty of
1082 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1083
1084 However, when WebPAC is running on desktop machines (or laptops :-), it's
1085 highly undesireable for system to start swapping. Using C<low_mem> option can
1086 reduce WecPAC memory usage to around 64Mb for same database with lookup
1087 fields and sorted indexes which stay in RAM. Performance will suffer, but
1088 memory usage will really be minimal. It might be also more confortable to
1089 run WebPAC reniced on those machines.
1090
1091 =cut
1092
1093 1;

  ViewVC Help
Powered by ViewVC 1.1.26