/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 434 - (show annotations)
Mon Sep 13 14:39:16 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 24606 byte(s)
require and not use DBM::Deep, so if you are not using low_mem option,
you don't need this module.

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 [code_page => 'ISO-8859-2',]
38 [low_mem => 1,]
39 );
40
41 Default C<code_page> is C<ISO-8859-2>.
42
43 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44
45 This method will also read configuration files
46 C<global.conf> (used by indexer and Web font-end)
47 and configuration file specified by C<config_file>
48 which describes databases to be indexed.
49
50 =cut
51
52 # mapping between data type and tag which specify
53 # format in XML file
54 my %type2tag = (
55 'isis' => 'isis',
56 # 'excel' => 'column',
57 # 'marc' => 'marc',
58 # 'feed' => 'feed'
59 );
60
61 sub new {
62 my $class = shift;
63 my $self = {@_};
64 bless($self, $class);
65
66 $self->{'start_t'} = time();
67
68 my $log_file = $self->{'log'} || "log.conf";
69 Log::Log4perl->init($log_file);
70
71 my $log = $self->_get_logger();
72
73 # fill in default values
74 # output codepage
75 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76
77 #
78 # read global.conf
79 #
80 $log->debug("read 'global.conf'");
81
82 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83
84 # read global config parametars
85 foreach my $var (qw(
86 dbi_dbd
87 dbi_dsn
88 dbi_user
89 dbi_passwd
90 show_progress
91 my_unac_filter
92 output_template
93 )) {
94 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 }
96
97 #
98 # read indexer config file
99 #
100
101 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102
103 # create UTF-8 convertor for import_xml files
104 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105
106 # create Template toolkit instance
107 $self->{'tt'} = Template->new(
108 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109 # FILTERS => {
110 # 'foo' => \&foo_filter,
111 # },
112 EVAL_PERL => 1,
113 );
114
115 # running with low_mem flag? well, use DBM::Deep then.
116 if ($self->{'low_mem'}) {
117 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118
119 my $db_file = "data.db";
120
121 if (-e $db_file) {
122 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123 $log->debug("removed '$db_file' from last run");
124 }
125
126 require DBM::Deep;
127
128 my $db = new DBM::Deep $db_file;
129
130 $log->logdie("DBM::Deep error: $!") unless ($db);
131
132 if ($db->error()) {
133 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134 } else {
135 $log->debug("using file '$db_file' for DBM::Deep");
136 }
137
138 $self->{'db'} = $db;
139 }
140
141 return $self;
142 }
143
144 =head2 open_isis
145
146 Open CDS/ISIS database using OpenIsis module and read all records to memory.
147
148 $webpac->open_isis(
149 filename => '/data/ISIS/ISIS',
150 code_page => '852',
151 limit_mfn => 500,
152 start_mfn => 6000,
153 lookup => [ ... ],
154 );
155
156 By default, ISIS code page is assumed to be C<852>.
157
158 If optional parametar C<start_mfn> is set, this will be first MFN to read
159 from database (so you can skip beginning of your database if you need to).
160
161 If optional parametar C<limit_mfn> is set, it will read just 500 records
162 from database in example above.
163
164 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166 value in index.
167
168 lookup => [
169 { 'key' => 'd:v900', 'val' => 'v250^a' },
170 { 'eval' => '"v901^a" eq "Podruèje"',
171 'key' => 'pa:v561^4:v562^4:v461^1',
172 'val' => 'v900' },
173 ]
174
175 Returns number of last record read into memory (size of database, really).
176
177 =cut
178
179 sub open_isis {
180 my $self = shift;
181 my $arg = {@_};
182
183 my $log = $self->_get_logger();
184
185 $log->logcroak("need filename") if (! $arg->{'filename'});
186 my $code_page = $arg->{'code_page'} || '852';
187
188 # store data in object
189 $self->{'isis_filename'} = $arg->{'filename'};
190 $self->{'isis_code_page'} = $code_page;
191
192 use OpenIsis;
193
194 #$self->{'isis_code_page'} = $code_page;
195
196 # create Text::Iconv object
197 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
198
199 $log->info("reading ISIS database '",$arg->{'filename'},"'");
200 $log->debug("isis code page: $code_page");
201
202 my $isis_db = OpenIsis::open($arg->{'filename'});
203
204 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
205 my $startmfn = 1;
206
207 if (my $s = $self->{'start_mfn'}) {
208 $log->info("skipping to MFN $s");
209 $startmfn = $s;
210 }
211
212 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
213
214 $log->info("processing ",($maxmfn-$startmfn)." records...");
215
216 # read database
217 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
218
219
220 $log->debug("mfn: $mfn\n");
221
222 my $rec;
223
224 # read record
225 my $row = OpenIsis::read( $isis_db, $mfn );
226 foreach my $k (keys %{$row}) {
227 if ($k ne "mfn") {
228 foreach my $l (@{$row->{$k}}) {
229 $l = $cp->convert($l);
230 # has subfields?
231 my $val;
232 if ($l =~ m/\^/) {
233 foreach my $t (split(/\^/,$l)) {
234 next if (! $t);
235 $val->{substr($t,0,1)} = substr($t,1);
236 }
237 } else {
238 $val = $l;
239 }
240
241 push @{$rec->{$k}}, $val;
242 }
243 } else {
244 push @{$rec->{'000'}}, $mfn;
245 }
246
247 }
248
249 $log->confess("record $mfn empty?") unless ($rec);
250
251 # store
252 if ($self->{'low_mem'}) {
253 $self->{'db'}->put($mfn, $rec);
254 } else {
255 $self->{'data'}->{$mfn} = $rec;
256 }
257
258 # create lookup
259 $self->create_lookup($rec, @{$arg->{'lookup'}});
260
261 $self->progress_bar($mfn,$maxmfn);
262
263 }
264
265 $self->{'current_mfn'} = $startmfn;
266 $self->{'last_pcnt'} = 0;
267
268 $log->debug("max mfn: $maxmfn");
269
270 # store max mfn and return it.
271 return $self->{'max_mfn'} = $maxmfn;
272 }
273
274 =head2 fetch_rec
275
276 Fetch next record from database. It will also display progress bar (once
277 it's implemented, that is).
278
279 my $rec = $webpac->fetch_rec;
280
281 =cut
282
283 sub fetch_rec {
284 my $self = shift;
285
286 my $log = $self->_get_logger();
287
288 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
289
290 if ($mfn > $self->{'max_mfn'}) {
291 $self->{'current_mfn'} = $self->{'max_mfn'};
292 $log->debug("at EOF");
293 return;
294 }
295
296 $self->progress_bar($mfn,$self->{'max_mfn'});
297
298 if ($self->{'low_mem'}) {
299 return $self->{'db'}->get($mfn);
300 } else {
301 return $self->{'data'}->{$mfn};
302 }
303 }
304
305 =head2 progress_bar
306
307 Draw progress bar on STDERR.
308
309 $webpac->progress_bar($current, $max);
310
311 =cut
312
313 sub progress_bar {
314 my $self = shift;
315
316 my ($curr,$max) = @_;
317
318 my $log = $self->_get_logger();
319
320 $log->logconfess("no current value!") if (! $curr);
321 $log->logconfess("no maximum value!") if (! $max);
322
323 if ($curr > $max) {
324 $max = $curr;
325 $log->debug("overflow to $curr");
326 }
327
328 $self->{'last_pcnt'} ||= 1;
329
330 my $p = int($curr * 100 / $max);
331
332 # reset on re-run
333 if ($p < $self->{'last_pcnt'}) {
334 $self->{'last_pcnt'} = $p;
335 $self->{'last_t'} = time();
336 $self->{'last_curr'} = 1;
337 }
338
339 if ($p != $self->{'last_pcnt'}) {
340
341 my $last_curr = $self->{'last_curr'} || $curr;
342 my $t = time();
343 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
344 my $eta = ($max-$curr) / ($rate || 1);
345 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
346 $self->{'last_pcnt'} = $p;
347 $self->{'last_t'} = time();
348 $self->{'last_curr'} = $curr;
349 }
350 print STDERR "\n" if ($p == 100);
351 }
352
353 =head2 fmt_time
354
355 Format time (in seconds) for display.
356
357 print $webpac->fmt_time(time());
358
359 This method is called by L<progress_bar> to display remaining time.
360
361 =cut
362
363 sub fmt_time {
364 my $self = shift;
365
366 my $t = shift || 0;
367 my $out = "";
368
369 my ($ss,$mm,$hh) = gmtime($t);
370 $out .= "${hh}h" if ($hh);
371 $out .= sprintf("%02d:%02d", $mm,$ss);
372 $out .= " " if ($hh == 0);
373 return $out;
374 }
375
376 =head2 open_import_xml
377
378 Read file from C<import_xml/> directory and parse it.
379
380 $webpac->open_import_xml(type => 'isis');
381
382 =cut
383
384 sub open_import_xml {
385 my $self = shift;
386
387 my $log = $self->_get_logger();
388
389 my $arg = {@_};
390 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
391
392 $self->{'type'} = $arg->{'type'};
393
394 my $type_base = $arg->{'type'};
395 $type_base =~ s/_.*$//g;
396
397 $self->{'tag'} = $type2tag{$type_base};
398
399 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
400
401 my $f = "./import_xml/".$self->{'type'}.".xml";
402 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
403
404 $log->info("reading '$f'");
405
406 $self->{'import_xml_file'} = $f;
407
408 $self->{'import_xml'} = XMLin($f,
409 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
410 );
411
412 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
413
414 }
415
416 =head2 create_lookup
417
418 Create lookup from record using lookup definition.
419
420 $self->create_lookup($rec, @lookups);
421
422 Called internally by C<open_*> methods.
423
424 =cut
425
426 sub create_lookup {
427 my $self = shift;
428
429 my $log = $self->_get_logger();
430
431 my $rec = shift || $log->logconfess("need record to create lookup");
432 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
433
434 foreach my $i (@_) {
435 $log->logconfess("need key") unless defined($i->{'key'});
436 $log->logconfess("need val") unless defined($i->{'val'});
437
438 if (defined($i->{'eval'})) {
439 # eval first, so we can skip fill_in for key and val
440 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
441 if ($self->_eval($eval)) {
442 my $key = $self->fill_in($rec,$i->{'key'}) || next;
443 my @val = $self->fill_in($rec,$i->{'val'}) || next;
444 $log->debug("stored $key = ",sub { join(" | ",@val) });
445 push @{$self->{'lookup'}->{$key}}, @val;
446 }
447 } else {
448 my $key = $self->fill_in($rec,$i->{'key'}) || next;
449 my @val = $self->fill_in($rec,$i->{'val'}) || next;
450 $log->debug("stored $key = ",sub { join(" | ",@val) });
451 push @{$self->{'lookup'}->{$key}}, @val;
452 }
453 }
454 }
455
456 =head2 get_data
457
458 Returns value from record.
459
460 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
461
462 Arguments are:
463 record reference C<$rec>,
464 field C<$f>,
465 optional subfiled C<$sf>,
466 index for repeatable values C<$i>.
467
468 Optinal variable C<$found> will be incremeted if there
469 is field.
470
471 Returns value or empty string.
472
473 =cut
474
475 sub get_data {
476 my $self = shift;
477
478 my ($rec,$f,$sf,$i,$found) = @_;
479
480 if ($$rec->{$f}) {
481 return '' if (! $$rec->{$f}->[$i]);
482 no strict 'refs';
483 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
484 $$found++ if (defined($$found));
485 return $$rec->{$f}->[$i]->{$sf};
486 } elsif ($$rec->{$f}->[$i]) {
487 $$found++ if (defined($$found));
488 # it still might have subfield, just
489 # not specified, so we'll dump all
490 if ($$rec->{$f}->[$i] =~ /HASH/o) {
491 my $out;
492 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
493 $out .= $$rec->{$f}->[$i]->{$k}." ";
494 }
495 return $out;
496 } else {
497 return $$rec->{$f}->[$i];
498 }
499 }
500 } else {
501 return '';
502 }
503 }
504
505 =head2 fill_in
506
507 Workhourse of all: takes record from in-memory structure of database and
508 strings with placeholders and returns string or array of with substituted
509 values from record.
510
511 my $text = $webpac->fill_in($rec,'v250^a');
512
513 Optional argument is ordinal number for repeatable fields. By default,
514 it's assume to be first repeatable field (fields are perl array, so first
515 element is 0).
516 Following example will read second value from repeatable field.
517
518 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
519
520 This function B<does not> perform parsing of format to inteligenty skip
521 delimiters before fields which aren't used.
522
523 This method will automatically decode UTF-8 string to local code page
524 if needed.
525
526 =cut
527
528 sub fill_in {
529 my $self = shift;
530
531 my $log = $self->_get_logger();
532
533 my $rec = shift || $log->logconfess("need data record");
534 my $format = shift || $log->logconfess("need format to parse");
535 # iteration (for repeatable fields)
536 my $i = shift || 0;
537
538 # FIXME remove for speedup?
539 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
540
541 if (utf8::is_utf8($format)) {
542 $format = $self->_x($format);
543 }
544
545 my $found = 0;
546
547 my $eval_code;
548 # remove eval{...} from beginning
549 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
550
551 # do actual replacement of placeholders
552 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
553
554 if ($found) {
555 $log->debug("format: $format");
556 if ($eval_code) {
557 my $eval = $self->fill_in($rec,$eval_code,$i);
558 return if (! $self->_eval($eval));
559 }
560 # do we have lookups?
561 if ($format =~ /$LOOKUP_REGEX/o) {
562 $log->debug("format '$format' has lookup");
563 return $self->lookup($format);
564 } else {
565 return $format;
566 }
567 } else {
568 return;
569 }
570 }
571
572 =head2 lookup
573
574 Perform lookups on format supplied to it.
575
576 my $text = $self->lookup('[v900]');
577
578 Lookups can be nested (like C<[d:[a:[v900]]]>).
579
580 =cut
581
582 sub lookup {
583 my $self = shift;
584
585 my $log = $self->_get_logger();
586
587 my $tmp = shift || $log->logconfess("need format");
588
589 if ($tmp =~ /$LOOKUP_REGEX/o) {
590 my @in = ( $tmp );
591
592 $log->debug("lookup for: ",$tmp);
593
594 my @out;
595 while (my $f = shift @in) {
596 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
597 my $k = $1;
598 if ($self->{'lookup'}->{$k}) {
599 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
600 my $tmp2 = $f;
601 $tmp2 =~ s/lookup{$k}/$nv/g;
602 push @in, $tmp2;
603 }
604 } else {
605 undef $f;
606 }
607 } elsif ($f) {
608 push @out, $f;
609 }
610 }
611 $log->logconfess("return is array and it's not expected!") unless wantarray;
612 return @out;
613 } else {
614 return $tmp;
615 }
616 }
617
618 =head2 parse
619
620 Perform smart parsing of string, skipping delimiters for fields which aren't
621 defined. It can also eval code in format starting with C<eval{...}> and
622 return output or nothing depending on eval code.
623
624 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
625
626 =cut
627
628 sub parse {
629 my $self = shift;
630
631 my ($rec, $format_utf8, $i) = @_;
632
633 return if (! $format_utf8);
634
635 my $log = $self->_get_logger();
636
637 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
638 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
639
640 $i = 0 if (! $i);
641
642 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
643
644 my @out;
645
646 $log->debug("format: $format");
647
648 my $eval_code;
649 # remove eval{...} from beginning
650 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
651
652 my $prefix;
653 my $all_found=0;
654
655 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
656
657 my $del = $1 || '';
658 $prefix ||= $del if ($all_found == 0);
659
660 my $found = 0;
661 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
662
663 if ($found) {
664 push @out, $del;
665 push @out, $tmp;
666 $all_found += $found;
667 }
668 }
669
670 return if (! $all_found);
671
672 my $out = join('',@out);
673
674 if ($out) {
675 # add rest of format (suffix)
676 $out .= $format;
677
678 # add prefix if not there
679 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
680
681 $log->debug("result: $out");
682 }
683
684 if ($eval_code) {
685 my $eval = $self->fill_in($rec,$eval_code,$i);
686 $log->debug("about to eval{",$eval,"} format: $out");
687 return if (! $self->_eval($eval));
688 }
689
690 return $out;
691 }
692
693 =head2 parse_to_arr
694
695 Similar to C<parse>, but returns array of all repeatable fields
696
697 my @arr = $webpac->parse_to_arr($rec,'v250^a');
698
699 =cut
700
701 sub parse_to_arr {
702 my $self = shift;
703
704 my ($rec, $format_utf8) = @_;
705
706 my $log = $self->_get_logger();
707
708 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
709 return if (! $format_utf8);
710
711 my $i = 0;
712 my @arr;
713
714 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
715 push @arr, $v;
716 }
717
718 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
719
720 return @arr;
721 }
722
723 =head2 fill_in_to_arr
724
725 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
726 for fields which have lookups, so they shouldn't be parsed but rather
727 C<fill_id>ed.
728
729 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
730
731 =cut
732
733 sub fill_in_to_arr {
734 my $self = shift;
735
736 my ($rec, $format_utf8) = @_;
737
738 my $log = $self->_get_logger();
739
740 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
741 return if (! $format_utf8);
742
743 my $i = 0;
744 my @arr;
745
746 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
747 push @arr, @v;
748 }
749
750 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
751
752 return @arr;
753 }
754
755
756 =head2 data_structure
757
758 Create in-memory data structure which represents layout from C<import_xml>.
759 It is used later to produce output.
760
761 my @ds = $webpac->data_structure($rec);
762
763 This method will also set C<$webpac->{'currnet_filename'}> if there is
764 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
765 <headline> tag.
766
767 =cut
768
769 sub data_structure {
770 my $self = shift;
771
772 my $log = $self->_get_logger();
773
774 my $rec = shift;
775 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
776
777 undef $self->{'currnet_filename'};
778 undef $self->{'headline'};
779
780 my @sorted_tags;
781 if ($self->{tags_by_order}) {
782 @sorted_tags = @{$self->{tags_by_order}};
783 } else {
784 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
785 $self->{tags_by_order} = \@sorted_tags;
786 }
787
788 my @ds;
789
790 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
791
792 foreach my $field (@sorted_tags) {
793
794 my $row;
795
796 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
797
798 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
799 my $format = $tag->{'value'} || $tag->{'content'};
800
801 $log->debug("format: $format");
802
803 my @v;
804 if ($format =~ /$LOOKUP_REGEX/o) {
805 @v = $self->fill_in_to_arr($rec,$format);
806 } else {
807 @v = $self->parse_to_arr($rec,$format);
808 }
809 next if (! @v);
810
811 # use format?
812 if ($tag->{'format_name'}) {
813 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
814 }
815
816 if ($field eq 'filename') {
817 $self->{'current_filename'} = join('',@v);
818 $log->debug("filename: ",$self->{'current_filename'});
819 } elsif ($field eq 'headline') {
820 $self->{'headline'} .= join('',@v);
821 $log->debug("headline: ",$self->{'headline'});
822 next; # don't return headline in data_structure!
823 }
824
825 # does tag have type?
826 if ($tag->{'type'}) {
827 push @{$row->{$tag->{'type'}}}, @v;
828 } else {
829 push @{$row->{'display'}}, @v;
830 push @{$row->{'swish'}}, @v;
831 }
832
833
834 }
835
836 if ($row) {
837 $row->{'tag'} = $field;
838
839 # TODO: name_sigular, name_plural
840 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
841 $row->{'name'} = $name ? $self->_x($name) : $field;
842
843 push @ds, $row;
844
845 $log->debug("row $field: ",sub { Dumper($row) });
846 }
847
848 }
849
850 return @ds;
851
852 }
853
854 =head2 output
855
856 Create output from in-memory data structure using Template Toolkit template.
857
858 my $text = $webpac->output( template => 'text.tt', data => @ds );
859
860 =cut
861
862 sub output {
863 my $self = shift;
864
865 my $args = {@_};
866
867 my $log = $self->_get_logger();
868
869 $log->logconfess("need template name") if (! $args->{'template'});
870 $log->logconfess("need data array") if (! $args->{'data'});
871
872 my $out;
873
874 $self->{'tt'}->process(
875 $args->{'template'},
876 $args,
877 \$out
878 ) || confess $self->{'tt'}->error();
879
880 return $out;
881 }
882
883 =head2 output_file
884
885 Create output from in-memory data structure using Template Toolkit template
886 to a file.
887
888 $webpac->output_file(
889 file => 'out.txt',
890 template => 'text.tt',
891 data => @ds
892 );
893
894 =cut
895
896 sub output_file {
897 my $self = shift;
898
899 my $args = {@_};
900
901 my $log = $self->_get_logger();
902
903 my $file = $args->{'file'} || $log->logconfess("need file name");
904
905 $log->debug("creating file ",$file);
906
907 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
908 print $fh $self->output(
909 template => $args->{'template'},
910 data => $args->{'data'},
911 ) || $log->logdie("print: $!");
912 close($fh) || $log->logdie("close: $!");
913 }
914
915 =head2 apply_format
916
917 Apply format specified in tag with C<format_name="name"> and
918 C<format_delimiter=";;">.
919
920 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
921
922 Formats can contain C<lookup{...}> if you need them.
923
924 =cut
925
926 sub apply_format {
927 my $self = shift;
928
929 my ($name,$delimiter,$data) = @_;
930
931 my $log = $self->_get_logger();
932
933 if (! $self->{'import_xml'}->{'format'}->{$name}) {
934 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
935 return $data;
936 }
937
938 $log->warn("no delimiter for format $name") if (! $delimiter);
939
940 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
941
942 my @data = split(/\Q$delimiter\E/, $data);
943
944 my $out = sprintf($format, @data);
945 $log->debug("using format $name [$format] on $data to produce: $out");
946
947 if ($out =~ m/$LOOKUP_REGEX/o) {
948 return $self->lookup($out);
949 } else {
950 return $out;
951 }
952
953 }
954
955
956 #
957 #
958 #
959
960 =head1 INTERNAL METHODS
961
962 Here is a quick list of internal methods, mostly useful to turn debugging
963 on them (see L<LOGGING> below for explanation).
964
965 =cut
966
967 =head2 _eval
968
969 Internal function to eval code without C<strict 'subs'>.
970
971 =cut
972
973 sub _eval {
974 my $self = shift;
975
976 my $code = shift || return;
977
978 my $log = $self->_get_logger();
979
980 no strict 'subs';
981 my $ret = eval $code;
982 if ($@) {
983 $log->error("problem with eval code [$code]: $@");
984 }
985
986 $log->debug("eval: ",$code," [",$ret,"]");
987
988 return $ret || 0;
989 }
990
991 =head2 _sort_by_order
992
993 Sort xml tags data structure accoding to C<order=""> attribute.
994
995 =cut
996
997 sub _sort_by_order {
998 my $self = shift;
999
1000 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1001 $self->{'import_xml'}->{'indexer'}->{$a};
1002 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1003 $self->{'import_xml'}->{'indexer'}->{$b};
1004
1005 return $va <=> $vb;
1006 }
1007
1008 =head2 _get_logger
1009
1010 Get C<Log::Log4perl> object with a twist: domains are defined for each
1011 method
1012
1013 my $log = $webpac->_get_logger();
1014
1015 =cut
1016
1017 sub _get_logger {
1018 my $self = shift;
1019
1020 my $name = (caller(1))[3] || caller;
1021 return get_logger($name);
1022 }
1023
1024 =head2 _x
1025
1026 Convert string from UTF-8 to code page defined in C<import_xml>.
1027
1028 my $text = $webpac->_x('utf8 text');
1029
1030 =cut
1031
1032 sub _x {
1033 my $self = shift;
1034 my $utf8 = shift || return;
1035
1036 return $self->{'utf2cp'}->convert($utf8) ||
1037 $self->_get_logger()->logwarn("can't convert '$utf8'");
1038 }
1039
1040 #
1041 #
1042 #
1043
1044 =head1 LOGGING
1045
1046 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1047 C<log.conf>.
1048
1049 Methods defined above have different levels of logging, so
1050 it's descriptions will be useful to turn (mostry B<debug> logging) on
1051 or off to see why WabPAC isn't perforing as you expect it (it might even
1052 be a bug!).
1053
1054 B<This is different from normal Log4perl behaviour>. To repeat, you can
1055 also use method names, and not only classes (which are just few)
1056 to filter logging.
1057
1058
1059 =head1 MEMORY USAGE
1060
1061 C<low_mem> options is double-edged sword. If enabled, WebPAC
1062 will run on memory constraint machines (which doesn't have enough
1063 physical RAM to create memory structure for whole source database).
1064
1065 If your machine has 512Mb or more of RAM and database is around 10000 records,
1066 memory shouldn't be an issue. If you don't have enough physical RAM, you
1067 might consider using virtual memory (if your operating system is handling it
1068 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1069 parsed structure of ISIS database (this is what C<low_mem> option does).
1070
1071 Hitting swap at end of reading source database is probably o.k. However,
1072 hitting swap before 90% will dramatically decrease performance and you will
1073 be better off with C<low_mem> and using rest of availble memory for
1074 operating system disk cache (Linux is particuallary good about this).
1075 However, every access to database record will require disk access, so
1076 generation phase will be slower 10-100 times.
1077
1078 Parsed structures are essential - you just have option to trade RAM memory
1079 (which is fast) for disk space (which is slow). Be sure to have planty of
1080 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1081
1082 However, when WebPAC is running on desktop machines (or laptops :-), it's
1083 highly undesireable for system to start swapping. Using C<low_mem> option can
1084 reduce WecPAC memory usage to around 64Mb for same database with lookup
1085 fields and sorted indexes which stay in RAM. Performance will suffer, but
1086 memory usage will really be minimal. It might be also more confortable to
1087 run WebPAC reniced on those machines.
1088
1089 =cut
1090
1091 1;

  ViewVC Help
Powered by ViewVC 1.1.26