/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 459 - (show annotations)
Tue Sep 21 19:08:11 2004 UTC (16 years, 2 months ago) by dpavlin
File size: 26360 byte(s)
fix warning

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 code_page => 'ISO-8859-2',
38 low_mem => 1,
39 );
40
41 Default C<code_page> is C<ISO-8859-2>.
42
43 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44
45 This method will also read configuration files
46 C<global.conf> (used by indexer and Web font-end)
47 and configuration file specified by C<config_file>
48 which describes databases to be indexed.
49
50 =cut
51
52 # mapping between data type and tag which specify
53 # format in XML file
54 my %type2tag = (
55 'isis' => 'isis',
56 # 'excel' => 'column',
57 # 'marc' => 'marc',
58 # 'feed' => 'feed'
59 );
60
61 sub new {
62 my $class = shift;
63 my $self = {@_};
64 bless($self, $class);
65
66 $self->{'start_t'} = time();
67
68 my $log_file = $self->{'log'} || "log.conf";
69 Log::Log4perl->init($log_file);
70
71 my $log = $self->_get_logger();
72
73 # fill in default values
74 # output codepage
75 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76
77 #
78 # read global.conf
79 #
80 $log->debug("read 'global.conf'");
81
82 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83
84 # read global config parametars
85 foreach my $var (qw(
86 dbi_dbd
87 dbi_dsn
88 dbi_user
89 dbi_passwd
90 show_progress
91 my_unac_filter
92 output_template
93 )) {
94 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 }
96
97 #
98 # read indexer config file
99 #
100
101 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102
103 # create UTF-8 convertor for import_xml files
104 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105
106 # create Template toolkit instance
107 $self->{'tt'} = Template->new(
108 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109 # FILTERS => {
110 # 'foo' => \&foo_filter,
111 # },
112 EVAL_PERL => 1,
113 );
114
115 # running with low_mem flag? well, use DBM::Deep then.
116 if ($self->{'low_mem'}) {
117 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118
119 my $db_file = "data.db";
120
121 if (-e $db_file) {
122 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123 $log->debug("removed '$db_file' from last run");
124 }
125
126 require DBM::Deep;
127
128 my $db = new DBM::Deep $db_file;
129
130 $log->logdie("DBM::Deep error: $!") unless ($db);
131
132 if ($db->error()) {
133 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134 } else {
135 $log->debug("using file '$db_file' for DBM::Deep");
136 }
137
138 $self->{'db'} = $db;
139 }
140
141 return $self;
142 }
143
144 =head2 open_isis
145
146 Open CDS/ISIS database using OpenIsis module and read all records to memory.
147
148 $webpac->open_isis(
149 filename => '/data/ISIS/ISIS',
150 code_page => '852',
151 limit_mfn => 500,
152 start_mfn => 6000,
153 lookup => [ ... ],
154 );
155
156 By default, ISIS code page is assumed to be C<852>.
157
158 If optional parametar C<start_mfn> is set, this will be first MFN to read
159 from database (so you can skip beginning of your database if you need to).
160
161 If optional parametar C<limit_mfn> is set, it will read just 500 records
162 from database in example above.
163
164 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166 value in index.
167
168 lookup => [
169 { 'key' => 'd:v900', 'val' => 'v250^a' },
170 { 'eval' => '"v901^a" eq "Podruèje"',
171 'key' => 'pa:v561^4:v562^4:v461^1',
172 'val' => 'v900' },
173 ]
174
175 Returns number of last record read into memory (size of database, really).
176
177 =cut
178
179 sub open_isis {
180 my $self = shift;
181 my $arg = {@_};
182
183 my $log = $self->_get_logger();
184
185 $log->logcroak("need filename") if (! $arg->{'filename'});
186 my $code_page = $arg->{'code_page'} || '852';
187
188 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189
190 # store data in object
191 $self->{'isis_filename'} = $arg->{'filename'};
192 $self->{'isis_code_page'} = $code_page;
193
194 use OpenIsis;
195
196 #$self->{'isis_code_page'} = $code_page;
197
198 # create Text::Iconv object
199 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200
201 $log->info("reading ISIS database '",$arg->{'filename'},"'");
202 $log->debug("isis code page: $code_page");
203
204 my $isis_db = OpenIsis::open($arg->{'filename'});
205
206 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207 my $startmfn = 1;
208
209 if (my $s = $self->{'start_mfn'}) {
210 $log->info("skipping to MFN $s");
211 $startmfn = $s;
212 } else {
213 $self->{'start_mfn'} = $startmfn;
214 }
215
216 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
217
218 $log->info("processing ",($maxmfn-$startmfn)." records...");
219
220 # read database
221 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
222
223
224 $log->debug("mfn: $mfn\n");
225
226 my $rec;
227
228 # read record
229 my $row = OpenIsis::read( $isis_db, $mfn );
230 foreach my $k (keys %{$row}) {
231 if ($k ne "mfn") {
232 foreach my $l (@{$row->{$k}}) {
233 $l = $cp->convert($l);
234 # has subfields?
235 my $val;
236 if ($l =~ m/\^/) {
237 foreach my $t (split(/\^/,$l)) {
238 next if (! $t);
239 $val->{substr($t,0,1)} = substr($t,1);
240 }
241 } else {
242 $val = $l;
243 }
244
245 push @{$rec->{$k}}, $val;
246 }
247 } else {
248 push @{$rec->{'000'}}, $mfn;
249 }
250
251 }
252
253 $log->confess("record $mfn empty?") unless ($rec);
254
255 # store
256 if ($self->{'low_mem'}) {
257 $self->{'db'}->put($mfn, $rec);
258 } else {
259 $self->{'data'}->{$mfn} = $rec;
260 }
261
262 # create lookup
263 $self->create_lookup($rec, @{$arg->{'lookup'}});
264
265 $self->progress_bar($mfn,$maxmfn);
266
267 }
268
269 $self->{'current_mfn'} = -1;
270 $self->{'last_pcnt'} = 0;
271
272 $log->debug("max mfn: $maxmfn");
273
274 # store max mfn and return it.
275 return $self->{'max_mfn'} = $maxmfn;
276 }
277
278 =head2 fetch_rec
279
280 Fetch next record from database. It will also display progress bar (once
281 it's implemented, that is).
282
283 my $rec = $webpac->fetch_rec;
284
285 =cut
286
287 sub fetch_rec {
288 my $self = shift;
289
290 my $log = $self->_get_logger();
291
292 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
293
294 if ($self->{'current_mfn'} == -1) {
295 $self->{'current_mfn'} = $self->{'start_mfn'};
296 } else {
297 $self->{'current_mfn'}++;
298 }
299
300 my $mfn = $self->{'current_mfn'};
301
302 if ($mfn > $self->{'max_mfn'}) {
303 $self->{'current_mfn'} = $self->{'max_mfn'};
304 $log->debug("at EOF");
305 return;
306 }
307
308 $self->progress_bar($mfn,$self->{'max_mfn'});
309
310 if ($self->{'low_mem'}) {
311 return $self->{'db'}->get($mfn);
312 } else {
313 return $self->{'data'}->{$mfn};
314 }
315 }
316
317 =head2 mfn
318
319 Returns current record number (MFN).
320
321 print $webpac->mfn;
322
323 =cut
324
325 sub mfn {
326 my $self = shift;
327 return $self->{'current_mfn'};
328 }
329
330 =head2 progress_bar
331
332 Draw progress bar on STDERR.
333
334 $webpac->progress_bar($current, $max);
335
336 =cut
337
338 sub progress_bar {
339 my $self = shift;
340
341 my ($curr,$max) = @_;
342
343 my $log = $self->_get_logger();
344
345 $log->logconfess("no current value!") if (! $curr);
346 $log->logconfess("no maximum value!") if (! $max);
347
348 if ($curr > $max) {
349 $max = $curr;
350 $log->debug("overflow to $curr");
351 }
352
353 $self->{'last_pcnt'} ||= 1;
354
355 my $p = int($curr * 100 / $max) || 1;
356
357 # reset on re-run
358 if ($p < $self->{'last_pcnt'}) {
359 $self->{'last_pcnt'} = $p;
360 $self->{'last_t'} = time();
361 $self->{'last_curr'} = undef;
362 }
363
364 $self->{'last_t'} ||= time();
365
366 if ($p != $self->{'last_pcnt'}) {
367
368 my $last_curr = $self->{'last_curr'} || $curr;
369 my $t = time();
370 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
371 my $eta = ($max-$curr) / ($rate || 1);
372 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
373 $self->{'last_pcnt'} = $p;
374 $self->{'last_t'} = time();
375 $self->{'last_curr'} = $curr;
376 }
377 print STDERR "\n" if ($p == 100);
378 }
379
380 =head2 fmt_time
381
382 Format time (in seconds) for display.
383
384 print $webpac->fmt_time(time());
385
386 This method is called by L<progress_bar> to display remaining time.
387
388 =cut
389
390 sub fmt_time {
391 my $self = shift;
392
393 my $t = shift || 0;
394 my $out = "";
395
396 my ($ss,$mm,$hh) = gmtime($t);
397 $out .= "${hh}h" if ($hh);
398 $out .= sprintf("%02d:%02d", $mm,$ss);
399 $out .= " " if ($hh == 0);
400 return $out;
401 }
402
403 =head2 open_import_xml
404
405 Read file from C<import_xml/> directory and parse it.
406
407 $webpac->open_import_xml(type => 'isis');
408
409 =cut
410
411 sub open_import_xml {
412 my $self = shift;
413
414 my $log = $self->_get_logger();
415
416 my $arg = {@_};
417 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
418
419 $self->{'type'} = $arg->{'type'};
420
421 my $type_base = $arg->{'type'};
422 $type_base =~ s/_.*$//g;
423
424 $self->{'tag'} = $type2tag{$type_base};
425
426 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
427
428 my $f = "./import_xml/".$self->{'type'}.".xml";
429 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
430
431 $log->info("reading '$f'");
432
433 $self->{'import_xml_file'} = $f;
434
435 $self->{'import_xml'} = XMLin($f,
436 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
437 );
438
439 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
440
441 }
442
443 =head2 create_lookup
444
445 Create lookup from record using lookup definition.
446
447 $self->create_lookup($rec, @lookups);
448
449 Called internally by C<open_*> methods.
450
451 =cut
452
453 sub create_lookup {
454 my $self = shift;
455
456 my $log = $self->_get_logger();
457
458 my $rec = shift || $log->logconfess("need record to create lookup");
459 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
460
461 foreach my $i (@_) {
462 $log->logconfess("need key") unless defined($i->{'key'});
463 $log->logconfess("need val") unless defined($i->{'val'});
464
465 if (defined($i->{'eval'})) {
466 # eval first, so we can skip fill_in for key and val
467 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
468 if ($self->_eval($eval)) {
469 my $key = $self->fill_in($rec,$i->{'key'}) || next;
470 my @val = $self->fill_in($rec,$i->{'val'}) || next;
471 $log->debug("stored $key = ",sub { join(" | ",@val) });
472 push @{$self->{'lookup'}->{$key}}, @val;
473 }
474 } else {
475 my $key = $self->fill_in($rec,$i->{'key'}) || next;
476 my @val = $self->fill_in($rec,$i->{'val'}) || next;
477 $log->debug("stored $key = ",sub { join(" | ",@val) });
478 push @{$self->{'lookup'}->{$key}}, @val;
479 }
480 }
481 }
482
483 =head2 get_data
484
485 Returns value from record.
486
487 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
488
489 Arguments are:
490 record reference C<$rec>,
491 field C<$f>,
492 optional subfiled C<$sf>,
493 index for repeatable values C<$i>.
494
495 Optinal variable C<$found> will be incremeted if there
496 is field.
497
498 Returns value or empty string.
499
500 =cut
501
502 sub get_data {
503 my $self = shift;
504
505 my ($rec,$f,$sf,$i,$found) = @_;
506
507 if ($$rec->{$f}) {
508 return '' if (! $$rec->{$f}->[$i]);
509 no strict 'refs';
510 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
511 $$found++ if (defined($$found));
512 return $$rec->{$f}->[$i]->{$sf};
513 } elsif ($$rec->{$f}->[$i]) {
514 $$found++ if (defined($$found));
515 # it still might have subfield, just
516 # not specified, so we'll dump all
517 if ($$rec->{$f}->[$i] =~ /HASH/o) {
518 my $out;
519 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
520 $out .= $$rec->{$f}->[$i]->{$k}." ";
521 }
522 return $out;
523 } else {
524 return $$rec->{$f}->[$i];
525 }
526 }
527 } else {
528 return '';
529 }
530 }
531
532 =head2 fill_in
533
534 Workhourse of all: takes record from in-memory structure of database and
535 strings with placeholders and returns string or array of with substituted
536 values from record.
537
538 my $text = $webpac->fill_in($rec,'v250^a');
539
540 Optional argument is ordinal number for repeatable fields. By default,
541 it's assume to be first repeatable field (fields are perl array, so first
542 element is 0).
543 Following example will read second value from repeatable field.
544
545 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
546
547 This function B<does not> perform parsing of format to inteligenty skip
548 delimiters before fields which aren't used.
549
550 This method will automatically decode UTF-8 string to local code page
551 if needed.
552
553 =cut
554
555 sub fill_in {
556 my $self = shift;
557
558 my $log = $self->_get_logger();
559
560 my $rec = shift || $log->logconfess("need data record");
561 my $format = shift || $log->logconfess("need format to parse");
562 # iteration (for repeatable fields)
563 my $i = shift || 0;
564
565 # FIXME remove for speedup?
566 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
567
568 if (utf8::is_utf8($format)) {
569 $format = $self->_x($format);
570 }
571
572 my $found = 0;
573
574 my $eval_code;
575 # remove eval{...} from beginning
576 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
577
578 # do actual replacement of placeholders
579 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
580
581 if ($found) {
582 $log->debug("format: $format");
583 if ($eval_code) {
584 my $eval = $self->fill_in($rec,$eval_code,$i);
585 return if (! $self->_eval($eval));
586 }
587 # do we have lookups?
588 if ($format =~ /$LOOKUP_REGEX/o) {
589 $log->debug("format '$format' has lookup");
590 return $self->lookup($format);
591 } else {
592 return $format;
593 }
594 } else {
595 return;
596 }
597 }
598
599 =head2 lookup
600
601 Perform lookups on format supplied to it.
602
603 my $text = $self->lookup('[v900]');
604
605 Lookups can be nested (like C<[d:[a:[v900]]]>).
606
607 =cut
608
609 sub lookup {
610 my $self = shift;
611
612 my $log = $self->_get_logger();
613
614 my $tmp = shift || $log->logconfess("need format");
615
616 if ($tmp =~ /$LOOKUP_REGEX/o) {
617 my @in = ( $tmp );
618
619 $log->debug("lookup for: ",$tmp);
620
621 my @out;
622 while (my $f = shift @in) {
623 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
624 my $k = $1;
625 if ($self->{'lookup'}->{$k}) {
626 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
627 my $tmp2 = $f;
628 $tmp2 =~ s/lookup{$k}/$nv/g;
629 push @in, $tmp2;
630 }
631 } else {
632 undef $f;
633 }
634 } elsif ($f) {
635 push @out, $f;
636 }
637 }
638 $log->logconfess("return is array and it's not expected!") unless wantarray;
639 return @out;
640 } else {
641 return $tmp;
642 }
643 }
644
645 =head2 parse
646
647 Perform smart parsing of string, skipping delimiters for fields which aren't
648 defined. It can also eval code in format starting with C<eval{...}> and
649 return output or nothing depending on eval code.
650
651 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
652
653 =cut
654
655 sub parse {
656 my $self = shift;
657
658 my ($rec, $format_utf8, $i) = @_;
659
660 return if (! $format_utf8);
661
662 my $log = $self->_get_logger();
663
664 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
665 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
666
667 $i = 0 if (! $i);
668
669 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
670
671 my @out;
672
673 $log->debug("format: $format");
674
675 my $eval_code;
676 # remove eval{...} from beginning
677 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
678
679 my $prefix;
680 my $all_found=0;
681
682 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
683
684 my $del = $1 || '';
685 $prefix ||= $del if ($all_found == 0);
686
687 my $found = 0;
688 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
689
690 if ($found) {
691 push @out, $del;
692 push @out, $tmp;
693 $all_found += $found;
694 }
695 }
696
697 return if (! $all_found);
698
699 my $out = join('',@out);
700
701 if ($out) {
702 # add rest of format (suffix)
703 $out .= $format;
704
705 # add prefix if not there
706 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
707
708 $log->debug("result: $out");
709 }
710
711 if ($eval_code) {
712 my $eval = $self->fill_in($rec,$eval_code,$i);
713 $log->debug("about to eval{",$eval,"} format: $out");
714 return if (! $self->_eval($eval));
715 }
716
717 return $out;
718 }
719
720 =head2 parse_to_arr
721
722 Similar to C<parse>, but returns array of all repeatable fields
723
724 my @arr = $webpac->parse_to_arr($rec,'v250^a');
725
726 =cut
727
728 sub parse_to_arr {
729 my $self = shift;
730
731 my ($rec, $format_utf8) = @_;
732
733 my $log = $self->_get_logger();
734
735 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
736 return if (! $format_utf8);
737
738 my $i = 0;
739 my @arr;
740
741 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
742 push @arr, $v;
743 }
744
745 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
746
747 return @arr;
748 }
749
750 =head2 fill_in_to_arr
751
752 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
753 for fields which have lookups, so they shouldn't be parsed but rather
754 C<fill_id>ed.
755
756 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
757
758 =cut
759
760 sub fill_in_to_arr {
761 my $self = shift;
762
763 my ($rec, $format_utf8) = @_;
764
765 my $log = $self->_get_logger();
766
767 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
768 return if (! $format_utf8);
769
770 my $i = 0;
771 my @arr;
772
773 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
774 push @arr, @v;
775 }
776
777 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
778
779 return @arr;
780 }
781
782 =head2 sort_arr
783
784 Sort array ignoring case and html in data
785
786 my @sorted = $webpac->sort_arr(@unsorted);
787
788 =cut
789
790 sub sort_arr {
791 my $self = shift;
792
793 my $log = $self->_get_logger();
794
795 # FIXME add Schwartzian Transformation?
796
797 my @sorted = sort {
798 $a =~ s#<[^>]+/*>##;
799 $b =~ s#<[^>]+/*>##;
800 lc($b) cmp lc($a)
801 } @_;
802 $log->debug("sorted values: ",sub { join(", ",@sorted) });
803
804 return @sorted;
805 }
806
807
808 =head2 data_structure
809
810 Create in-memory data structure which represents layout from C<import_xml>.
811 It is used later to produce output.
812
813 my @ds = $webpac->data_structure($rec);
814
815 This method will also set C<$webpac->{'currnet_filename'}> if there is
816 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
817 <headline> tag.
818
819 =cut
820
821 sub data_structure {
822 my $self = shift;
823
824 my $log = $self->_get_logger();
825
826 my $rec = shift;
827 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
828
829 undef $self->{'currnet_filename'};
830 undef $self->{'headline'};
831
832 my @sorted_tags;
833 if ($self->{tags_by_order}) {
834 @sorted_tags = @{$self->{tags_by_order}};
835 } else {
836 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
837 $self->{tags_by_order} = \@sorted_tags;
838 }
839
840 my @ds;
841
842 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
843
844 foreach my $field (@sorted_tags) {
845
846 my $row;
847
848 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
849
850 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
851 my $format = $tag->{'value'} || $tag->{'content'};
852
853 $log->debug("format: $format");
854
855 my @v;
856 if ($format =~ /$LOOKUP_REGEX/o) {
857 @v = $self->fill_in_to_arr($rec,$format);
858 } else {
859 @v = $self->parse_to_arr($rec,$format);
860 }
861 next if (! @v);
862
863 if ($tag->{'sort'}) {
864 @v = $self->sort_arr(@v);
865 $log->warn("sort within tag is usually not what you want!");
866 }
867
868 # use format?
869 if ($tag->{'format_name'}) {
870 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
871 }
872
873 if ($field eq 'filename') {
874 $self->{'current_filename'} = join('',@v);
875 $log->debug("filename: ",$self->{'current_filename'});
876 } elsif ($field eq 'headline') {
877 $self->{'headline'} .= join('',@v);
878 $log->debug("headline: ",$self->{'headline'});
879 next; # don't return headline in data_structure!
880 }
881
882 # delimiter will join repeatable fields
883 if ($tag->{'delimiter'}) {
884 @v = ( join($tag->{'delimiter'}, @v) );
885 }
886
887 # default types
888 my @types = qw(display swish);
889 # override by type attribute
890 @types = ( $tag->{'type'} ) if ($tag->{'type'});
891
892 foreach my $type (@types) {
893 # append to previous line?
894 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
895 if ($tag->{'append'}) {
896
897 # I will delimit appended part with
898 # delimiter (or ,)
899 my $d = $tag->{'delimiter'};
900 # default delimiter
901 $d ||= ", ";
902
903 my $last = pop @{$row->{$type}};
904 $d = "" if (! $last);
905 $last .= $d . join($d, @v);
906 push @{$row->{$type}}, $last;
907
908 } else {
909 push @{$row->{$type}}, @v;
910 }
911 }
912
913
914 }
915
916 if ($row) {
917 $row->{'tag'} = $field;
918
919 # TODO: name_sigular, name_plural
920 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
921 $row->{'name'} = $name ? $self->_x($name) : $field;
922
923 # post-sort all values in field
924 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
925 $log->warn("sort at field tag not implemented");
926 }
927
928 push @ds, $row;
929
930 $log->debug("row $field: ",sub { Dumper($row) });
931 }
932
933 }
934
935 return @ds;
936
937 }
938
939 =head2 output
940
941 Create output from in-memory data structure using Template Toolkit template.
942
943 my $text = $webpac->output( template => 'text.tt', data => @ds );
944
945 =cut
946
947 sub output {
948 my $self = shift;
949
950 my $args = {@_};
951
952 my $log = $self->_get_logger();
953
954 $log->logconfess("need template name") if (! $args->{'template'});
955 $log->logconfess("need data array") if (! $args->{'data'});
956
957 my $out;
958
959 $self->{'tt'}->process(
960 $args->{'template'},
961 $args,
962 \$out
963 ) || confess $self->{'tt'}->error();
964
965 return $out;
966 }
967
968 =head2 output_file
969
970 Create output from in-memory data structure using Template Toolkit template
971 to a file.
972
973 $webpac->output_file(
974 file => 'out.txt',
975 template => 'text.tt',
976 data => @ds
977 );
978
979 =cut
980
981 sub output_file {
982 my $self = shift;
983
984 my $args = {@_};
985
986 my $log = $self->_get_logger();
987
988 my $file = $args->{'file'} || $log->logconfess("need file name");
989
990 $log->debug("creating file ",$file);
991
992 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
993 print $fh $self->output(
994 template => $args->{'template'},
995 data => $args->{'data'},
996 ) || $log->logdie("print: $!");
997 close($fh) || $log->logdie("close: $!");
998 }
999
1000 =head2 apply_format
1001
1002 Apply format specified in tag with C<format_name="name"> and
1003 C<format_delimiter=";;">.
1004
1005 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1006
1007 Formats can contain C<lookup{...}> if you need them.
1008
1009 =cut
1010
1011 sub apply_format {
1012 my $self = shift;
1013
1014 my ($name,$delimiter,$data) = @_;
1015
1016 my $log = $self->_get_logger();
1017
1018 if (! $self->{'import_xml'}->{'format'}->{$name}) {
1019 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1020 return $data;
1021 }
1022
1023 $log->warn("no delimiter for format $name") if (! $delimiter);
1024
1025 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1026
1027 my @data = split(/\Q$delimiter\E/, $data);
1028
1029 my $out = sprintf($format, @data);
1030 $log->debug("using format $name [$format] on $data to produce: $out");
1031
1032 if ($out =~ m/$LOOKUP_REGEX/o) {
1033 return $self->lookup($out);
1034 } else {
1035 return $out;
1036 }
1037
1038 }
1039
1040
1041 #
1042 #
1043 #
1044
1045 =head1 INTERNAL METHODS
1046
1047 Here is a quick list of internal methods, mostly useful to turn debugging
1048 on them (see L<LOGGING> below for explanation).
1049
1050 =cut
1051
1052 =head2 _eval
1053
1054 Internal function to eval code without C<strict 'subs'>.
1055
1056 =cut
1057
1058 sub _eval {
1059 my $self = shift;
1060
1061 my $code = shift || return;
1062
1063 my $log = $self->_get_logger();
1064
1065 no strict 'subs';
1066 my $ret = eval $code;
1067 if ($@) {
1068 $log->error("problem with eval code [$code]: $@");
1069 }
1070
1071 $log->debug("eval: ",$code," [",$ret,"]");
1072
1073 return $ret || 0;
1074 }
1075
1076 =head2 _sort_by_order
1077
1078 Sort xml tags data structure accoding to C<order=""> attribute.
1079
1080 =cut
1081
1082 sub _sort_by_order {
1083 my $self = shift;
1084
1085 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1086 $self->{'import_xml'}->{'indexer'}->{$a};
1087 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1088 $self->{'import_xml'}->{'indexer'}->{$b};
1089
1090 return $va <=> $vb;
1091 }
1092
1093 =head2 _get_logger
1094
1095 Get C<Log::Log4perl> object with a twist: domains are defined for each
1096 method
1097
1098 my $log = $webpac->_get_logger();
1099
1100 =cut
1101
1102 sub _get_logger {
1103 my $self = shift;
1104
1105 my $name = (caller(1))[3] || caller;
1106 return get_logger($name);
1107 }
1108
1109 =head2 _x
1110
1111 Convert string from UTF-8 to code page defined in C<import_xml>.
1112
1113 my $text = $webpac->_x('utf8 text');
1114
1115 =cut
1116
1117 sub _x {
1118 my $self = shift;
1119 my $utf8 = shift || return;
1120
1121 return $self->{'utf2cp'}->convert($utf8) ||
1122 $self->_get_logger()->logwarn("can't convert '$utf8'");
1123 }
1124
1125 #
1126 #
1127 #
1128
1129 =head1 LOGGING
1130
1131 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1132 C<log.conf>.
1133
1134 Methods defined above have different levels of logging, so
1135 it's descriptions will be useful to turn (mostry B<debug> logging) on
1136 or off to see why WabPAC isn't perforing as you expect it (it might even
1137 be a bug!).
1138
1139 B<This is different from normal Log4perl behaviour>. To repeat, you can
1140 also use method names, and not only classes (which are just few)
1141 to filter logging.
1142
1143
1144 =head1 MEMORY USAGE
1145
1146 C<low_mem> options is double-edged sword. If enabled, WebPAC
1147 will run on memory constraint machines (which doesn't have enough
1148 physical RAM to create memory structure for whole source database).
1149
1150 If your machine has 512Mb or more of RAM and database is around 10000 records,
1151 memory shouldn't be an issue. If you don't have enough physical RAM, you
1152 might consider using virtual memory (if your operating system is handling it
1153 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1154 parsed structure of ISIS database (this is what C<low_mem> option does).
1155
1156 Hitting swap at end of reading source database is probably o.k. However,
1157 hitting swap before 90% will dramatically decrease performance and you will
1158 be better off with C<low_mem> and using rest of availble memory for
1159 operating system disk cache (Linux is particuallary good about this).
1160 However, every access to database record will require disk access, so
1161 generation phase will be slower 10-100 times.
1162
1163 Parsed structures are essential - you just have option to trade RAM memory
1164 (which is fast) for disk space (which is slow). Be sure to have planty of
1165 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1166
1167 However, when WebPAC is running on desktop machines (or laptops :-), it's
1168 highly undesireable for system to start swapping. Using C<low_mem> option can
1169 reduce WecPAC memory usage to around 64Mb for same database with lookup
1170 fields and sorted indexes which stay in RAM. Performance will suffer, but
1171 memory usage will really be minimal. It might be also more confortable to
1172 run WebPAC reniced on those machines.
1173
1174 =cut
1175
1176 1;

  ViewVC Help
Powered by ViewVC 1.1.26