/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 563 - (show annotations)
Sat Oct 30 23:58:36 2004 UTC (16 years, 5 months ago) by dpavlin
File size: 27583 byte(s)
well, it seems it *IS* what I want. Removed that warning :-)

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 code_page => 'ISO-8859-2',
38 low_mem => 1,
39 filter => {
40 'lower' => sub { lc($_[0]) },
41 },
42 );
43
44 Default C<code_page> is C<ISO-8859-2>.
45
46 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47
48 There is optinal parametar C<filter> which specify different filters which
49 can be applied using C<filter{name}> notation.
50 Same filters can be used in Template Toolkit files.
51
52 This method will also read configuration files
53 C<global.conf> (used by indexer and Web font-end)
54 and configuration file specified by C<config_file>
55 which describes databases to be indexed.
56
57 =cut
58
59 # mapping between data type and tag which specify
60 # format in XML file
61 my %type2tag = (
62 'isis' => 'isis',
63 # 'excel' => 'column',
64 # 'marc' => 'marc',
65 # 'feed' => 'feed'
66 );
67
68 sub new {
69 my $class = shift;
70 my $self = {@_};
71 bless($self, $class);
72
73 $self->{'start_t'} = time();
74
75 my $log_file = $self->{'log'} || "log.conf";
76 Log::Log4perl->init($log_file);
77
78 my $log = $self->_get_logger();
79
80 # fill in default values
81 # output codepage
82 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
83
84 #
85 # read global.conf
86 #
87 $log->debug("read 'global.conf'");
88
89 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
90
91 # read global config parametars
92 foreach my $var (qw(
93 dbi_dbd
94 dbi_dsn
95 dbi_user
96 dbi_passwd
97 show_progress
98 my_unac_filter
99 output_template
100 )) {
101 $self->{'global_config'}->{$var} = $config->val('global', $var);
102 }
103
104 #
105 # read indexer config file
106 #
107
108 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
109
110 # create UTF-8 convertor for import_xml files
111 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
112
113 # create Template toolkit instance
114 $self->{'tt'} = Template->new(
115 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116 FILTERS => $self->{'filter'},
117 EVAL_PERL => 1,
118 );
119
120 # running with low_mem flag? well, use DBM::Deep then.
121 if ($self->{'low_mem'}) {
122 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123
124 my $db_file = "data.db";
125
126 if (-e $db_file) {
127 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128 $log->debug("removed '$db_file' from last run");
129 }
130
131 require DBM::Deep;
132
133 my $db = new DBM::Deep $db_file;
134
135 $log->logdie("DBM::Deep error: $!") unless ($db);
136
137 if ($db->error()) {
138 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139 } else {
140 $log->debug("using file '$db_file' for DBM::Deep");
141 }
142
143 $self->{'db'} = $db;
144 }
145
146 $log->debug("filters defined: ",Dumper($self->{'filter'}));
147
148 return $self;
149 }
150
151 =head2 open_isis
152
153 Open CDS/ISIS database using OpenIsis module and read all records to memory.
154
155 $webpac->open_isis(
156 filename => '/data/ISIS/ISIS',
157 code_page => '852',
158 limit_mfn => 500,
159 start_mfn => 6000,
160 lookup => [ ... ],
161 );
162
163 By default, ISIS code page is assumed to be C<852>.
164
165 If optional parametar C<start_mfn> is set, this will be first MFN to read
166 from database (so you can skip beginning of your database if you need to).
167
168 If optional parametar C<limit_mfn> is set, it will read just 500 records
169 from database in example above.
170
171 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
172 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
173 value in index.
174
175 lookup => [
176 { 'key' => 'd:v900', 'val' => 'v250^a' },
177 { 'eval' => '"v901^a" eq "Podruèje"',
178 'key' => 'pa:v561^4:v562^4:v461^1',
179 'val' => 'v900' },
180 ]
181
182 Returns number of last record read into memory (size of database, really).
183
184 =cut
185
186 sub open_isis {
187 my $self = shift;
188 my $arg = {@_};
189
190 my $log = $self->_get_logger();
191
192 $log->logcroak("need filename") if (! $arg->{'filename'});
193 my $code_page = $arg->{'code_page'} || '852';
194
195 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
196
197 # store data in object
198 $self->{'isis_filename'} = $arg->{'filename'};
199 $self->{'isis_code_page'} = $code_page;
200
201 use OpenIsis;
202
203 #$self->{'isis_code_page'} = $code_page;
204
205 # create Text::Iconv object
206 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
207
208 $log->info("reading ISIS database '",$arg->{'filename'},"'");
209 $log->debug("isis code page: $code_page");
210
211 my $isis_db = OpenIsis::open($arg->{'filename'});
212
213 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
214 my $startmfn = 1;
215
216 if (my $s = $self->{'start_mfn'}) {
217 $log->info("skipping to MFN $s");
218 $startmfn = $s;
219 } else {
220 $self->{'start_mfn'} = $startmfn;
221 }
222
223 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
224
225 $log->info("processing ",($maxmfn-$startmfn)." records...");
226
227 # read database
228 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
229
230
231 $log->debug("mfn: $mfn\n");
232
233 my $rec;
234
235 # read record
236 my $row = OpenIsis::read( $isis_db, $mfn );
237 foreach my $k (keys %{$row}) {
238 if ($k ne "mfn") {
239 foreach my $l (@{$row->{$k}}) {
240 $l = $cp->convert($l);
241 # has subfields?
242 my $val;
243 if ($l =~ m/\^/) {
244 foreach my $t (split(/\^/,$l)) {
245 next if (! $t);
246 $val->{substr($t,0,1)} = substr($t,1);
247 }
248 } else {
249 $val = $l;
250 }
251
252 push @{$rec->{$k}}, $val;
253 }
254 } else {
255 push @{$rec->{'000'}}, $mfn;
256 }
257
258 }
259
260 $log->confess("record $mfn empty?") unless ($rec);
261
262 # store
263 if ($self->{'low_mem'}) {
264 $self->{'db'}->put($mfn, $rec);
265 } else {
266 $self->{'data'}->{$mfn} = $rec;
267 }
268
269 # create lookup
270 $self->create_lookup($rec, @{$arg->{'lookup'}});
271
272 $self->progress_bar($mfn,$maxmfn);
273
274 }
275
276 $self->{'current_mfn'} = -1;
277 $self->{'last_pcnt'} = 0;
278
279 $log->debug("max mfn: $maxmfn");
280
281 # store max mfn and return it.
282 return $self->{'max_mfn'} = $maxmfn;
283 }
284
285 =head2 fetch_rec
286
287 Fetch next record from database. It will also display progress bar (once
288 it's implemented, that is).
289
290 my $rec = $webpac->fetch_rec;
291
292 =cut
293
294 sub fetch_rec {
295 my $self = shift;
296
297 my $log = $self->_get_logger();
298
299 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
300
301 if ($self->{'current_mfn'} == -1) {
302 $self->{'current_mfn'} = $self->{'start_mfn'};
303 } else {
304 $self->{'current_mfn'}++;
305 }
306
307 my $mfn = $self->{'current_mfn'};
308
309 if ($mfn > $self->{'max_mfn'}) {
310 $self->{'current_mfn'} = $self->{'max_mfn'};
311 $log->debug("at EOF");
312 return;
313 }
314
315 $self->progress_bar($mfn,$self->{'max_mfn'});
316
317 if ($self->{'low_mem'}) {
318 return $self->{'db'}->get($mfn);
319 } else {
320 return $self->{'data'}->{$mfn};
321 }
322 }
323
324 =head2 mfn
325
326 Returns current record number (MFN).
327
328 print $webpac->mfn;
329
330 =cut
331
332 sub mfn {
333 my $self = shift;
334 return $self->{'current_mfn'};
335 }
336
337 =head2 progress_bar
338
339 Draw progress bar on STDERR.
340
341 $webpac->progress_bar($current, $max);
342
343 =cut
344
345 sub progress_bar {
346 my $self = shift;
347
348 my ($curr,$max) = @_;
349
350 my $log = $self->_get_logger();
351
352 $log->logconfess("no current value!") if (! $curr);
353 $log->logconfess("no maximum value!") if (! $max);
354
355 if ($curr > $max) {
356 $max = $curr;
357 $log->debug("overflow to $curr");
358 }
359
360 $self->{'last_pcnt'} ||= 1;
361
362 my $p = int($curr * 100 / $max) || 1;
363
364 # reset on re-run
365 if ($p < $self->{'last_pcnt'}) {
366 $self->{'last_pcnt'} = $p;
367 $self->{'last_t'} = time();
368 $self->{'last_curr'} = undef;
369 }
370
371 $self->{'last_t'} ||= time();
372
373 if ($p != $self->{'last_pcnt'}) {
374
375 my $last_curr = $self->{'last_curr'} || $curr;
376 my $t = time();
377 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
378 my $eta = ($max-$curr) / ($rate || 1);
379 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
380 $self->{'last_pcnt'} = $p;
381 $self->{'last_t'} = time();
382 $self->{'last_curr'} = $curr;
383 }
384 print STDERR "\n" if ($p == 100);
385 }
386
387 =head2 fmt_time
388
389 Format time (in seconds) for display.
390
391 print $webpac->fmt_time(time());
392
393 This method is called by L<progress_bar> to display remaining time.
394
395 =cut
396
397 sub fmt_time {
398 my $self = shift;
399
400 my $t = shift || 0;
401 my $out = "";
402
403 my ($ss,$mm,$hh) = gmtime($t);
404 $out .= "${hh}h" if ($hh);
405 $out .= sprintf("%02d:%02d", $mm,$ss);
406 $out .= " " if ($hh == 0);
407 return $out;
408 }
409
410 =head2 open_import_xml
411
412 Read file from C<import_xml/> directory and parse it.
413
414 $webpac->open_import_xml(type => 'isis');
415
416 =cut
417
418 sub open_import_xml {
419 my $self = shift;
420
421 my $log = $self->_get_logger();
422
423 my $arg = {@_};
424 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
425
426 $self->{'type'} = $arg->{'type'};
427
428 my $type_base = $arg->{'type'};
429 $type_base =~ s/_.*$//g;
430
431 $self->{'tag'} = $type2tag{$type_base};
432
433 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
434
435 my $f = "./import_xml/".$self->{'type'}.".xml";
436 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
437
438 $log->info("reading '$f'");
439
440 $self->{'import_xml_file'} = $f;
441
442 $self->{'import_xml'} = XMLin($f,
443 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
444 );
445
446 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
447
448 }
449
450 =head2 create_lookup
451
452 Create lookup from record using lookup definition.
453
454 $self->create_lookup($rec, @lookups);
455
456 Called internally by C<open_*> methods.
457
458 =cut
459
460 sub create_lookup {
461 my $self = shift;
462
463 my $log = $self->_get_logger();
464
465 my $rec = shift || $log->logconfess("need record to create lookup");
466 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
467
468 foreach my $i (@_) {
469 $log->logconfess("need key") unless defined($i->{'key'});
470 $log->logconfess("need val") unless defined($i->{'val'});
471
472 if (defined($i->{'eval'})) {
473 # eval first, so we can skip fill_in for key and val
474 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
475 if ($self->_eval($eval)) {
476 my $key = $self->fill_in($rec,$i->{'key'}) || next;
477 my @val = $self->fill_in($rec,$i->{'val'}) || next;
478 $log->debug("stored $key = ",sub { join(" | ",@val) });
479 push @{$self->{'lookup'}->{$key}}, @val;
480 }
481 } else {
482 my $key = $self->fill_in($rec,$i->{'key'}) || next;
483 my @val = $self->fill_in($rec,$i->{'val'}) || next;
484 $log->debug("stored $key = ",sub { join(" | ",@val) });
485 push @{$self->{'lookup'}->{$key}}, @val;
486 }
487 }
488 }
489
490 =head2 get_data
491
492 Returns value from record.
493
494 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
495
496 Arguments are:
497 record reference C<$rec>,
498 field C<$f>,
499 optional subfiled C<$sf>,
500 index for repeatable values C<$i>.
501
502 Optinal variable C<$found> will be incremeted if there
503 is field.
504
505 Returns value or empty string.
506
507 =cut
508
509 sub get_data {
510 my $self = shift;
511
512 my ($rec,$f,$sf,$i,$found) = @_;
513
514 if ($$rec->{$f}) {
515 return '' if (! $$rec->{$f}->[$i]);
516 no strict 'refs';
517 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
518 $$found++ if (defined($$found));
519 return $$rec->{$f}->[$i]->{$sf};
520 } elsif ($$rec->{$f}->[$i]) {
521 $$found++ if (defined($$found));
522 # it still might have subfield, just
523 # not specified, so we'll dump all
524 if ($$rec->{$f}->[$i] =~ /HASH/o) {
525 my $out;
526 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
527 $out .= $$rec->{$f}->[$i]->{$k}." ";
528 }
529 return $out;
530 } else {
531 return $$rec->{$f}->[$i];
532 }
533 }
534 } else {
535 return '';
536 }
537 }
538
539 =head2 fill_in
540
541 Workhourse of all: takes record from in-memory structure of database and
542 strings with placeholders and returns string or array of with substituted
543 values from record.
544
545 my $text = $webpac->fill_in($rec,'v250^a');
546
547 Optional argument is ordinal number for repeatable fields. By default,
548 it's assume to be first repeatable field (fields are perl array, so first
549 element is 0).
550 Following example will read second value from repeatable field.
551
552 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
553
554 This function B<does not> perform parsing of format to inteligenty skip
555 delimiters before fields which aren't used.
556
557 This method will automatically decode UTF-8 string to local code page
558 if needed.
559
560 =cut
561
562 sub fill_in {
563 my $self = shift;
564
565 my $log = $self->_get_logger();
566
567 my $rec = shift || $log->logconfess("need data record");
568 my $format = shift || $log->logconfess("need format to parse");
569 # iteration (for repeatable fields)
570 my $i = shift || 0;
571
572 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
573
574 # FIXME remove for speedup?
575 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
576
577 if (utf8::is_utf8($format)) {
578 $format = $self->_x($format);
579 }
580
581 my $found = 0;
582
583 my $eval_code;
584 # remove eval{...} from beginning
585 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
586
587 my $filter_name;
588 # remove filter{...} from beginning
589 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
590
591 # do actual replacement of placeholders
592 # repeatable fields
593 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
594 # non-repeatable fields
595 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
596
597 if ($found) {
598 $log->debug("format: $format");
599 if ($eval_code) {
600 my $eval = $self->fill_in($rec,$eval_code,$i);
601 return if (! $self->_eval($eval));
602 }
603 if ($filter_name && $self->{'filter'}->{$filter_name}) {
604 $log->debug("filter '$filter_name' for $format");
605 $format = $self->{'filter'}->{$filter_name}->($format);
606 return unless(defined($format));
607 $log->debug("filter result: $format");
608 }
609 # do we have lookups?
610 if ($format =~ /$LOOKUP_REGEX/o) {
611 $log->debug("format '$format' has lookup");
612 return $self->lookup($format);
613 } else {
614 return $format;
615 }
616 } else {
617 return;
618 }
619 }
620
621 =head2 lookup
622
623 Perform lookups on format supplied to it.
624
625 my $text = $self->lookup('[v900]');
626
627 Lookups can be nested (like C<[d:[a:[v900]]]>).
628
629 =cut
630
631 sub lookup {
632 my $self = shift;
633
634 my $log = $self->_get_logger();
635
636 my $tmp = shift || $log->logconfess("need format");
637
638 if ($tmp =~ /$LOOKUP_REGEX/o) {
639 my @in = ( $tmp );
640
641 $log->debug("lookup for: ",$tmp);
642
643 my @out;
644 while (my $f = shift @in) {
645 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
646 my $k = $1;
647 if ($self->{'lookup'}->{$k}) {
648 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
649 my $tmp2 = $f;
650 $tmp2 =~ s/lookup{$k}/$nv/g;
651 push @in, $tmp2;
652 }
653 } else {
654 undef $f;
655 }
656 } elsif ($f) {
657 push @out, $f;
658 }
659 }
660 $log->logconfess("return is array and it's not expected!") unless wantarray;
661 return @out;
662 } else {
663 return $tmp;
664 }
665 }
666
667 =head2 parse
668
669 Perform smart parsing of string, skipping delimiters for fields which aren't
670 defined. It can also eval code in format starting with C<eval{...}> and
671 return output or nothing depending on eval code.
672
673 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
674
675 =cut
676
677 sub parse {
678 my $self = shift;
679
680 my ($rec, $format_utf8, $i) = @_;
681
682 return if (! $format_utf8);
683
684 my $log = $self->_get_logger();
685
686 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
687 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
688
689 $i = 0 if (! $i);
690
691 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
692
693 my @out;
694
695 $log->debug("format: $format");
696
697 my $eval_code;
698 # remove eval{...} from beginning
699 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
700
701 my $filter_name;
702 # remove filter{...} from beginning
703 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
704
705 my $prefix;
706 my $all_found=0;
707
708 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
709
710 my $del = $1 || '';
711 $prefix ||= $del if ($all_found == 0);
712
713 # repeatable index
714 my $r = $i;
715 $r = 0 if (lc("$2") eq 's');
716
717 my $found = 0;
718 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
719
720 if ($found) {
721 push @out, $del;
722 push @out, $tmp;
723 $all_found += $found;
724 }
725 }
726
727 return if (! $all_found);
728
729 my $out = join('',@out);
730
731 if ($out) {
732 # add rest of format (suffix)
733 $out .= $format;
734
735 # add prefix if not there
736 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
737
738 $log->debug("result: $out");
739 }
740
741 if ($eval_code) {
742 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
743 $log->debug("about to eval{$eval} format: $out");
744 return if (! $self->_eval($eval));
745 }
746
747 if ($filter_name && $self->{'filter'}->{$filter_name}) {
748 $log->debug("about to filter{$filter_name} format: $out");
749 $out = $self->{'filter'}->{$filter_name}->($out);
750 return unless(defined($out));
751 $log->debug("filter result: $out");
752 }
753
754 return $out;
755 }
756
757 =head2 parse_to_arr
758
759 Similar to C<parse>, but returns array of all repeatable fields
760
761 my @arr = $webpac->parse_to_arr($rec,'v250^a');
762
763 =cut
764
765 sub parse_to_arr {
766 my $self = shift;
767
768 my ($rec, $format_utf8) = @_;
769
770 my $log = $self->_get_logger();
771
772 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
773 return if (! $format_utf8);
774
775 my $i = 0;
776 my @arr;
777
778 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
779 push @arr, $v;
780 }
781
782 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
783
784 return @arr;
785 }
786
787 =head2 fill_in_to_arr
788
789 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
790 for fields which have lookups, so they shouldn't be parsed but rather
791 C<fill_id>ed.
792
793 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
794
795 =cut
796
797 sub fill_in_to_arr {
798 my $self = shift;
799
800 my ($rec, $format_utf8) = @_;
801
802 my $log = $self->_get_logger();
803
804 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
805 return if (! $format_utf8);
806
807 my $i = 0;
808 my @arr;
809
810 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
811 push @arr, @v;
812 }
813
814 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
815
816 return @arr;
817 }
818
819 =head2 sort_arr
820
821 Sort array ignoring case and html in data
822
823 my @sorted = $webpac->sort_arr(@unsorted);
824
825 =cut
826
827 sub sort_arr {
828 my $self = shift;
829
830 my $log = $self->_get_logger();
831
832 # FIXME add Schwartzian Transformation?
833
834 my @sorted = sort {
835 $a =~ s#<[^>]+/*>##;
836 $b =~ s#<[^>]+/*>##;
837 lc($b) cmp lc($a)
838 } @_;
839 $log->debug("sorted values: ",sub { join(", ",@sorted) });
840
841 return @sorted;
842 }
843
844
845 =head2 data_structure
846
847 Create in-memory data structure which represents layout from C<import_xml>.
848 It is used later to produce output.
849
850 my @ds = $webpac->data_structure($rec);
851
852 This method will also set C<$webpac->{'currnet_filename'}> if there is
853 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
854 <headline> tag.
855
856 =cut
857
858 sub data_structure {
859 my $self = shift;
860
861 my $log = $self->_get_logger();
862
863 my $rec = shift;
864 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
865
866 undef $self->{'currnet_filename'};
867 undef $self->{'headline'};
868
869 my @sorted_tags;
870 if ($self->{tags_by_order}) {
871 @sorted_tags = @{$self->{tags_by_order}};
872 } else {
873 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
874 $self->{tags_by_order} = \@sorted_tags;
875 }
876
877 my @ds;
878
879 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
880
881 foreach my $field (@sorted_tags) {
882
883 my $row;
884
885 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
886
887 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
888 my $format = $tag->{'value'} || $tag->{'content'};
889
890 $log->debug("format: $format");
891
892 my @v;
893 if ($format =~ /$LOOKUP_REGEX/o) {
894 @v = $self->fill_in_to_arr($rec,$format);
895 } else {
896 @v = $self->parse_to_arr($rec,$format);
897 }
898 next if (! @v);
899
900 if ($tag->{'sort'}) {
901 @v = $self->sort_arr(@v);
902 }
903
904 # use format?
905 if ($tag->{'format_name'}) {
906 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
907 }
908
909 if ($field eq 'filename') {
910 $self->{'current_filename'} = join('',@v);
911 $log->debug("filename: ",$self->{'current_filename'});
912 } elsif ($field eq 'headline') {
913 $self->{'headline'} .= join('',@v);
914 $log->debug("headline: ",$self->{'headline'});
915 next; # don't return headline in data_structure!
916 }
917
918 # delimiter will join repeatable fields
919 if ($tag->{'delimiter'}) {
920 @v = ( join($tag->{'delimiter'}, @v) );
921 }
922
923 # default types
924 my @types = qw(display swish);
925 # override by type attribute
926 @types = ( $tag->{'type'} ) if ($tag->{'type'});
927
928 foreach my $type (@types) {
929 # append to previous line?
930 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
931 if ($tag->{'append'}) {
932
933 # I will delimit appended part with
934 # delimiter (or ,)
935 my $d = $tag->{'delimiter'};
936 # default delimiter
937 $d ||= " ";
938
939 my $last = pop @{$row->{$type}};
940 $d = "" if (! $last);
941 $last .= $d . join($d, @v);
942 push @{$row->{$type}}, $last;
943
944 } else {
945 push @{$row->{$type}}, @v;
946 }
947 }
948
949
950 }
951
952 if ($row) {
953 $row->{'tag'} = $field;
954
955 # TODO: name_sigular, name_plural
956 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
957 $row->{'name'} = $name ? $self->_x($name) : $field;
958
959 # post-sort all values in field
960 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
961 $log->warn("sort at field tag not implemented");
962 }
963
964 push @ds, $row;
965
966 $log->debug("row $field: ",sub { Dumper($row) });
967 }
968
969 }
970
971 return @ds;
972
973 }
974
975 =head2 output
976
977 Create output from in-memory data structure using Template Toolkit template.
978
979 my $text = $webpac->output( template => 'text.tt', data => @ds );
980
981 =cut
982
983 sub output {
984 my $self = shift;
985
986 my $args = {@_};
987
988 my $log = $self->_get_logger();
989
990 $log->logconfess("need template name") if (! $args->{'template'});
991 $log->logconfess("need data array") if (! $args->{'data'});
992
993 my $out;
994
995 $self->{'tt'}->process(
996 $args->{'template'},
997 $args,
998 \$out
999 ) || confess $self->{'tt'}->error();
1000
1001 return $out;
1002 }
1003
1004 =head2 output_file
1005
1006 Create output from in-memory data structure using Template Toolkit template
1007 to a file.
1008
1009 $webpac->output_file(
1010 file => 'out.txt',
1011 template => 'text.tt',
1012 data => @ds
1013 );
1014
1015 =cut
1016
1017 sub output_file {
1018 my $self = shift;
1019
1020 my $args = {@_};
1021
1022 my $log = $self->_get_logger();
1023
1024 my $file = $args->{'file'} || $log->logconfess("need file name");
1025
1026 $log->debug("creating file ",$file);
1027
1028 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1029 print $fh $self->output(
1030 template => $args->{'template'},
1031 data => $args->{'data'},
1032 ) || $log->logdie("print: $!");
1033 close($fh) || $log->logdie("close: $!");
1034 }
1035
1036 =head2 apply_format
1037
1038 Apply format specified in tag with C<format_name="name"> and
1039 C<format_delimiter=";;">.
1040
1041 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1042
1043 Formats can contain C<lookup{...}> if you need them.
1044
1045 =cut
1046
1047 sub apply_format {
1048 my $self = shift;
1049
1050 my ($name,$delimiter,$data) = @_;
1051
1052 my $log = $self->_get_logger();
1053
1054 if (! $self->{'import_xml'}->{'format'}->{$name}) {
1055 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1056 return $data;
1057 }
1058
1059 $log->warn("no delimiter for format $name") if (! $delimiter);
1060
1061 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1062
1063 my @data = split(/\Q$delimiter\E/, $data);
1064
1065 my $out = sprintf($format, @data);
1066 $log->debug("using format $name [$format] on $data to produce: $out");
1067
1068 if ($out =~ m/$LOOKUP_REGEX/o) {
1069 return $self->lookup($out);
1070 } else {
1071 return $out;
1072 }
1073
1074 }
1075
1076
1077 #
1078 #
1079 #
1080
1081 =head1 INTERNAL METHODS
1082
1083 Here is a quick list of internal methods, mostly useful to turn debugging
1084 on them (see L<LOGGING> below for explanation).
1085
1086 =cut
1087
1088 =head2 _eval
1089
1090 Internal function to eval code without C<strict 'subs'>.
1091
1092 =cut
1093
1094 sub _eval {
1095 my $self = shift;
1096
1097 my $code = shift || return;
1098
1099 my $log = $self->_get_logger();
1100
1101 no strict 'subs';
1102 my $ret = eval $code;
1103 if ($@) {
1104 $log->error("problem with eval code [$code]: $@");
1105 }
1106
1107 $log->debug("eval: ",$code," [",$ret,"]");
1108
1109 return $ret || undef;
1110 }
1111
1112 =head2 _sort_by_order
1113
1114 Sort xml tags data structure accoding to C<order=""> attribute.
1115
1116 =cut
1117
1118 sub _sort_by_order {
1119 my $self = shift;
1120
1121 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1122 $self->{'import_xml'}->{'indexer'}->{$a};
1123 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1124 $self->{'import_xml'}->{'indexer'}->{$b};
1125
1126 return $va <=> $vb;
1127 }
1128
1129 =head2 _get_logger
1130
1131 Get C<Log::Log4perl> object with a twist: domains are defined for each
1132 method
1133
1134 my $log = $webpac->_get_logger();
1135
1136 =cut
1137
1138 sub _get_logger {
1139 my $self = shift;
1140
1141 my $name = (caller(1))[3] || caller;
1142 return get_logger($name);
1143 }
1144
1145 =head2 _x
1146
1147 Convert string from UTF-8 to code page defined in C<import_xml>.
1148
1149 my $text = $webpac->_x('utf8 text');
1150
1151 =cut
1152
1153 sub _x {
1154 my $self = shift;
1155 my $utf8 = shift || return;
1156
1157 return $self->{'utf2cp'}->convert($utf8) ||
1158 $self->_get_logger()->logwarn("can't convert '$utf8'");
1159 }
1160
1161 #
1162 #
1163 #
1164
1165 =head1 LOGGING
1166
1167 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1168 C<log.conf>.
1169
1170 Methods defined above have different levels of logging, so
1171 it's descriptions will be useful to turn (mostry B<debug> logging) on
1172 or off to see why WabPAC isn't perforing as you expect it (it might even
1173 be a bug!).
1174
1175 B<This is different from normal Log4perl behaviour>. To repeat, you can
1176 also use method names, and not only classes (which are just few)
1177 to filter logging.
1178
1179
1180 =head1 MEMORY USAGE
1181
1182 C<low_mem> options is double-edged sword. If enabled, WebPAC
1183 will run on memory constraint machines (which doesn't have enough
1184 physical RAM to create memory structure for whole source database).
1185
1186 If your machine has 512Mb or more of RAM and database is around 10000 records,
1187 memory shouldn't be an issue. If you don't have enough physical RAM, you
1188 might consider using virtual memory (if your operating system is handling it
1189 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1190 parsed structure of ISIS database (this is what C<low_mem> option does).
1191
1192 Hitting swap at end of reading source database is probably o.k. However,
1193 hitting swap before 90% will dramatically decrease performance and you will
1194 be better off with C<low_mem> and using rest of availble memory for
1195 operating system disk cache (Linux is particuallary good about this).
1196 However, every access to database record will require disk access, so
1197 generation phase will be slower 10-100 times.
1198
1199 Parsed structures are essential - you just have option to trade RAM memory
1200 (which is fast) for disk space (which is slow). Be sure to have planty of
1201 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1202
1203 However, when WebPAC is running on desktop machines (or laptops :-), it's
1204 highly undesireable for system to start swapping. Using C<low_mem> option can
1205 reduce WecPAC memory usage to around 64Mb for same database with lookup
1206 fields and sorted indexes which stay in RAM. Performance will suffer, but
1207 memory usage will really be minimal. It might be also more confortable to
1208 run WebPAC reniced on those machines.
1209
1210 =cut
1211
1212 1;

  ViewVC Help
Powered by ViewVC 1.1.26