/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 560 - (show annotations)
Sat Oct 30 23:04:37 2004 UTC (16 years, 6 months ago) by dpavlin
File size: 27648 byte(s)
removed upper case letters from encoding of index nodes (to support Windows
which is case-insensitive), added support for filters into TT templates,
changed crovoc symbol to #C# in data

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 code_page => 'ISO-8859-2',
38 low_mem => 1,
39 filter => {
40 'lower' => sub { lc($_[0]) },
41 },
42 );
43
44 Default C<code_page> is C<ISO-8859-2>.
45
46 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47
48 There is optinal parametar C<filter> which specify different filters which
49 can be applied using C<filter{name}> notation.
50 Same filters can be used in Template Toolkit files.
51
52 This method will also read configuration files
53 C<global.conf> (used by indexer and Web font-end)
54 and configuration file specified by C<config_file>
55 which describes databases to be indexed.
56
57 =cut
58
59 # mapping between data type and tag which specify
60 # format in XML file
61 my %type2tag = (
62 'isis' => 'isis',
63 # 'excel' => 'column',
64 # 'marc' => 'marc',
65 # 'feed' => 'feed'
66 );
67
68 sub new {
69 my $class = shift;
70 my $self = {@_};
71 bless($self, $class);
72
73 $self->{'start_t'} = time();
74
75 my $log_file = $self->{'log'} || "log.conf";
76 Log::Log4perl->init($log_file);
77
78 my $log = $self->_get_logger();
79
80 # fill in default values
81 # output codepage
82 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
83
84 #
85 # read global.conf
86 #
87 $log->debug("read 'global.conf'");
88
89 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
90
91 # read global config parametars
92 foreach my $var (qw(
93 dbi_dbd
94 dbi_dsn
95 dbi_user
96 dbi_passwd
97 show_progress
98 my_unac_filter
99 output_template
100 )) {
101 $self->{'global_config'}->{$var} = $config->val('global', $var);
102 }
103
104 #
105 # read indexer config file
106 #
107
108 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
109
110 # create UTF-8 convertor for import_xml files
111 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
112
113 # create Template toolkit instance
114 $self->{'tt'} = Template->new(
115 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116 FILTERS => $self->{'filter'},
117 EVAL_PERL => 1,
118 );
119
120 # running with low_mem flag? well, use DBM::Deep then.
121 if ($self->{'low_mem'}) {
122 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123
124 my $db_file = "data.db";
125
126 if (-e $db_file) {
127 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128 $log->debug("removed '$db_file' from last run");
129 }
130
131 require DBM::Deep;
132
133 my $db = new DBM::Deep $db_file;
134
135 $log->logdie("DBM::Deep error: $!") unless ($db);
136
137 if ($db->error()) {
138 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139 } else {
140 $log->debug("using file '$db_file' for DBM::Deep");
141 }
142
143 $self->{'db'} = $db;
144 }
145
146 $log->debug("filters defined: ",Dumper($self->{'filter'}));
147
148 return $self;
149 }
150
151 =head2 open_isis
152
153 Open CDS/ISIS database using OpenIsis module and read all records to memory.
154
155 $webpac->open_isis(
156 filename => '/data/ISIS/ISIS',
157 code_page => '852',
158 limit_mfn => 500,
159 start_mfn => 6000,
160 lookup => [ ... ],
161 );
162
163 By default, ISIS code page is assumed to be C<852>.
164
165 If optional parametar C<start_mfn> is set, this will be first MFN to read
166 from database (so you can skip beginning of your database if you need to).
167
168 If optional parametar C<limit_mfn> is set, it will read just 500 records
169 from database in example above.
170
171 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
172 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
173 value in index.
174
175 lookup => [
176 { 'key' => 'd:v900', 'val' => 'v250^a' },
177 { 'eval' => '"v901^a" eq "Podruèje"',
178 'key' => 'pa:v561^4:v562^4:v461^1',
179 'val' => 'v900' },
180 ]
181
182 Returns number of last record read into memory (size of database, really).
183
184 =cut
185
186 sub open_isis {
187 my $self = shift;
188 my $arg = {@_};
189
190 my $log = $self->_get_logger();
191
192 $log->logcroak("need filename") if (! $arg->{'filename'});
193 my $code_page = $arg->{'code_page'} || '852';
194
195 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
196
197 # store data in object
198 $self->{'isis_filename'} = $arg->{'filename'};
199 $self->{'isis_code_page'} = $code_page;
200
201 use OpenIsis;
202
203 #$self->{'isis_code_page'} = $code_page;
204
205 # create Text::Iconv object
206 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
207
208 $log->info("reading ISIS database '",$arg->{'filename'},"'");
209 $log->debug("isis code page: $code_page");
210
211 my $isis_db = OpenIsis::open($arg->{'filename'});
212
213 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
214 my $startmfn = 1;
215
216 if (my $s = $self->{'start_mfn'}) {
217 $log->info("skipping to MFN $s");
218 $startmfn = $s;
219 } else {
220 $self->{'start_mfn'} = $startmfn;
221 }
222
223 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
224
225 $log->info("processing ",($maxmfn-$startmfn)." records...");
226
227 # read database
228 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
229
230
231 $log->debug("mfn: $mfn\n");
232
233 my $rec;
234
235 # read record
236 my $row = OpenIsis::read( $isis_db, $mfn );
237 foreach my $k (keys %{$row}) {
238 if ($k ne "mfn") {
239 foreach my $l (@{$row->{$k}}) {
240 $l = $cp->convert($l);
241 # has subfields?
242 my $val;
243 if ($l =~ m/\^/) {
244 foreach my $t (split(/\^/,$l)) {
245 next if (! $t);
246 $val->{substr($t,0,1)} = substr($t,1);
247 }
248 } else {
249 $val = $l;
250 }
251
252 push @{$rec->{$k}}, $val;
253 }
254 } else {
255 push @{$rec->{'000'}}, $mfn;
256 }
257
258 }
259
260 $log->confess("record $mfn empty?") unless ($rec);
261
262 # store
263 if ($self->{'low_mem'}) {
264 $self->{'db'}->put($mfn, $rec);
265 } else {
266 $self->{'data'}->{$mfn} = $rec;
267 }
268
269 # create lookup
270 $self->create_lookup($rec, @{$arg->{'lookup'}});
271
272 $self->progress_bar($mfn,$maxmfn);
273
274 }
275
276 $self->{'current_mfn'} = -1;
277 $self->{'last_pcnt'} = 0;
278
279 $log->debug("max mfn: $maxmfn");
280
281 # store max mfn and return it.
282 return $self->{'max_mfn'} = $maxmfn;
283 }
284
285 =head2 fetch_rec
286
287 Fetch next record from database. It will also display progress bar (once
288 it's implemented, that is).
289
290 my $rec = $webpac->fetch_rec;
291
292 =cut
293
294 sub fetch_rec {
295 my $self = shift;
296
297 my $log = $self->_get_logger();
298
299 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
300
301 if ($self->{'current_mfn'} == -1) {
302 $self->{'current_mfn'} = $self->{'start_mfn'};
303 } else {
304 $self->{'current_mfn'}++;
305 }
306
307 my $mfn = $self->{'current_mfn'};
308
309 if ($mfn > $self->{'max_mfn'}) {
310 $self->{'current_mfn'} = $self->{'max_mfn'};
311 $log->debug("at EOF");
312 return;
313 }
314
315 $self->progress_bar($mfn,$self->{'max_mfn'});
316
317 if ($self->{'low_mem'}) {
318 return $self->{'db'}->get($mfn);
319 } else {
320 return $self->{'data'}->{$mfn};
321 }
322 }
323
324 =head2 mfn
325
326 Returns current record number (MFN).
327
328 print $webpac->mfn;
329
330 =cut
331
332 sub mfn {
333 my $self = shift;
334 return $self->{'current_mfn'};
335 }
336
337 =head2 progress_bar
338
339 Draw progress bar on STDERR.
340
341 $webpac->progress_bar($current, $max);
342
343 =cut
344
345 sub progress_bar {
346 my $self = shift;
347
348 my ($curr,$max) = @_;
349
350 my $log = $self->_get_logger();
351
352 $log->logconfess("no current value!") if (! $curr);
353 $log->logconfess("no maximum value!") if (! $max);
354
355 if ($curr > $max) {
356 $max = $curr;
357 $log->debug("overflow to $curr");
358 }
359
360 $self->{'last_pcnt'} ||= 1;
361
362 my $p = int($curr * 100 / $max) || 1;
363
364 # reset on re-run
365 if ($p < $self->{'last_pcnt'}) {
366 $self->{'last_pcnt'} = $p;
367 $self->{'last_t'} = time();
368 $self->{'last_curr'} = undef;
369 }
370
371 $self->{'last_t'} ||= time();
372
373 if ($p != $self->{'last_pcnt'}) {
374
375 my $last_curr = $self->{'last_curr'} || $curr;
376 my $t = time();
377 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
378 my $eta = ($max-$curr) / ($rate || 1);
379 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
380 $self->{'last_pcnt'} = $p;
381 $self->{'last_t'} = time();
382 $self->{'last_curr'} = $curr;
383 }
384 print STDERR "\n" if ($p == 100);
385 }
386
387 =head2 fmt_time
388
389 Format time (in seconds) for display.
390
391 print $webpac->fmt_time(time());
392
393 This method is called by L<progress_bar> to display remaining time.
394
395 =cut
396
397 sub fmt_time {
398 my $self = shift;
399
400 my $t = shift || 0;
401 my $out = "";
402
403 my ($ss,$mm,$hh) = gmtime($t);
404 $out .= "${hh}h" if ($hh);
405 $out .= sprintf("%02d:%02d", $mm,$ss);
406 $out .= " " if ($hh == 0);
407 return $out;
408 }
409
410 =head2 open_import_xml
411
412 Read file from C<import_xml/> directory and parse it.
413
414 $webpac->open_import_xml(type => 'isis');
415
416 =cut
417
418 sub open_import_xml {
419 my $self = shift;
420
421 my $log = $self->_get_logger();
422
423 my $arg = {@_};
424 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
425
426 $self->{'type'} = $arg->{'type'};
427
428 my $type_base = $arg->{'type'};
429 $type_base =~ s/_.*$//g;
430
431 $self->{'tag'} = $type2tag{$type_base};
432
433 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
434
435 my $f = "./import_xml/".$self->{'type'}.".xml";
436 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
437
438 $log->info("reading '$f'");
439
440 $self->{'import_xml_file'} = $f;
441
442 $self->{'import_xml'} = XMLin($f,
443 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
444 );
445
446 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
447
448 }
449
450 =head2 create_lookup
451
452 Create lookup from record using lookup definition.
453
454 $self->create_lookup($rec, @lookups);
455
456 Called internally by C<open_*> methods.
457
458 =cut
459
460 sub create_lookup {
461 my $self = shift;
462
463 my $log = $self->_get_logger();
464
465 my $rec = shift || $log->logconfess("need record to create lookup");
466 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
467
468 foreach my $i (@_) {
469 $log->logconfess("need key") unless defined($i->{'key'});
470 $log->logconfess("need val") unless defined($i->{'val'});
471
472 if (defined($i->{'eval'})) {
473 # eval first, so we can skip fill_in for key and val
474 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
475 if ($self->_eval($eval)) {
476 my $key = $self->fill_in($rec,$i->{'key'}) || next;
477 my @val = $self->fill_in($rec,$i->{'val'}) || next;
478 $log->debug("stored $key = ",sub { join(" | ",@val) });
479 push @{$self->{'lookup'}->{$key}}, @val;
480 }
481 } else {
482 my $key = $self->fill_in($rec,$i->{'key'}) || next;
483 my @val = $self->fill_in($rec,$i->{'val'}) || next;
484 $log->debug("stored $key = ",sub { join(" | ",@val) });
485 push @{$self->{'lookup'}->{$key}}, @val;
486 }
487 }
488 }
489
490 =head2 get_data
491
492 Returns value from record.
493
494 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
495
496 Arguments are:
497 record reference C<$rec>,
498 field C<$f>,
499 optional subfiled C<$sf>,
500 index for repeatable values C<$i>.
501
502 Optinal variable C<$found> will be incremeted if there
503 is field.
504
505 Returns value or empty string.
506
507 =cut
508
509 sub get_data {
510 my $self = shift;
511
512 my ($rec,$f,$sf,$i,$found) = @_;
513
514 if ($$rec->{$f}) {
515 return '' if (! $$rec->{$f}->[$i]);
516 no strict 'refs';
517 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
518 $$found++ if (defined($$found));
519 return $$rec->{$f}->[$i]->{$sf};
520 } elsif ($$rec->{$f}->[$i]) {
521 $$found++ if (defined($$found));
522 # it still might have subfield, just
523 # not specified, so we'll dump all
524 if ($$rec->{$f}->[$i] =~ /HASH/o) {
525 my $out;
526 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
527 $out .= $$rec->{$f}->[$i]->{$k}." ";
528 }
529 return $out;
530 } else {
531 return $$rec->{$f}->[$i];
532 }
533 }
534 } else {
535 return '';
536 }
537 }
538
539 =head2 fill_in
540
541 Workhourse of all: takes record from in-memory structure of database and
542 strings with placeholders and returns string or array of with substituted
543 values from record.
544
545 my $text = $webpac->fill_in($rec,'v250^a');
546
547 Optional argument is ordinal number for repeatable fields. By default,
548 it's assume to be first repeatable field (fields are perl array, so first
549 element is 0).
550 Following example will read second value from repeatable field.
551
552 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
553
554 This function B<does not> perform parsing of format to inteligenty skip
555 delimiters before fields which aren't used.
556
557 This method will automatically decode UTF-8 string to local code page
558 if needed.
559
560 =cut
561
562 sub fill_in {
563 my $self = shift;
564
565 my $log = $self->_get_logger();
566
567 my $rec = shift || $log->logconfess("need data record");
568 my $format = shift || $log->logconfess("need format to parse");
569 # iteration (for repeatable fields)
570 my $i = shift || 0;
571
572 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
573
574 # FIXME remove for speedup?
575 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
576
577 if (utf8::is_utf8($format)) {
578 $format = $self->_x($format);
579 }
580
581 my $found = 0;
582
583 my $eval_code;
584 # remove eval{...} from beginning
585 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
586
587 my $filter_name;
588 # remove filter{...} from beginning
589 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
590
591 # do actual replacement of placeholders
592 # repeatable fields
593 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
594 # non-repeatable fields
595 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
596
597 if ($found) {
598 $log->debug("format: $format");
599 if ($eval_code) {
600 my $eval = $self->fill_in($rec,$eval_code,$i);
601 return if (! $self->_eval($eval));
602 }
603 if ($filter_name && $self->{'filter'}->{$filter_name}) {
604 $log->debug("filter '$filter_name' for $format");
605 $format = $self->{'filter'}->{$filter_name}->($format);
606 return unless(defined($format));
607 $log->debug("filter result: $format");
608 }
609 # do we have lookups?
610 if ($format =~ /$LOOKUP_REGEX/o) {
611 $log->debug("format '$format' has lookup");
612 return $self->lookup($format);
613 } else {
614 return $format;
615 }
616 } else {
617 return;
618 }
619 }
620
621 =head2 lookup
622
623 Perform lookups on format supplied to it.
624
625 my $text = $self->lookup('[v900]');
626
627 Lookups can be nested (like C<[d:[a:[v900]]]>).
628
629 =cut
630
631 sub lookup {
632 my $self = shift;
633
634 my $log = $self->_get_logger();
635
636 my $tmp = shift || $log->logconfess("need format");
637
638 if ($tmp =~ /$LOOKUP_REGEX/o) {
639 my @in = ( $tmp );
640
641 $log->debug("lookup for: ",$tmp);
642
643 my @out;
644 while (my $f = shift @in) {
645 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
646 my $k = $1;
647 if ($self->{'lookup'}->{$k}) {
648 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
649 my $tmp2 = $f;
650 $tmp2 =~ s/lookup{$k}/$nv/g;
651 push @in, $tmp2;
652 }
653 } else {
654 undef $f;
655 }
656 } elsif ($f) {
657 push @out, $f;
658 }
659 }
660 $log->logconfess("return is array and it's not expected!") unless wantarray;
661 return @out;
662 } else {
663 return $tmp;
664 }
665 }
666
667 =head2 parse
668
669 Perform smart parsing of string, skipping delimiters for fields which aren't
670 defined. It can also eval code in format starting with C<eval{...}> and
671 return output or nothing depending on eval code.
672
673 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
674
675 =cut
676
677 sub parse {
678 my $self = shift;
679
680 my ($rec, $format_utf8, $i) = @_;
681
682 return if (! $format_utf8);
683
684 my $log = $self->_get_logger();
685
686 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
687 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
688
689 $i = 0 if (! $i);
690
691 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
692
693 my @out;
694
695 $log->debug("format: $format");
696
697 my $eval_code;
698 # remove eval{...} from beginning
699 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
700
701 my $filter_name;
702 # remove filter{...} from beginning
703 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
704
705 my $prefix;
706 my $all_found=0;
707
708 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
709
710 my $del = $1 || '';
711 $prefix ||= $del if ($all_found == 0);
712
713 # repeatable index
714 my $r = $i;
715 $r = 0 if (lc("$2") eq 's');
716
717 my $found = 0;
718 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
719
720 if ($found) {
721 push @out, $del;
722 push @out, $tmp;
723 $all_found += $found;
724 }
725 }
726
727 return if (! $all_found);
728
729 my $out = join('',@out);
730
731 if ($out) {
732 # add rest of format (suffix)
733 $out .= $format;
734
735 # add prefix if not there
736 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
737
738 $log->debug("result: $out");
739 }
740
741 if ($eval_code) {
742 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
743 $log->debug("about to eval{$eval} format: $out");
744 return if (! $self->_eval($eval));
745 }
746
747 if ($filter_name && $self->{'filter'}->{$filter_name}) {
748 $log->debug("about to filter{$filter_name} format: $out");
749 $out = $self->{'filter'}->{$filter_name}->($out);
750 return unless(defined($out));
751 $log->debug("filter result: $out");
752 }
753
754 return $out;
755 }
756
757 =head2 parse_to_arr
758
759 Similar to C<parse>, but returns array of all repeatable fields
760
761 my @arr = $webpac->parse_to_arr($rec,'v250^a');
762
763 =cut
764
765 sub parse_to_arr {
766 my $self = shift;
767
768 my ($rec, $format_utf8) = @_;
769
770 my $log = $self->_get_logger();
771
772 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
773 return if (! $format_utf8);
774
775 my $i = 0;
776 my @arr;
777
778 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
779 push @arr, $v;
780 }
781
782 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
783
784 return @arr;
785 }
786
787 =head2 fill_in_to_arr
788
789 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
790 for fields which have lookups, so they shouldn't be parsed but rather
791 C<fill_id>ed.
792
793 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
794
795 =cut
796
797 sub fill_in_to_arr {
798 my $self = shift;
799
800 my ($rec, $format_utf8) = @_;
801
802 my $log = $self->_get_logger();
803
804 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
805 return if (! $format_utf8);
806
807 my $i = 0;
808 my @arr;
809
810 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
811 push @arr, @v;
812 }
813
814 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
815
816 return @arr;
817 }
818
819 =head2 sort_arr
820
821 Sort array ignoring case and html in data
822
823 my @sorted = $webpac->sort_arr(@unsorted);
824
825 =cut
826
827 sub sort_arr {
828 my $self = shift;
829
830 my $log = $self->_get_logger();
831
832 # FIXME add Schwartzian Transformation?
833
834 my @sorted = sort {
835 $a =~ s#<[^>]+/*>##;
836 $b =~ s#<[^>]+/*>##;
837 lc($b) cmp lc($a)
838 } @_;
839 $log->debug("sorted values: ",sub { join(", ",@sorted) });
840
841 return @sorted;
842 }
843
844
845 =head2 data_structure
846
847 Create in-memory data structure which represents layout from C<import_xml>.
848 It is used later to produce output.
849
850 my @ds = $webpac->data_structure($rec);
851
852 This method will also set C<$webpac->{'currnet_filename'}> if there is
853 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
854 <headline> tag.
855
856 =cut
857
858 sub data_structure {
859 my $self = shift;
860
861 my $log = $self->_get_logger();
862
863 my $rec = shift;
864 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
865
866 undef $self->{'currnet_filename'};
867 undef $self->{'headline'};
868
869 my @sorted_tags;
870 if ($self->{tags_by_order}) {
871 @sorted_tags = @{$self->{tags_by_order}};
872 } else {
873 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
874 $self->{tags_by_order} = \@sorted_tags;
875 }
876
877 my @ds;
878
879 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
880
881 foreach my $field (@sorted_tags) {
882
883 my $row;
884
885 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
886
887 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
888 my $format = $tag->{'value'} || $tag->{'content'};
889
890 $log->debug("format: $format");
891
892 my @v;
893 if ($format =~ /$LOOKUP_REGEX/o) {
894 @v = $self->fill_in_to_arr($rec,$format);
895 } else {
896 @v = $self->parse_to_arr($rec,$format);
897 }
898 next if (! @v);
899
900 if ($tag->{'sort'}) {
901 @v = $self->sort_arr(@v);
902 $log->warn("sort within tag is usually not what you want!");
903 }
904
905 # use format?
906 if ($tag->{'format_name'}) {
907 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
908 }
909
910 if ($field eq 'filename') {
911 $self->{'current_filename'} = join('',@v);
912 $log->debug("filename: ",$self->{'current_filename'});
913 } elsif ($field eq 'headline') {
914 $self->{'headline'} .= join('',@v);
915 $log->debug("headline: ",$self->{'headline'});
916 next; # don't return headline in data_structure!
917 }
918
919 # delimiter will join repeatable fields
920 if ($tag->{'delimiter'}) {
921 @v = ( join($tag->{'delimiter'}, @v) );
922 }
923
924 # default types
925 my @types = qw(display swish);
926 # override by type attribute
927 @types = ( $tag->{'type'} ) if ($tag->{'type'});
928
929 foreach my $type (@types) {
930 # append to previous line?
931 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
932 if ($tag->{'append'}) {
933
934 # I will delimit appended part with
935 # delimiter (or ,)
936 my $d = $tag->{'delimiter'};
937 # default delimiter
938 $d ||= " ";
939
940 my $last = pop @{$row->{$type}};
941 $d = "" if (! $last);
942 $last .= $d . join($d, @v);
943 push @{$row->{$type}}, $last;
944
945 } else {
946 push @{$row->{$type}}, @v;
947 }
948 }
949
950
951 }
952
953 if ($row) {
954 $row->{'tag'} = $field;
955
956 # TODO: name_sigular, name_plural
957 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
958 $row->{'name'} = $name ? $self->_x($name) : $field;
959
960 # post-sort all values in field
961 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
962 $log->warn("sort at field tag not implemented");
963 }
964
965 push @ds, $row;
966
967 $log->debug("row $field: ",sub { Dumper($row) });
968 }
969
970 }
971
972 return @ds;
973
974 }
975
976 =head2 output
977
978 Create output from in-memory data structure using Template Toolkit template.
979
980 my $text = $webpac->output( template => 'text.tt', data => @ds );
981
982 =cut
983
984 sub output {
985 my $self = shift;
986
987 my $args = {@_};
988
989 my $log = $self->_get_logger();
990
991 $log->logconfess("need template name") if (! $args->{'template'});
992 $log->logconfess("need data array") if (! $args->{'data'});
993
994 my $out;
995
996 $self->{'tt'}->process(
997 $args->{'template'},
998 $args,
999 \$out
1000 ) || confess $self->{'tt'}->error();
1001
1002 return $out;
1003 }
1004
1005 =head2 output_file
1006
1007 Create output from in-memory data structure using Template Toolkit template
1008 to a file.
1009
1010 $webpac->output_file(
1011 file => 'out.txt',
1012 template => 'text.tt',
1013 data => @ds
1014 );
1015
1016 =cut
1017
1018 sub output_file {
1019 my $self = shift;
1020
1021 my $args = {@_};
1022
1023 my $log = $self->_get_logger();
1024
1025 my $file = $args->{'file'} || $log->logconfess("need file name");
1026
1027 $log->debug("creating file ",$file);
1028
1029 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1030 print $fh $self->output(
1031 template => $args->{'template'},
1032 data => $args->{'data'},
1033 ) || $log->logdie("print: $!");
1034 close($fh) || $log->logdie("close: $!");
1035 }
1036
1037 =head2 apply_format
1038
1039 Apply format specified in tag with C<format_name="name"> and
1040 C<format_delimiter=";;">.
1041
1042 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1043
1044 Formats can contain C<lookup{...}> if you need them.
1045
1046 =cut
1047
1048 sub apply_format {
1049 my $self = shift;
1050
1051 my ($name,$delimiter,$data) = @_;
1052
1053 my $log = $self->_get_logger();
1054
1055 if (! $self->{'import_xml'}->{'format'}->{$name}) {
1056 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1057 return $data;
1058 }
1059
1060 $log->warn("no delimiter for format $name") if (! $delimiter);
1061
1062 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1063
1064 my @data = split(/\Q$delimiter\E/, $data);
1065
1066 my $out = sprintf($format, @data);
1067 $log->debug("using format $name [$format] on $data to produce: $out");
1068
1069 if ($out =~ m/$LOOKUP_REGEX/o) {
1070 return $self->lookup($out);
1071 } else {
1072 return $out;
1073 }
1074
1075 }
1076
1077
1078 #
1079 #
1080 #
1081
1082 =head1 INTERNAL METHODS
1083
1084 Here is a quick list of internal methods, mostly useful to turn debugging
1085 on them (see L<LOGGING> below for explanation).
1086
1087 =cut
1088
1089 =head2 _eval
1090
1091 Internal function to eval code without C<strict 'subs'>.
1092
1093 =cut
1094
1095 sub _eval {
1096 my $self = shift;
1097
1098 my $code = shift || return;
1099
1100 my $log = $self->_get_logger();
1101
1102 no strict 'subs';
1103 my $ret = eval $code;
1104 if ($@) {
1105 $log->error("problem with eval code [$code]: $@");
1106 }
1107
1108 $log->debug("eval: ",$code," [",$ret,"]");
1109
1110 return $ret || undef;
1111 }
1112
1113 =head2 _sort_by_order
1114
1115 Sort xml tags data structure accoding to C<order=""> attribute.
1116
1117 =cut
1118
1119 sub _sort_by_order {
1120 my $self = shift;
1121
1122 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1123 $self->{'import_xml'}->{'indexer'}->{$a};
1124 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1125 $self->{'import_xml'}->{'indexer'}->{$b};
1126
1127 return $va <=> $vb;
1128 }
1129
1130 =head2 _get_logger
1131
1132 Get C<Log::Log4perl> object with a twist: domains are defined for each
1133 method
1134
1135 my $log = $webpac->_get_logger();
1136
1137 =cut
1138
1139 sub _get_logger {
1140 my $self = shift;
1141
1142 my $name = (caller(1))[3] || caller;
1143 return get_logger($name);
1144 }
1145
1146 =head2 _x
1147
1148 Convert string from UTF-8 to code page defined in C<import_xml>.
1149
1150 my $text = $webpac->_x('utf8 text');
1151
1152 =cut
1153
1154 sub _x {
1155 my $self = shift;
1156 my $utf8 = shift || return;
1157
1158 return $self->{'utf2cp'}->convert($utf8) ||
1159 $self->_get_logger()->logwarn("can't convert '$utf8'");
1160 }
1161
1162 #
1163 #
1164 #
1165
1166 =head1 LOGGING
1167
1168 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1169 C<log.conf>.
1170
1171 Methods defined above have different levels of logging, so
1172 it's descriptions will be useful to turn (mostry B<debug> logging) on
1173 or off to see why WabPAC isn't perforing as you expect it (it might even
1174 be a bug!).
1175
1176 B<This is different from normal Log4perl behaviour>. To repeat, you can
1177 also use method names, and not only classes (which are just few)
1178 to filter logging.
1179
1180
1181 =head1 MEMORY USAGE
1182
1183 C<low_mem> options is double-edged sword. If enabled, WebPAC
1184 will run on memory constraint machines (which doesn't have enough
1185 physical RAM to create memory structure for whole source database).
1186
1187 If your machine has 512Mb or more of RAM and database is around 10000 records,
1188 memory shouldn't be an issue. If you don't have enough physical RAM, you
1189 might consider using virtual memory (if your operating system is handling it
1190 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1191 parsed structure of ISIS database (this is what C<low_mem> option does).
1192
1193 Hitting swap at end of reading source database is probably o.k. However,
1194 hitting swap before 90% will dramatically decrease performance and you will
1195 be better off with C<low_mem> and using rest of availble memory for
1196 operating system disk cache (Linux is particuallary good about this).
1197 However, every access to database record will require disk access, so
1198 generation phase will be slower 10-100 times.
1199
1200 Parsed structures are essential - you just have option to trade RAM memory
1201 (which is fast) for disk space (which is slow). Be sure to have planty of
1202 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1203
1204 However, when WebPAC is running on desktop machines (or laptops :-), it's
1205 highly undesireable for system to start swapping. Using C<low_mem> option can
1206 reduce WecPAC memory usage to around 64Mb for same database with lookup
1207 fields and sorted indexes which stay in RAM. Performance will suffer, but
1208 memory usage will really be minimal. It might be also more confortable to
1209 run WebPAC reniced on those machines.
1210
1211 =cut
1212
1213 1;

  ViewVC Help
Powered by ViewVC 1.1.26