/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 609 - (show annotations)
Fri Dec 31 02:19:24 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 28010 byte(s)
prefer IsisDB over OpenISIS, small improvement to progress bar

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 code_page => 'ISO-8859-2',
38 low_mem => 1,
39 filter => {
40 'lower' => sub { lc($_[0]) },
41 },
42 );
43
44 Default C<code_page> is C<ISO-8859-2>.
45
46 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47
48 There is optinal parametar C<filter> which specify different filters which
49 can be applied using C<filter{name}> notation.
50 Same filters can be used in Template Toolkit files.
51
52 This method will also read configuration files
53 C<global.conf> (used by indexer and Web font-end)
54 and configuration file specified by C<config_file>
55 which describes databases to be indexed.
56
57 =cut
58
59 # mapping between data type and tag which specify
60 # format in XML file
61 my %type2tag = (
62 'isis' => 'isis',
63 # 'excel' => 'column',
64 # 'marc' => 'marc',
65 # 'feed' => 'feed'
66 );
67
68 sub new {
69 my $class = shift;
70 my $self = {@_};
71 bless($self, $class);
72
73 $self->{'start_t'} = time();
74
75 my $log_file = $self->{'log'} || "log.conf";
76 Log::Log4perl->init($log_file);
77
78 my $log = $self->_get_logger();
79
80 # fill in default values
81 # output codepage
82 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
83
84 #
85 # read global.conf
86 #
87 $log->debug("read 'global.conf'");
88
89 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
90
91 # read global config parametars
92 foreach my $var (qw(
93 dbi_dbd
94 dbi_dsn
95 dbi_user
96 dbi_passwd
97 show_progress
98 my_unac_filter
99 output_template
100 )) {
101 $self->{'global_config'}->{$var} = $config->val('global', $var);
102 }
103
104 #
105 # read indexer config file
106 #
107
108 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
109
110 # create UTF-8 convertor for import_xml files
111 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
112
113 # create Template toolkit instance
114 $self->{'tt'} = Template->new(
115 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116 FILTERS => $self->{'filter'},
117 EVAL_PERL => 1,
118 );
119
120 # running with low_mem flag? well, use DBM::Deep then.
121 if ($self->{'low_mem'}) {
122 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123
124 my $db_file = "data.db";
125
126 if (-e $db_file) {
127 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128 $log->debug("removed '$db_file' from last run");
129 }
130
131 require DBM::Deep;
132
133 my $db = new DBM::Deep $db_file;
134
135 $log->logdie("DBM::Deep error: $!") unless ($db);
136
137 if ($db->error()) {
138 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139 } else {
140 $log->debug("using file '$db_file' for DBM::Deep");
141 }
142
143 $self->{'db'} = $db;
144 }
145
146 $log->debug("filters defined: ",Dumper($self->{'filter'}));
147
148 return $self;
149 }
150
151 =head2 open_isis
152
153 Open CDS/ISIS, WinISIS or IsisMarc database using IsisDB or OpenIsis module
154 and read all records to memory.
155
156 $webpac->open_isis(
157 filename => '/data/ISIS/ISIS',
158 code_page => '852',
159 limit_mfn => 500,
160 start_mfn => 6000,
161 lookup => [ ... ],
162 );
163
164 By default, ISIS code page is assumed to be C<852>.
165
166 If optional parametar C<start_mfn> is set, this will be first MFN to read
167 from database (so you can skip beginning of your database if you need to).
168
169 If optional parametar C<limit_mfn> is set, it will read just 500 records
170 from database in example above.
171
172 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
173 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
174 value in index.
175
176 lookup => [
177 { 'key' => 'd:v900', 'val' => 'v250^a' },
178 { 'eval' => '"v901^a" eq "Podruèje"',
179 'key' => 'pa:v561^4:v562^4:v461^1',
180 'val' => 'v900' },
181 ]
182
183 Returns number of last record read into memory (size of database, really).
184
185 =cut
186
187 sub open_isis {
188 my $self = shift;
189 my $arg = {@_};
190
191 my $log = $self->_get_logger();
192
193 $log->logcroak("need filename") if (! $arg->{'filename'});
194 my $code_page = $arg->{'code_page'} || '852';
195
196 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197
198 # store data in object
199 $self->{'isis_filename'} = $arg->{'filename'};
200 $self->{'isis_code_page'} = $code_page;
201
202 #$self->{'isis_code_page'} = $code_page;
203
204 # create Text::Iconv object
205 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
206
207 $log->info("reading ISIS database '",$arg->{'filename'},"'");
208 $log->debug("isis code page: $code_page");
209
210 my $use_openisis = 1;
211
212 eval { use IsisDB 0.06; };
213 $use_openisis = 0 unless ($@);
214
215 my ($isis_db,$maxmfn);
216
217 if ($use_openisis) {
218 use OpenIsis;
219 $isis_db = OpenIsis::open($arg->{'filename'});
220 $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
221 } else {
222 $isis_db = new IsisDB(
223 isisdb => $arg->{'filename'},
224 include_deleted => 1,
225 hash_filter => sub {
226 my $l = shift || return;
227 $l = $cp->convert($l);
228 return $l;
229 },
230 );
231 $maxmfn = $isis_db->{'maxmfn'};
232 }
233
234
235 my $startmfn = 1;
236
237 if (my $s = $self->{'start_mfn'}) {
238 $log->info("skipping to MFN $s");
239 $startmfn = $s;
240 } else {
241 $self->{'start_mfn'} = $startmfn;
242 }
243
244 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
245
246 $log->info("processing ",($maxmfn-$startmfn)." records using ",( $use_openisis ? 'OpenIsis' : 'IsisDB'));
247
248
249 # read database
250 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
251
252
253 $log->debug("mfn: $mfn\n");
254
255 my $rec;
256
257 if ($use_openisis) {
258
259 # read record using OpenIsis
260 my $row = OpenIsis::read( $isis_db, $mfn );
261 foreach my $k (keys %{$row}) {
262 if ($k ne "mfn") {
263 foreach my $l (@{$row->{$k}}) {
264 $l = $cp->convert($l);
265 # has subfields?
266 my $val;
267 if ($l =~ m/\^/) {
268 foreach my $t (split(/\^/,$l)) {
269 next if (! $t);
270 $val->{substr($t,0,1)} = substr($t,1);
271 }
272 } else {
273 $val = $l;
274 }
275
276 push @{$rec->{$k}}, $val;
277 }
278 } else {
279 push @{$rec->{'000'}}, $mfn;
280 }
281 }
282
283 } else {
284 $rec = $isis_db->to_hash($mfn);
285 }
286
287 $log->confess("record $mfn empty?") unless ($rec);
288
289 # store
290 if ($self->{'low_mem'}) {
291 $self->{'db'}->put($mfn, $rec);
292 } else {
293 $self->{'data'}->{$mfn} = $rec;
294 }
295
296 # create lookup
297 $self->create_lookup($rec, @{$arg->{'lookup'}});
298
299 $self->progress_bar($mfn,$maxmfn);
300
301 }
302
303 $self->{'current_mfn'} = -1;
304 $self->{'last_pcnt'} = 0;
305
306 $log->debug("max mfn: $maxmfn");
307
308 # store max mfn and return it.
309 return $self->{'max_mfn'} = $maxmfn;
310 }
311
312 =head2 fetch_rec
313
314 Fetch next record from database. It will also display progress bar (once
315 it's implemented, that is).
316
317 my $rec = $webpac->fetch_rec;
318
319 =cut
320
321 sub fetch_rec {
322 my $self = shift;
323
324 my $log = $self->_get_logger();
325
326 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
327
328 if ($self->{'current_mfn'} == -1) {
329 $self->{'current_mfn'} = $self->{'start_mfn'};
330 } else {
331 $self->{'current_mfn'}++;
332 }
333
334 my $mfn = $self->{'current_mfn'};
335
336 if ($mfn > $self->{'max_mfn'}) {
337 $self->{'current_mfn'} = $self->{'max_mfn'};
338 $log->debug("at EOF");
339 return;
340 }
341
342 $self->progress_bar($mfn,$self->{'max_mfn'});
343
344 if ($self->{'low_mem'}) {
345 return $self->{'db'}->get($mfn);
346 } else {
347 return $self->{'data'}->{$mfn};
348 }
349 }
350
351 =head2 mfn
352
353 Returns current record number (MFN).
354
355 print $webpac->mfn;
356
357 =cut
358
359 sub mfn {
360 my $self = shift;
361 return $self->{'current_mfn'};
362 }
363
364 =head2 progress_bar
365
366 Draw progress bar on STDERR.
367
368 $webpac->progress_bar($current, $max);
369
370 =cut
371
372 sub progress_bar {
373 my $self = shift;
374
375 my ($curr,$max) = @_;
376
377 my $log = $self->_get_logger();
378
379 $log->logconfess("no current value!") if (! $curr);
380 $log->logconfess("no maximum value!") if (! $max);
381
382 if ($curr > $max) {
383 $max = $curr;
384 $log->debug("overflow to $curr");
385 }
386
387 $self->{'last_pcnt'} ||= 1;
388
389 my $p = int($curr * 100 / $max) || 1;
390
391 # reset on re-run
392 if ($p < $self->{'last_pcnt'}) {
393 $self->{'last_pcnt'} = $p;
394 $self->{'start_t'} = time();
395 }
396
397 if ($p != $self->{'last_pcnt'}) {
398
399 my $t = time();
400 my $rate = ($curr / ($t - $self->{'start_t'} || 1));
401 my $eta = ($max-$curr) / ($rate || 1);
402 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
403 $self->{'last_pcnt'} = $p;
404 $self->{'last_t'} = time();
405 $self->{'last_curr'} = $curr;
406 }
407 print STDERR "\n" if ($p == 100);
408 }
409
410 =head2 fmt_time
411
412 Format time (in seconds) for display.
413
414 print $webpac->fmt_time(time());
415
416 This method is called by L<progress_bar> to display remaining time.
417
418 =cut
419
420 sub fmt_time {
421 my $self = shift;
422
423 my $t = shift || 0;
424 my $out = "";
425
426 my ($ss,$mm,$hh) = gmtime($t);
427 $out .= "${hh}h" if ($hh);
428 $out .= sprintf("%02d:%02d", $mm,$ss);
429 $out .= " " if ($hh == 0);
430 return $out;
431 }
432
433 =head2 open_import_xml
434
435 Read file from C<import_xml/> directory and parse it.
436
437 $webpac->open_import_xml(type => 'isis');
438
439 =cut
440
441 sub open_import_xml {
442 my $self = shift;
443
444 my $log = $self->_get_logger();
445
446 my $arg = {@_};
447 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
448
449 $self->{'type'} = $arg->{'type'};
450
451 my $type_base = $arg->{'type'};
452 $type_base =~ s/_.*$//g;
453
454 $self->{'tag'} = $type2tag{$type_base};
455
456 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
457
458 my $f = "./import_xml/".$self->{'type'}.".xml";
459 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
460
461 $log->info("reading '$f'");
462
463 $self->{'import_xml_file'} = $f;
464
465 $self->{'import_xml'} = XMLin($f,
466 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
467 );
468
469 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
470
471 }
472
473 =head2 create_lookup
474
475 Create lookup from record using lookup definition.
476
477 $self->create_lookup($rec, @lookups);
478
479 Called internally by C<open_*> methods.
480
481 =cut
482
483 sub create_lookup {
484 my $self = shift;
485
486 my $log = $self->_get_logger();
487
488 my $rec = shift || $log->logconfess("need record to create lookup");
489 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
490
491 foreach my $i (@_) {
492 $log->logconfess("need key") unless defined($i->{'key'});
493 $log->logconfess("need val") unless defined($i->{'val'});
494
495 if (defined($i->{'eval'})) {
496 # eval first, so we can skip fill_in for key and val
497 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
498 if ($self->_eval($eval)) {
499 my $key = $self->fill_in($rec,$i->{'key'}) || next;
500 my @val = $self->fill_in($rec,$i->{'val'}) || next;
501 $log->debug("stored $key = ",sub { join(" | ",@val) });
502 push @{$self->{'lookup'}->{$key}}, @val;
503 }
504 } else {
505 my $key = $self->fill_in($rec,$i->{'key'}) || next;
506 my @val = $self->fill_in($rec,$i->{'val'}) || next;
507 $log->debug("stored $key = ",sub { join(" | ",@val) });
508 push @{$self->{'lookup'}->{$key}}, @val;
509 }
510 }
511 }
512
513 =head2 get_data
514
515 Returns value from record.
516
517 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
518
519 Arguments are:
520 record reference C<$rec>,
521 field C<$f>,
522 optional subfiled C<$sf>,
523 index for repeatable values C<$i>.
524
525 Optinal variable C<$found> will be incremeted if there
526 is field.
527
528 Returns value or empty string.
529
530 =cut
531
532 sub get_data {
533 my $self = shift;
534
535 my ($rec,$f,$sf,$i,$found) = @_;
536
537 if ($$rec->{$f}) {
538 return '' if (! $$rec->{$f}->[$i]);
539 no strict 'refs';
540 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
541 $$found++ if (defined($$found));
542 return $$rec->{$f}->[$i]->{$sf};
543 } elsif ($$rec->{$f}->[$i]) {
544 $$found++ if (defined($$found));
545 # it still might have subfield, just
546 # not specified, so we'll dump all
547 if ($$rec->{$f}->[$i] =~ /HASH/o) {
548 my $out;
549 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
550 $out .= $$rec->{$f}->[$i]->{$k}." ";
551 }
552 return $out;
553 } else {
554 return $$rec->{$f}->[$i];
555 }
556 }
557 } else {
558 return '';
559 }
560 }
561
562 =head2 fill_in
563
564 Workhourse of all: takes record from in-memory structure of database and
565 strings with placeholders and returns string or array of with substituted
566 values from record.
567
568 my $text = $webpac->fill_in($rec,'v250^a');
569
570 Optional argument is ordinal number for repeatable fields. By default,
571 it's assume to be first repeatable field (fields are perl array, so first
572 element is 0).
573 Following example will read second value from repeatable field.
574
575 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
576
577 This function B<does not> perform parsing of format to inteligenty skip
578 delimiters before fields which aren't used.
579
580 This method will automatically decode UTF-8 string to local code page
581 if needed.
582
583 =cut
584
585 sub fill_in {
586 my $self = shift;
587
588 my $log = $self->_get_logger();
589
590 my $rec = shift || $log->logconfess("need data record");
591 my $format = shift || $log->logconfess("need format to parse");
592 # iteration (for repeatable fields)
593 my $i = shift || 0;
594
595 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
596
597 # FIXME remove for speedup?
598 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
599
600 if (utf8::is_utf8($format)) {
601 $format = $self->_x($format);
602 }
603
604 my $found = 0;
605
606 my $eval_code;
607 # remove eval{...} from beginning
608 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
609
610 my $filter_name;
611 # remove filter{...} from beginning
612 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
613
614 # do actual replacement of placeholders
615 # repeatable fields
616 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
617 # non-repeatable fields
618 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
619
620 if ($found) {
621 $log->debug("format: $format");
622 if ($eval_code) {
623 my $eval = $self->fill_in($rec,$eval_code,$i);
624 return if (! $self->_eval($eval));
625 }
626 if ($filter_name && $self->{'filter'}->{$filter_name}) {
627 $log->debug("filter '$filter_name' for $format");
628 $format = $self->{'filter'}->{$filter_name}->($format);
629 return unless(defined($format));
630 $log->debug("filter result: $format");
631 }
632 # do we have lookups?
633 if ($format =~ /$LOOKUP_REGEX/o) {
634 $log->debug("format '$format' has lookup");
635 return $self->lookup($format);
636 } else {
637 return $format;
638 }
639 } else {
640 return;
641 }
642 }
643
644 =head2 lookup
645
646 Perform lookups on format supplied to it.
647
648 my $text = $self->lookup('[v900]');
649
650 Lookups can be nested (like C<[d:[a:[v900]]]>).
651
652 =cut
653
654 sub lookup {
655 my $self = shift;
656
657 my $log = $self->_get_logger();
658
659 my $tmp = shift || $log->logconfess("need format");
660
661 if ($tmp =~ /$LOOKUP_REGEX/o) {
662 my @in = ( $tmp );
663
664 $log->debug("lookup for: ",$tmp);
665
666 my @out;
667 while (my $f = shift @in) {
668 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
669 my $k = $1;
670 if ($self->{'lookup'}->{$k}) {
671 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
672 my $tmp2 = $f;
673 $tmp2 =~ s/lookup{$k}/$nv/g;
674 push @in, $tmp2;
675 }
676 } else {
677 undef $f;
678 }
679 } elsif ($f) {
680 push @out, $f;
681 }
682 }
683 $log->logconfess("return is array and it's not expected!") unless wantarray;
684 return @out;
685 } else {
686 return $tmp;
687 }
688 }
689
690 =head2 parse
691
692 Perform smart parsing of string, skipping delimiters for fields which aren't
693 defined. It can also eval code in format starting with C<eval{...}> and
694 return output or nothing depending on eval code.
695
696 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
697
698 =cut
699
700 sub parse {
701 my $self = shift;
702
703 my ($rec, $format_utf8, $i) = @_;
704
705 return if (! $format_utf8);
706
707 my $log = $self->_get_logger();
708
709 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
710 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
711
712 $i = 0 if (! $i);
713
714 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
715
716 my @out;
717
718 $log->debug("format: $format");
719
720 my $eval_code;
721 # remove eval{...} from beginning
722 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
723
724 my $filter_name;
725 # remove filter{...} from beginning
726 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
727
728 my $prefix;
729 my $all_found=0;
730
731 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
732
733 my $del = $1 || '';
734 $prefix ||= $del if ($all_found == 0);
735
736 # repeatable index
737 my $r = $i;
738 $r = 0 if (lc("$2") eq 's');
739
740 my $found = 0;
741 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
742
743 if ($found) {
744 push @out, $del;
745 push @out, $tmp;
746 $all_found += $found;
747 }
748 }
749
750 return if (! $all_found);
751
752 my $out = join('',@out);
753
754 if ($out) {
755 # add rest of format (suffix)
756 $out .= $format;
757
758 # add prefix if not there
759 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
760
761 $log->debug("result: $out");
762 }
763
764 if ($eval_code) {
765 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
766 $log->debug("about to eval{$eval} format: $out");
767 return if (! $self->_eval($eval));
768 }
769
770 if ($filter_name && $self->{'filter'}->{$filter_name}) {
771 $log->debug("about to filter{$filter_name} format: $out");
772 $out = $self->{'filter'}->{$filter_name}->($out);
773 return unless(defined($out));
774 $log->debug("filter result: $out");
775 }
776
777 return $out;
778 }
779
780 =head2 parse_to_arr
781
782 Similar to C<parse>, but returns array of all repeatable fields
783
784 my @arr = $webpac->parse_to_arr($rec,'v250^a');
785
786 =cut
787
788 sub parse_to_arr {
789 my $self = shift;
790
791 my ($rec, $format_utf8) = @_;
792
793 my $log = $self->_get_logger();
794
795 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
796 return if (! $format_utf8);
797
798 my $i = 0;
799 my @arr;
800
801 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
802 push @arr, $v;
803 }
804
805 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
806
807 return @arr;
808 }
809
810 =head2 fill_in_to_arr
811
812 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
813 for fields which have lookups, so they shouldn't be parsed but rather
814 C<fill_id>ed.
815
816 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
817
818 =cut
819
820 sub fill_in_to_arr {
821 my $self = shift;
822
823 my ($rec, $format_utf8) = @_;
824
825 my $log = $self->_get_logger();
826
827 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
828 return if (! $format_utf8);
829
830 my $i = 0;
831 my @arr;
832
833 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
834 push @arr, @v;
835 }
836
837 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
838
839 return @arr;
840 }
841
842 =head2 sort_arr
843
844 Sort array ignoring case and html in data
845
846 my @sorted = $webpac->sort_arr(@unsorted);
847
848 =cut
849
850 sub sort_arr {
851 my $self = shift;
852
853 my $log = $self->_get_logger();
854
855 # FIXME add Schwartzian Transformation?
856
857 my @sorted = sort {
858 $a =~ s#<[^>]+/*>##;
859 $b =~ s#<[^>]+/*>##;
860 lc($b) cmp lc($a)
861 } @_;
862 $log->debug("sorted values: ",sub { join(", ",@sorted) });
863
864 return @sorted;
865 }
866
867
868 =head2 data_structure
869
870 Create in-memory data structure which represents layout from C<import_xml>.
871 It is used later to produce output.
872
873 my @ds = $webpac->data_structure($rec);
874
875 This method will also set C<$webpac->{'currnet_filename'}> if there is
876 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
877 <headline> tag.
878
879 =cut
880
881 sub data_structure {
882 my $self = shift;
883
884 my $log = $self->_get_logger();
885
886 my $rec = shift;
887 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
888
889 undef $self->{'currnet_filename'};
890 undef $self->{'headline'};
891
892 my @sorted_tags;
893 if ($self->{tags_by_order}) {
894 @sorted_tags = @{$self->{tags_by_order}};
895 } else {
896 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
897 $self->{tags_by_order} = \@sorted_tags;
898 }
899
900 my @ds;
901
902 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
903
904 foreach my $field (@sorted_tags) {
905
906 my $row;
907
908 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
909
910 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
911 my $format = $tag->{'value'} || $tag->{'content'};
912
913 $log->debug("format: $format");
914
915 my @v;
916 if ($format =~ /$LOOKUP_REGEX/o) {
917 @v = $self->fill_in_to_arr($rec,$format);
918 } else {
919 @v = $self->parse_to_arr($rec,$format);
920 }
921 next if (! @v);
922
923 if ($tag->{'sort'}) {
924 @v = $self->sort_arr(@v);
925 }
926
927 # use format?
928 if ($tag->{'format_name'}) {
929 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
930 }
931
932 if ($field eq 'filename') {
933 $self->{'current_filename'} = join('',@v);
934 $log->debug("filename: ",$self->{'current_filename'});
935 } elsif ($field eq 'headline') {
936 $self->{'headline'} .= join('',@v);
937 $log->debug("headline: ",$self->{'headline'});
938 next; # don't return headline in data_structure!
939 }
940
941 # delimiter will join repeatable fields
942 if ($tag->{'delimiter'}) {
943 @v = ( join($tag->{'delimiter'}, @v) );
944 }
945
946 # default types
947 my @types = qw(display swish);
948 # override by type attribute
949 @types = ( $tag->{'type'} ) if ($tag->{'type'});
950
951 foreach my $type (@types) {
952 # append to previous line?
953 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
954 if ($tag->{'append'}) {
955
956 # I will delimit appended part with
957 # delimiter (or ,)
958 my $d = $tag->{'delimiter'};
959 # default delimiter
960 $d ||= " ";
961
962 my $last = pop @{$row->{$type}};
963 $d = "" if (! $last);
964 $last .= $d . join($d, @v);
965 push @{$row->{$type}}, $last;
966
967 } else {
968 push @{$row->{$type}}, @v;
969 }
970 }
971
972
973 }
974
975 if ($row) {
976 $row->{'tag'} = $field;
977
978 # TODO: name_sigular, name_plural
979 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
980 $row->{'name'} = $name ? $self->_x($name) : $field;
981
982 # post-sort all values in field
983 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
984 $log->warn("sort at field tag not implemented");
985 }
986
987 push @ds, $row;
988
989 $log->debug("row $field: ",sub { Dumper($row) });
990 }
991
992 }
993
994 return @ds;
995
996 }
997
998 =head2 output
999
1000 Create output from in-memory data structure using Template Toolkit template.
1001
1002 my $text = $webpac->output( template => 'text.tt', data => @ds );
1003
1004 =cut
1005
1006 sub output {
1007 my $self = shift;
1008
1009 my $args = {@_};
1010
1011 my $log = $self->_get_logger();
1012
1013 $log->logconfess("need template name") if (! $args->{'template'});
1014 $log->logconfess("need data array") if (! $args->{'data'});
1015
1016 my $out;
1017
1018 $self->{'tt'}->process(
1019 $args->{'template'},
1020 $args,
1021 \$out
1022 ) || confess $self->{'tt'}->error();
1023
1024 return $out;
1025 }
1026
1027 =head2 output_file
1028
1029 Create output from in-memory data structure using Template Toolkit template
1030 to a file.
1031
1032 $webpac->output_file(
1033 file => 'out.txt',
1034 template => 'text.tt',
1035 data => @ds
1036 );
1037
1038 =cut
1039
1040 sub output_file {
1041 my $self = shift;
1042
1043 my $args = {@_};
1044
1045 my $log = $self->_get_logger();
1046
1047 my $file = $args->{'file'} || $log->logconfess("need file name");
1048
1049 $log->debug("creating file ",$file);
1050
1051 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1052 print $fh $self->output(
1053 template => $args->{'template'},
1054 data => $args->{'data'},
1055 ) || $log->logdie("print: $!");
1056 close($fh) || $log->logdie("close: $!");
1057 }
1058
1059 =head2 apply_format
1060
1061 Apply format specified in tag with C<format_name="name"> and
1062 C<format_delimiter=";;">.
1063
1064 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1065
1066 Formats can contain C<lookup{...}> if you need them.
1067
1068 =cut
1069
1070 sub apply_format {
1071 my $self = shift;
1072
1073 my ($name,$delimiter,$data) = @_;
1074
1075 my $log = $self->_get_logger();
1076
1077 if (! $self->{'import_xml'}->{'format'}->{$name}) {
1078 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1079 return $data;
1080 }
1081
1082 $log->warn("no delimiter for format $name") if (! $delimiter);
1083
1084 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1085
1086 my @data = split(/\Q$delimiter\E/, $data);
1087
1088 my $out = sprintf($format, @data);
1089 $log->debug("using format $name [$format] on $data to produce: $out");
1090
1091 if ($out =~ m/$LOOKUP_REGEX/o) {
1092 return $self->lookup($out);
1093 } else {
1094 return $out;
1095 }
1096
1097 }
1098
1099
1100 #
1101 #
1102 #
1103
1104 =head1 INTERNAL METHODS
1105
1106 Here is a quick list of internal methods, mostly useful to turn debugging
1107 on them (see L<LOGGING> below for explanation).
1108
1109 =cut
1110
1111 =head2 _eval
1112
1113 Internal function to eval code without C<strict 'subs'>.
1114
1115 =cut
1116
1117 sub _eval {
1118 my $self = shift;
1119
1120 my $code = shift || return;
1121
1122 my $log = $self->_get_logger();
1123
1124 no strict 'subs';
1125 my $ret = eval $code;
1126 if ($@) {
1127 $log->error("problem with eval code [$code]: $@");
1128 }
1129
1130 $log->debug("eval: ",$code," [",$ret,"]");
1131
1132 return $ret || undef;
1133 }
1134
1135 =head2 _sort_by_order
1136
1137 Sort xml tags data structure accoding to C<order=""> attribute.
1138
1139 =cut
1140
1141 sub _sort_by_order {
1142 my $self = shift;
1143
1144 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1145 $self->{'import_xml'}->{'indexer'}->{$a};
1146 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1147 $self->{'import_xml'}->{'indexer'}->{$b};
1148
1149 return $va <=> $vb;
1150 }
1151
1152 =head2 _get_logger
1153
1154 Get C<Log::Log4perl> object with a twist: domains are defined for each
1155 method
1156
1157 my $log = $webpac->_get_logger();
1158
1159 =cut
1160
1161 sub _get_logger {
1162 my $self = shift;
1163
1164 my $name = (caller(1))[3] || caller;
1165 return get_logger($name);
1166 }
1167
1168 =head2 _x
1169
1170 Convert string from UTF-8 to code page defined in C<import_xml>.
1171
1172 my $text = $webpac->_x('utf8 text');
1173
1174 =cut
1175
1176 sub _x {
1177 my $self = shift;
1178 my $utf8 = shift || return;
1179
1180 return $self->{'utf2cp'}->convert($utf8) ||
1181 $self->_get_logger()->logwarn("can't convert '$utf8'");
1182 }
1183
1184 #
1185 #
1186 #
1187
1188 =head1 LOGGING
1189
1190 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1191 C<log.conf>.
1192
1193 Methods defined above have different levels of logging, so
1194 it's descriptions will be useful to turn (mostry B<debug> logging) on
1195 or off to see why WabPAC isn't perforing as you expect it (it might even
1196 be a bug!).
1197
1198 B<This is different from normal Log4perl behaviour>. To repeat, you can
1199 also use method names, and not only classes (which are just few)
1200 to filter logging.
1201
1202
1203 =head1 MEMORY USAGE
1204
1205 C<low_mem> options is double-edged sword. If enabled, WebPAC
1206 will run on memory constraint machines (which doesn't have enough
1207 physical RAM to create memory structure for whole source database).
1208
1209 If your machine has 512Mb or more of RAM and database is around 10000 records,
1210 memory shouldn't be an issue. If you don't have enough physical RAM, you
1211 might consider using virtual memory (if your operating system is handling it
1212 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1213 parsed structure of ISIS database (this is what C<low_mem> option does).
1214
1215 Hitting swap at end of reading source database is probably o.k. However,
1216 hitting swap before 90% will dramatically decrease performance and you will
1217 be better off with C<low_mem> and using rest of availble memory for
1218 operating system disk cache (Linux is particuallary good about this).
1219 However, every access to database record will require disk access, so
1220 generation phase will be slower 10-100 times.
1221
1222 Parsed structures are essential - you just have option to trade RAM memory
1223 (which is fast) for disk space (which is slow). Be sure to have planty of
1224 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1225
1226 However, when WebPAC is running on desktop machines (or laptops :-), it's
1227 highly undesireable for system to start swapping. Using C<low_mem> option can
1228 reduce WecPAC memory usage to around 64Mb for same database with lookup
1229 fields and sorted indexes which stay in RAM. Performance will suffer, but
1230 memory usage will really be minimal. It might be also more confortable to
1231 run WebPAC reniced on those machines.
1232
1233 =cut
1234
1235 1;

  ViewVC Help
Powered by ViewVC 1.1.26