/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 616 - (show annotations)
Fri Dec 31 03:34:33 2004 UTC (15 years, 6 months ago) by dpavlin
File size: 27980 byte(s)
cleanup

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 code_page => 'ISO-8859-2',
38 low_mem => 1,
39 filter => {
40 'lower' => sub { lc($_[0]) },
41 },
42 );
43
44 Default C<code_page> is C<ISO-8859-2>.
45
46 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47
48 There is optinal parametar C<filter> which specify different filters which
49 can be applied using C<filter{name}> notation.
50 Same filters can be used in Template Toolkit files.
51
52 This method will also read configuration files
53 C<global.conf> (used by indexer and Web font-end)
54 and configuration file specified by C<config_file>
55 which describes databases to be indexed.
56
57 =cut
58
59 # mapping between data type and tag which specify
60 # format in XML file
61 my %type2tag = (
62 'isis' => 'isis',
63 # 'excel' => 'column',
64 # 'marc' => 'marc',
65 # 'feed' => 'feed'
66 );
67
68 sub new {
69 my $class = shift;
70 my $self = {@_};
71 bless($self, $class);
72
73 $self->{'start_t'} = time();
74
75 my $log_file = $self->{'log'} || "log.conf";
76 Log::Log4perl->init($log_file);
77
78 my $log = $self->_get_logger();
79
80 # fill in default values
81 # output codepage
82 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
83
84 #
85 # read global.conf
86 #
87 $log->debug("read 'global.conf'");
88
89 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
90
91 # read global config parametars
92 foreach my $var (qw(
93 dbi_dbd
94 dbi_dsn
95 dbi_user
96 dbi_passwd
97 show_progress
98 my_unac_filter
99 output_template
100 )) {
101 $self->{'global_config'}->{$var} = $config->val('global', $var);
102 }
103
104 #
105 # read indexer config file
106 #
107
108 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
109
110 # create UTF-8 convertor for import_xml files
111 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
112
113 # create Template toolkit instance
114 $self->{'tt'} = Template->new(
115 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116 FILTERS => $self->{'filter'},
117 EVAL_PERL => 1,
118 );
119
120 # running with low_mem flag? well, use DBM::Deep then.
121 if ($self->{'low_mem'}) {
122 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123
124 my $db_file = "data.db";
125
126 if (-e $db_file) {
127 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128 $log->debug("removed '$db_file' from last run");
129 }
130
131 require DBM::Deep;
132
133 my $db = new DBM::Deep $db_file;
134
135 $log->logdie("DBM::Deep error: $!") unless ($db);
136
137 if ($db->error()) {
138 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139 } else {
140 $log->debug("using file '$db_file' for DBM::Deep");
141 }
142
143 $self->{'db'} = $db;
144 }
145
146 $log->debug("filters defined: ",Dumper($self->{'filter'}));
147
148 return $self;
149 }
150
151 =head2 open_isis
152
153 Open CDS/ISIS, WinISIS or IsisMarc database using IsisDB or OpenIsis module
154 and read all records to memory.
155
156 $webpac->open_isis(
157 filename => '/data/ISIS/ISIS',
158 code_page => '852',
159 limit_mfn => 500,
160 start_mfn => 6000,
161 lookup => [ ... ],
162 );
163
164 By default, ISIS code page is assumed to be C<852>.
165
166 If optional parametar C<start_mfn> is set, this will be first MFN to read
167 from database (so you can skip beginning of your database if you need to).
168
169 If optional parametar C<limit_mfn> is set, it will read just 500 records
170 from database in example above.
171
172 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
173 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
174 value in index.
175
176 lookup => [
177 { 'key' => 'd:v900', 'val' => 'v250^a' },
178 { 'eval' => '"v901^a" eq "Podruèje"',
179 'key' => 'pa:v561^4:v562^4:v461^1',
180 'val' => 'v900' },
181 ]
182
183 Returns number of last record read into memory (size of database, really).
184
185 =cut
186
187 sub open_isis {
188 my $self = shift;
189 my $arg = {@_};
190
191 my $log = $self->_get_logger();
192
193 $log->logcroak("need filename") if (! $arg->{'filename'});
194 my $code_page = $arg->{'code_page'} || '852';
195
196 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197
198 # store data in object
199 $self->{'isis_filename'} = $arg->{'filename'};
200 $self->{'isis_code_page'} = $code_page;
201
202 #$self->{'isis_code_page'} = $code_page;
203
204 # create Text::Iconv object
205 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
206
207 $log->info("reading ISIS database '",$arg->{'filename'},"'");
208 $log->debug("isis code page: $code_page");
209
210 my $use_openisis = 1;
211
212 eval { use IsisDB 0.06; };
213 $use_openisis = 0 unless ($@);
214
215 my ($isis_db,$maxmfn);
216
217 if ($use_openisis) {
218 use OpenIsis;
219 $isis_db = OpenIsis::open($arg->{'filename'});
220 $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
221 } else {
222 $isis_db = new IsisDB(
223 isisdb => $arg->{'filename'},
224 include_deleted => 1,
225 hash_filter => sub {
226 my $l = shift || return;
227 $l = $cp->convert($l);
228 return $l;
229 },
230 );
231 $maxmfn = $isis_db->{'maxmfn'};
232 }
233
234
235 my $startmfn = 1;
236
237 if (my $s = $self->{'start_mfn'}) {
238 $log->info("skipping to MFN $s");
239 $startmfn = $s;
240 } else {
241 $self->{'start_mfn'} = $startmfn;
242 }
243
244 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
245
246 $log->info("processing ",($maxmfn-$startmfn)." records using ",( $use_openisis ? 'OpenIsis' : 'IsisDB'));
247
248
249 # read database
250 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
251
252
253 $log->debug("mfn: $mfn\n");
254
255 my $rec;
256
257 if ($use_openisis) {
258
259 # read record using OpenIsis
260 my $row = OpenIsis::read( $isis_db, $mfn );
261 foreach my $k (keys %{$row}) {
262 if ($k ne "mfn") {
263 foreach my $l (@{$row->{$k}}) {
264 $l = $cp->convert($l);
265 # has subfields?
266 my $val;
267 if ($l =~ m/\^/) {
268 foreach my $t (split(/\^/,$l)) {
269 next if (! $t);
270 $val->{substr($t,0,1)} = substr($t,1);
271 }
272 } else {
273 $val = $l;
274 }
275
276 push @{$rec->{$k}}, $val;
277 }
278 } else {
279 push @{$rec->{'000'}}, $mfn;
280 }
281 }
282
283 } else {
284 $rec = $isis_db->to_hash($mfn);
285 }
286
287 $log->confess("record $mfn empty?") unless ($rec);
288
289 # store
290 if ($self->{'low_mem'}) {
291 $self->{'db'}->put($mfn, $rec);
292 } else {
293 $self->{'data'}->{$mfn} = $rec;
294 }
295
296 # create lookup
297 $self->create_lookup($rec, @{$arg->{'lookup'}});
298
299 $self->progress_bar($mfn,$maxmfn);
300
301 }
302
303 $self->{'current_mfn'} = -1;
304 $self->{'last_pcnt'} = 0;
305
306 $log->debug("max mfn: $maxmfn");
307
308 # store max mfn and return it.
309 return $self->{'max_mfn'} = $maxmfn;
310 }
311
312 =head2 fetch_rec
313
314 Fetch next record from database. It will also display progress bar (once
315 it's implemented, that is).
316
317 my $rec = $webpac->fetch_rec;
318
319 =cut
320
321 sub fetch_rec {
322 my $self = shift;
323
324 my $log = $self->_get_logger();
325
326 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
327
328 if ($self->{'current_mfn'} == -1) {
329 $self->{'current_mfn'} = $self->{'start_mfn'};
330 } else {
331 $self->{'current_mfn'}++;
332 }
333
334 my $mfn = $self->{'current_mfn'};
335
336 if ($mfn > $self->{'max_mfn'}) {
337 $self->{'current_mfn'} = $self->{'max_mfn'};
338 $log->debug("at EOF");
339 return;
340 }
341
342 $self->progress_bar($mfn,$self->{'max_mfn'});
343
344 if ($self->{'low_mem'}) {
345 return $self->{'db'}->get($mfn);
346 } else {
347 return $self->{'data'}->{$mfn};
348 }
349 }
350
351 =head2 mfn
352
353 Returns current record number (MFN).
354
355 print $webpac->mfn;
356
357 =cut
358
359 sub mfn {
360 my $self = shift;
361 return $self->{'current_mfn'};
362 }
363
364 =head2 progress_bar
365
366 Draw progress bar on STDERR.
367
368 $webpac->progress_bar($current, $max);
369
370 =cut
371
372 sub progress_bar {
373 my $self = shift;
374
375 my ($curr,$max) = @_;
376
377 my $log = $self->_get_logger();
378
379 $log->logconfess("no current value!") if (! $curr);
380 $log->logconfess("no maximum value!") if (! $max);
381
382 if ($curr > $max) {
383 $max = $curr;
384 $log->debug("overflow to $curr");
385 }
386
387 $self->{'last_pcnt'} ||= 1;
388
389 my $p = int($curr * 100 / $max) || 1;
390
391 # reset on re-run
392 if ($p < $self->{'last_pcnt'}) {
393 $self->{'last_pcnt'} = $p;
394 $self->{'start_t'} = time();
395 }
396
397 if ($p != $self->{'last_pcnt'}) {
398
399 my $t = time();
400 my $rate = ($curr / ($t - $self->{'start_t'} || 1));
401 my $eta = ($max-$curr) / ($rate || 1);
402 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
403 $self->{'last_pcnt'} = $p;
404 $self->{'last_curr'} = $curr;
405 }
406 print STDERR "\n" if ($p == 100);
407 }
408
409 =head2 fmt_time
410
411 Format time (in seconds) for display.
412
413 print $webpac->fmt_time(time());
414
415 This method is called by L<progress_bar> to display remaining time.
416
417 =cut
418
419 sub fmt_time {
420 my $self = shift;
421
422 my $t = shift || 0;
423 my $out = "";
424
425 my ($ss,$mm,$hh) = gmtime($t);
426 $out .= "${hh}h" if ($hh);
427 $out .= sprintf("%02d:%02d", $mm,$ss);
428 $out .= " " if ($hh == 0);
429 return $out;
430 }
431
432 =head2 open_import_xml
433
434 Read file from C<import_xml/> directory and parse it.
435
436 $webpac->open_import_xml(type => 'isis');
437
438 =cut
439
440 sub open_import_xml {
441 my $self = shift;
442
443 my $log = $self->_get_logger();
444
445 my $arg = {@_};
446 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
447
448 $self->{'type'} = $arg->{'type'};
449
450 my $type_base = $arg->{'type'};
451 $type_base =~ s/_.*$//g;
452
453 $self->{'tag'} = $type2tag{$type_base};
454
455 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
456
457 my $f = "./import_xml/".$self->{'type'}.".xml";
458 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
459
460 $log->info("reading '$f'");
461
462 $self->{'import_xml_file'} = $f;
463
464 $self->{'import_xml'} = XMLin($f,
465 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
466 );
467
468 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
469
470 }
471
472 =head2 create_lookup
473
474 Create lookup from record using lookup definition.
475
476 $self->create_lookup($rec, @lookups);
477
478 Called internally by C<open_*> methods.
479
480 =cut
481
482 sub create_lookup {
483 my $self = shift;
484
485 my $log = $self->_get_logger();
486
487 my $rec = shift || $log->logconfess("need record to create lookup");
488 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
489
490 foreach my $i (@_) {
491 $log->logconfess("need key") unless defined($i->{'key'});
492 $log->logconfess("need val") unless defined($i->{'val'});
493
494 if (defined($i->{'eval'})) {
495 # eval first, so we can skip fill_in for key and val
496 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
497 if ($self->_eval($eval)) {
498 my $key = $self->fill_in($rec,$i->{'key'}) || next;
499 my @val = $self->fill_in($rec,$i->{'val'}) || next;
500 $log->debug("stored $key = ",sub { join(" | ",@val) });
501 push @{$self->{'lookup'}->{$key}}, @val;
502 }
503 } else {
504 my $key = $self->fill_in($rec,$i->{'key'}) || next;
505 my @val = $self->fill_in($rec,$i->{'val'}) || next;
506 $log->debug("stored $key = ",sub { join(" | ",@val) });
507 push @{$self->{'lookup'}->{$key}}, @val;
508 }
509 }
510 }
511
512 =head2 get_data
513
514 Returns value from record.
515
516 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
517
518 Arguments are:
519 record reference C<$rec>,
520 field C<$f>,
521 optional subfiled C<$sf>,
522 index for repeatable values C<$i>.
523
524 Optinal variable C<$found> will be incremeted if there
525 is field.
526
527 Returns value or empty string.
528
529 =cut
530
531 sub get_data {
532 my $self = shift;
533
534 my ($rec,$f,$sf,$i,$found) = @_;
535
536 if ($$rec->{$f}) {
537 return '' if (! $$rec->{$f}->[$i]);
538 no strict 'refs';
539 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
540 $$found++ if (defined($$found));
541 return $$rec->{$f}->[$i]->{$sf};
542 } elsif ($$rec->{$f}->[$i]) {
543 $$found++ if (defined($$found));
544 # it still might have subfield, just
545 # not specified, so we'll dump all
546 if ($$rec->{$f}->[$i] =~ /HASH/o) {
547 my $out;
548 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
549 $out .= $$rec->{$f}->[$i]->{$k}." ";
550 }
551 return $out;
552 } else {
553 return $$rec->{$f}->[$i];
554 }
555 }
556 } else {
557 return '';
558 }
559 }
560
561 =head2 fill_in
562
563 Workhourse of all: takes record from in-memory structure of database and
564 strings with placeholders and returns string or array of with substituted
565 values from record.
566
567 my $text = $webpac->fill_in($rec,'v250^a');
568
569 Optional argument is ordinal number for repeatable fields. By default,
570 it's assume to be first repeatable field (fields are perl array, so first
571 element is 0).
572 Following example will read second value from repeatable field.
573
574 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
575
576 This function B<does not> perform parsing of format to inteligenty skip
577 delimiters before fields which aren't used.
578
579 This method will automatically decode UTF-8 string to local code page
580 if needed.
581
582 =cut
583
584 sub fill_in {
585 my $self = shift;
586
587 my $log = $self->_get_logger();
588
589 my $rec = shift || $log->logconfess("need data record");
590 my $format = shift || $log->logconfess("need format to parse");
591 # iteration (for repeatable fields)
592 my $i = shift || 0;
593
594 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
595
596 # FIXME remove for speedup?
597 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
598
599 if (utf8::is_utf8($format)) {
600 $format = $self->_x($format);
601 }
602
603 my $found = 0;
604
605 my $eval_code;
606 # remove eval{...} from beginning
607 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
608
609 my $filter_name;
610 # remove filter{...} from beginning
611 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
612
613 # do actual replacement of placeholders
614 # repeatable fields
615 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
616 # non-repeatable fields
617 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
618
619 if ($found) {
620 $log->debug("format: $format");
621 if ($eval_code) {
622 my $eval = $self->fill_in($rec,$eval_code,$i);
623 return if (! $self->_eval($eval));
624 }
625 if ($filter_name && $self->{'filter'}->{$filter_name}) {
626 $log->debug("filter '$filter_name' for $format");
627 $format = $self->{'filter'}->{$filter_name}->($format);
628 return unless(defined($format));
629 $log->debug("filter result: $format");
630 }
631 # do we have lookups?
632 if ($format =~ /$LOOKUP_REGEX/o) {
633 $log->debug("format '$format' has lookup");
634 return $self->lookup($format);
635 } else {
636 return $format;
637 }
638 } else {
639 return;
640 }
641 }
642
643 =head2 lookup
644
645 Perform lookups on format supplied to it.
646
647 my $text = $self->lookup('[v900]');
648
649 Lookups can be nested (like C<[d:[a:[v900]]]>).
650
651 =cut
652
653 sub lookup {
654 my $self = shift;
655
656 my $log = $self->_get_logger();
657
658 my $tmp = shift || $log->logconfess("need format");
659
660 if ($tmp =~ /$LOOKUP_REGEX/o) {
661 my @in = ( $tmp );
662
663 $log->debug("lookup for: ",$tmp);
664
665 my @out;
666 while (my $f = shift @in) {
667 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
668 my $k = $1;
669 if ($self->{'lookup'}->{$k}) {
670 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
671 my $tmp2 = $f;
672 $tmp2 =~ s/lookup{$k}/$nv/g;
673 push @in, $tmp2;
674 }
675 } else {
676 undef $f;
677 }
678 } elsif ($f) {
679 push @out, $f;
680 }
681 }
682 $log->logconfess("return is array and it's not expected!") unless wantarray;
683 return @out;
684 } else {
685 return $tmp;
686 }
687 }
688
689 =head2 parse
690
691 Perform smart parsing of string, skipping delimiters for fields which aren't
692 defined. It can also eval code in format starting with C<eval{...}> and
693 return output or nothing depending on eval code.
694
695 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
696
697 =cut
698
699 sub parse {
700 my $self = shift;
701
702 my ($rec, $format_utf8, $i) = @_;
703
704 return if (! $format_utf8);
705
706 my $log = $self->_get_logger();
707
708 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
709 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
710
711 $i = 0 if (! $i);
712
713 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
714
715 my @out;
716
717 $log->debug("format: $format");
718
719 my $eval_code;
720 # remove eval{...} from beginning
721 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
722
723 my $filter_name;
724 # remove filter{...} from beginning
725 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
726
727 my $prefix;
728 my $all_found=0;
729
730 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
731
732 my $del = $1 || '';
733 $prefix ||= $del if ($all_found == 0);
734
735 # repeatable index
736 my $r = $i;
737 $r = 0 if (lc("$2") eq 's');
738
739 my $found = 0;
740 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
741
742 if ($found) {
743 push @out, $del;
744 push @out, $tmp;
745 $all_found += $found;
746 }
747 }
748
749 return if (! $all_found);
750
751 my $out = join('',@out);
752
753 if ($out) {
754 # add rest of format (suffix)
755 $out .= $format;
756
757 # add prefix if not there
758 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
759
760 $log->debug("result: $out");
761 }
762
763 if ($eval_code) {
764 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
765 $log->debug("about to eval{$eval} format: $out");
766 return if (! $self->_eval($eval));
767 }
768
769 if ($filter_name && $self->{'filter'}->{$filter_name}) {
770 $log->debug("about to filter{$filter_name} format: $out");
771 $out = $self->{'filter'}->{$filter_name}->($out);
772 return unless(defined($out));
773 $log->debug("filter result: $out");
774 }
775
776 return $out;
777 }
778
779 =head2 parse_to_arr
780
781 Similar to C<parse>, but returns array of all repeatable fields
782
783 my @arr = $webpac->parse_to_arr($rec,'v250^a');
784
785 =cut
786
787 sub parse_to_arr {
788 my $self = shift;
789
790 my ($rec, $format_utf8) = @_;
791
792 my $log = $self->_get_logger();
793
794 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
795 return if (! $format_utf8);
796
797 my $i = 0;
798 my @arr;
799
800 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
801 push @arr, $v;
802 }
803
804 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
805
806 return @arr;
807 }
808
809 =head2 fill_in_to_arr
810
811 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
812 for fields which have lookups, so they shouldn't be parsed but rather
813 C<fill_id>ed.
814
815 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
816
817 =cut
818
819 sub fill_in_to_arr {
820 my $self = shift;
821
822 my ($rec, $format_utf8) = @_;
823
824 my $log = $self->_get_logger();
825
826 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
827 return if (! $format_utf8);
828
829 my $i = 0;
830 my @arr;
831
832 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
833 push @arr, @v;
834 }
835
836 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
837
838 return @arr;
839 }
840
841 =head2 sort_arr
842
843 Sort array ignoring case and html in data
844
845 my @sorted = $webpac->sort_arr(@unsorted);
846
847 =cut
848
849 sub sort_arr {
850 my $self = shift;
851
852 my $log = $self->_get_logger();
853
854 # FIXME add Schwartzian Transformation?
855
856 my @sorted = sort {
857 $a =~ s#<[^>]+/*>##;
858 $b =~ s#<[^>]+/*>##;
859 lc($b) cmp lc($a)
860 } @_;
861 $log->debug("sorted values: ",sub { join(", ",@sorted) });
862
863 return @sorted;
864 }
865
866
867 =head2 data_structure
868
869 Create in-memory data structure which represents layout from C<import_xml>.
870 It is used later to produce output.
871
872 my @ds = $webpac->data_structure($rec);
873
874 This method will also set C<$webpac->{'currnet_filename'}> if there is
875 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
876 <headline> tag.
877
878 =cut
879
880 sub data_structure {
881 my $self = shift;
882
883 my $log = $self->_get_logger();
884
885 my $rec = shift;
886 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
887
888 undef $self->{'currnet_filename'};
889 undef $self->{'headline'};
890
891 my @sorted_tags;
892 if ($self->{tags_by_order}) {
893 @sorted_tags = @{$self->{tags_by_order}};
894 } else {
895 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
896 $self->{tags_by_order} = \@sorted_tags;
897 }
898
899 my @ds;
900
901 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
902
903 foreach my $field (@sorted_tags) {
904
905 my $row;
906
907 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
908
909 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
910 my $format = $tag->{'value'} || $tag->{'content'};
911
912 $log->debug("format: $format");
913
914 my @v;
915 if ($format =~ /$LOOKUP_REGEX/o) {
916 @v = $self->fill_in_to_arr($rec,$format);
917 } else {
918 @v = $self->parse_to_arr($rec,$format);
919 }
920 next if (! @v);
921
922 if ($tag->{'sort'}) {
923 @v = $self->sort_arr(@v);
924 }
925
926 # use format?
927 if ($tag->{'format_name'}) {
928 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
929 }
930
931 if ($field eq 'filename') {
932 $self->{'current_filename'} = join('',@v);
933 $log->debug("filename: ",$self->{'current_filename'});
934 } elsif ($field eq 'headline') {
935 $self->{'headline'} .= join('',@v);
936 $log->debug("headline: ",$self->{'headline'});
937 next; # don't return headline in data_structure!
938 }
939
940 # delimiter will join repeatable fields
941 if ($tag->{'delimiter'}) {
942 @v = ( join($tag->{'delimiter'}, @v) );
943 }
944
945 # default types
946 my @types = qw(display swish);
947 # override by type attribute
948 @types = ( $tag->{'type'} ) if ($tag->{'type'});
949
950 foreach my $type (@types) {
951 # append to previous line?
952 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
953 if ($tag->{'append'}) {
954
955 # I will delimit appended part with
956 # delimiter (or ,)
957 my $d = $tag->{'delimiter'};
958 # default delimiter
959 $d ||= " ";
960
961 my $last = pop @{$row->{$type}};
962 $d = "" if (! $last);
963 $last .= $d . join($d, @v);
964 push @{$row->{$type}}, $last;
965
966 } else {
967 push @{$row->{$type}}, @v;
968 }
969 }
970
971
972 }
973
974 if ($row) {
975 $row->{'tag'} = $field;
976
977 # TODO: name_sigular, name_plural
978 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
979 $row->{'name'} = $name ? $self->_x($name) : $field;
980
981 # post-sort all values in field
982 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
983 $log->warn("sort at field tag not implemented");
984 }
985
986 push @ds, $row;
987
988 $log->debug("row $field: ",sub { Dumper($row) });
989 }
990
991 }
992
993 return @ds;
994
995 }
996
997 =head2 output
998
999 Create output from in-memory data structure using Template Toolkit template.
1000
1001 my $text = $webpac->output( template => 'text.tt', data => @ds );
1002
1003 =cut
1004
1005 sub output {
1006 my $self = shift;
1007
1008 my $args = {@_};
1009
1010 my $log = $self->_get_logger();
1011
1012 $log->logconfess("need template name") if (! $args->{'template'});
1013 $log->logconfess("need data array") if (! $args->{'data'});
1014
1015 my $out;
1016
1017 $self->{'tt'}->process(
1018 $args->{'template'},
1019 $args,
1020 \$out
1021 ) || confess $self->{'tt'}->error();
1022
1023 return $out;
1024 }
1025
1026 =head2 output_file
1027
1028 Create output from in-memory data structure using Template Toolkit template
1029 to a file.
1030
1031 $webpac->output_file(
1032 file => 'out.txt',
1033 template => 'text.tt',
1034 data => @ds
1035 );
1036
1037 =cut
1038
1039 sub output_file {
1040 my $self = shift;
1041
1042 my $args = {@_};
1043
1044 my $log = $self->_get_logger();
1045
1046 my $file = $args->{'file'} || $log->logconfess("need file name");
1047
1048 $log->debug("creating file ",$file);
1049
1050 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1051 print $fh $self->output(
1052 template => $args->{'template'},
1053 data => $args->{'data'},
1054 ) || $log->logdie("print: $!");
1055 close($fh) || $log->logdie("close: $!");
1056 }
1057
1058 =head2 apply_format
1059
1060 Apply format specified in tag with C<format_name="name"> and
1061 C<format_delimiter=";;">.
1062
1063 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1064
1065 Formats can contain C<lookup{...}> if you need them.
1066
1067 =cut
1068
1069 sub apply_format {
1070 my $self = shift;
1071
1072 my ($name,$delimiter,$data) = @_;
1073
1074 my $log = $self->_get_logger();
1075
1076 if (! $self->{'import_xml'}->{'format'}->{$name}) {
1077 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1078 return $data;
1079 }
1080
1081 $log->warn("no delimiter for format $name") if (! $delimiter);
1082
1083 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1084
1085 my @data = split(/\Q$delimiter\E/, $data);
1086
1087 my $out = sprintf($format, @data);
1088 $log->debug("using format $name [$format] on $data to produce: $out");
1089
1090 if ($out =~ m/$LOOKUP_REGEX/o) {
1091 return $self->lookup($out);
1092 } else {
1093 return $out;
1094 }
1095
1096 }
1097
1098
1099 #
1100 #
1101 #
1102
1103 =head1 INTERNAL METHODS
1104
1105 Here is a quick list of internal methods, mostly useful to turn debugging
1106 on them (see L<LOGGING> below for explanation).
1107
1108 =cut
1109
1110 =head2 _eval
1111
1112 Internal function to eval code without C<strict 'subs'>.
1113
1114 =cut
1115
1116 sub _eval {
1117 my $self = shift;
1118
1119 my $code = shift || return;
1120
1121 my $log = $self->_get_logger();
1122
1123 no strict 'subs';
1124 my $ret = eval $code;
1125 if ($@) {
1126 $log->error("problem with eval code [$code]: $@");
1127 }
1128
1129 $log->debug("eval: ",$code," [",$ret,"]");
1130
1131 return $ret || undef;
1132 }
1133
1134 =head2 _sort_by_order
1135
1136 Sort xml tags data structure accoding to C<order=""> attribute.
1137
1138 =cut
1139
1140 sub _sort_by_order {
1141 my $self = shift;
1142
1143 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1144 $self->{'import_xml'}->{'indexer'}->{$a};
1145 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1146 $self->{'import_xml'}->{'indexer'}->{$b};
1147
1148 return $va <=> $vb;
1149 }
1150
1151 =head2 _get_logger
1152
1153 Get C<Log::Log4perl> object with a twist: domains are defined for each
1154 method
1155
1156 my $log = $webpac->_get_logger();
1157
1158 =cut
1159
1160 sub _get_logger {
1161 my $self = shift;
1162
1163 my $name = (caller(1))[3] || caller;
1164 return get_logger($name);
1165 }
1166
1167 =head2 _x
1168
1169 Convert string from UTF-8 to code page defined in C<import_xml>.
1170
1171 my $text = $webpac->_x('utf8 text');
1172
1173 =cut
1174
1175 sub _x {
1176 my $self = shift;
1177 my $utf8 = shift || return;
1178
1179 return $self->{'utf2cp'}->convert($utf8) ||
1180 $self->_get_logger()->logwarn("can't convert '$utf8'");
1181 }
1182
1183 #
1184 #
1185 #
1186
1187 =head1 LOGGING
1188
1189 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1190 C<log.conf>.
1191
1192 Methods defined above have different levels of logging, so
1193 it's descriptions will be useful to turn (mostry B<debug> logging) on
1194 or off to see why WabPAC isn't perforing as you expect it (it might even
1195 be a bug!).
1196
1197 B<This is different from normal Log4perl behaviour>. To repeat, you can
1198 also use method names, and not only classes (which are just few)
1199 to filter logging.
1200
1201
1202 =head1 MEMORY USAGE
1203
1204 C<low_mem> options is double-edged sword. If enabled, WebPAC
1205 will run on memory constraint machines (which doesn't have enough
1206 physical RAM to create memory structure for whole source database).
1207
1208 If your machine has 512Mb or more of RAM and database is around 10000 records,
1209 memory shouldn't be an issue. If you don't have enough physical RAM, you
1210 might consider using virtual memory (if your operating system is handling it
1211 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1212 parsed structure of ISIS database (this is what C<low_mem> option does).
1213
1214 Hitting swap at end of reading source database is probably o.k. However,
1215 hitting swap before 90% will dramatically decrease performance and you will
1216 be better off with C<low_mem> and using rest of availble memory for
1217 operating system disk cache (Linux is particuallary good about this).
1218 However, every access to database record will require disk access, so
1219 generation phase will be slower 10-100 times.
1220
1221 Parsed structures are essential - you just have option to trade RAM memory
1222 (which is fast) for disk space (which is slow). Be sure to have planty of
1223 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1224
1225 However, when WebPAC is running on desktop machines (or laptops :-), it's
1226 highly undesireable for system to start swapping. Using C<low_mem> option can
1227 reduce WecPAC memory usage to around 64Mb for same database with lookup
1228 fields and sorted indexes which stay in RAM. Performance will suffer, but
1229 memory usage will really be minimal. It might be also more confortable to
1230 run WebPAC reniced on those machines.
1231
1232 =cut
1233
1234 1;

  ViewVC Help
Powered by ViewVC 1.1.26