/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 555 - (show annotations)
Fri Oct 29 22:09:04 2004 UTC (19 years, 5 months ago) by dpavlin
File size: 27613 byte(s)
Special suport for non-repeatable fileds:

Now you can specify s900^a instead of v900^a to get one and singe value.
Have in mind that if you DON'T have any repeatable value in this filed, you
WILL create infinite loop. So, use some repeatable value.

This is however, very useful for situation where you want to reference one
non-repeatable value from field which is repeatable.

There is experimental code that checks for infitite loops, but it's not
working right (and I'm too tired to fix this now).


1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 code_page => 'ISO-8859-2',
38 low_mem => 1,
39 filter => {
40 'lower' => sub { lc($_[0]) },
41 },
42 );
43
44 Default C<code_page> is C<ISO-8859-2>.
45
46 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47
48 There is optinal parametar C<filter> which specify different filters which
49 can be applied using C<filter{name}> notation.
50
51 This method will also read configuration files
52 C<global.conf> (used by indexer and Web font-end)
53 and configuration file specified by C<config_file>
54 which describes databases to be indexed.
55
56 =cut
57
58 # mapping between data type and tag which specify
59 # format in XML file
60 my %type2tag = (
61 'isis' => 'isis',
62 # 'excel' => 'column',
63 # 'marc' => 'marc',
64 # 'feed' => 'feed'
65 );
66
67 sub new {
68 my $class = shift;
69 my $self = {@_};
70 bless($self, $class);
71
72 $self->{'start_t'} = time();
73
74 my $log_file = $self->{'log'} || "log.conf";
75 Log::Log4perl->init($log_file);
76
77 my $log = $self->_get_logger();
78
79 # fill in default values
80 # output codepage
81 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
82
83 #
84 # read global.conf
85 #
86 $log->debug("read 'global.conf'");
87
88 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
89
90 # read global config parametars
91 foreach my $var (qw(
92 dbi_dbd
93 dbi_dsn
94 dbi_user
95 dbi_passwd
96 show_progress
97 my_unac_filter
98 output_template
99 )) {
100 $self->{'global_config'}->{$var} = $config->val('global', $var);
101 }
102
103 #
104 # read indexer config file
105 #
106
107 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
108
109 # create UTF-8 convertor for import_xml files
110 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
111
112 # create Template toolkit instance
113 $self->{'tt'} = Template->new(
114 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
115 # FILTERS => {
116 # 'foo' => \&foo_filter,
117 # },
118 EVAL_PERL => 1,
119 );
120
121 # running with low_mem flag? well, use DBM::Deep then.
122 if ($self->{'low_mem'}) {
123 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
124
125 my $db_file = "data.db";
126
127 if (-e $db_file) {
128 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
129 $log->debug("removed '$db_file' from last run");
130 }
131
132 require DBM::Deep;
133
134 my $db = new DBM::Deep $db_file;
135
136 $log->logdie("DBM::Deep error: $!") unless ($db);
137
138 if ($db->error()) {
139 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
140 } else {
141 $log->debug("using file '$db_file' for DBM::Deep");
142 }
143
144 $self->{'db'} = $db;
145 }
146
147 $log->debug("filters defined: ",Dumper($self->{'filter'}));
148
149 return $self;
150 }
151
152 =head2 open_isis
153
154 Open CDS/ISIS database using OpenIsis module and read all records to memory.
155
156 $webpac->open_isis(
157 filename => '/data/ISIS/ISIS',
158 code_page => '852',
159 limit_mfn => 500,
160 start_mfn => 6000,
161 lookup => [ ... ],
162 );
163
164 By default, ISIS code page is assumed to be C<852>.
165
166 If optional parametar C<start_mfn> is set, this will be first MFN to read
167 from database (so you can skip beginning of your database if you need to).
168
169 If optional parametar C<limit_mfn> is set, it will read just 500 records
170 from database in example above.
171
172 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
173 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
174 value in index.
175
176 lookup => [
177 { 'key' => 'd:v900', 'val' => 'v250^a' },
178 { 'eval' => '"v901^a" eq "Podruèje"',
179 'key' => 'pa:v561^4:v562^4:v461^1',
180 'val' => 'v900' },
181 ]
182
183 Returns number of last record read into memory (size of database, really).
184
185 =cut
186
187 sub open_isis {
188 my $self = shift;
189 my $arg = {@_};
190
191 my $log = $self->_get_logger();
192
193 $log->logcroak("need filename") if (! $arg->{'filename'});
194 my $code_page = $arg->{'code_page'} || '852';
195
196 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197
198 # store data in object
199 $self->{'isis_filename'} = $arg->{'filename'};
200 $self->{'isis_code_page'} = $code_page;
201
202 use OpenIsis;
203
204 #$self->{'isis_code_page'} = $code_page;
205
206 # create Text::Iconv object
207 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
208
209 $log->info("reading ISIS database '",$arg->{'filename'},"'");
210 $log->debug("isis code page: $code_page");
211
212 my $isis_db = OpenIsis::open($arg->{'filename'});
213
214 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
215 my $startmfn = 1;
216
217 if (my $s = $self->{'start_mfn'}) {
218 $log->info("skipping to MFN $s");
219 $startmfn = $s;
220 } else {
221 $self->{'start_mfn'} = $startmfn;
222 }
223
224 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
225
226 $log->info("processing ",($maxmfn-$startmfn)." records...");
227
228 # read database
229 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
230
231
232 $log->debug("mfn: $mfn\n");
233
234 my $rec;
235
236 # read record
237 my $row = OpenIsis::read( $isis_db, $mfn );
238 foreach my $k (keys %{$row}) {
239 if ($k ne "mfn") {
240 foreach my $l (@{$row->{$k}}) {
241 $l = $cp->convert($l);
242 # has subfields?
243 my $val;
244 if ($l =~ m/\^/) {
245 foreach my $t (split(/\^/,$l)) {
246 next if (! $t);
247 $val->{substr($t,0,1)} = substr($t,1);
248 }
249 } else {
250 $val = $l;
251 }
252
253 push @{$rec->{$k}}, $val;
254 }
255 } else {
256 push @{$rec->{'000'}}, $mfn;
257 }
258
259 }
260
261 $log->confess("record $mfn empty?") unless ($rec);
262
263 # store
264 if ($self->{'low_mem'}) {
265 $self->{'db'}->put($mfn, $rec);
266 } else {
267 $self->{'data'}->{$mfn} = $rec;
268 }
269
270 # create lookup
271 $self->create_lookup($rec, @{$arg->{'lookup'}});
272
273 $self->progress_bar($mfn,$maxmfn);
274
275 }
276
277 $self->{'current_mfn'} = -1;
278 $self->{'last_pcnt'} = 0;
279
280 $log->debug("max mfn: $maxmfn");
281
282 # store max mfn and return it.
283 return $self->{'max_mfn'} = $maxmfn;
284 }
285
286 =head2 fetch_rec
287
288 Fetch next record from database. It will also display progress bar (once
289 it's implemented, that is).
290
291 my $rec = $webpac->fetch_rec;
292
293 =cut
294
295 sub fetch_rec {
296 my $self = shift;
297
298 my $log = $self->_get_logger();
299
300 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
301
302 if ($self->{'current_mfn'} == -1) {
303 $self->{'current_mfn'} = $self->{'start_mfn'};
304 } else {
305 $self->{'current_mfn'}++;
306 }
307
308 my $mfn = $self->{'current_mfn'};
309
310 if ($mfn > $self->{'max_mfn'}) {
311 $self->{'current_mfn'} = $self->{'max_mfn'};
312 $log->debug("at EOF");
313 return;
314 }
315
316 $self->progress_bar($mfn,$self->{'max_mfn'});
317
318 if ($self->{'low_mem'}) {
319 return $self->{'db'}->get($mfn);
320 } else {
321 return $self->{'data'}->{$mfn};
322 }
323 }
324
325 =head2 mfn
326
327 Returns current record number (MFN).
328
329 print $webpac->mfn;
330
331 =cut
332
333 sub mfn {
334 my $self = shift;
335 return $self->{'current_mfn'};
336 }
337
338 =head2 progress_bar
339
340 Draw progress bar on STDERR.
341
342 $webpac->progress_bar($current, $max);
343
344 =cut
345
346 sub progress_bar {
347 my $self = shift;
348
349 my ($curr,$max) = @_;
350
351 my $log = $self->_get_logger();
352
353 $log->logconfess("no current value!") if (! $curr);
354 $log->logconfess("no maximum value!") if (! $max);
355
356 if ($curr > $max) {
357 $max = $curr;
358 $log->debug("overflow to $curr");
359 }
360
361 $self->{'last_pcnt'} ||= 1;
362
363 my $p = int($curr * 100 / $max) || 1;
364
365 # reset on re-run
366 if ($p < $self->{'last_pcnt'}) {
367 $self->{'last_pcnt'} = $p;
368 $self->{'last_t'} = time();
369 $self->{'last_curr'} = undef;
370 }
371
372 $self->{'last_t'} ||= time();
373
374 if ($p != $self->{'last_pcnt'}) {
375
376 my $last_curr = $self->{'last_curr'} || $curr;
377 my $t = time();
378 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
379 my $eta = ($max-$curr) / ($rate || 1);
380 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
381 $self->{'last_pcnt'} = $p;
382 $self->{'last_t'} = time();
383 $self->{'last_curr'} = $curr;
384 }
385 print STDERR "\n" if ($p == 100);
386 }
387
388 =head2 fmt_time
389
390 Format time (in seconds) for display.
391
392 print $webpac->fmt_time(time());
393
394 This method is called by L<progress_bar> to display remaining time.
395
396 =cut
397
398 sub fmt_time {
399 my $self = shift;
400
401 my $t = shift || 0;
402 my $out = "";
403
404 my ($ss,$mm,$hh) = gmtime($t);
405 $out .= "${hh}h" if ($hh);
406 $out .= sprintf("%02d:%02d", $mm,$ss);
407 $out .= " " if ($hh == 0);
408 return $out;
409 }
410
411 =head2 open_import_xml
412
413 Read file from C<import_xml/> directory and parse it.
414
415 $webpac->open_import_xml(type => 'isis');
416
417 =cut
418
419 sub open_import_xml {
420 my $self = shift;
421
422 my $log = $self->_get_logger();
423
424 my $arg = {@_};
425 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
426
427 $self->{'type'} = $arg->{'type'};
428
429 my $type_base = $arg->{'type'};
430 $type_base =~ s/_.*$//g;
431
432 $self->{'tag'} = $type2tag{$type_base};
433
434 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
435
436 my $f = "./import_xml/".$self->{'type'}.".xml";
437 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
438
439 $log->info("reading '$f'");
440
441 $self->{'import_xml_file'} = $f;
442
443 $self->{'import_xml'} = XMLin($f,
444 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
445 );
446
447 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
448
449 }
450
451 =head2 create_lookup
452
453 Create lookup from record using lookup definition.
454
455 $self->create_lookup($rec, @lookups);
456
457 Called internally by C<open_*> methods.
458
459 =cut
460
461 sub create_lookup {
462 my $self = shift;
463
464 my $log = $self->_get_logger();
465
466 my $rec = shift || $log->logconfess("need record to create lookup");
467 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
468
469 foreach my $i (@_) {
470 $log->logconfess("need key") unless defined($i->{'key'});
471 $log->logconfess("need val") unless defined($i->{'val'});
472
473 if (defined($i->{'eval'})) {
474 # eval first, so we can skip fill_in for key and val
475 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
476 if ($self->_eval($eval)) {
477 my $key = $self->fill_in($rec,$i->{'key'}) || next;
478 my @val = $self->fill_in($rec,$i->{'val'}) || next;
479 $log->debug("stored $key = ",sub { join(" | ",@val) });
480 push @{$self->{'lookup'}->{$key}}, @val;
481 }
482 } else {
483 my $key = $self->fill_in($rec,$i->{'key'}) || next;
484 my @val = $self->fill_in($rec,$i->{'val'}) || next;
485 $log->debug("stored $key = ",sub { join(" | ",@val) });
486 push @{$self->{'lookup'}->{$key}}, @val;
487 }
488 }
489 }
490
491 =head2 get_data
492
493 Returns value from record.
494
495 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
496
497 Arguments are:
498 record reference C<$rec>,
499 field C<$f>,
500 optional subfiled C<$sf>,
501 index for repeatable values C<$i>.
502
503 Optinal variable C<$found> will be incremeted if there
504 is field.
505
506 Returns value or empty string.
507
508 =cut
509
510 sub get_data {
511 my $self = shift;
512
513 my ($rec,$f,$sf,$i,$found) = @_;
514
515 if ($$rec->{$f}) {
516 return '' if (! $$rec->{$f}->[$i]);
517 no strict 'refs';
518 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
519 $$found++ if (defined($$found));
520 return $$rec->{$f}->[$i]->{$sf};
521 } elsif ($$rec->{$f}->[$i]) {
522 $$found++ if (defined($$found));
523 # it still might have subfield, just
524 # not specified, so we'll dump all
525 if ($$rec->{$f}->[$i] =~ /HASH/o) {
526 my $out;
527 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
528 $out .= $$rec->{$f}->[$i]->{$k}." ";
529 }
530 return $out;
531 } else {
532 return $$rec->{$f}->[$i];
533 }
534 }
535 } else {
536 return '';
537 }
538 }
539
540 =head2 fill_in
541
542 Workhourse of all: takes record from in-memory structure of database and
543 strings with placeholders and returns string or array of with substituted
544 values from record.
545
546 my $text = $webpac->fill_in($rec,'v250^a');
547
548 Optional argument is ordinal number for repeatable fields. By default,
549 it's assume to be first repeatable field (fields are perl array, so first
550 element is 0).
551 Following example will read second value from repeatable field.
552
553 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
554
555 This function B<does not> perform parsing of format to inteligenty skip
556 delimiters before fields which aren't used.
557
558 This method will automatically decode UTF-8 string to local code page
559 if needed.
560
561 =cut
562
563 sub fill_in {
564 my $self = shift;
565
566 my $log = $self->_get_logger();
567
568 my $rec = shift || $log->logconfess("need data record");
569 my $format = shift || $log->logconfess("need format to parse");
570 # iteration (for repeatable fields)
571 my $i = shift || 0;
572
573 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
574
575 # FIXME remove for speedup?
576 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
577
578 if (utf8::is_utf8($format)) {
579 $format = $self->_x($format);
580 }
581
582 my $found = 0;
583
584 my $eval_code;
585 # remove eval{...} from beginning
586 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
587
588 my $filter_name;
589 # remove filter{...} from beginning
590 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
591
592 # do actual replacement of placeholders
593 # repeatable fields
594 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
595 # non-repeatable fields
596 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
597
598 if ($found) {
599 $log->debug("format: $format");
600 if ($eval_code) {
601 my $eval = $self->fill_in($rec,$eval_code,$i);
602 return if (! $self->_eval($eval));
603 }
604 if ($filter_name && $self->{'filter'}->{$filter_name}) {
605 $log->debug("filter '$filter_name' for $format");
606 $format = $self->{'filter'}->{$filter_name}->($format);
607 return unless(defined($format));
608 $log->debug("filter result: $format");
609 }
610 # do we have lookups?
611 if ($format =~ /$LOOKUP_REGEX/o) {
612 $log->debug("format '$format' has lookup");
613 return $self->lookup($format);
614 } else {
615 return $format;
616 }
617 } else {
618 return;
619 }
620 }
621
622 =head2 lookup
623
624 Perform lookups on format supplied to it.
625
626 my $text = $self->lookup('[v900]');
627
628 Lookups can be nested (like C<[d:[a:[v900]]]>).
629
630 =cut
631
632 sub lookup {
633 my $self = shift;
634
635 my $log = $self->_get_logger();
636
637 my $tmp = shift || $log->logconfess("need format");
638
639 if ($tmp =~ /$LOOKUP_REGEX/o) {
640 my @in = ( $tmp );
641
642 $log->debug("lookup for: ",$tmp);
643
644 my @out;
645 while (my $f = shift @in) {
646 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
647 my $k = $1;
648 if ($self->{'lookup'}->{$k}) {
649 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
650 my $tmp2 = $f;
651 $tmp2 =~ s/lookup{$k}/$nv/g;
652 push @in, $tmp2;
653 }
654 } else {
655 undef $f;
656 }
657 } elsif ($f) {
658 push @out, $f;
659 }
660 }
661 $log->logconfess("return is array and it's not expected!") unless wantarray;
662 return @out;
663 } else {
664 return $tmp;
665 }
666 }
667
668 =head2 parse
669
670 Perform smart parsing of string, skipping delimiters for fields which aren't
671 defined. It can also eval code in format starting with C<eval{...}> and
672 return output or nothing depending on eval code.
673
674 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
675
676 =cut
677
678 sub parse {
679 my $self = shift;
680
681 my ($rec, $format_utf8, $i) = @_;
682
683 return if (! $format_utf8);
684
685 my $log = $self->_get_logger();
686
687 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
688 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
689
690 $i = 0 if (! $i);
691
692 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
693
694 my @out;
695
696 $log->debug("format: $format");
697
698 my $eval_code;
699 # remove eval{...} from beginning
700 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
701
702 my $filter_name;
703 # remove filter{...} from beginning
704 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
705
706 my $prefix;
707 my $all_found=0;
708
709 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
710
711 my $del = $1 || '';
712 $prefix ||= $del if ($all_found == 0);
713
714 # repeatable index
715 my $r = $i;
716 $r = 0 if (lc("$2") eq 's');
717
718 my $found = 0;
719 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
720
721 if ($found) {
722 push @out, $del;
723 push @out, $tmp;
724 $all_found += $found;
725 }
726 }
727
728 return if (! $all_found);
729
730 my $out = join('',@out);
731
732 if ($out) {
733 # add rest of format (suffix)
734 $out .= $format;
735
736 # add prefix if not there
737 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
738
739 $log->debug("result: $out");
740 }
741
742 if ($eval_code) {
743 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
744 $log->debug("about to eval{$eval} format: $out");
745 return if (! $self->_eval($eval));
746 }
747
748 if ($filter_name && $self->{'filter'}->{$filter_name}) {
749 $log->debug("about to filter{$filter_name} format: $out");
750 $out = $self->{'filter'}->{$filter_name}->($out);
751 return unless(defined($out));
752 $log->debug("filter result: $out");
753 }
754
755 return $out;
756 }
757
758 =head2 parse_to_arr
759
760 Similar to C<parse>, but returns array of all repeatable fields
761
762 my @arr = $webpac->parse_to_arr($rec,'v250^a');
763
764 =cut
765
766 sub parse_to_arr {
767 my $self = shift;
768
769 my ($rec, $format_utf8) = @_;
770
771 my $log = $self->_get_logger();
772
773 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
774 return if (! $format_utf8);
775
776 my $i = 0;
777 my @arr;
778
779 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
780 push @arr, $v;
781 }
782
783 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
784
785 return @arr;
786 }
787
788 =head2 fill_in_to_arr
789
790 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
791 for fields which have lookups, so they shouldn't be parsed but rather
792 C<fill_id>ed.
793
794 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
795
796 =cut
797
798 sub fill_in_to_arr {
799 my $self = shift;
800
801 my ($rec, $format_utf8) = @_;
802
803 my $log = $self->_get_logger();
804
805 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
806 return if (! $format_utf8);
807
808 my $i = 0;
809 my @arr;
810
811 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
812 push @arr, @v;
813 }
814
815 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
816
817 return @arr;
818 }
819
820 =head2 sort_arr
821
822 Sort array ignoring case and html in data
823
824 my @sorted = $webpac->sort_arr(@unsorted);
825
826 =cut
827
828 sub sort_arr {
829 my $self = shift;
830
831 my $log = $self->_get_logger();
832
833 # FIXME add Schwartzian Transformation?
834
835 my @sorted = sort {
836 $a =~ s#<[^>]+/*>##;
837 $b =~ s#<[^>]+/*>##;
838 lc($b) cmp lc($a)
839 } @_;
840 $log->debug("sorted values: ",sub { join(", ",@sorted) });
841
842 return @sorted;
843 }
844
845
846 =head2 data_structure
847
848 Create in-memory data structure which represents layout from C<import_xml>.
849 It is used later to produce output.
850
851 my @ds = $webpac->data_structure($rec);
852
853 This method will also set C<$webpac->{'currnet_filename'}> if there is
854 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
855 <headline> tag.
856
857 =cut
858
859 sub data_structure {
860 my $self = shift;
861
862 my $log = $self->_get_logger();
863
864 my $rec = shift;
865 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
866
867 undef $self->{'currnet_filename'};
868 undef $self->{'headline'};
869
870 my @sorted_tags;
871 if ($self->{tags_by_order}) {
872 @sorted_tags = @{$self->{tags_by_order}};
873 } else {
874 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
875 $self->{tags_by_order} = \@sorted_tags;
876 }
877
878 my @ds;
879
880 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
881
882 foreach my $field (@sorted_tags) {
883
884 my $row;
885
886 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
887
888 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
889 my $format = $tag->{'value'} || $tag->{'content'};
890
891 $log->debug("format: $format");
892
893 my @v;
894 if ($format =~ /$LOOKUP_REGEX/o) {
895 @v = $self->fill_in_to_arr($rec,$format);
896 } else {
897 @v = $self->parse_to_arr($rec,$format);
898 }
899 next if (! @v);
900
901 if ($tag->{'sort'}) {
902 @v = $self->sort_arr(@v);
903 $log->warn("sort within tag is usually not what you want!");
904 }
905
906 # use format?
907 if ($tag->{'format_name'}) {
908 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
909 }
910
911 if ($field eq 'filename') {
912 $self->{'current_filename'} = join('',@v);
913 $log->debug("filename: ",$self->{'current_filename'});
914 } elsif ($field eq 'headline') {
915 $self->{'headline'} .= join('',@v);
916 $log->debug("headline: ",$self->{'headline'});
917 next; # don't return headline in data_structure!
918 }
919
920 # delimiter will join repeatable fields
921 if ($tag->{'delimiter'}) {
922 @v = ( join($tag->{'delimiter'}, @v) );
923 }
924
925 # default types
926 my @types = qw(display swish);
927 # override by type attribute
928 @types = ( $tag->{'type'} ) if ($tag->{'type'});
929
930 foreach my $type (@types) {
931 # append to previous line?
932 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
933 if ($tag->{'append'}) {
934
935 # I will delimit appended part with
936 # delimiter (or ,)
937 my $d = $tag->{'delimiter'};
938 # default delimiter
939 $d ||= " ";
940
941 my $last = pop @{$row->{$type}};
942 $d = "" if (! $last);
943 $last .= $d . join($d, @v);
944 push @{$row->{$type}}, $last;
945
946 } else {
947 push @{$row->{$type}}, @v;
948 }
949 }
950
951
952 }
953
954 if ($row) {
955 $row->{'tag'} = $field;
956
957 # TODO: name_sigular, name_plural
958 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
959 $row->{'name'} = $name ? $self->_x($name) : $field;
960
961 # post-sort all values in field
962 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
963 $log->warn("sort at field tag not implemented");
964 }
965
966 push @ds, $row;
967
968 $log->debug("row $field: ",sub { Dumper($row) });
969 }
970
971 }
972
973 return @ds;
974
975 }
976
977 =head2 output
978
979 Create output from in-memory data structure using Template Toolkit template.
980
981 my $text = $webpac->output( template => 'text.tt', data => @ds );
982
983 =cut
984
985 sub output {
986 my $self = shift;
987
988 my $args = {@_};
989
990 my $log = $self->_get_logger();
991
992 $log->logconfess("need template name") if (! $args->{'template'});
993 $log->logconfess("need data array") if (! $args->{'data'});
994
995 my $out;
996
997 $self->{'tt'}->process(
998 $args->{'template'},
999 $args,
1000 \$out
1001 ) || confess $self->{'tt'}->error();
1002
1003 return $out;
1004 }
1005
1006 =head2 output_file
1007
1008 Create output from in-memory data structure using Template Toolkit template
1009 to a file.
1010
1011 $webpac->output_file(
1012 file => 'out.txt',
1013 template => 'text.tt',
1014 data => @ds
1015 );
1016
1017 =cut
1018
1019 sub output_file {
1020 my $self = shift;
1021
1022 my $args = {@_};
1023
1024 my $log = $self->_get_logger();
1025
1026 my $file = $args->{'file'} || $log->logconfess("need file name");
1027
1028 $log->debug("creating file ",$file);
1029
1030 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1031 print $fh $self->output(
1032 template => $args->{'template'},
1033 data => $args->{'data'},
1034 ) || $log->logdie("print: $!");
1035 close($fh) || $log->logdie("close: $!");
1036 }
1037
1038 =head2 apply_format
1039
1040 Apply format specified in tag with C<format_name="name"> and
1041 C<format_delimiter=";;">.
1042
1043 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1044
1045 Formats can contain C<lookup{...}> if you need them.
1046
1047 =cut
1048
1049 sub apply_format {
1050 my $self = shift;
1051
1052 my ($name,$delimiter,$data) = @_;
1053
1054 my $log = $self->_get_logger();
1055
1056 if (! $self->{'import_xml'}->{'format'}->{$name}) {
1057 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1058 return $data;
1059 }
1060
1061 $log->warn("no delimiter for format $name") if (! $delimiter);
1062
1063 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1064
1065 my @data = split(/\Q$delimiter\E/, $data);
1066
1067 my $out = sprintf($format, @data);
1068 $log->debug("using format $name [$format] on $data to produce: $out");
1069
1070 if ($out =~ m/$LOOKUP_REGEX/o) {
1071 return $self->lookup($out);
1072 } else {
1073 return $out;
1074 }
1075
1076 }
1077
1078
1079 #
1080 #
1081 #
1082
1083 =head1 INTERNAL METHODS
1084
1085 Here is a quick list of internal methods, mostly useful to turn debugging
1086 on them (see L<LOGGING> below for explanation).
1087
1088 =cut
1089
1090 =head2 _eval
1091
1092 Internal function to eval code without C<strict 'subs'>.
1093
1094 =cut
1095
1096 sub _eval {
1097 my $self = shift;
1098
1099 my $code = shift || return;
1100
1101 my $log = $self->_get_logger();
1102
1103 no strict 'subs';
1104 my $ret = eval $code;
1105 if ($@) {
1106 $log->error("problem with eval code [$code]: $@");
1107 }
1108
1109 $log->debug("eval: ",$code," [",$ret,"]");
1110
1111 return $ret || undef;
1112 }
1113
1114 =head2 _sort_by_order
1115
1116 Sort xml tags data structure accoding to C<order=""> attribute.
1117
1118 =cut
1119
1120 sub _sort_by_order {
1121 my $self = shift;
1122
1123 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1124 $self->{'import_xml'}->{'indexer'}->{$a};
1125 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1126 $self->{'import_xml'}->{'indexer'}->{$b};
1127
1128 return $va <=> $vb;
1129 }
1130
1131 =head2 _get_logger
1132
1133 Get C<Log::Log4perl> object with a twist: domains are defined for each
1134 method
1135
1136 my $log = $webpac->_get_logger();
1137
1138 =cut
1139
1140 sub _get_logger {
1141 my $self = shift;
1142
1143 my $name = (caller(1))[3] || caller;
1144 return get_logger($name);
1145 }
1146
1147 =head2 _x
1148
1149 Convert string from UTF-8 to code page defined in C<import_xml>.
1150
1151 my $text = $webpac->_x('utf8 text');
1152
1153 =cut
1154
1155 sub _x {
1156 my $self = shift;
1157 my $utf8 = shift || return;
1158
1159 return $self->{'utf2cp'}->convert($utf8) ||
1160 $self->_get_logger()->logwarn("can't convert '$utf8'");
1161 }
1162
1163 #
1164 #
1165 #
1166
1167 =head1 LOGGING
1168
1169 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1170 C<log.conf>.
1171
1172 Methods defined above have different levels of logging, so
1173 it's descriptions will be useful to turn (mostry B<debug> logging) on
1174 or off to see why WabPAC isn't perforing as you expect it (it might even
1175 be a bug!).
1176
1177 B<This is different from normal Log4perl behaviour>. To repeat, you can
1178 also use method names, and not only classes (which are just few)
1179 to filter logging.
1180
1181
1182 =head1 MEMORY USAGE
1183
1184 C<low_mem> options is double-edged sword. If enabled, WebPAC
1185 will run on memory constraint machines (which doesn't have enough
1186 physical RAM to create memory structure for whole source database).
1187
1188 If your machine has 512Mb or more of RAM and database is around 10000 records,
1189 memory shouldn't be an issue. If you don't have enough physical RAM, you
1190 might consider using virtual memory (if your operating system is handling it
1191 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1192 parsed structure of ISIS database (this is what C<low_mem> option does).
1193
1194 Hitting swap at end of reading source database is probably o.k. However,
1195 hitting swap before 90% will dramatically decrease performance and you will
1196 be better off with C<low_mem> and using rest of availble memory for
1197 operating system disk cache (Linux is particuallary good about this).
1198 However, every access to database record will require disk access, so
1199 generation phase will be slower 10-100 times.
1200
1201 Parsed structures are essential - you just have option to trade RAM memory
1202 (which is fast) for disk space (which is slow). Be sure to have planty of
1203 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1204
1205 However, when WebPAC is running on desktop machines (or laptops :-), it's
1206 highly undesireable for system to start swapping. Using C<low_mem> option can
1207 reduce WecPAC memory usage to around 64Mb for same database with lookup
1208 fields and sorted indexes which stay in RAM. Performance will suffer, but
1209 memory usage will really be minimal. It might be also more confortable to
1210 run WebPAC reniced on those machines.
1211
1212 =cut
1213
1214 1;

  ViewVC Help
Powered by ViewVC 1.1.26