/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 500 - (show annotations)
Sun Oct 10 11:04:52 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 27321 byte(s)
implement filter{name} which can be embedded inside filed, just like
eval{...}. Code for filters is not compatibile with old implementation, but
it should be easier.
If you mix eval{} and filter{}, eval has to come first.

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 code_page => 'ISO-8859-2',
38 low_mem => 1,
39 filter => {
40 'lower' => sub { lc($_[0]) },
41 },
42 );
43
44 Default C<code_page> is C<ISO-8859-2>.
45
46 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47
48 There is optinal parametar C<filter> which specify different filters which
49 can be applied using C<filter{name}> notation.
50
51 This method will also read configuration files
52 C<global.conf> (used by indexer and Web font-end)
53 and configuration file specified by C<config_file>
54 which describes databases to be indexed.
55
56 =cut
57
58 # mapping between data type and tag which specify
59 # format in XML file
60 my %type2tag = (
61 'isis' => 'isis',
62 # 'excel' => 'column',
63 # 'marc' => 'marc',
64 # 'feed' => 'feed'
65 );
66
67 sub new {
68 my $class = shift;
69 my $self = {@_};
70 bless($self, $class);
71
72 $self->{'start_t'} = time();
73
74 my $log_file = $self->{'log'} || "log.conf";
75 Log::Log4perl->init($log_file);
76
77 my $log = $self->_get_logger();
78
79 # fill in default values
80 # output codepage
81 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
82
83 #
84 # read global.conf
85 #
86 $log->debug("read 'global.conf'");
87
88 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
89
90 # read global config parametars
91 foreach my $var (qw(
92 dbi_dbd
93 dbi_dsn
94 dbi_user
95 dbi_passwd
96 show_progress
97 my_unac_filter
98 output_template
99 )) {
100 $self->{'global_config'}->{$var} = $config->val('global', $var);
101 }
102
103 #
104 # read indexer config file
105 #
106
107 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
108
109 # create UTF-8 convertor for import_xml files
110 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
111
112 # create Template toolkit instance
113 $self->{'tt'} = Template->new(
114 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
115 # FILTERS => {
116 # 'foo' => \&foo_filter,
117 # },
118 EVAL_PERL => 1,
119 );
120
121 # running with low_mem flag? well, use DBM::Deep then.
122 if ($self->{'low_mem'}) {
123 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
124
125 my $db_file = "data.db";
126
127 if (-e $db_file) {
128 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
129 $log->debug("removed '$db_file' from last run");
130 }
131
132 require DBM::Deep;
133
134 my $db = new DBM::Deep $db_file;
135
136 $log->logdie("DBM::Deep error: $!") unless ($db);
137
138 if ($db->error()) {
139 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
140 } else {
141 $log->debug("using file '$db_file' for DBM::Deep");
142 }
143
144 $self->{'db'} = $db;
145 }
146
147 $log->debug("filters defined: ",Dumper($self->{'filter'}));
148
149 return $self;
150 }
151
152 =head2 open_isis
153
154 Open CDS/ISIS database using OpenIsis module and read all records to memory.
155
156 $webpac->open_isis(
157 filename => '/data/ISIS/ISIS',
158 code_page => '852',
159 limit_mfn => 500,
160 start_mfn => 6000,
161 lookup => [ ... ],
162 );
163
164 By default, ISIS code page is assumed to be C<852>.
165
166 If optional parametar C<start_mfn> is set, this will be first MFN to read
167 from database (so you can skip beginning of your database if you need to).
168
169 If optional parametar C<limit_mfn> is set, it will read just 500 records
170 from database in example above.
171
172 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
173 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
174 value in index.
175
176 lookup => [
177 { 'key' => 'd:v900', 'val' => 'v250^a' },
178 { 'eval' => '"v901^a" eq "Podruèje"',
179 'key' => 'pa:v561^4:v562^4:v461^1',
180 'val' => 'v900' },
181 ]
182
183 Returns number of last record read into memory (size of database, really).
184
185 =cut
186
187 sub open_isis {
188 my $self = shift;
189 my $arg = {@_};
190
191 my $log = $self->_get_logger();
192
193 $log->logcroak("need filename") if (! $arg->{'filename'});
194 my $code_page = $arg->{'code_page'} || '852';
195
196 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197
198 # store data in object
199 $self->{'isis_filename'} = $arg->{'filename'};
200 $self->{'isis_code_page'} = $code_page;
201
202 use OpenIsis;
203
204 #$self->{'isis_code_page'} = $code_page;
205
206 # create Text::Iconv object
207 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
208
209 $log->info("reading ISIS database '",$arg->{'filename'},"'");
210 $log->debug("isis code page: $code_page");
211
212 my $isis_db = OpenIsis::open($arg->{'filename'});
213
214 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
215 my $startmfn = 1;
216
217 if (my $s = $self->{'start_mfn'}) {
218 $log->info("skipping to MFN $s");
219 $startmfn = $s;
220 } else {
221 $self->{'start_mfn'} = $startmfn;
222 }
223
224 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
225
226 $log->info("processing ",($maxmfn-$startmfn)." records...");
227
228 # read database
229 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
230
231
232 $log->debug("mfn: $mfn\n");
233
234 my $rec;
235
236 # read record
237 my $row = OpenIsis::read( $isis_db, $mfn );
238 foreach my $k (keys %{$row}) {
239 if ($k ne "mfn") {
240 foreach my $l (@{$row->{$k}}) {
241 $l = $cp->convert($l);
242 # has subfields?
243 my $val;
244 if ($l =~ m/\^/) {
245 foreach my $t (split(/\^/,$l)) {
246 next if (! $t);
247 $val->{substr($t,0,1)} = substr($t,1);
248 }
249 } else {
250 $val = $l;
251 }
252
253 push @{$rec->{$k}}, $val;
254 }
255 } else {
256 push @{$rec->{'000'}}, $mfn;
257 }
258
259 }
260
261 $log->confess("record $mfn empty?") unless ($rec);
262
263 # store
264 if ($self->{'low_mem'}) {
265 $self->{'db'}->put($mfn, $rec);
266 } else {
267 $self->{'data'}->{$mfn} = $rec;
268 }
269
270 # create lookup
271 $self->create_lookup($rec, @{$arg->{'lookup'}});
272
273 $self->progress_bar($mfn,$maxmfn);
274
275 }
276
277 $self->{'current_mfn'} = -1;
278 $self->{'last_pcnt'} = 0;
279
280 $log->debug("max mfn: $maxmfn");
281
282 # store max mfn and return it.
283 return $self->{'max_mfn'} = $maxmfn;
284 }
285
286 =head2 fetch_rec
287
288 Fetch next record from database. It will also display progress bar (once
289 it's implemented, that is).
290
291 my $rec = $webpac->fetch_rec;
292
293 =cut
294
295 sub fetch_rec {
296 my $self = shift;
297
298 my $log = $self->_get_logger();
299
300 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
301
302 if ($self->{'current_mfn'} == -1) {
303 $self->{'current_mfn'} = $self->{'start_mfn'};
304 } else {
305 $self->{'current_mfn'}++;
306 }
307
308 my $mfn = $self->{'current_mfn'};
309
310 if ($mfn > $self->{'max_mfn'}) {
311 $self->{'current_mfn'} = $self->{'max_mfn'};
312 $log->debug("at EOF");
313 return;
314 }
315
316 $self->progress_bar($mfn,$self->{'max_mfn'});
317
318 if ($self->{'low_mem'}) {
319 return $self->{'db'}->get($mfn);
320 } else {
321 return $self->{'data'}->{$mfn};
322 }
323 }
324
325 =head2 mfn
326
327 Returns current record number (MFN).
328
329 print $webpac->mfn;
330
331 =cut
332
333 sub mfn {
334 my $self = shift;
335 return $self->{'current_mfn'};
336 }
337
338 =head2 progress_bar
339
340 Draw progress bar on STDERR.
341
342 $webpac->progress_bar($current, $max);
343
344 =cut
345
346 sub progress_bar {
347 my $self = shift;
348
349 my ($curr,$max) = @_;
350
351 my $log = $self->_get_logger();
352
353 $log->logconfess("no current value!") if (! $curr);
354 $log->logconfess("no maximum value!") if (! $max);
355
356 if ($curr > $max) {
357 $max = $curr;
358 $log->debug("overflow to $curr");
359 }
360
361 $self->{'last_pcnt'} ||= 1;
362
363 my $p = int($curr * 100 / $max) || 1;
364
365 # reset on re-run
366 if ($p < $self->{'last_pcnt'}) {
367 $self->{'last_pcnt'} = $p;
368 $self->{'last_t'} = time();
369 $self->{'last_curr'} = undef;
370 }
371
372 $self->{'last_t'} ||= time();
373
374 if ($p != $self->{'last_pcnt'}) {
375
376 my $last_curr = $self->{'last_curr'} || $curr;
377 my $t = time();
378 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
379 my $eta = ($max-$curr) / ($rate || 1);
380 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
381 $self->{'last_pcnt'} = $p;
382 $self->{'last_t'} = time();
383 $self->{'last_curr'} = $curr;
384 }
385 print STDERR "\n" if ($p == 100);
386 }
387
388 =head2 fmt_time
389
390 Format time (in seconds) for display.
391
392 print $webpac->fmt_time(time());
393
394 This method is called by L<progress_bar> to display remaining time.
395
396 =cut
397
398 sub fmt_time {
399 my $self = shift;
400
401 my $t = shift || 0;
402 my $out = "";
403
404 my ($ss,$mm,$hh) = gmtime($t);
405 $out .= "${hh}h" if ($hh);
406 $out .= sprintf("%02d:%02d", $mm,$ss);
407 $out .= " " if ($hh == 0);
408 return $out;
409 }
410
411 =head2 open_import_xml
412
413 Read file from C<import_xml/> directory and parse it.
414
415 $webpac->open_import_xml(type => 'isis');
416
417 =cut
418
419 sub open_import_xml {
420 my $self = shift;
421
422 my $log = $self->_get_logger();
423
424 my $arg = {@_};
425 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
426
427 $self->{'type'} = $arg->{'type'};
428
429 my $type_base = $arg->{'type'};
430 $type_base =~ s/_.*$//g;
431
432 $self->{'tag'} = $type2tag{$type_base};
433
434 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
435
436 my $f = "./import_xml/".$self->{'type'}.".xml";
437 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
438
439 $log->info("reading '$f'");
440
441 $self->{'import_xml_file'} = $f;
442
443 $self->{'import_xml'} = XMLin($f,
444 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
445 );
446
447 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
448
449 }
450
451 =head2 create_lookup
452
453 Create lookup from record using lookup definition.
454
455 $self->create_lookup($rec, @lookups);
456
457 Called internally by C<open_*> methods.
458
459 =cut
460
461 sub create_lookup {
462 my $self = shift;
463
464 my $log = $self->_get_logger();
465
466 my $rec = shift || $log->logconfess("need record to create lookup");
467 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
468
469 foreach my $i (@_) {
470 $log->logconfess("need key") unless defined($i->{'key'});
471 $log->logconfess("need val") unless defined($i->{'val'});
472
473 if (defined($i->{'eval'})) {
474 # eval first, so we can skip fill_in for key and val
475 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
476 if ($self->_eval($eval)) {
477 my $key = $self->fill_in($rec,$i->{'key'}) || next;
478 my @val = $self->fill_in($rec,$i->{'val'}) || next;
479 $log->debug("stored $key = ",sub { join(" | ",@val) });
480 push @{$self->{'lookup'}->{$key}}, @val;
481 }
482 } else {
483 my $key = $self->fill_in($rec,$i->{'key'}) || next;
484 my @val = $self->fill_in($rec,$i->{'val'}) || next;
485 $log->debug("stored $key = ",sub { join(" | ",@val) });
486 push @{$self->{'lookup'}->{$key}}, @val;
487 }
488 }
489 }
490
491 =head2 get_data
492
493 Returns value from record.
494
495 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
496
497 Arguments are:
498 record reference C<$rec>,
499 field C<$f>,
500 optional subfiled C<$sf>,
501 index for repeatable values C<$i>.
502
503 Optinal variable C<$found> will be incremeted if there
504 is field.
505
506 Returns value or empty string.
507
508 =cut
509
510 sub get_data {
511 my $self = shift;
512
513 my ($rec,$f,$sf,$i,$found) = @_;
514
515 if ($$rec->{$f}) {
516 return '' if (! $$rec->{$f}->[$i]);
517 no strict 'refs';
518 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
519 $$found++ if (defined($$found));
520 return $$rec->{$f}->[$i]->{$sf};
521 } elsif ($$rec->{$f}->[$i]) {
522 $$found++ if (defined($$found));
523 # it still might have subfield, just
524 # not specified, so we'll dump all
525 if ($$rec->{$f}->[$i] =~ /HASH/o) {
526 my $out;
527 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
528 $out .= $$rec->{$f}->[$i]->{$k}." ";
529 }
530 return $out;
531 } else {
532 return $$rec->{$f}->[$i];
533 }
534 }
535 } else {
536 return '';
537 }
538 }
539
540 =head2 fill_in
541
542 Workhourse of all: takes record from in-memory structure of database and
543 strings with placeholders and returns string or array of with substituted
544 values from record.
545
546 my $text = $webpac->fill_in($rec,'v250^a');
547
548 Optional argument is ordinal number for repeatable fields. By default,
549 it's assume to be first repeatable field (fields are perl array, so first
550 element is 0).
551 Following example will read second value from repeatable field.
552
553 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
554
555 This function B<does not> perform parsing of format to inteligenty skip
556 delimiters before fields which aren't used.
557
558 This method will automatically decode UTF-8 string to local code page
559 if needed.
560
561 =cut
562
563 sub fill_in {
564 my $self = shift;
565
566 my $log = $self->_get_logger();
567
568 my $rec = shift || $log->logconfess("need data record");
569 my $format = shift || $log->logconfess("need format to parse");
570 # iteration (for repeatable fields)
571 my $i = shift || 0;
572
573 # FIXME remove for speedup?
574 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
575
576 if (utf8::is_utf8($format)) {
577 $format = $self->_x($format);
578 }
579
580 my $found = 0;
581
582 my $eval_code;
583 # remove eval{...} from beginning
584 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
585
586 my $filter_name;
587 # remove filter{...} from beginning
588 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
589
590 # do actual replacement of placeholders
591 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
592
593 if ($found) {
594 $log->debug("format: $format");
595 if ($eval_code) {
596 my $eval = $self->fill_in($rec,$eval_code,$i);
597 return if (! $self->_eval($eval));
598 }
599 if ($filter_name && $self->{'filter'}->{$filter_name}) {
600 $log->debug("filter '$filter_name' for $format");
601 $format = $self->{'filter'}->{$filter_name}->($format);
602 return unless(defined($format));
603 $log->debug("filter result: $format");
604 }
605 # do we have lookups?
606 if ($format =~ /$LOOKUP_REGEX/o) {
607 $log->debug("format '$format' has lookup");
608 return $self->lookup($format);
609 } else {
610 return $format;
611 }
612 } else {
613 return;
614 }
615 }
616
617 =head2 lookup
618
619 Perform lookups on format supplied to it.
620
621 my $text = $self->lookup('[v900]');
622
623 Lookups can be nested (like C<[d:[a:[v900]]]>).
624
625 =cut
626
627 sub lookup {
628 my $self = shift;
629
630 my $log = $self->_get_logger();
631
632 my $tmp = shift || $log->logconfess("need format");
633
634 if ($tmp =~ /$LOOKUP_REGEX/o) {
635 my @in = ( $tmp );
636
637 $log->debug("lookup for: ",$tmp);
638
639 my @out;
640 while (my $f = shift @in) {
641 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
642 my $k = $1;
643 if ($self->{'lookup'}->{$k}) {
644 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
645 my $tmp2 = $f;
646 $tmp2 =~ s/lookup{$k}/$nv/g;
647 push @in, $tmp2;
648 }
649 } else {
650 undef $f;
651 }
652 } elsif ($f) {
653 push @out, $f;
654 }
655 }
656 $log->logconfess("return is array and it's not expected!") unless wantarray;
657 return @out;
658 } else {
659 return $tmp;
660 }
661 }
662
663 =head2 parse
664
665 Perform smart parsing of string, skipping delimiters for fields which aren't
666 defined. It can also eval code in format starting with C<eval{...}> and
667 return output or nothing depending on eval code.
668
669 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
670
671 =cut
672
673 sub parse {
674 my $self = shift;
675
676 my ($rec, $format_utf8, $i) = @_;
677
678 return if (! $format_utf8);
679
680 my $log = $self->_get_logger();
681
682 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
683 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
684
685 $i = 0 if (! $i);
686
687 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
688
689 my @out;
690
691 $log->debug("format: $format");
692
693 my $eval_code;
694 # remove eval{...} from beginning
695 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
696
697 my $filter_name;
698 # remove filter{...} from beginning
699 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
700
701 my $prefix;
702 my $all_found=0;
703
704 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
705
706 my $del = $1 || '';
707 $prefix ||= $del if ($all_found == 0);
708
709 my $found = 0;
710 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
711
712 if ($found) {
713 push @out, $del;
714 push @out, $tmp;
715 $all_found += $found;
716 }
717 }
718
719 return if (! $all_found);
720
721 my $out = join('',@out);
722
723 if ($out) {
724 # add rest of format (suffix)
725 $out .= $format;
726
727 # add prefix if not there
728 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
729
730 $log->debug("result: $out");
731 }
732
733 if ($eval_code) {
734 my $eval = $self->fill_in($rec,$eval_code,$i);
735 $log->debug("about to eval{$eval} format: $out");
736 return if (! $self->_eval($eval));
737 }
738
739 if ($filter_name && $self->{'filter'}->{$filter_name}) {
740 $log->debug("about to filter{$filter_name} format: $out");
741 $out = $self->{'filter'}->{$filter_name}->($out);
742 return unless(defined($out));
743 $log->debug("filter result: $out");
744 }
745
746 return $out;
747 }
748
749 =head2 parse_to_arr
750
751 Similar to C<parse>, but returns array of all repeatable fields
752
753 my @arr = $webpac->parse_to_arr($rec,'v250^a');
754
755 =cut
756
757 sub parse_to_arr {
758 my $self = shift;
759
760 my ($rec, $format_utf8) = @_;
761
762 my $log = $self->_get_logger();
763
764 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
765 return if (! $format_utf8);
766
767 my $i = 0;
768 my @arr;
769
770 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
771 push @arr, $v;
772 }
773
774 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
775
776 return @arr;
777 }
778
779 =head2 fill_in_to_arr
780
781 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
782 for fields which have lookups, so they shouldn't be parsed but rather
783 C<fill_id>ed.
784
785 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
786
787 =cut
788
789 sub fill_in_to_arr {
790 my $self = shift;
791
792 my ($rec, $format_utf8) = @_;
793
794 my $log = $self->_get_logger();
795
796 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
797 return if (! $format_utf8);
798
799 my $i = 0;
800 my @arr;
801
802 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
803 push @arr, @v;
804 }
805
806 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
807
808 return @arr;
809 }
810
811 =head2 sort_arr
812
813 Sort array ignoring case and html in data
814
815 my @sorted = $webpac->sort_arr(@unsorted);
816
817 =cut
818
819 sub sort_arr {
820 my $self = shift;
821
822 my $log = $self->_get_logger();
823
824 # FIXME add Schwartzian Transformation?
825
826 my @sorted = sort {
827 $a =~ s#<[^>]+/*>##;
828 $b =~ s#<[^>]+/*>##;
829 lc($b) cmp lc($a)
830 } @_;
831 $log->debug("sorted values: ",sub { join(", ",@sorted) });
832
833 return @sorted;
834 }
835
836
837 =head2 data_structure
838
839 Create in-memory data structure which represents layout from C<import_xml>.
840 It is used later to produce output.
841
842 my @ds = $webpac->data_structure($rec);
843
844 This method will also set C<$webpac->{'currnet_filename'}> if there is
845 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
846 <headline> tag.
847
848 =cut
849
850 sub data_structure {
851 my $self = shift;
852
853 my $log = $self->_get_logger();
854
855 my $rec = shift;
856 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
857
858 undef $self->{'currnet_filename'};
859 undef $self->{'headline'};
860
861 my @sorted_tags;
862 if ($self->{tags_by_order}) {
863 @sorted_tags = @{$self->{tags_by_order}};
864 } else {
865 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
866 $self->{tags_by_order} = \@sorted_tags;
867 }
868
869 my @ds;
870
871 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
872
873 foreach my $field (@sorted_tags) {
874
875 my $row;
876
877 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
878
879 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
880 my $format = $tag->{'value'} || $tag->{'content'};
881
882 $log->debug("format: $format");
883
884 my @v;
885 if ($format =~ /$LOOKUP_REGEX/o) {
886 @v = $self->fill_in_to_arr($rec,$format);
887 } else {
888 @v = $self->parse_to_arr($rec,$format);
889 }
890 next if (! @v);
891
892 if ($tag->{'sort'}) {
893 @v = $self->sort_arr(@v);
894 $log->warn("sort within tag is usually not what you want!");
895 }
896
897 # use format?
898 if ($tag->{'format_name'}) {
899 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
900 }
901
902 if ($field eq 'filename') {
903 $self->{'current_filename'} = join('',@v);
904 $log->debug("filename: ",$self->{'current_filename'});
905 } elsif ($field eq 'headline') {
906 $self->{'headline'} .= join('',@v);
907 $log->debug("headline: ",$self->{'headline'});
908 next; # don't return headline in data_structure!
909 }
910
911 # delimiter will join repeatable fields
912 if ($tag->{'delimiter'}) {
913 @v = ( join($tag->{'delimiter'}, @v) );
914 }
915
916 # default types
917 my @types = qw(display swish);
918 # override by type attribute
919 @types = ( $tag->{'type'} ) if ($tag->{'type'});
920
921 foreach my $type (@types) {
922 # append to previous line?
923 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
924 if ($tag->{'append'}) {
925
926 # I will delimit appended part with
927 # delimiter (or ,)
928 my $d = $tag->{'delimiter'};
929 # default delimiter
930 $d ||= ", ";
931
932 my $last = pop @{$row->{$type}};
933 $d = "" if (! $last);
934 $last .= $d . join($d, @v);
935 push @{$row->{$type}}, $last;
936
937 } else {
938 push @{$row->{$type}}, @v;
939 }
940 }
941
942
943 }
944
945 if ($row) {
946 $row->{'tag'} = $field;
947
948 # TODO: name_sigular, name_plural
949 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
950 $row->{'name'} = $name ? $self->_x($name) : $field;
951
952 # post-sort all values in field
953 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
954 $log->warn("sort at field tag not implemented");
955 }
956
957 push @ds, $row;
958
959 $log->debug("row $field: ",sub { Dumper($row) });
960 }
961
962 }
963
964 return @ds;
965
966 }
967
968 =head2 output
969
970 Create output from in-memory data structure using Template Toolkit template.
971
972 my $text = $webpac->output( template => 'text.tt', data => @ds );
973
974 =cut
975
976 sub output {
977 my $self = shift;
978
979 my $args = {@_};
980
981 my $log = $self->_get_logger();
982
983 $log->logconfess("need template name") if (! $args->{'template'});
984 $log->logconfess("need data array") if (! $args->{'data'});
985
986 my $out;
987
988 $self->{'tt'}->process(
989 $args->{'template'},
990 $args,
991 \$out
992 ) || confess $self->{'tt'}->error();
993
994 return $out;
995 }
996
997 =head2 output_file
998
999 Create output from in-memory data structure using Template Toolkit template
1000 to a file.
1001
1002 $webpac->output_file(
1003 file => 'out.txt',
1004 template => 'text.tt',
1005 data => @ds
1006 );
1007
1008 =cut
1009
1010 sub output_file {
1011 my $self = shift;
1012
1013 my $args = {@_};
1014
1015 my $log = $self->_get_logger();
1016
1017 my $file = $args->{'file'} || $log->logconfess("need file name");
1018
1019 $log->debug("creating file ",$file);
1020
1021 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1022 print $fh $self->output(
1023 template => $args->{'template'},
1024 data => $args->{'data'},
1025 ) || $log->logdie("print: $!");
1026 close($fh) || $log->logdie("close: $!");
1027 }
1028
1029 =head2 apply_format
1030
1031 Apply format specified in tag with C<format_name="name"> and
1032 C<format_delimiter=";;">.
1033
1034 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1035
1036 Formats can contain C<lookup{...}> if you need them.
1037
1038 =cut
1039
1040 sub apply_format {
1041 my $self = shift;
1042
1043 my ($name,$delimiter,$data) = @_;
1044
1045 my $log = $self->_get_logger();
1046
1047 if (! $self->{'import_xml'}->{'format'}->{$name}) {
1048 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1049 return $data;
1050 }
1051
1052 $log->warn("no delimiter for format $name") if (! $delimiter);
1053
1054 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1055
1056 my @data = split(/\Q$delimiter\E/, $data);
1057
1058 my $out = sprintf($format, @data);
1059 $log->debug("using format $name [$format] on $data to produce: $out");
1060
1061 if ($out =~ m/$LOOKUP_REGEX/o) {
1062 return $self->lookup($out);
1063 } else {
1064 return $out;
1065 }
1066
1067 }
1068
1069
1070 #
1071 #
1072 #
1073
1074 =head1 INTERNAL METHODS
1075
1076 Here is a quick list of internal methods, mostly useful to turn debugging
1077 on them (see L<LOGGING> below for explanation).
1078
1079 =cut
1080
1081 =head2 _eval
1082
1083 Internal function to eval code without C<strict 'subs'>.
1084
1085 =cut
1086
1087 sub _eval {
1088 my $self = shift;
1089
1090 my $code = shift || return;
1091
1092 my $log = $self->_get_logger();
1093
1094 no strict 'subs';
1095 my $ret = eval $code;
1096 if ($@) {
1097 $log->error("problem with eval code [$code]: $@");
1098 }
1099
1100 $log->debug("eval: ",$code," [",$ret,"]");
1101
1102 return $ret || undef;
1103 }
1104
1105 =head2 _sort_by_order
1106
1107 Sort xml tags data structure accoding to C<order=""> attribute.
1108
1109 =cut
1110
1111 sub _sort_by_order {
1112 my $self = shift;
1113
1114 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1115 $self->{'import_xml'}->{'indexer'}->{$a};
1116 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1117 $self->{'import_xml'}->{'indexer'}->{$b};
1118
1119 return $va <=> $vb;
1120 }
1121
1122 =head2 _get_logger
1123
1124 Get C<Log::Log4perl> object with a twist: domains are defined for each
1125 method
1126
1127 my $log = $webpac->_get_logger();
1128
1129 =cut
1130
1131 sub _get_logger {
1132 my $self = shift;
1133
1134 my $name = (caller(1))[3] || caller;
1135 return get_logger($name);
1136 }
1137
1138 =head2 _x
1139
1140 Convert string from UTF-8 to code page defined in C<import_xml>.
1141
1142 my $text = $webpac->_x('utf8 text');
1143
1144 =cut
1145
1146 sub _x {
1147 my $self = shift;
1148 my $utf8 = shift || return;
1149
1150 return $self->{'utf2cp'}->convert($utf8) ||
1151 $self->_get_logger()->logwarn("can't convert '$utf8'");
1152 }
1153
1154 #
1155 #
1156 #
1157
1158 =head1 LOGGING
1159
1160 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1161 C<log.conf>.
1162
1163 Methods defined above have different levels of logging, so
1164 it's descriptions will be useful to turn (mostry B<debug> logging) on
1165 or off to see why WabPAC isn't perforing as you expect it (it might even
1166 be a bug!).
1167
1168 B<This is different from normal Log4perl behaviour>. To repeat, you can
1169 also use method names, and not only classes (which are just few)
1170 to filter logging.
1171
1172
1173 =head1 MEMORY USAGE
1174
1175 C<low_mem> options is double-edged sword. If enabled, WebPAC
1176 will run on memory constraint machines (which doesn't have enough
1177 physical RAM to create memory structure for whole source database).
1178
1179 If your machine has 512Mb or more of RAM and database is around 10000 records,
1180 memory shouldn't be an issue. If you don't have enough physical RAM, you
1181 might consider using virtual memory (if your operating system is handling it
1182 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1183 parsed structure of ISIS database (this is what C<low_mem> option does).
1184
1185 Hitting swap at end of reading source database is probably o.k. However,
1186 hitting swap before 90% will dramatically decrease performance and you will
1187 be better off with C<low_mem> and using rest of availble memory for
1188 operating system disk cache (Linux is particuallary good about this).
1189 However, every access to database record will require disk access, so
1190 generation phase will be slower 10-100 times.
1191
1192 Parsed structures are essential - you just have option to trade RAM memory
1193 (which is fast) for disk space (which is slow). Be sure to have planty of
1194 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1195
1196 However, when WebPAC is running on desktop machines (or laptops :-), it's
1197 highly undesireable for system to start swapping. Using C<low_mem> option can
1198 reduce WecPAC memory usage to around 64Mb for same database with lookup
1199 fields and sorted indexes which stay in RAM. Performance will suffer, but
1200 memory usage will really be minimal. It might be also more confortable to
1201 run WebPAC reniced on those machines.
1202
1203 =cut
1204
1205 1;

  ViewVC Help
Powered by ViewVC 1.1.26