/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 705 - (hide annotations)
Wed Jul 13 22:34:52 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 28410 byte(s)
ported code to use Biblio::Isis, newer version (released on CPAN of IsisDB)

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 705 my ($have_biblio_isis, $have_openisis) = (0,0);
17    
18     eval "use Biblio::Isis 0.13;";
19     unless ($@) {
20     $have_biblio_isis = 1
21     } else {
22     eval "use OpenIsis;";
23     $have_openisis = 1 unless ($@);
24     }
25    
26 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
27     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
28     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
29     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
30    
31 dpavlin 352 =head1 NAME
32    
33 dpavlin 354 WebPAC - base class for WebPAC
34 dpavlin 352
35     =head1 DESCRIPTION
36    
37 dpavlin 354 This module implements methods used by WebPAC.
38 dpavlin 352
39     =head1 METHODS
40    
41     =head2 new
42    
43 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
44 dpavlin 352
45 dpavlin 354 my $webpac = new WebPAC(
46 dpavlin 352 config_file => 'name.conf',
47 dpavlin 439 code_page => 'ISO-8859-2',
48     low_mem => 1,
49 dpavlin 500 filter => {
50     'lower' => sub { lc($_[0]) },
51     },
52 dpavlin 352 );
53    
54     Default C<code_page> is C<ISO-8859-2>.
55    
56 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
57    
58 dpavlin 500 There is optinal parametar C<filter> which specify different filters which
59     can be applied using C<filter{name}> notation.
60 dpavlin 560 Same filters can be used in Template Toolkit files.
61 dpavlin 500
62 dpavlin 421 This method will also read configuration files
63 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
64     and configuration file specified by C<config_file>
65     which describes databases to be indexed.
66    
67 dpavlin 352 =cut
68    
69 dpavlin 363 # mapping between data type and tag which specify
70     # format in XML file
71     my %type2tag = (
72     'isis' => 'isis',
73     # 'excel' => 'column',
74     # 'marc' => 'marc',
75     # 'feed' => 'feed'
76     );
77    
78 dpavlin 352 sub new {
79     my $class = shift;
80     my $self = {@_};
81     bless($self, $class);
82    
83 dpavlin 422 $self->{'start_t'} = time();
84    
85 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
86     Log::Log4perl->init($log_file);
87    
88     my $log = $self->_get_logger();
89    
90 dpavlin 352 # fill in default values
91     # output codepage
92     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
93    
94 dpavlin 353 #
95     # read global.conf
96     #
97 dpavlin 372 $log->debug("read 'global.conf'");
98 dpavlin 352
99 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
100 dpavlin 352
101     # read global config parametars
102     foreach my $var (qw(
103     dbi_dbd
104     dbi_dsn
105     dbi_user
106     dbi_passwd
107     show_progress
108     my_unac_filter
109 dpavlin 370 output_template
110 dpavlin 352 )) {
111 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
112 dpavlin 352 }
113    
114 dpavlin 353 #
115     # read indexer config file
116     #
117 dpavlin 352
118 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
119 dpavlin 352
120 dpavlin 370 # create UTF-8 convertor for import_xml files
121 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
122 dpavlin 370
123     # create Template toolkit instance
124     $self->{'tt'} = Template->new(
125     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
126 dpavlin 560 FILTERS => $self->{'filter'},
127 dpavlin 370 EVAL_PERL => 1,
128     );
129    
130 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
131     if ($self->{'low_mem'}) {
132 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
133 dpavlin 421
134     my $db_file = "data.db";
135    
136     if (-e $db_file) {
137     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
138     $log->debug("removed '$db_file' from last run");
139     }
140    
141 dpavlin 434 require DBM::Deep;
142 dpavlin 421
143     my $db = new DBM::Deep $db_file;
144    
145     $log->logdie("DBM::Deep error: $!") unless ($db);
146    
147     if ($db->error()) {
148     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
149     } else {
150 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
151 dpavlin 421 }
152    
153     $self->{'db'} = $db;
154     }
155    
156 dpavlin 500 $log->debug("filters defined: ",Dumper($self->{'filter'}));
157    
158 dpavlin 352 return $self;
159     }
160    
161     =head2 open_isis
162    
163 dpavlin 609 Open CDS/ISIS, WinISIS or IsisMarc database using IsisDB or OpenIsis module
164     and read all records to memory.
165 dpavlin 352
166     $webpac->open_isis(
167     filename => '/data/ISIS/ISIS',
168     code_page => '852',
169 dpavlin 431 limit_mfn => 500,
170     start_mfn => 6000,
171 dpavlin 352 lookup => [ ... ],
172     );
173    
174     By default, ISIS code page is assumed to be C<852>.
175    
176 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
177     from database (so you can skip beginning of your database if you need to).
178    
179 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
180     from database in example above.
181 dpavlin 352
182     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
183     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
184     value in index.
185    
186     lookup => [
187     { 'key' => 'd:v900', 'val' => 'v250^a' },
188     { 'eval' => '"v901^a" eq "Podruèje"',
189     'key' => 'pa:v561^4:v562^4:v461^1',
190     'val' => 'v900' },
191     ]
192    
193 dpavlin 367 Returns number of last record read into memory (size of database, really).
194    
195 dpavlin 352 =cut
196    
197     sub open_isis {
198     my $self = shift;
199     my $arg = {@_};
200    
201 dpavlin 372 my $log = $self->_get_logger();
202    
203     $log->logcroak("need filename") if (! $arg->{'filename'});
204 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
205    
206 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
207    
208 dpavlin 421 # store data in object
209     $self->{'isis_filename'} = $arg->{'filename'};
210     $self->{'isis_code_page'} = $code_page;
211    
212 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
213    
214     # create Text::Iconv object
215     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
216    
217 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
218 dpavlin 421 $log->debug("isis code page: $code_page");
219 dpavlin 363
220 dpavlin 609 my ($isis_db,$maxmfn);
221    
222 dpavlin 705 if ($have_openisis) {
223     $log->debug("using OpenIsis perl bindings");
224 dpavlin 609 $isis_db = OpenIsis::open($arg->{'filename'});
225     $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
226 dpavlin 705 } elsif ($have_biblio_isis) {
227     $log->debug("using Biblio::Isis");
228     use Biblio::Isis;
229     $isis_db = new Biblio::Isis(
230 dpavlin 609 isisdb => $arg->{'filename'},
231     include_deleted => 1,
232     hash_filter => sub {
233     my $l = shift || return;
234     $l = $cp->convert($l);
235     return $l;
236     },
237     );
238     $maxmfn = $isis_db->{'maxmfn'};
239 dpavlin 705 } else {
240     $log->logdie("Can't find supported ISIS library for perl. I suggent that you install Bilbio::Isis from CPAN.");
241 dpavlin 609 }
242    
243    
244 dpavlin 431 my $startmfn = 1;
245 dpavlin 352
246 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
247     $log->info("skipping to MFN $s");
248     $startmfn = $s;
249 dpavlin 448 } else {
250     $self->{'start_mfn'} = $startmfn;
251 dpavlin 431 }
252 dpavlin 357
253 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
254 dpavlin 363
255 dpavlin 705 $log->info("processing ",($maxmfn-$startmfn)." records using ",( $have_openisis ? 'OpenIsis' : 'Biblio::Isis'));
256 dpavlin 431
257 dpavlin 609
258 dpavlin 352 # read database
259 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
260 dpavlin 352
261 dpavlin 418
262     $log->debug("mfn: $mfn\n");
263    
264 dpavlin 421 my $rec;
265    
266 dpavlin 705 if ($have_openisis) {
267 dpavlin 609
268     # read record using OpenIsis
269     my $row = OpenIsis::read( $isis_db, $mfn );
270     foreach my $k (keys %{$row}) {
271     if ($k ne "mfn") {
272     foreach my $l (@{$row->{$k}}) {
273     $l = $cp->convert($l);
274     # has subfields?
275     my $val;
276     if ($l =~ m/\^/) {
277     foreach my $t (split(/\^/,$l)) {
278     next if (! $t);
279     $val->{substr($t,0,1)} = substr($t,1);
280     }
281     } else {
282     $val = $l;
283 dpavlin 352 }
284 dpavlin 609
285     push @{$rec->{$k}}, $val;
286 dpavlin 352 }
287 dpavlin 609 } else {
288     push @{$rec->{'000'}}, $mfn;
289 dpavlin 352 }
290     }
291    
292 dpavlin 705 } elsif ($have_biblio_isis) {
293     $rec = $isis_db->to_hash($mfn);
294 dpavlin 609 } else {
295 dpavlin 705 $log->logdie("hum? implementation missing?");
296 dpavlin 352 }
297    
298 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
299    
300     # store
301     if ($self->{'low_mem'}) {
302     $self->{'db'}->put($mfn, $rec);
303     } else {
304     $self->{'data'}->{$mfn} = $rec;
305     }
306    
307 dpavlin 352 # create lookup
308 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
309 dpavlin 352
310 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
311    
312 dpavlin 352 }
313    
314 dpavlin 448 $self->{'current_mfn'} = -1;
315 dpavlin 389 $self->{'last_pcnt'} = 0;
316 dpavlin 362
317 dpavlin 418 $log->debug("max mfn: $maxmfn");
318    
319 dpavlin 352 # store max mfn and return it.
320     return $self->{'max_mfn'} = $maxmfn;
321     }
322    
323 dpavlin 362 =head2 fetch_rec
324    
325     Fetch next record from database. It will also display progress bar (once
326     it's implemented, that is).
327    
328     my $rec = $webpac->fetch_rec;
329    
330     =cut
331    
332     sub fetch_rec {
333     my $self = shift;
334    
335 dpavlin 372 my $log = $self->_get_logger();
336 dpavlin 362
337 dpavlin 448 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
338 dpavlin 372
339 dpavlin 448 if ($self->{'current_mfn'} == -1) {
340     $self->{'current_mfn'} = $self->{'start_mfn'};
341     } else {
342     $self->{'current_mfn'}++;
343     }
344    
345     my $mfn = $self->{'current_mfn'};
346    
347 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
348     $self->{'current_mfn'} = $self->{'max_mfn'};
349 dpavlin 373 $log->debug("at EOF");
350 dpavlin 362 return;
351     }
352    
353 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
354    
355 dpavlin 421 if ($self->{'low_mem'}) {
356     return $self->{'db'}->get($mfn);
357     } else {
358     return $self->{'data'}->{$mfn};
359     }
360 dpavlin 362 }
361    
362 dpavlin 448 =head2 mfn
363    
364     Returns current record number (MFN).
365    
366     print $webpac->mfn;
367    
368     =cut
369    
370     sub mfn {
371     my $self = shift;
372     return $self->{'current_mfn'};
373     }
374    
375 dpavlin 389 =head2 progress_bar
376    
377     Draw progress bar on STDERR.
378    
379     $webpac->progress_bar($current, $max);
380    
381     =cut
382    
383     sub progress_bar {
384     my $self = shift;
385    
386     my ($curr,$max) = @_;
387    
388     my $log = $self->_get_logger();
389    
390     $log->logconfess("no current value!") if (! $curr);
391     $log->logconfess("no maximum value!") if (! $max);
392    
393     if ($curr > $max) {
394     $max = $curr;
395     $log->debug("overflow to $curr");
396     }
397    
398     $self->{'last_pcnt'} ||= 1;
399    
400 dpavlin 454 my $p = int($curr * 100 / $max) || 1;
401 dpavlin 389
402 dpavlin 422 # reset on re-run
403     if ($p < $self->{'last_pcnt'}) {
404     $self->{'last_pcnt'} = $p;
405 dpavlin 609 $self->{'start_t'} = time();
406 dpavlin 422 }
407    
408 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
409 dpavlin 422
410     my $t = time();
411 dpavlin 609 my $rate = ($curr / ($t - $self->{'start_t'} || 1));
412 dpavlin 422 my $eta = ($max-$curr) / ($rate || 1);
413     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
414 dpavlin 389 $self->{'last_pcnt'} = $p;
415 dpavlin 422 $self->{'last_curr'} = $curr;
416 dpavlin 389 }
417 dpavlin 412 print STDERR "\n" if ($p == 100);
418 dpavlin 389 }
419    
420 dpavlin 422 =head2 fmt_time
421    
422     Format time (in seconds) for display.
423    
424     print $webpac->fmt_time(time());
425    
426     This method is called by L<progress_bar> to display remaining time.
427    
428     =cut
429    
430     sub fmt_time {
431     my $self = shift;
432    
433     my $t = shift || 0;
434     my $out = "";
435    
436     my ($ss,$mm,$hh) = gmtime($t);
437     $out .= "${hh}h" if ($hh);
438     $out .= sprintf("%02d:%02d", $mm,$ss);
439     $out .= " " if ($hh == 0);
440     return $out;
441     }
442    
443 dpavlin 363 =head2 open_import_xml
444    
445     Read file from C<import_xml/> directory and parse it.
446    
447     $webpac->open_import_xml(type => 'isis');
448    
449     =cut
450    
451     sub open_import_xml {
452     my $self = shift;
453    
454 dpavlin 372 my $log = $self->_get_logger();
455    
456 dpavlin 363 my $arg = {@_};
457 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
458 dpavlin 363
459 dpavlin 366 $self->{'type'} = $arg->{'type'};
460 dpavlin 363
461 dpavlin 366 my $type_base = $arg->{'type'};
462 dpavlin 363 $type_base =~ s/_.*$//g;
463    
464 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
465    
466 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
467 dpavlin 366
468     my $f = "./import_xml/".$self->{'type'}.".xml";
469 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
470 dpavlin 363
471 dpavlin 375 $log->info("reading '$f'");
472 dpavlin 363
473 dpavlin 375 $self->{'import_xml_file'} = $f;
474    
475 dpavlin 363 $self->{'import_xml'} = XMLin($f,
476 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
477 dpavlin 363 );
478    
479 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
480    
481 dpavlin 363 }
482    
483 dpavlin 355 =head2 create_lookup
484    
485     Create lookup from record using lookup definition.
486    
487 dpavlin 367 $self->create_lookup($rec, @lookups);
488    
489     Called internally by C<open_*> methods.
490    
491 dpavlin 355 =cut
492    
493     sub create_lookup {
494     my $self = shift;
495    
496 dpavlin 372 my $log = $self->_get_logger();
497 dpavlin 355
498 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
499     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
500    
501 dpavlin 355 foreach my $i (@_) {
502 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
503     $log->logconfess("need val") unless defined($i->{'val'});
504    
505     if (defined($i->{'eval'})) {
506     # eval first, so we can skip fill_in for key and val
507     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
508     if ($self->_eval($eval)) {
509     my $key = $self->fill_in($rec,$i->{'key'}) || next;
510     my @val = $self->fill_in($rec,$i->{'val'}) || next;
511 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
512 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
513     }
514     } else {
515 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
516     my @val = $self->fill_in($rec,$i->{'val'}) || next;
517     $log->debug("stored $key = ",sub { join(" | ",@val) });
518     push @{$self->{'lookup'}->{$key}}, @val;
519 dpavlin 355 }
520     }
521     }
522    
523 dpavlin 356 =head2 get_data
524    
525     Returns value from record.
526    
527 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
528 dpavlin 356
529     Arguments are:
530     record reference C<$rec>,
531     field C<$f>,
532     optional subfiled C<$sf>,
533     index for repeatable values C<$i>.
534    
535 dpavlin 367 Optinal variable C<$found> will be incremeted if there
536 dpavlin 356 is field.
537    
538     Returns value or empty string.
539    
540     =cut
541    
542     sub get_data {
543     my $self = shift;
544    
545     my ($rec,$f,$sf,$i,$found) = @_;
546 dpavlin 367
547 dpavlin 356 if ($$rec->{$f}) {
548 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
549 dpavlin 389 no strict 'refs';
550 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
551     $$found++ if (defined($$found));
552     return $$rec->{$f}->[$i]->{$sf};
553     } elsif ($$rec->{$f}->[$i]) {
554     $$found++ if (defined($$found));
555 dpavlin 366 # it still might have subfield, just
556     # not specified, so we'll dump all
557     if ($$rec->{$f}->[$i] =~ /HASH/o) {
558     my $out;
559     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
560     $out .= $$rec->{$f}->[$i]->{$k}." ";
561     }
562     return $out;
563     } else {
564     return $$rec->{$f}->[$i];
565     }
566 dpavlin 356 }
567     } else {
568     return '';
569     }
570     }
571    
572 dpavlin 352 =head2 fill_in
573    
574     Workhourse of all: takes record from in-memory structure of database and
575     strings with placeholders and returns string or array of with substituted
576     values from record.
577    
578 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
579 dpavlin 352
580     Optional argument is ordinal number for repeatable fields. By default,
581 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
582     element is 0).
583     Following example will read second value from repeatable field.
584 dpavlin 352
585 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
586 dpavlin 353
587     This function B<does not> perform parsing of format to inteligenty skip
588     delimiters before fields which aren't used.
589    
590 dpavlin 376 This method will automatically decode UTF-8 string to local code page
591     if needed.
592    
593 dpavlin 352 =cut
594    
595 dpavlin 372 sub fill_in {
596 dpavlin 371 my $self = shift;
597    
598 dpavlin 372 my $log = $self->_get_logger();
599 dpavlin 371
600 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
601     my $format = shift || $log->logconfess("need format to parse");
602 dpavlin 352 # iteration (for repeatable fields)
603     my $i = shift || 0;
604    
605 dpavlin 555 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
606    
607 dpavlin 352 # FIXME remove for speedup?
608 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
609 dpavlin 352
610 dpavlin 376 if (utf8::is_utf8($format)) {
611     $format = $self->_x($format);
612     }
613    
614 dpavlin 352 my $found = 0;
615    
616 dpavlin 359 my $eval_code;
617     # remove eval{...} from beginning
618     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
619    
620 dpavlin 500 my $filter_name;
621     # remove filter{...} from beginning
622     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
623    
624 dpavlin 352 # do actual replacement of placeholders
625 dpavlin 555 # repeatable fields
626 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
627 dpavlin 555 # non-repeatable fields
628     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
629 dpavlin 352
630 dpavlin 353 if ($found) {
631 dpavlin 373 $log->debug("format: $format");
632 dpavlin 359 if ($eval_code) {
633     my $eval = $self->fill_in($rec,$eval_code,$i);
634 dpavlin 371 return if (! $self->_eval($eval));
635 dpavlin 359 }
636 dpavlin 500 if ($filter_name && $self->{'filter'}->{$filter_name}) {
637     $log->debug("filter '$filter_name' for $format");
638     $format = $self->{'filter'}->{$filter_name}->($format);
639     return unless(defined($format));
640     $log->debug("filter result: $format");
641     }
642 dpavlin 353 # do we have lookups?
643 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
644     $log->debug("format '$format' has lookup");
645 dpavlin 353 return $self->lookup($format);
646     } else {
647     return $format;
648     }
649 dpavlin 352 } else {
650     return;
651     }
652     }
653    
654     =head2 lookup
655    
656 dpavlin 355 Perform lookups on format supplied to it.
657 dpavlin 352
658 dpavlin 367 my $text = $self->lookup('[v900]');
659 dpavlin 352
660 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
661    
662 dpavlin 352 =cut
663    
664     sub lookup {
665     my $self = shift;
666    
667 dpavlin 372 my $log = $self->_get_logger();
668 dpavlin 352
669 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
670    
671 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
672 dpavlin 352 my @in = ( $tmp );
673 dpavlin 372
674     $log->debug("lookup for: ",$tmp);
675    
676 dpavlin 352 my @out;
677     while (my $f = shift @in) {
678 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
679 dpavlin 352 my $k = $1;
680     if ($self->{'lookup'}->{$k}) {
681     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
682     my $tmp2 = $f;
683 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
684 dpavlin 352 push @in, $tmp2;
685     }
686     } else {
687     undef $f;
688     }
689     } elsif ($f) {
690     push @out, $f;
691     }
692     }
693 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
694 dpavlin 352 return @out;
695     } else {
696     return $tmp;
697     }
698     }
699    
700 dpavlin 356 =head2 parse
701    
702     Perform smart parsing of string, skipping delimiters for fields which aren't
703     defined. It can also eval code in format starting with C<eval{...}> and
704     return output or nothing depending on eval code.
705    
706 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
707 dpavlin 356
708     =cut
709    
710     sub parse {
711     my $self = shift;
712    
713 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
714 dpavlin 356
715 dpavlin 366 return if (! $format_utf8);
716    
717 dpavlin 372 my $log = $self->_get_logger();
718 dpavlin 358
719 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
720     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
721    
722 dpavlin 358 $i = 0 if (! $i);
723    
724 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
725 dpavlin 366
726 dpavlin 356 my @out;
727    
728 dpavlin 373 $log->debug("format: $format");
729    
730 dpavlin 356 my $eval_code;
731     # remove eval{...} from beginning
732     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
733    
734 dpavlin 500 my $filter_name;
735     # remove filter{...} from beginning
736     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
737    
738 dpavlin 358 my $prefix;
739     my $all_found=0;
740 dpavlin 356
741 dpavlin 555 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
742 dpavlin 356
743 dpavlin 358 my $del = $1 || '';
744 dpavlin 359 $prefix ||= $del if ($all_found == 0);
745 dpavlin 358
746 dpavlin 555 # repeatable index
747     my $r = $i;
748     $r = 0 if (lc("$2") eq 's');
749    
750 dpavlin 358 my $found = 0;
751 dpavlin 555 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
752 dpavlin 358
753 dpavlin 356 if ($found) {
754 dpavlin 358 push @out, $del;
755     push @out, $tmp;
756     $all_found += $found;
757 dpavlin 356 }
758     }
759    
760 dpavlin 358 return if (! $all_found);
761 dpavlin 356
762 dpavlin 373 my $out = join('',@out);
763 dpavlin 358
764 dpavlin 373 if ($out) {
765     # add rest of format (suffix)
766     $out .= $format;
767 dpavlin 367
768 dpavlin 373 # add prefix if not there
769     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
770    
771     $log->debug("result: $out");
772     }
773    
774 dpavlin 359 if ($eval_code) {
775 dpavlin 501 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
776 dpavlin 500 $log->debug("about to eval{$eval} format: $out");
777 dpavlin 371 return if (! $self->_eval($eval));
778 dpavlin 359 }
779 dpavlin 500
780     if ($filter_name && $self->{'filter'}->{$filter_name}) {
781     $log->debug("about to filter{$filter_name} format: $out");
782     $out = $self->{'filter'}->{$filter_name}->($out);
783     return unless(defined($out));
784     $log->debug("filter result: $out");
785     }
786 dpavlin 359
787 dpavlin 358 return $out;
788 dpavlin 356 }
789    
790 dpavlin 367 =head2 parse_to_arr
791    
792     Similar to C<parse>, but returns array of all repeatable fields
793    
794     my @arr = $webpac->parse_to_arr($rec,'v250^a');
795    
796     =cut
797    
798     sub parse_to_arr {
799     my $self = shift;
800    
801     my ($rec, $format_utf8) = @_;
802    
803 dpavlin 372 my $log = $self->_get_logger();
804    
805     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
806 dpavlin 367 return if (! $format_utf8);
807    
808     my $i = 0;
809     my @arr;
810    
811     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
812     push @arr, $v;
813     }
814    
815 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
816    
817 dpavlin 367 return @arr;
818     }
819    
820 dpavlin 373 =head2 fill_in_to_arr
821    
822     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
823     for fields which have lookups, so they shouldn't be parsed but rather
824     C<fill_id>ed.
825    
826     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
827    
828     =cut
829    
830     sub fill_in_to_arr {
831     my $self = shift;
832    
833     my ($rec, $format_utf8) = @_;
834    
835     my $log = $self->_get_logger();
836    
837     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
838     return if (! $format_utf8);
839    
840     my $i = 0;
841     my @arr;
842    
843     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
844     push @arr, @v;
845     }
846    
847     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
848    
849     return @arr;
850     }
851    
852 dpavlin 441 =head2 sort_arr
853 dpavlin 373
854 dpavlin 441 Sort array ignoring case and html in data
855    
856     my @sorted = $webpac->sort_arr(@unsorted);
857    
858     =cut
859    
860     sub sort_arr {
861     my $self = shift;
862    
863     my $log = $self->_get_logger();
864    
865     # FIXME add Schwartzian Transformation?
866    
867     my @sorted = sort {
868     $a =~ s#<[^>]+/*>##;
869     $b =~ s#<[^>]+/*>##;
870     lc($b) cmp lc($a)
871     } @_;
872     $log->debug("sorted values: ",sub { join(", ",@sorted) });
873    
874     return @sorted;
875     }
876    
877    
878 dpavlin 366 =head2 data_structure
879    
880     Create in-memory data structure which represents layout from C<import_xml>.
881     It is used later to produce output.
882    
883 dpavlin 368 my @ds = $webpac->data_structure($rec);
884 dpavlin 366
885 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
886 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
887     <headline> tag.
888 dpavlin 374
889 dpavlin 366 =cut
890    
891 dpavlin 372 sub data_structure {
892 dpavlin 366 my $self = shift;
893    
894 dpavlin 372 my $log = $self->_get_logger();
895 dpavlin 366
896     my $rec = shift;
897 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
898 dpavlin 366
899 dpavlin 374 undef $self->{'currnet_filename'};
900 dpavlin 398 undef $self->{'headline'};
901 dpavlin 374
902 dpavlin 366 my @sorted_tags;
903     if ($self->{tags_by_order}) {
904     @sorted_tags = @{$self->{tags_by_order}};
905     } else {
906     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
907     $self->{tags_by_order} = \@sorted_tags;
908     }
909    
910 dpavlin 368 my @ds;
911 dpavlin 366
912 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
913    
914 dpavlin 366 foreach my $field (@sorted_tags) {
915    
916     my $row;
917    
918     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
919    
920     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
921 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
922 dpavlin 366
923 dpavlin 373 $log->debug("format: $format");
924    
925     my @v;
926     if ($format =~ /$LOOKUP_REGEX/o) {
927     @v = $self->fill_in_to_arr($rec,$format);
928     } else {
929     @v = $self->parse_to_arr($rec,$format);
930     }
931 dpavlin 367 next if (! @v);
932 dpavlin 366
933 dpavlin 439 if ($tag->{'sort'}) {
934 dpavlin 441 @v = $self->sort_arr(@v);
935 dpavlin 439 }
936    
937 dpavlin 375 # use format?
938     if ($tag->{'format_name'}) {
939     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
940     }
941    
942 dpavlin 398 if ($field eq 'filename') {
943     $self->{'current_filename'} = join('',@v);
944     $log->debug("filename: ",$self->{'current_filename'});
945     } elsif ($field eq 'headline') {
946     $self->{'headline'} .= join('',@v);
947     $log->debug("headline: ",$self->{'headline'});
948     next; # don't return headline in data_structure!
949     }
950    
951 dpavlin 439 # delimiter will join repeatable fields
952     if ($tag->{'delimiter'}) {
953     @v = ( join($tag->{'delimiter'}, @v) );
954 dpavlin 366 }
955 dpavlin 373
956 dpavlin 439 # default types
957     my @types = qw(display swish);
958     # override by type attribute
959     @types = ( $tag->{'type'} ) if ($tag->{'type'});
960 dpavlin 374
961 dpavlin 439 foreach my $type (@types) {
962     # append to previous line?
963     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
964     if ($tag->{'append'}) {
965    
966     # I will delimit appended part with
967     # delimiter (or ,)
968     my $d = $tag->{'delimiter'};
969     # default delimiter
970 dpavlin 501 $d ||= " ";
971 dpavlin 439
972     my $last = pop @{$row->{$type}};
973     $d = "" if (! $last);
974     $last .= $d . join($d, @v);
975     push @{$row->{$type}}, $last;
976    
977     } else {
978     push @{$row->{$type}}, @v;
979     }
980     }
981    
982    
983 dpavlin 366 }
984    
985 dpavlin 368 if ($row) {
986     $row->{'tag'} = $field;
987 dpavlin 375
988     # TODO: name_sigular, name_plural
989     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
990     $row->{'name'} = $name ? $self->_x($name) : $field;
991    
992 dpavlin 439 # post-sort all values in field
993     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
994     $log->warn("sort at field tag not implemented");
995     }
996    
997 dpavlin 368 push @ds, $row;
998 dpavlin 374
999 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
1000 dpavlin 368 }
1001 dpavlin 366
1002     }
1003    
1004 dpavlin 370 return @ds;
1005 dpavlin 366
1006     }
1007    
1008 dpavlin 370 =head2 output
1009    
1010     Create output from in-memory data structure using Template Toolkit template.
1011    
1012     my $text = $webpac->output( template => 'text.tt', data => @ds );
1013    
1014     =cut
1015    
1016     sub output {
1017     my $self = shift;
1018    
1019     my $args = {@_};
1020    
1021 dpavlin 372 my $log = $self->_get_logger();
1022 dpavlin 370
1023 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
1024     $log->logconfess("need data array") if (! $args->{'data'});
1025    
1026 dpavlin 370 my $out;
1027    
1028     $self->{'tt'}->process(
1029     $args->{'template'},
1030     $args,
1031     \$out
1032     ) || confess $self->{'tt'}->error();
1033    
1034     return $out;
1035     }
1036    
1037 dpavlin 411 =head2 output_file
1038    
1039     Create output from in-memory data structure using Template Toolkit template
1040     to a file.
1041    
1042     $webpac->output_file(
1043     file => 'out.txt',
1044     template => 'text.tt',
1045     data => @ds
1046     );
1047    
1048     =cut
1049    
1050     sub output_file {
1051     my $self = shift;
1052    
1053     my $args = {@_};
1054    
1055     my $log = $self->_get_logger();
1056    
1057 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
1058 dpavlin 411
1059 dpavlin 421 $log->debug("creating file ",$file);
1060 dpavlin 411
1061 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1062 dpavlin 411 print $fh $self->output(
1063     template => $args->{'template'},
1064     data => $args->{'data'},
1065     ) || $log->logdie("print: $!");
1066     close($fh) || $log->logdie("close: $!");
1067     }
1068    
1069 dpavlin 375 =head2 apply_format
1070    
1071     Apply format specified in tag with C<format_name="name"> and
1072     C<format_delimiter=";;">.
1073    
1074     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1075    
1076     Formats can contain C<lookup{...}> if you need them.
1077    
1078     =cut
1079    
1080     sub apply_format {
1081     my $self = shift;
1082    
1083     my ($name,$delimiter,$data) = @_;
1084    
1085     my $log = $self->_get_logger();
1086    
1087     if (! $self->{'import_xml'}->{'format'}->{$name}) {
1088     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1089     return $data;
1090     }
1091    
1092     $log->warn("no delimiter for format $name") if (! $delimiter);
1093    
1094     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1095    
1096     my @data = split(/\Q$delimiter\E/, $data);
1097    
1098     my $out = sprintf($format, @data);
1099     $log->debug("using format $name [$format] on $data to produce: $out");
1100    
1101     if ($out =~ m/$LOOKUP_REGEX/o) {
1102     return $self->lookup($out);
1103     } else {
1104     return $out;
1105     }
1106    
1107     }
1108    
1109    
1110 dpavlin 372 #
1111     #
1112     #
1113    
1114     =head1 INTERNAL METHODS
1115    
1116     Here is a quick list of internal methods, mostly useful to turn debugging
1117     on them (see L<LOGGING> below for explanation).
1118    
1119     =cut
1120    
1121     =head2 _eval
1122    
1123     Internal function to eval code without C<strict 'subs'>.
1124    
1125     =cut
1126    
1127     sub _eval {
1128     my $self = shift;
1129    
1130     my $code = shift || return;
1131    
1132     my $log = $self->_get_logger();
1133    
1134     no strict 'subs';
1135     my $ret = eval $code;
1136     if ($@) {
1137     $log->error("problem with eval code [$code]: $@");
1138     }
1139    
1140     $log->debug("eval: ",$code," [",$ret,"]");
1141    
1142 dpavlin 500 return $ret || undef;
1143 dpavlin 372 }
1144    
1145     =head2 _sort_by_order
1146    
1147     Sort xml tags data structure accoding to C<order=""> attribute.
1148    
1149     =cut
1150    
1151     sub _sort_by_order {
1152     my $self = shift;
1153    
1154     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1155     $self->{'import_xml'}->{'indexer'}->{$a};
1156     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1157     $self->{'import_xml'}->{'indexer'}->{$b};
1158    
1159     return $va <=> $vb;
1160     }
1161    
1162 dpavlin 375 =head2 _get_logger
1163    
1164     Get C<Log::Log4perl> object with a twist: domains are defined for each
1165     method
1166    
1167     my $log = $webpac->_get_logger();
1168    
1169     =cut
1170    
1171 dpavlin 372 sub _get_logger {
1172     my $self = shift;
1173    
1174 dpavlin 374 my $name = (caller(1))[3] || caller;
1175     return get_logger($name);
1176 dpavlin 372 }
1177    
1178 dpavlin 375 =head2 _x
1179    
1180     Convert string from UTF-8 to code page defined in C<import_xml>.
1181    
1182     my $text = $webpac->_x('utf8 text');
1183    
1184     =cut
1185    
1186     sub _x {
1187     my $self = shift;
1188     my $utf8 = shift || return;
1189    
1190     return $self->{'utf2cp'}->convert($utf8) ||
1191     $self->_get_logger()->logwarn("can't convert '$utf8'");
1192     }
1193    
1194 dpavlin 372 #
1195     #
1196     #
1197    
1198     =head1 LOGGING
1199    
1200     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1201     C<log.conf>.
1202    
1203     Methods defined above have different levels of logging, so
1204     it's descriptions will be useful to turn (mostry B<debug> logging) on
1205     or off to see why WabPAC isn't perforing as you expect it (it might even
1206     be a bug!).
1207    
1208     B<This is different from normal Log4perl behaviour>. To repeat, you can
1209     also use method names, and not only classes (which are just few)
1210     to filter logging.
1211    
1212 dpavlin 422
1213     =head1 MEMORY USAGE
1214    
1215     C<low_mem> options is double-edged sword. If enabled, WebPAC
1216     will run on memory constraint machines (which doesn't have enough
1217     physical RAM to create memory structure for whole source database).
1218    
1219     If your machine has 512Mb or more of RAM and database is around 10000 records,
1220     memory shouldn't be an issue. If you don't have enough physical RAM, you
1221     might consider using virtual memory (if your operating system is handling it
1222     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1223     parsed structure of ISIS database (this is what C<low_mem> option does).
1224    
1225     Hitting swap at end of reading source database is probably o.k. However,
1226     hitting swap before 90% will dramatically decrease performance and you will
1227     be better off with C<low_mem> and using rest of availble memory for
1228     operating system disk cache (Linux is particuallary good about this).
1229     However, every access to database record will require disk access, so
1230     generation phase will be slower 10-100 times.
1231    
1232     Parsed structures are essential - you just have option to trade RAM memory
1233     (which is fast) for disk space (which is slow). Be sure to have planty of
1234     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1235    
1236     However, when WebPAC is running on desktop machines (or laptops :-), it's
1237     highly undesireable for system to start swapping. Using C<low_mem> option can
1238     reduce WecPAC memory usage to around 64Mb for same database with lookup
1239     fields and sorted indexes which stay in RAM. Performance will suffer, but
1240     memory usage will really be minimal. It might be also more confortable to
1241     run WebPAC reniced on those machines.
1242    
1243 dpavlin 372 =cut
1244    
1245 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26