/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 707 - (hide annotations)
Wed Jul 13 23:36:53 2005 UTC (14 years, 6 months ago) by dpavlin
File size: 28525 byte(s)
small fix

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 705 my ($have_biblio_isis, $have_openisis) = (0,0);
17    
18     eval "use Biblio::Isis 0.13;";
19     unless ($@) {
20     $have_biblio_isis = 1
21     } else {
22     eval "use OpenIsis;";
23     $have_openisis = 1 unless ($@);
24     }
25    
26 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
27     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
28     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
29     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
30    
31 dpavlin 352 =head1 NAME
32    
33 dpavlin 354 WebPAC - base class for WebPAC
34 dpavlin 352
35     =head1 DESCRIPTION
36    
37 dpavlin 354 This module implements methods used by WebPAC.
38 dpavlin 352
39     =head1 METHODS
40    
41     =head2 new
42    
43 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
44 dpavlin 352
45 dpavlin 354 my $webpac = new WebPAC(
46 dpavlin 352 config_file => 'name.conf',
47 dpavlin 439 code_page => 'ISO-8859-2',
48     low_mem => 1,
49 dpavlin 500 filter => {
50     'lower' => sub { lc($_[0]) },
51     },
52 dpavlin 352 );
53    
54     Default C<code_page> is C<ISO-8859-2>.
55    
56 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
57    
58 dpavlin 500 There is optinal parametar C<filter> which specify different filters which
59     can be applied using C<filter{name}> notation.
60 dpavlin 560 Same filters can be used in Template Toolkit files.
61 dpavlin 500
62 dpavlin 421 This method will also read configuration files
63 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
64     and configuration file specified by C<config_file>
65     which describes databases to be indexed.
66    
67 dpavlin 352 =cut
68    
69 dpavlin 363 # mapping between data type and tag which specify
70     # format in XML file
71     my %type2tag = (
72     'isis' => 'isis',
73     # 'excel' => 'column',
74     # 'marc' => 'marc',
75     # 'feed' => 'feed'
76     );
77    
78 dpavlin 352 sub new {
79     my $class = shift;
80     my $self = {@_};
81     bless($self, $class);
82    
83 dpavlin 422 $self->{'start_t'} = time();
84    
85 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
86     Log::Log4perl->init($log_file);
87    
88     my $log = $self->_get_logger();
89    
90 dpavlin 352 # fill in default values
91     # output codepage
92     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
93    
94 dpavlin 353 #
95     # read global.conf
96     #
97 dpavlin 372 $log->debug("read 'global.conf'");
98 dpavlin 352
99 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
100 dpavlin 352
101     # read global config parametars
102     foreach my $var (qw(
103     dbi_dbd
104     dbi_dsn
105     dbi_user
106     dbi_passwd
107     show_progress
108     my_unac_filter
109 dpavlin 370 output_template
110 dpavlin 352 )) {
111 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
112 dpavlin 352 }
113    
114 dpavlin 353 #
115     # read indexer config file
116     #
117 dpavlin 352
118 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
119 dpavlin 352
120 dpavlin 370 # create UTF-8 convertor for import_xml files
121 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
122 dpavlin 370
123     # create Template toolkit instance
124     $self->{'tt'} = Template->new(
125     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
126 dpavlin 560 FILTERS => $self->{'filter'},
127 dpavlin 370 EVAL_PERL => 1,
128     );
129    
130 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
131     if ($self->{'low_mem'}) {
132 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
133 dpavlin 421
134     my $db_file = "data.db";
135    
136     if (-e $db_file) {
137     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
138     $log->debug("removed '$db_file' from last run");
139     }
140    
141 dpavlin 434 require DBM::Deep;
142 dpavlin 421
143     my $db = new DBM::Deep $db_file;
144    
145     $log->logdie("DBM::Deep error: $!") unless ($db);
146    
147     if ($db->error()) {
148     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
149     } else {
150 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
151 dpavlin 421 }
152    
153     $self->{'db'} = $db;
154     }
155    
156 dpavlin 500 $log->debug("filters defined: ",Dumper($self->{'filter'}));
157    
158 dpavlin 352 return $self;
159     }
160    
161     =head2 open_isis
162    
163 dpavlin 609 Open CDS/ISIS, WinISIS or IsisMarc database using IsisDB or OpenIsis module
164     and read all records to memory.
165 dpavlin 352
166     $webpac->open_isis(
167     filename => '/data/ISIS/ISIS',
168     code_page => '852',
169 dpavlin 431 limit_mfn => 500,
170     start_mfn => 6000,
171 dpavlin 352 lookup => [ ... ],
172     );
173    
174     By default, ISIS code page is assumed to be C<852>.
175    
176 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
177     from database (so you can skip beginning of your database if you need to).
178    
179 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
180     from database in example above.
181 dpavlin 352
182     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
183     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
184     value in index.
185    
186     lookup => [
187     { 'key' => 'd:v900', 'val' => 'v250^a' },
188     { 'eval' => '"v901^a" eq "Podruèje"',
189     'key' => 'pa:v561^4:v562^4:v461^1',
190     'val' => 'v900' },
191     ]
192    
193 dpavlin 367 Returns number of last record read into memory (size of database, really).
194    
195 dpavlin 352 =cut
196    
197     sub open_isis {
198     my $self = shift;
199     my $arg = {@_};
200    
201 dpavlin 372 my $log = $self->_get_logger();
202    
203     $log->logcroak("need filename") if (! $arg->{'filename'});
204 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
205    
206 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
207    
208 dpavlin 421 # store data in object
209     $self->{'isis_filename'} = $arg->{'filename'};
210     $self->{'isis_code_page'} = $code_page;
211    
212 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
213    
214     # create Text::Iconv object
215     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
216    
217 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
218 dpavlin 421 $log->debug("isis code page: $code_page");
219 dpavlin 363
220 dpavlin 609 my ($isis_db,$maxmfn);
221    
222 dpavlin 705 if ($have_openisis) {
223     $log->debug("using OpenIsis perl bindings");
224 dpavlin 609 $isis_db = OpenIsis::open($arg->{'filename'});
225     $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
226 dpavlin 705 } elsif ($have_biblio_isis) {
227     $log->debug("using Biblio::Isis");
228     use Biblio::Isis;
229     $isis_db = new Biblio::Isis(
230 dpavlin 609 isisdb => $arg->{'filename'},
231     include_deleted => 1,
232     hash_filter => sub {
233     my $l = shift || return;
234     $l = $cp->convert($l);
235     return $l;
236     },
237     );
238 dpavlin 707 $maxmfn = $isis_db->count;
239    
240     unless ($maxmfn) {
241     $log->logwarn("no records in database ", $arg->{'filename'}, ", skipping...");
242     return;
243     }
244    
245 dpavlin 705 } else {
246     $log->logdie("Can't find supported ISIS library for perl. I suggent that you install Bilbio::Isis from CPAN.");
247 dpavlin 609 }
248    
249    
250 dpavlin 431 my $startmfn = 1;
251 dpavlin 352
252 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
253     $log->info("skipping to MFN $s");
254     $startmfn = $s;
255 dpavlin 448 } else {
256     $self->{'start_mfn'} = $startmfn;
257 dpavlin 431 }
258 dpavlin 357
259 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
260 dpavlin 363
261 dpavlin 705 $log->info("processing ",($maxmfn-$startmfn)." records using ",( $have_openisis ? 'OpenIsis' : 'Biblio::Isis'));
262 dpavlin 431
263 dpavlin 609
264 dpavlin 352 # read database
265 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
266 dpavlin 352
267 dpavlin 418
268     $log->debug("mfn: $mfn\n");
269    
270 dpavlin 421 my $rec;
271    
272 dpavlin 705 if ($have_openisis) {
273 dpavlin 609
274     # read record using OpenIsis
275     my $row = OpenIsis::read( $isis_db, $mfn );
276     foreach my $k (keys %{$row}) {
277     if ($k ne "mfn") {
278     foreach my $l (@{$row->{$k}}) {
279     $l = $cp->convert($l);
280     # has subfields?
281     my $val;
282     if ($l =~ m/\^/) {
283     foreach my $t (split(/\^/,$l)) {
284     next if (! $t);
285     $val->{substr($t,0,1)} = substr($t,1);
286     }
287     } else {
288     $val = $l;
289 dpavlin 352 }
290 dpavlin 609
291     push @{$rec->{$k}}, $val;
292 dpavlin 352 }
293 dpavlin 609 } else {
294     push @{$rec->{'000'}}, $mfn;
295 dpavlin 352 }
296     }
297    
298 dpavlin 705 } elsif ($have_biblio_isis) {
299     $rec = $isis_db->to_hash($mfn);
300 dpavlin 609 } else {
301 dpavlin 705 $log->logdie("hum? implementation missing?");
302 dpavlin 352 }
303    
304 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
305    
306     # store
307     if ($self->{'low_mem'}) {
308     $self->{'db'}->put($mfn, $rec);
309     } else {
310     $self->{'data'}->{$mfn} = $rec;
311     }
312    
313 dpavlin 352 # create lookup
314 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
315 dpavlin 352
316 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
317    
318 dpavlin 352 }
319    
320 dpavlin 448 $self->{'current_mfn'} = -1;
321 dpavlin 389 $self->{'last_pcnt'} = 0;
322 dpavlin 362
323 dpavlin 418 $log->debug("max mfn: $maxmfn");
324    
325 dpavlin 352 # store max mfn and return it.
326     return $self->{'max_mfn'} = $maxmfn;
327     }
328    
329 dpavlin 362 =head2 fetch_rec
330    
331     Fetch next record from database. It will also display progress bar (once
332     it's implemented, that is).
333    
334     my $rec = $webpac->fetch_rec;
335    
336     =cut
337    
338     sub fetch_rec {
339     my $self = shift;
340    
341 dpavlin 372 my $log = $self->_get_logger();
342 dpavlin 362
343 dpavlin 448 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
344 dpavlin 372
345 dpavlin 448 if ($self->{'current_mfn'} == -1) {
346     $self->{'current_mfn'} = $self->{'start_mfn'};
347     } else {
348     $self->{'current_mfn'}++;
349     }
350    
351     my $mfn = $self->{'current_mfn'};
352    
353 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
354     $self->{'current_mfn'} = $self->{'max_mfn'};
355 dpavlin 373 $log->debug("at EOF");
356 dpavlin 362 return;
357     }
358    
359 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
360    
361 dpavlin 421 if ($self->{'low_mem'}) {
362     return $self->{'db'}->get($mfn);
363     } else {
364     return $self->{'data'}->{$mfn};
365     }
366 dpavlin 362 }
367    
368 dpavlin 448 =head2 mfn
369    
370     Returns current record number (MFN).
371    
372     print $webpac->mfn;
373    
374     =cut
375    
376     sub mfn {
377     my $self = shift;
378     return $self->{'current_mfn'};
379     }
380    
381 dpavlin 389 =head2 progress_bar
382    
383     Draw progress bar on STDERR.
384    
385     $webpac->progress_bar($current, $max);
386    
387     =cut
388    
389     sub progress_bar {
390     my $self = shift;
391    
392     my ($curr,$max) = @_;
393    
394     my $log = $self->_get_logger();
395    
396     $log->logconfess("no current value!") if (! $curr);
397     $log->logconfess("no maximum value!") if (! $max);
398    
399     if ($curr > $max) {
400     $max = $curr;
401     $log->debug("overflow to $curr");
402     }
403    
404     $self->{'last_pcnt'} ||= 1;
405    
406 dpavlin 454 my $p = int($curr * 100 / $max) || 1;
407 dpavlin 389
408 dpavlin 422 # reset on re-run
409     if ($p < $self->{'last_pcnt'}) {
410     $self->{'last_pcnt'} = $p;
411 dpavlin 609 $self->{'start_t'} = time();
412 dpavlin 422 }
413    
414 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
415 dpavlin 422
416     my $t = time();
417 dpavlin 609 my $rate = ($curr / ($t - $self->{'start_t'} || 1));
418 dpavlin 422 my $eta = ($max-$curr) / ($rate || 1);
419     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
420 dpavlin 389 $self->{'last_pcnt'} = $p;
421 dpavlin 422 $self->{'last_curr'} = $curr;
422 dpavlin 389 }
423 dpavlin 412 print STDERR "\n" if ($p == 100);
424 dpavlin 389 }
425    
426 dpavlin 422 =head2 fmt_time
427    
428     Format time (in seconds) for display.
429    
430     print $webpac->fmt_time(time());
431    
432     This method is called by L<progress_bar> to display remaining time.
433    
434     =cut
435    
436     sub fmt_time {
437     my $self = shift;
438    
439     my $t = shift || 0;
440     my $out = "";
441    
442     my ($ss,$mm,$hh) = gmtime($t);
443     $out .= "${hh}h" if ($hh);
444     $out .= sprintf("%02d:%02d", $mm,$ss);
445     $out .= " " if ($hh == 0);
446     return $out;
447     }
448    
449 dpavlin 363 =head2 open_import_xml
450    
451     Read file from C<import_xml/> directory and parse it.
452    
453     $webpac->open_import_xml(type => 'isis');
454    
455     =cut
456    
457     sub open_import_xml {
458     my $self = shift;
459    
460 dpavlin 372 my $log = $self->_get_logger();
461    
462 dpavlin 363 my $arg = {@_};
463 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
464 dpavlin 363
465 dpavlin 366 $self->{'type'} = $arg->{'type'};
466 dpavlin 363
467 dpavlin 366 my $type_base = $arg->{'type'};
468 dpavlin 363 $type_base =~ s/_.*$//g;
469    
470 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
471    
472 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
473 dpavlin 366
474     my $f = "./import_xml/".$self->{'type'}.".xml";
475 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
476 dpavlin 363
477 dpavlin 375 $log->info("reading '$f'");
478 dpavlin 363
479 dpavlin 375 $self->{'import_xml_file'} = $f;
480    
481 dpavlin 363 $self->{'import_xml'} = XMLin($f,
482 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
483 dpavlin 363 );
484    
485 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
486    
487 dpavlin 363 }
488    
489 dpavlin 355 =head2 create_lookup
490    
491     Create lookup from record using lookup definition.
492    
493 dpavlin 367 $self->create_lookup($rec, @lookups);
494    
495     Called internally by C<open_*> methods.
496    
497 dpavlin 355 =cut
498    
499     sub create_lookup {
500     my $self = shift;
501    
502 dpavlin 372 my $log = $self->_get_logger();
503 dpavlin 355
504 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
505     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
506    
507 dpavlin 355 foreach my $i (@_) {
508 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
509     $log->logconfess("need val") unless defined($i->{'val'});
510    
511     if (defined($i->{'eval'})) {
512     # eval first, so we can skip fill_in for key and val
513     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
514     if ($self->_eval($eval)) {
515     my $key = $self->fill_in($rec,$i->{'key'}) || next;
516     my @val = $self->fill_in($rec,$i->{'val'}) || next;
517 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
518 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
519     }
520     } else {
521 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
522     my @val = $self->fill_in($rec,$i->{'val'}) || next;
523     $log->debug("stored $key = ",sub { join(" | ",@val) });
524     push @{$self->{'lookup'}->{$key}}, @val;
525 dpavlin 355 }
526     }
527     }
528    
529 dpavlin 356 =head2 get_data
530    
531     Returns value from record.
532    
533 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
534 dpavlin 356
535     Arguments are:
536     record reference C<$rec>,
537     field C<$f>,
538     optional subfiled C<$sf>,
539     index for repeatable values C<$i>.
540    
541 dpavlin 367 Optinal variable C<$found> will be incremeted if there
542 dpavlin 356 is field.
543    
544     Returns value or empty string.
545    
546     =cut
547    
548     sub get_data {
549     my $self = shift;
550    
551     my ($rec,$f,$sf,$i,$found) = @_;
552 dpavlin 367
553 dpavlin 356 if ($$rec->{$f}) {
554 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
555 dpavlin 389 no strict 'refs';
556 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
557     $$found++ if (defined($$found));
558     return $$rec->{$f}->[$i]->{$sf};
559     } elsif ($$rec->{$f}->[$i]) {
560     $$found++ if (defined($$found));
561 dpavlin 366 # it still might have subfield, just
562     # not specified, so we'll dump all
563     if ($$rec->{$f}->[$i] =~ /HASH/o) {
564     my $out;
565     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
566     $out .= $$rec->{$f}->[$i]->{$k}." ";
567     }
568     return $out;
569     } else {
570     return $$rec->{$f}->[$i];
571     }
572 dpavlin 356 }
573     } else {
574     return '';
575     }
576     }
577    
578 dpavlin 352 =head2 fill_in
579    
580     Workhourse of all: takes record from in-memory structure of database and
581     strings with placeholders and returns string or array of with substituted
582     values from record.
583    
584 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
585 dpavlin 352
586     Optional argument is ordinal number for repeatable fields. By default,
587 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
588     element is 0).
589     Following example will read second value from repeatable field.
590 dpavlin 352
591 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
592 dpavlin 353
593     This function B<does not> perform parsing of format to inteligenty skip
594     delimiters before fields which aren't used.
595    
596 dpavlin 376 This method will automatically decode UTF-8 string to local code page
597     if needed.
598    
599 dpavlin 352 =cut
600    
601 dpavlin 372 sub fill_in {
602 dpavlin 371 my $self = shift;
603    
604 dpavlin 372 my $log = $self->_get_logger();
605 dpavlin 371
606 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
607     my $format = shift || $log->logconfess("need format to parse");
608 dpavlin 352 # iteration (for repeatable fields)
609     my $i = shift || 0;
610    
611 dpavlin 555 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
612    
613 dpavlin 352 # FIXME remove for speedup?
614 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
615 dpavlin 352
616 dpavlin 376 if (utf8::is_utf8($format)) {
617     $format = $self->_x($format);
618     }
619    
620 dpavlin 352 my $found = 0;
621    
622 dpavlin 359 my $eval_code;
623     # remove eval{...} from beginning
624     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
625    
626 dpavlin 500 my $filter_name;
627     # remove filter{...} from beginning
628     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
629    
630 dpavlin 352 # do actual replacement of placeholders
631 dpavlin 555 # repeatable fields
632 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
633 dpavlin 555 # non-repeatable fields
634     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
635 dpavlin 352
636 dpavlin 353 if ($found) {
637 dpavlin 373 $log->debug("format: $format");
638 dpavlin 359 if ($eval_code) {
639     my $eval = $self->fill_in($rec,$eval_code,$i);
640 dpavlin 371 return if (! $self->_eval($eval));
641 dpavlin 359 }
642 dpavlin 500 if ($filter_name && $self->{'filter'}->{$filter_name}) {
643     $log->debug("filter '$filter_name' for $format");
644     $format = $self->{'filter'}->{$filter_name}->($format);
645     return unless(defined($format));
646     $log->debug("filter result: $format");
647     }
648 dpavlin 353 # do we have lookups?
649 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
650     $log->debug("format '$format' has lookup");
651 dpavlin 353 return $self->lookup($format);
652     } else {
653     return $format;
654     }
655 dpavlin 352 } else {
656     return;
657     }
658     }
659    
660     =head2 lookup
661    
662 dpavlin 355 Perform lookups on format supplied to it.
663 dpavlin 352
664 dpavlin 367 my $text = $self->lookup('[v900]');
665 dpavlin 352
666 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
667    
668 dpavlin 352 =cut
669    
670     sub lookup {
671     my $self = shift;
672    
673 dpavlin 372 my $log = $self->_get_logger();
674 dpavlin 352
675 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
676    
677 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
678 dpavlin 352 my @in = ( $tmp );
679 dpavlin 372
680     $log->debug("lookup for: ",$tmp);
681    
682 dpavlin 352 my @out;
683     while (my $f = shift @in) {
684 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
685 dpavlin 352 my $k = $1;
686     if ($self->{'lookup'}->{$k}) {
687     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
688     my $tmp2 = $f;
689 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
690 dpavlin 352 push @in, $tmp2;
691     }
692     } else {
693     undef $f;
694     }
695     } elsif ($f) {
696     push @out, $f;
697     }
698     }
699 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
700 dpavlin 352 return @out;
701     } else {
702     return $tmp;
703     }
704     }
705    
706 dpavlin 356 =head2 parse
707    
708     Perform smart parsing of string, skipping delimiters for fields which aren't
709     defined. It can also eval code in format starting with C<eval{...}> and
710     return output or nothing depending on eval code.
711    
712 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
713 dpavlin 356
714     =cut
715    
716     sub parse {
717     my $self = shift;
718    
719 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
720 dpavlin 356
721 dpavlin 366 return if (! $format_utf8);
722    
723 dpavlin 372 my $log = $self->_get_logger();
724 dpavlin 358
725 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
726     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
727    
728 dpavlin 358 $i = 0 if (! $i);
729    
730 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
731 dpavlin 366
732 dpavlin 356 my @out;
733    
734 dpavlin 373 $log->debug("format: $format");
735    
736 dpavlin 356 my $eval_code;
737     # remove eval{...} from beginning
738     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
739    
740 dpavlin 500 my $filter_name;
741     # remove filter{...} from beginning
742     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
743    
744 dpavlin 358 my $prefix;
745     my $all_found=0;
746 dpavlin 356
747 dpavlin 555 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
748 dpavlin 356
749 dpavlin 358 my $del = $1 || '';
750 dpavlin 359 $prefix ||= $del if ($all_found == 0);
751 dpavlin 358
752 dpavlin 555 # repeatable index
753     my $r = $i;
754     $r = 0 if (lc("$2") eq 's');
755    
756 dpavlin 358 my $found = 0;
757 dpavlin 555 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
758 dpavlin 358
759 dpavlin 356 if ($found) {
760 dpavlin 358 push @out, $del;
761     push @out, $tmp;
762     $all_found += $found;
763 dpavlin 356 }
764     }
765    
766 dpavlin 358 return if (! $all_found);
767 dpavlin 356
768 dpavlin 373 my $out = join('',@out);
769 dpavlin 358
770 dpavlin 373 if ($out) {
771     # add rest of format (suffix)
772     $out .= $format;
773 dpavlin 367
774 dpavlin 373 # add prefix if not there
775     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
776    
777     $log->debug("result: $out");
778     }
779    
780 dpavlin 359 if ($eval_code) {
781 dpavlin 501 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
782 dpavlin 500 $log->debug("about to eval{$eval} format: $out");
783 dpavlin 371 return if (! $self->_eval($eval));
784 dpavlin 359 }
785 dpavlin 500
786     if ($filter_name && $self->{'filter'}->{$filter_name}) {
787     $log->debug("about to filter{$filter_name} format: $out");
788     $out = $self->{'filter'}->{$filter_name}->($out);
789     return unless(defined($out));
790     $log->debug("filter result: $out");
791     }
792 dpavlin 359
793 dpavlin 358 return $out;
794 dpavlin 356 }
795    
796 dpavlin 367 =head2 parse_to_arr
797    
798     Similar to C<parse>, but returns array of all repeatable fields
799    
800     my @arr = $webpac->parse_to_arr($rec,'v250^a');
801    
802     =cut
803    
804     sub parse_to_arr {
805     my $self = shift;
806    
807     my ($rec, $format_utf8) = @_;
808    
809 dpavlin 372 my $log = $self->_get_logger();
810    
811     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
812 dpavlin 367 return if (! $format_utf8);
813    
814     my $i = 0;
815     my @arr;
816    
817     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
818     push @arr, $v;
819     }
820    
821 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
822    
823 dpavlin 367 return @arr;
824     }
825    
826 dpavlin 373 =head2 fill_in_to_arr
827    
828     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
829     for fields which have lookups, so they shouldn't be parsed but rather
830     C<fill_id>ed.
831    
832     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
833    
834     =cut
835    
836     sub fill_in_to_arr {
837     my $self = shift;
838    
839     my ($rec, $format_utf8) = @_;
840    
841     my $log = $self->_get_logger();
842    
843     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
844     return if (! $format_utf8);
845    
846     my $i = 0;
847     my @arr;
848    
849     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
850     push @arr, @v;
851     }
852    
853     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
854    
855     return @arr;
856     }
857    
858 dpavlin 441 =head2 sort_arr
859 dpavlin 373
860 dpavlin 441 Sort array ignoring case and html in data
861    
862     my @sorted = $webpac->sort_arr(@unsorted);
863    
864     =cut
865    
866     sub sort_arr {
867     my $self = shift;
868    
869     my $log = $self->_get_logger();
870    
871     # FIXME add Schwartzian Transformation?
872    
873     my @sorted = sort {
874     $a =~ s#<[^>]+/*>##;
875     $b =~ s#<[^>]+/*>##;
876     lc($b) cmp lc($a)
877     } @_;
878     $log->debug("sorted values: ",sub { join(", ",@sorted) });
879    
880     return @sorted;
881     }
882    
883    
884 dpavlin 366 =head2 data_structure
885    
886     Create in-memory data structure which represents layout from C<import_xml>.
887     It is used later to produce output.
888    
889 dpavlin 368 my @ds = $webpac->data_structure($rec);
890 dpavlin 366
891 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
892 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
893     <headline> tag.
894 dpavlin 374
895 dpavlin 366 =cut
896    
897 dpavlin 372 sub data_structure {
898 dpavlin 366 my $self = shift;
899    
900 dpavlin 372 my $log = $self->_get_logger();
901 dpavlin 366
902     my $rec = shift;
903 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
904 dpavlin 366
905 dpavlin 374 undef $self->{'currnet_filename'};
906 dpavlin 398 undef $self->{'headline'};
907 dpavlin 374
908 dpavlin 366 my @sorted_tags;
909     if ($self->{tags_by_order}) {
910     @sorted_tags = @{$self->{tags_by_order}};
911     } else {
912     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
913     $self->{tags_by_order} = \@sorted_tags;
914     }
915    
916 dpavlin 368 my @ds;
917 dpavlin 366
918 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
919    
920 dpavlin 366 foreach my $field (@sorted_tags) {
921    
922     my $row;
923    
924     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
925    
926     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
927 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
928 dpavlin 366
929 dpavlin 373 $log->debug("format: $format");
930    
931     my @v;
932     if ($format =~ /$LOOKUP_REGEX/o) {
933     @v = $self->fill_in_to_arr($rec,$format);
934     } else {
935     @v = $self->parse_to_arr($rec,$format);
936     }
937 dpavlin 367 next if (! @v);
938 dpavlin 366
939 dpavlin 439 if ($tag->{'sort'}) {
940 dpavlin 441 @v = $self->sort_arr(@v);
941 dpavlin 439 }
942    
943 dpavlin 375 # use format?
944     if ($tag->{'format_name'}) {
945     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
946     }
947    
948 dpavlin 398 if ($field eq 'filename') {
949     $self->{'current_filename'} = join('',@v);
950     $log->debug("filename: ",$self->{'current_filename'});
951     } elsif ($field eq 'headline') {
952     $self->{'headline'} .= join('',@v);
953     $log->debug("headline: ",$self->{'headline'});
954     next; # don't return headline in data_structure!
955     }
956    
957 dpavlin 439 # delimiter will join repeatable fields
958     if ($tag->{'delimiter'}) {
959     @v = ( join($tag->{'delimiter'}, @v) );
960 dpavlin 366 }
961 dpavlin 373
962 dpavlin 439 # default types
963     my @types = qw(display swish);
964     # override by type attribute
965     @types = ( $tag->{'type'} ) if ($tag->{'type'});
966 dpavlin 374
967 dpavlin 439 foreach my $type (@types) {
968     # append to previous line?
969     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
970     if ($tag->{'append'}) {
971    
972     # I will delimit appended part with
973     # delimiter (or ,)
974     my $d = $tag->{'delimiter'};
975     # default delimiter
976 dpavlin 501 $d ||= " ";
977 dpavlin 439
978     my $last = pop @{$row->{$type}};
979     $d = "" if (! $last);
980     $last .= $d . join($d, @v);
981     push @{$row->{$type}}, $last;
982    
983     } else {
984     push @{$row->{$type}}, @v;
985     }
986     }
987    
988    
989 dpavlin 366 }
990    
991 dpavlin 368 if ($row) {
992     $row->{'tag'} = $field;
993 dpavlin 375
994     # TODO: name_sigular, name_plural
995     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
996     $row->{'name'} = $name ? $self->_x($name) : $field;
997    
998 dpavlin 439 # post-sort all values in field
999     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
1000     $log->warn("sort at field tag not implemented");
1001     }
1002    
1003 dpavlin 368 push @ds, $row;
1004 dpavlin 374
1005 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
1006 dpavlin 368 }
1007 dpavlin 366
1008     }
1009    
1010 dpavlin 370 return @ds;
1011 dpavlin 366
1012     }
1013    
1014 dpavlin 370 =head2 output
1015    
1016     Create output from in-memory data structure using Template Toolkit template.
1017    
1018     my $text = $webpac->output( template => 'text.tt', data => @ds );
1019    
1020     =cut
1021    
1022     sub output {
1023     my $self = shift;
1024    
1025     my $args = {@_};
1026    
1027 dpavlin 372 my $log = $self->_get_logger();
1028 dpavlin 370
1029 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
1030     $log->logconfess("need data array") if (! $args->{'data'});
1031    
1032 dpavlin 370 my $out;
1033    
1034     $self->{'tt'}->process(
1035     $args->{'template'},
1036     $args,
1037     \$out
1038     ) || confess $self->{'tt'}->error();
1039    
1040     return $out;
1041     }
1042    
1043 dpavlin 411 =head2 output_file
1044    
1045     Create output from in-memory data structure using Template Toolkit template
1046     to a file.
1047    
1048     $webpac->output_file(
1049     file => 'out.txt',
1050     template => 'text.tt',
1051     data => @ds
1052     );
1053    
1054     =cut
1055    
1056     sub output_file {
1057     my $self = shift;
1058    
1059     my $args = {@_};
1060    
1061     my $log = $self->_get_logger();
1062    
1063 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
1064 dpavlin 411
1065 dpavlin 421 $log->debug("creating file ",$file);
1066 dpavlin 411
1067 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1068 dpavlin 411 print $fh $self->output(
1069     template => $args->{'template'},
1070     data => $args->{'data'},
1071     ) || $log->logdie("print: $!");
1072     close($fh) || $log->logdie("close: $!");
1073     }
1074    
1075 dpavlin 375 =head2 apply_format
1076    
1077     Apply format specified in tag with C<format_name="name"> and
1078     C<format_delimiter=";;">.
1079    
1080     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1081    
1082     Formats can contain C<lookup{...}> if you need them.
1083    
1084     =cut
1085    
1086     sub apply_format {
1087     my $self = shift;
1088    
1089     my ($name,$delimiter,$data) = @_;
1090    
1091     my $log = $self->_get_logger();
1092    
1093     if (! $self->{'import_xml'}->{'format'}->{$name}) {
1094     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1095     return $data;
1096     }
1097    
1098     $log->warn("no delimiter for format $name") if (! $delimiter);
1099    
1100     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1101    
1102     my @data = split(/\Q$delimiter\E/, $data);
1103    
1104     my $out = sprintf($format, @data);
1105     $log->debug("using format $name [$format] on $data to produce: $out");
1106    
1107     if ($out =~ m/$LOOKUP_REGEX/o) {
1108     return $self->lookup($out);
1109     } else {
1110     return $out;
1111     }
1112    
1113     }
1114    
1115    
1116 dpavlin 372 #
1117     #
1118     #
1119    
1120     =head1 INTERNAL METHODS
1121    
1122     Here is a quick list of internal methods, mostly useful to turn debugging
1123     on them (see L<LOGGING> below for explanation).
1124    
1125     =cut
1126    
1127     =head2 _eval
1128    
1129     Internal function to eval code without C<strict 'subs'>.
1130    
1131     =cut
1132    
1133     sub _eval {
1134     my $self = shift;
1135    
1136     my $code = shift || return;
1137    
1138     my $log = $self->_get_logger();
1139    
1140     no strict 'subs';
1141     my $ret = eval $code;
1142     if ($@) {
1143     $log->error("problem with eval code [$code]: $@");
1144     }
1145    
1146     $log->debug("eval: ",$code," [",$ret,"]");
1147    
1148 dpavlin 500 return $ret || undef;
1149 dpavlin 372 }
1150    
1151     =head2 _sort_by_order
1152    
1153     Sort xml tags data structure accoding to C<order=""> attribute.
1154    
1155     =cut
1156    
1157     sub _sort_by_order {
1158     my $self = shift;
1159    
1160     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1161     $self->{'import_xml'}->{'indexer'}->{$a};
1162     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1163     $self->{'import_xml'}->{'indexer'}->{$b};
1164    
1165     return $va <=> $vb;
1166     }
1167    
1168 dpavlin 375 =head2 _get_logger
1169    
1170     Get C<Log::Log4perl> object with a twist: domains are defined for each
1171     method
1172    
1173     my $log = $webpac->_get_logger();
1174    
1175     =cut
1176    
1177 dpavlin 372 sub _get_logger {
1178     my $self = shift;
1179    
1180 dpavlin 374 my $name = (caller(1))[3] || caller;
1181     return get_logger($name);
1182 dpavlin 372 }
1183    
1184 dpavlin 375 =head2 _x
1185    
1186     Convert string from UTF-8 to code page defined in C<import_xml>.
1187    
1188     my $text = $webpac->_x('utf8 text');
1189    
1190     =cut
1191    
1192     sub _x {
1193     my $self = shift;
1194     my $utf8 = shift || return;
1195    
1196     return $self->{'utf2cp'}->convert($utf8) ||
1197     $self->_get_logger()->logwarn("can't convert '$utf8'");
1198     }
1199    
1200 dpavlin 372 #
1201     #
1202     #
1203    
1204     =head1 LOGGING
1205    
1206     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1207     C<log.conf>.
1208    
1209     Methods defined above have different levels of logging, so
1210     it's descriptions will be useful to turn (mostry B<debug> logging) on
1211     or off to see why WabPAC isn't perforing as you expect it (it might even
1212     be a bug!).
1213    
1214     B<This is different from normal Log4perl behaviour>. To repeat, you can
1215     also use method names, and not only classes (which are just few)
1216     to filter logging.
1217    
1218 dpavlin 422
1219     =head1 MEMORY USAGE
1220    
1221     C<low_mem> options is double-edged sword. If enabled, WebPAC
1222     will run on memory constraint machines (which doesn't have enough
1223     physical RAM to create memory structure for whole source database).
1224    
1225     If your machine has 512Mb or more of RAM and database is around 10000 records,
1226     memory shouldn't be an issue. If you don't have enough physical RAM, you
1227     might consider using virtual memory (if your operating system is handling it
1228     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1229     parsed structure of ISIS database (this is what C<low_mem> option does).
1230    
1231     Hitting swap at end of reading source database is probably o.k. However,
1232     hitting swap before 90% will dramatically decrease performance and you will
1233     be better off with C<low_mem> and using rest of availble memory for
1234     operating system disk cache (Linux is particuallary good about this).
1235     However, every access to database record will require disk access, so
1236     generation phase will be slower 10-100 times.
1237    
1238     Parsed structures are essential - you just have option to trade RAM memory
1239     (which is fast) for disk space (which is slow). Be sure to have planty of
1240     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1241    
1242     However, when WebPAC is running on desktop machines (or laptops :-), it's
1243     highly undesireable for system to start swapping. Using C<low_mem> option can
1244     reduce WecPAC memory usage to around 64Mb for same database with lookup
1245     fields and sorted indexes which stay in RAM. Performance will suffer, but
1246     memory usage will really be minimal. It might be also more confortable to
1247     run WebPAC reniced on those machines.
1248    
1249 dpavlin 372 =cut
1250    
1251 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26