/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 609 - (hide annotations)
Fri Dec 31 02:19:24 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 28010 byte(s)
prefer IsisDB over OpenISIS, small improvement to progress bar

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37 dpavlin 439 code_page => 'ISO-8859-2',
38     low_mem => 1,
39 dpavlin 500 filter => {
40     'lower' => sub { lc($_[0]) },
41     },
42 dpavlin 352 );
43    
44     Default C<code_page> is C<ISO-8859-2>.
45    
46 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48 dpavlin 500 There is optinal parametar C<filter> which specify different filters which
49     can be applied using C<filter{name}> notation.
50 dpavlin 560 Same filters can be used in Template Toolkit files.
51 dpavlin 500
52 dpavlin 421 This method will also read configuration files
53 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
54     and configuration file specified by C<config_file>
55     which describes databases to be indexed.
56    
57 dpavlin 352 =cut
58    
59 dpavlin 363 # mapping between data type and tag which specify
60     # format in XML file
61     my %type2tag = (
62     'isis' => 'isis',
63     # 'excel' => 'column',
64     # 'marc' => 'marc',
65     # 'feed' => 'feed'
66     );
67    
68 dpavlin 352 sub new {
69     my $class = shift;
70     my $self = {@_};
71     bless($self, $class);
72    
73 dpavlin 422 $self->{'start_t'} = time();
74    
75 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
76     Log::Log4perl->init($log_file);
77    
78     my $log = $self->_get_logger();
79    
80 dpavlin 352 # fill in default values
81     # output codepage
82     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
83    
84 dpavlin 353 #
85     # read global.conf
86     #
87 dpavlin 372 $log->debug("read 'global.conf'");
88 dpavlin 352
89 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
90 dpavlin 352
91     # read global config parametars
92     foreach my $var (qw(
93     dbi_dbd
94     dbi_dsn
95     dbi_user
96     dbi_passwd
97     show_progress
98     my_unac_filter
99 dpavlin 370 output_template
100 dpavlin 352 )) {
101 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
102 dpavlin 352 }
103    
104 dpavlin 353 #
105     # read indexer config file
106     #
107 dpavlin 352
108 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
109 dpavlin 352
110 dpavlin 370 # create UTF-8 convertor for import_xml files
111 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
112 dpavlin 370
113     # create Template toolkit instance
114     $self->{'tt'} = Template->new(
115     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116 dpavlin 560 FILTERS => $self->{'filter'},
117 dpavlin 370 EVAL_PERL => 1,
118     );
119    
120 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
121     if ($self->{'low_mem'}) {
122 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123 dpavlin 421
124     my $db_file = "data.db";
125    
126     if (-e $db_file) {
127     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128     $log->debug("removed '$db_file' from last run");
129     }
130    
131 dpavlin 434 require DBM::Deep;
132 dpavlin 421
133     my $db = new DBM::Deep $db_file;
134    
135     $log->logdie("DBM::Deep error: $!") unless ($db);
136    
137     if ($db->error()) {
138     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139     } else {
140 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
141 dpavlin 421 }
142    
143     $self->{'db'} = $db;
144     }
145    
146 dpavlin 500 $log->debug("filters defined: ",Dumper($self->{'filter'}));
147    
148 dpavlin 352 return $self;
149     }
150    
151     =head2 open_isis
152    
153 dpavlin 609 Open CDS/ISIS, WinISIS or IsisMarc database using IsisDB or OpenIsis module
154     and read all records to memory.
155 dpavlin 352
156     $webpac->open_isis(
157     filename => '/data/ISIS/ISIS',
158     code_page => '852',
159 dpavlin 431 limit_mfn => 500,
160     start_mfn => 6000,
161 dpavlin 352 lookup => [ ... ],
162     );
163    
164     By default, ISIS code page is assumed to be C<852>.
165    
166 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
167     from database (so you can skip beginning of your database if you need to).
168    
169 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
170     from database in example above.
171 dpavlin 352
172     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
173     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
174     value in index.
175    
176     lookup => [
177     { 'key' => 'd:v900', 'val' => 'v250^a' },
178     { 'eval' => '"v901^a" eq "Podruèje"',
179     'key' => 'pa:v561^4:v562^4:v461^1',
180     'val' => 'v900' },
181     ]
182    
183 dpavlin 367 Returns number of last record read into memory (size of database, really).
184    
185 dpavlin 352 =cut
186    
187     sub open_isis {
188     my $self = shift;
189     my $arg = {@_};
190    
191 dpavlin 372 my $log = $self->_get_logger();
192    
193     $log->logcroak("need filename") if (! $arg->{'filename'});
194 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
195    
196 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197    
198 dpavlin 421 # store data in object
199     $self->{'isis_filename'} = $arg->{'filename'};
200     $self->{'isis_code_page'} = $code_page;
201    
202 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
203    
204     # create Text::Iconv object
205     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
206    
207 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
208 dpavlin 421 $log->debug("isis code page: $code_page");
209 dpavlin 363
210 dpavlin 609 my $use_openisis = 1;
211 dpavlin 352
212 dpavlin 609 eval { use IsisDB 0.06; };
213     $use_openisis = 0 unless ($@);
214    
215     my ($isis_db,$maxmfn);
216    
217     if ($use_openisis) {
218     use OpenIsis;
219     $isis_db = OpenIsis::open($arg->{'filename'});
220     $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
221     } else {
222     $isis_db = new IsisDB(
223     isisdb => $arg->{'filename'},
224     include_deleted => 1,
225     hash_filter => sub {
226     my $l = shift || return;
227     $l = $cp->convert($l);
228     return $l;
229     },
230     );
231     $maxmfn = $isis_db->{'maxmfn'};
232     }
233    
234    
235 dpavlin 431 my $startmfn = 1;
236 dpavlin 352
237 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
238     $log->info("skipping to MFN $s");
239     $startmfn = $s;
240 dpavlin 448 } else {
241     $self->{'start_mfn'} = $startmfn;
242 dpavlin 431 }
243 dpavlin 357
244 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
245 dpavlin 363
246 dpavlin 609 $log->info("processing ",($maxmfn-$startmfn)." records using ",( $use_openisis ? 'OpenIsis' : 'IsisDB'));
247 dpavlin 431
248 dpavlin 609
249 dpavlin 352 # read database
250 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
251 dpavlin 352
252 dpavlin 418
253     $log->debug("mfn: $mfn\n");
254    
255 dpavlin 421 my $rec;
256    
257 dpavlin 609 if ($use_openisis) {
258    
259     # read record using OpenIsis
260     my $row = OpenIsis::read( $isis_db, $mfn );
261     foreach my $k (keys %{$row}) {
262     if ($k ne "mfn") {
263     foreach my $l (@{$row->{$k}}) {
264     $l = $cp->convert($l);
265     # has subfields?
266     my $val;
267     if ($l =~ m/\^/) {
268     foreach my $t (split(/\^/,$l)) {
269     next if (! $t);
270     $val->{substr($t,0,1)} = substr($t,1);
271     }
272     } else {
273     $val = $l;
274 dpavlin 352 }
275 dpavlin 609
276     push @{$rec->{$k}}, $val;
277 dpavlin 352 }
278 dpavlin 609 } else {
279     push @{$rec->{'000'}}, $mfn;
280 dpavlin 352 }
281     }
282    
283 dpavlin 609 } else {
284     $rec = $isis_db->to_hash($mfn);
285 dpavlin 352 }
286    
287 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
288    
289     # store
290     if ($self->{'low_mem'}) {
291     $self->{'db'}->put($mfn, $rec);
292     } else {
293     $self->{'data'}->{$mfn} = $rec;
294     }
295    
296 dpavlin 352 # create lookup
297 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
298 dpavlin 352
299 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
300    
301 dpavlin 352 }
302    
303 dpavlin 448 $self->{'current_mfn'} = -1;
304 dpavlin 389 $self->{'last_pcnt'} = 0;
305 dpavlin 362
306 dpavlin 418 $log->debug("max mfn: $maxmfn");
307    
308 dpavlin 352 # store max mfn and return it.
309     return $self->{'max_mfn'} = $maxmfn;
310     }
311    
312 dpavlin 362 =head2 fetch_rec
313    
314     Fetch next record from database. It will also display progress bar (once
315     it's implemented, that is).
316    
317     my $rec = $webpac->fetch_rec;
318    
319     =cut
320    
321     sub fetch_rec {
322     my $self = shift;
323    
324 dpavlin 372 my $log = $self->_get_logger();
325 dpavlin 362
326 dpavlin 448 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
327 dpavlin 372
328 dpavlin 448 if ($self->{'current_mfn'} == -1) {
329     $self->{'current_mfn'} = $self->{'start_mfn'};
330     } else {
331     $self->{'current_mfn'}++;
332     }
333    
334     my $mfn = $self->{'current_mfn'};
335    
336 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
337     $self->{'current_mfn'} = $self->{'max_mfn'};
338 dpavlin 373 $log->debug("at EOF");
339 dpavlin 362 return;
340     }
341    
342 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
343    
344 dpavlin 421 if ($self->{'low_mem'}) {
345     return $self->{'db'}->get($mfn);
346     } else {
347     return $self->{'data'}->{$mfn};
348     }
349 dpavlin 362 }
350    
351 dpavlin 448 =head2 mfn
352    
353     Returns current record number (MFN).
354    
355     print $webpac->mfn;
356    
357     =cut
358    
359     sub mfn {
360     my $self = shift;
361     return $self->{'current_mfn'};
362     }
363    
364 dpavlin 389 =head2 progress_bar
365    
366     Draw progress bar on STDERR.
367    
368     $webpac->progress_bar($current, $max);
369    
370     =cut
371    
372     sub progress_bar {
373     my $self = shift;
374    
375     my ($curr,$max) = @_;
376    
377     my $log = $self->_get_logger();
378    
379     $log->logconfess("no current value!") if (! $curr);
380     $log->logconfess("no maximum value!") if (! $max);
381    
382     if ($curr > $max) {
383     $max = $curr;
384     $log->debug("overflow to $curr");
385     }
386    
387     $self->{'last_pcnt'} ||= 1;
388    
389 dpavlin 454 my $p = int($curr * 100 / $max) || 1;
390 dpavlin 389
391 dpavlin 422 # reset on re-run
392     if ($p < $self->{'last_pcnt'}) {
393     $self->{'last_pcnt'} = $p;
394 dpavlin 609 $self->{'start_t'} = time();
395 dpavlin 422 }
396    
397 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
398 dpavlin 422
399     my $t = time();
400 dpavlin 609 my $rate = ($curr / ($t - $self->{'start_t'} || 1));
401 dpavlin 422 my $eta = ($max-$curr) / ($rate || 1);
402     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
403 dpavlin 389 $self->{'last_pcnt'} = $p;
404 dpavlin 422 $self->{'last_t'} = time();
405     $self->{'last_curr'} = $curr;
406 dpavlin 389 }
407 dpavlin 412 print STDERR "\n" if ($p == 100);
408 dpavlin 389 }
409    
410 dpavlin 422 =head2 fmt_time
411    
412     Format time (in seconds) for display.
413    
414     print $webpac->fmt_time(time());
415    
416     This method is called by L<progress_bar> to display remaining time.
417    
418     =cut
419    
420     sub fmt_time {
421     my $self = shift;
422    
423     my $t = shift || 0;
424     my $out = "";
425    
426     my ($ss,$mm,$hh) = gmtime($t);
427     $out .= "${hh}h" if ($hh);
428     $out .= sprintf("%02d:%02d", $mm,$ss);
429     $out .= " " if ($hh == 0);
430     return $out;
431     }
432    
433 dpavlin 363 =head2 open_import_xml
434    
435     Read file from C<import_xml/> directory and parse it.
436    
437     $webpac->open_import_xml(type => 'isis');
438    
439     =cut
440    
441     sub open_import_xml {
442     my $self = shift;
443    
444 dpavlin 372 my $log = $self->_get_logger();
445    
446 dpavlin 363 my $arg = {@_};
447 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
448 dpavlin 363
449 dpavlin 366 $self->{'type'} = $arg->{'type'};
450 dpavlin 363
451 dpavlin 366 my $type_base = $arg->{'type'};
452 dpavlin 363 $type_base =~ s/_.*$//g;
453    
454 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
455    
456 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
457 dpavlin 366
458     my $f = "./import_xml/".$self->{'type'}.".xml";
459 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
460 dpavlin 363
461 dpavlin 375 $log->info("reading '$f'");
462 dpavlin 363
463 dpavlin 375 $self->{'import_xml_file'} = $f;
464    
465 dpavlin 363 $self->{'import_xml'} = XMLin($f,
466 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
467 dpavlin 363 );
468    
469 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
470    
471 dpavlin 363 }
472    
473 dpavlin 355 =head2 create_lookup
474    
475     Create lookup from record using lookup definition.
476    
477 dpavlin 367 $self->create_lookup($rec, @lookups);
478    
479     Called internally by C<open_*> methods.
480    
481 dpavlin 355 =cut
482    
483     sub create_lookup {
484     my $self = shift;
485    
486 dpavlin 372 my $log = $self->_get_logger();
487 dpavlin 355
488 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
489     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
490    
491 dpavlin 355 foreach my $i (@_) {
492 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
493     $log->logconfess("need val") unless defined($i->{'val'});
494    
495     if (defined($i->{'eval'})) {
496     # eval first, so we can skip fill_in for key and val
497     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
498     if ($self->_eval($eval)) {
499     my $key = $self->fill_in($rec,$i->{'key'}) || next;
500     my @val = $self->fill_in($rec,$i->{'val'}) || next;
501 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
502 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
503     }
504     } else {
505 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
506     my @val = $self->fill_in($rec,$i->{'val'}) || next;
507     $log->debug("stored $key = ",sub { join(" | ",@val) });
508     push @{$self->{'lookup'}->{$key}}, @val;
509 dpavlin 355 }
510     }
511     }
512    
513 dpavlin 356 =head2 get_data
514    
515     Returns value from record.
516    
517 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
518 dpavlin 356
519     Arguments are:
520     record reference C<$rec>,
521     field C<$f>,
522     optional subfiled C<$sf>,
523     index for repeatable values C<$i>.
524    
525 dpavlin 367 Optinal variable C<$found> will be incremeted if there
526 dpavlin 356 is field.
527    
528     Returns value or empty string.
529    
530     =cut
531    
532     sub get_data {
533     my $self = shift;
534    
535     my ($rec,$f,$sf,$i,$found) = @_;
536 dpavlin 367
537 dpavlin 356 if ($$rec->{$f}) {
538 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
539 dpavlin 389 no strict 'refs';
540 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
541     $$found++ if (defined($$found));
542     return $$rec->{$f}->[$i]->{$sf};
543     } elsif ($$rec->{$f}->[$i]) {
544     $$found++ if (defined($$found));
545 dpavlin 366 # it still might have subfield, just
546     # not specified, so we'll dump all
547     if ($$rec->{$f}->[$i] =~ /HASH/o) {
548     my $out;
549     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
550     $out .= $$rec->{$f}->[$i]->{$k}." ";
551     }
552     return $out;
553     } else {
554     return $$rec->{$f}->[$i];
555     }
556 dpavlin 356 }
557     } else {
558     return '';
559     }
560     }
561    
562 dpavlin 352 =head2 fill_in
563    
564     Workhourse of all: takes record from in-memory structure of database and
565     strings with placeholders and returns string or array of with substituted
566     values from record.
567    
568 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
569 dpavlin 352
570     Optional argument is ordinal number for repeatable fields. By default,
571 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
572     element is 0).
573     Following example will read second value from repeatable field.
574 dpavlin 352
575 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
576 dpavlin 353
577     This function B<does not> perform parsing of format to inteligenty skip
578     delimiters before fields which aren't used.
579    
580 dpavlin 376 This method will automatically decode UTF-8 string to local code page
581     if needed.
582    
583 dpavlin 352 =cut
584    
585 dpavlin 372 sub fill_in {
586 dpavlin 371 my $self = shift;
587    
588 dpavlin 372 my $log = $self->_get_logger();
589 dpavlin 371
590 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
591     my $format = shift || $log->logconfess("need format to parse");
592 dpavlin 352 # iteration (for repeatable fields)
593     my $i = shift || 0;
594    
595 dpavlin 555 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
596    
597 dpavlin 352 # FIXME remove for speedup?
598 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
599 dpavlin 352
600 dpavlin 376 if (utf8::is_utf8($format)) {
601     $format = $self->_x($format);
602     }
603    
604 dpavlin 352 my $found = 0;
605    
606 dpavlin 359 my $eval_code;
607     # remove eval{...} from beginning
608     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
609    
610 dpavlin 500 my $filter_name;
611     # remove filter{...} from beginning
612     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
613    
614 dpavlin 352 # do actual replacement of placeholders
615 dpavlin 555 # repeatable fields
616 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
617 dpavlin 555 # non-repeatable fields
618     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
619 dpavlin 352
620 dpavlin 353 if ($found) {
621 dpavlin 373 $log->debug("format: $format");
622 dpavlin 359 if ($eval_code) {
623     my $eval = $self->fill_in($rec,$eval_code,$i);
624 dpavlin 371 return if (! $self->_eval($eval));
625 dpavlin 359 }
626 dpavlin 500 if ($filter_name && $self->{'filter'}->{$filter_name}) {
627     $log->debug("filter '$filter_name' for $format");
628     $format = $self->{'filter'}->{$filter_name}->($format);
629     return unless(defined($format));
630     $log->debug("filter result: $format");
631     }
632 dpavlin 353 # do we have lookups?
633 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
634     $log->debug("format '$format' has lookup");
635 dpavlin 353 return $self->lookup($format);
636     } else {
637     return $format;
638     }
639 dpavlin 352 } else {
640     return;
641     }
642     }
643    
644     =head2 lookup
645    
646 dpavlin 355 Perform lookups on format supplied to it.
647 dpavlin 352
648 dpavlin 367 my $text = $self->lookup('[v900]');
649 dpavlin 352
650 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
651    
652 dpavlin 352 =cut
653    
654     sub lookup {
655     my $self = shift;
656    
657 dpavlin 372 my $log = $self->_get_logger();
658 dpavlin 352
659 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
660    
661 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
662 dpavlin 352 my @in = ( $tmp );
663 dpavlin 372
664     $log->debug("lookup for: ",$tmp);
665    
666 dpavlin 352 my @out;
667     while (my $f = shift @in) {
668 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
669 dpavlin 352 my $k = $1;
670     if ($self->{'lookup'}->{$k}) {
671     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
672     my $tmp2 = $f;
673 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
674 dpavlin 352 push @in, $tmp2;
675     }
676     } else {
677     undef $f;
678     }
679     } elsif ($f) {
680     push @out, $f;
681     }
682     }
683 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
684 dpavlin 352 return @out;
685     } else {
686     return $tmp;
687     }
688     }
689    
690 dpavlin 356 =head2 parse
691    
692     Perform smart parsing of string, skipping delimiters for fields which aren't
693     defined. It can also eval code in format starting with C<eval{...}> and
694     return output or nothing depending on eval code.
695    
696 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
697 dpavlin 356
698     =cut
699    
700     sub parse {
701     my $self = shift;
702    
703 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
704 dpavlin 356
705 dpavlin 366 return if (! $format_utf8);
706    
707 dpavlin 372 my $log = $self->_get_logger();
708 dpavlin 358
709 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
710     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
711    
712 dpavlin 358 $i = 0 if (! $i);
713    
714 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
715 dpavlin 366
716 dpavlin 356 my @out;
717    
718 dpavlin 373 $log->debug("format: $format");
719    
720 dpavlin 356 my $eval_code;
721     # remove eval{...} from beginning
722     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
723    
724 dpavlin 500 my $filter_name;
725     # remove filter{...} from beginning
726     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
727    
728 dpavlin 358 my $prefix;
729     my $all_found=0;
730 dpavlin 356
731 dpavlin 555 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
732 dpavlin 356
733 dpavlin 358 my $del = $1 || '';
734 dpavlin 359 $prefix ||= $del if ($all_found == 0);
735 dpavlin 358
736 dpavlin 555 # repeatable index
737     my $r = $i;
738     $r = 0 if (lc("$2") eq 's');
739    
740 dpavlin 358 my $found = 0;
741 dpavlin 555 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
742 dpavlin 358
743 dpavlin 356 if ($found) {
744 dpavlin 358 push @out, $del;
745     push @out, $tmp;
746     $all_found += $found;
747 dpavlin 356 }
748     }
749    
750 dpavlin 358 return if (! $all_found);
751 dpavlin 356
752 dpavlin 373 my $out = join('',@out);
753 dpavlin 358
754 dpavlin 373 if ($out) {
755     # add rest of format (suffix)
756     $out .= $format;
757 dpavlin 367
758 dpavlin 373 # add prefix if not there
759     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
760    
761     $log->debug("result: $out");
762     }
763    
764 dpavlin 359 if ($eval_code) {
765 dpavlin 501 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
766 dpavlin 500 $log->debug("about to eval{$eval} format: $out");
767 dpavlin 371 return if (! $self->_eval($eval));
768 dpavlin 359 }
769 dpavlin 500
770     if ($filter_name && $self->{'filter'}->{$filter_name}) {
771     $log->debug("about to filter{$filter_name} format: $out");
772     $out = $self->{'filter'}->{$filter_name}->($out);
773     return unless(defined($out));
774     $log->debug("filter result: $out");
775     }
776 dpavlin 359
777 dpavlin 358 return $out;
778 dpavlin 356 }
779    
780 dpavlin 367 =head2 parse_to_arr
781    
782     Similar to C<parse>, but returns array of all repeatable fields
783    
784     my @arr = $webpac->parse_to_arr($rec,'v250^a');
785    
786     =cut
787    
788     sub parse_to_arr {
789     my $self = shift;
790    
791     my ($rec, $format_utf8) = @_;
792    
793 dpavlin 372 my $log = $self->_get_logger();
794    
795     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
796 dpavlin 367 return if (! $format_utf8);
797    
798     my $i = 0;
799     my @arr;
800    
801     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
802     push @arr, $v;
803     }
804    
805 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
806    
807 dpavlin 367 return @arr;
808     }
809    
810 dpavlin 373 =head2 fill_in_to_arr
811    
812     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
813     for fields which have lookups, so they shouldn't be parsed but rather
814     C<fill_id>ed.
815    
816     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
817    
818     =cut
819    
820     sub fill_in_to_arr {
821     my $self = shift;
822    
823     my ($rec, $format_utf8) = @_;
824    
825     my $log = $self->_get_logger();
826    
827     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
828     return if (! $format_utf8);
829    
830     my $i = 0;
831     my @arr;
832    
833     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
834     push @arr, @v;
835     }
836    
837     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
838    
839     return @arr;
840     }
841    
842 dpavlin 441 =head2 sort_arr
843 dpavlin 373
844 dpavlin 441 Sort array ignoring case and html in data
845    
846     my @sorted = $webpac->sort_arr(@unsorted);
847    
848     =cut
849    
850     sub sort_arr {
851     my $self = shift;
852    
853     my $log = $self->_get_logger();
854    
855     # FIXME add Schwartzian Transformation?
856    
857     my @sorted = sort {
858     $a =~ s#<[^>]+/*>##;
859     $b =~ s#<[^>]+/*>##;
860     lc($b) cmp lc($a)
861     } @_;
862     $log->debug("sorted values: ",sub { join(", ",@sorted) });
863    
864     return @sorted;
865     }
866    
867    
868 dpavlin 366 =head2 data_structure
869    
870     Create in-memory data structure which represents layout from C<import_xml>.
871     It is used later to produce output.
872    
873 dpavlin 368 my @ds = $webpac->data_structure($rec);
874 dpavlin 366
875 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
876 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
877     <headline> tag.
878 dpavlin 374
879 dpavlin 366 =cut
880    
881 dpavlin 372 sub data_structure {
882 dpavlin 366 my $self = shift;
883    
884 dpavlin 372 my $log = $self->_get_logger();
885 dpavlin 366
886     my $rec = shift;
887 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
888 dpavlin 366
889 dpavlin 374 undef $self->{'currnet_filename'};
890 dpavlin 398 undef $self->{'headline'};
891 dpavlin 374
892 dpavlin 366 my @sorted_tags;
893     if ($self->{tags_by_order}) {
894     @sorted_tags = @{$self->{tags_by_order}};
895     } else {
896     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
897     $self->{tags_by_order} = \@sorted_tags;
898     }
899    
900 dpavlin 368 my @ds;
901 dpavlin 366
902 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
903    
904 dpavlin 366 foreach my $field (@sorted_tags) {
905    
906     my $row;
907    
908     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
909    
910     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
911 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
912 dpavlin 366
913 dpavlin 373 $log->debug("format: $format");
914    
915     my @v;
916     if ($format =~ /$LOOKUP_REGEX/o) {
917     @v = $self->fill_in_to_arr($rec,$format);
918     } else {
919     @v = $self->parse_to_arr($rec,$format);
920     }
921 dpavlin 367 next if (! @v);
922 dpavlin 366
923 dpavlin 439 if ($tag->{'sort'}) {
924 dpavlin 441 @v = $self->sort_arr(@v);
925 dpavlin 439 }
926    
927 dpavlin 375 # use format?
928     if ($tag->{'format_name'}) {
929     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
930     }
931    
932 dpavlin 398 if ($field eq 'filename') {
933     $self->{'current_filename'} = join('',@v);
934     $log->debug("filename: ",$self->{'current_filename'});
935     } elsif ($field eq 'headline') {
936     $self->{'headline'} .= join('',@v);
937     $log->debug("headline: ",$self->{'headline'});
938     next; # don't return headline in data_structure!
939     }
940    
941 dpavlin 439 # delimiter will join repeatable fields
942     if ($tag->{'delimiter'}) {
943     @v = ( join($tag->{'delimiter'}, @v) );
944 dpavlin 366 }
945 dpavlin 373
946 dpavlin 439 # default types
947     my @types = qw(display swish);
948     # override by type attribute
949     @types = ( $tag->{'type'} ) if ($tag->{'type'});
950 dpavlin 374
951 dpavlin 439 foreach my $type (@types) {
952     # append to previous line?
953     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
954     if ($tag->{'append'}) {
955    
956     # I will delimit appended part with
957     # delimiter (or ,)
958     my $d = $tag->{'delimiter'};
959     # default delimiter
960 dpavlin 501 $d ||= " ";
961 dpavlin 439
962     my $last = pop @{$row->{$type}};
963     $d = "" if (! $last);
964     $last .= $d . join($d, @v);
965     push @{$row->{$type}}, $last;
966    
967     } else {
968     push @{$row->{$type}}, @v;
969     }
970     }
971    
972    
973 dpavlin 366 }
974    
975 dpavlin 368 if ($row) {
976     $row->{'tag'} = $field;
977 dpavlin 375
978     # TODO: name_sigular, name_plural
979     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
980     $row->{'name'} = $name ? $self->_x($name) : $field;
981    
982 dpavlin 439 # post-sort all values in field
983     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
984     $log->warn("sort at field tag not implemented");
985     }
986    
987 dpavlin 368 push @ds, $row;
988 dpavlin 374
989 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
990 dpavlin 368 }
991 dpavlin 366
992     }
993    
994 dpavlin 370 return @ds;
995 dpavlin 366
996     }
997    
998 dpavlin 370 =head2 output
999    
1000     Create output from in-memory data structure using Template Toolkit template.
1001    
1002     my $text = $webpac->output( template => 'text.tt', data => @ds );
1003    
1004     =cut
1005    
1006     sub output {
1007     my $self = shift;
1008    
1009     my $args = {@_};
1010    
1011 dpavlin 372 my $log = $self->_get_logger();
1012 dpavlin 370
1013 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
1014     $log->logconfess("need data array") if (! $args->{'data'});
1015    
1016 dpavlin 370 my $out;
1017    
1018     $self->{'tt'}->process(
1019     $args->{'template'},
1020     $args,
1021     \$out
1022     ) || confess $self->{'tt'}->error();
1023    
1024     return $out;
1025     }
1026    
1027 dpavlin 411 =head2 output_file
1028    
1029     Create output from in-memory data structure using Template Toolkit template
1030     to a file.
1031    
1032     $webpac->output_file(
1033     file => 'out.txt',
1034     template => 'text.tt',
1035     data => @ds
1036     );
1037    
1038     =cut
1039    
1040     sub output_file {
1041     my $self = shift;
1042    
1043     my $args = {@_};
1044    
1045     my $log = $self->_get_logger();
1046    
1047 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
1048 dpavlin 411
1049 dpavlin 421 $log->debug("creating file ",$file);
1050 dpavlin 411
1051 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1052 dpavlin 411 print $fh $self->output(
1053     template => $args->{'template'},
1054     data => $args->{'data'},
1055     ) || $log->logdie("print: $!");
1056     close($fh) || $log->logdie("close: $!");
1057     }
1058    
1059 dpavlin 375 =head2 apply_format
1060    
1061     Apply format specified in tag with C<format_name="name"> and
1062     C<format_delimiter=";;">.
1063    
1064     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1065    
1066     Formats can contain C<lookup{...}> if you need them.
1067    
1068     =cut
1069    
1070     sub apply_format {
1071     my $self = shift;
1072    
1073     my ($name,$delimiter,$data) = @_;
1074    
1075     my $log = $self->_get_logger();
1076    
1077     if (! $self->{'import_xml'}->{'format'}->{$name}) {
1078     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1079     return $data;
1080     }
1081    
1082     $log->warn("no delimiter for format $name") if (! $delimiter);
1083    
1084     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1085    
1086     my @data = split(/\Q$delimiter\E/, $data);
1087    
1088     my $out = sprintf($format, @data);
1089     $log->debug("using format $name [$format] on $data to produce: $out");
1090    
1091     if ($out =~ m/$LOOKUP_REGEX/o) {
1092     return $self->lookup($out);
1093     } else {
1094     return $out;
1095     }
1096    
1097     }
1098    
1099    
1100 dpavlin 372 #
1101     #
1102     #
1103    
1104     =head1 INTERNAL METHODS
1105    
1106     Here is a quick list of internal methods, mostly useful to turn debugging
1107     on them (see L<LOGGING> below for explanation).
1108    
1109     =cut
1110    
1111     =head2 _eval
1112    
1113     Internal function to eval code without C<strict 'subs'>.
1114    
1115     =cut
1116    
1117     sub _eval {
1118     my $self = shift;
1119    
1120     my $code = shift || return;
1121    
1122     my $log = $self->_get_logger();
1123    
1124     no strict 'subs';
1125     my $ret = eval $code;
1126     if ($@) {
1127     $log->error("problem with eval code [$code]: $@");
1128     }
1129    
1130     $log->debug("eval: ",$code," [",$ret,"]");
1131    
1132 dpavlin 500 return $ret || undef;
1133 dpavlin 372 }
1134    
1135     =head2 _sort_by_order
1136    
1137     Sort xml tags data structure accoding to C<order=""> attribute.
1138    
1139     =cut
1140    
1141     sub _sort_by_order {
1142     my $self = shift;
1143    
1144     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1145     $self->{'import_xml'}->{'indexer'}->{$a};
1146     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1147     $self->{'import_xml'}->{'indexer'}->{$b};
1148    
1149     return $va <=> $vb;
1150     }
1151    
1152 dpavlin 375 =head2 _get_logger
1153    
1154     Get C<Log::Log4perl> object with a twist: domains are defined for each
1155     method
1156    
1157     my $log = $webpac->_get_logger();
1158    
1159     =cut
1160    
1161 dpavlin 372 sub _get_logger {
1162     my $self = shift;
1163    
1164 dpavlin 374 my $name = (caller(1))[3] || caller;
1165     return get_logger($name);
1166 dpavlin 372 }
1167    
1168 dpavlin 375 =head2 _x
1169    
1170     Convert string from UTF-8 to code page defined in C<import_xml>.
1171    
1172     my $text = $webpac->_x('utf8 text');
1173    
1174     =cut
1175    
1176     sub _x {
1177     my $self = shift;
1178     my $utf8 = shift || return;
1179    
1180     return $self->{'utf2cp'}->convert($utf8) ||
1181     $self->_get_logger()->logwarn("can't convert '$utf8'");
1182     }
1183    
1184 dpavlin 372 #
1185     #
1186     #
1187    
1188     =head1 LOGGING
1189    
1190     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1191     C<log.conf>.
1192    
1193     Methods defined above have different levels of logging, so
1194     it's descriptions will be useful to turn (mostry B<debug> logging) on
1195     or off to see why WabPAC isn't perforing as you expect it (it might even
1196     be a bug!).
1197    
1198     B<This is different from normal Log4perl behaviour>. To repeat, you can
1199     also use method names, and not only classes (which are just few)
1200     to filter logging.
1201    
1202 dpavlin 422
1203     =head1 MEMORY USAGE
1204    
1205     C<low_mem> options is double-edged sword. If enabled, WebPAC
1206     will run on memory constraint machines (which doesn't have enough
1207     physical RAM to create memory structure for whole source database).
1208    
1209     If your machine has 512Mb or more of RAM and database is around 10000 records,
1210     memory shouldn't be an issue. If you don't have enough physical RAM, you
1211     might consider using virtual memory (if your operating system is handling it
1212     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1213     parsed structure of ISIS database (this is what C<low_mem> option does).
1214    
1215     Hitting swap at end of reading source database is probably o.k. However,
1216     hitting swap before 90% will dramatically decrease performance and you will
1217     be better off with C<low_mem> and using rest of availble memory for
1218     operating system disk cache (Linux is particuallary good about this).
1219     However, every access to database record will require disk access, so
1220     generation phase will be slower 10-100 times.
1221    
1222     Parsed structures are essential - you just have option to trade RAM memory
1223     (which is fast) for disk space (which is slow). Be sure to have planty of
1224     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1225    
1226     However, when WebPAC is running on desktop machines (or laptops :-), it's
1227     highly undesireable for system to start swapping. Using C<low_mem> option can
1228     reduce WecPAC memory usage to around 64Mb for same database with lookup
1229     fields and sorted indexes which stay in RAM. Performance will suffer, but
1230     memory usage will really be minimal. It might be also more confortable to
1231     run WebPAC reniced on those machines.
1232    
1233 dpavlin 372 =cut
1234    
1235 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26