/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 616 - (hide annotations)
Fri Dec 31 03:34:33 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 27980 byte(s)
cleanup

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37 dpavlin 439 code_page => 'ISO-8859-2',
38     low_mem => 1,
39 dpavlin 500 filter => {
40     'lower' => sub { lc($_[0]) },
41     },
42 dpavlin 352 );
43    
44     Default C<code_page> is C<ISO-8859-2>.
45    
46 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48 dpavlin 500 There is optinal parametar C<filter> which specify different filters which
49     can be applied using C<filter{name}> notation.
50 dpavlin 560 Same filters can be used in Template Toolkit files.
51 dpavlin 500
52 dpavlin 421 This method will also read configuration files
53 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
54     and configuration file specified by C<config_file>
55     which describes databases to be indexed.
56    
57 dpavlin 352 =cut
58    
59 dpavlin 363 # mapping between data type and tag which specify
60     # format in XML file
61     my %type2tag = (
62     'isis' => 'isis',
63     # 'excel' => 'column',
64     # 'marc' => 'marc',
65     # 'feed' => 'feed'
66     );
67    
68 dpavlin 352 sub new {
69     my $class = shift;
70     my $self = {@_};
71     bless($self, $class);
72    
73 dpavlin 422 $self->{'start_t'} = time();
74    
75 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
76     Log::Log4perl->init($log_file);
77    
78     my $log = $self->_get_logger();
79    
80 dpavlin 352 # fill in default values
81     # output codepage
82     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
83    
84 dpavlin 353 #
85     # read global.conf
86     #
87 dpavlin 372 $log->debug("read 'global.conf'");
88 dpavlin 352
89 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
90 dpavlin 352
91     # read global config parametars
92     foreach my $var (qw(
93     dbi_dbd
94     dbi_dsn
95     dbi_user
96     dbi_passwd
97     show_progress
98     my_unac_filter
99 dpavlin 370 output_template
100 dpavlin 352 )) {
101 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
102 dpavlin 352 }
103    
104 dpavlin 353 #
105     # read indexer config file
106     #
107 dpavlin 352
108 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
109 dpavlin 352
110 dpavlin 370 # create UTF-8 convertor for import_xml files
111 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
112 dpavlin 370
113     # create Template toolkit instance
114     $self->{'tt'} = Template->new(
115     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116 dpavlin 560 FILTERS => $self->{'filter'},
117 dpavlin 370 EVAL_PERL => 1,
118     );
119    
120 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
121     if ($self->{'low_mem'}) {
122 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123 dpavlin 421
124     my $db_file = "data.db";
125    
126     if (-e $db_file) {
127     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128     $log->debug("removed '$db_file' from last run");
129     }
130    
131 dpavlin 434 require DBM::Deep;
132 dpavlin 421
133     my $db = new DBM::Deep $db_file;
134    
135     $log->logdie("DBM::Deep error: $!") unless ($db);
136    
137     if ($db->error()) {
138     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139     } else {
140 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
141 dpavlin 421 }
142    
143     $self->{'db'} = $db;
144     }
145    
146 dpavlin 500 $log->debug("filters defined: ",Dumper($self->{'filter'}));
147    
148 dpavlin 352 return $self;
149     }
150    
151     =head2 open_isis
152    
153 dpavlin 609 Open CDS/ISIS, WinISIS or IsisMarc database using IsisDB or OpenIsis module
154     and read all records to memory.
155 dpavlin 352
156     $webpac->open_isis(
157     filename => '/data/ISIS/ISIS',
158     code_page => '852',
159 dpavlin 431 limit_mfn => 500,
160     start_mfn => 6000,
161 dpavlin 352 lookup => [ ... ],
162     );
163    
164     By default, ISIS code page is assumed to be C<852>.
165    
166 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
167     from database (so you can skip beginning of your database if you need to).
168    
169 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
170     from database in example above.
171 dpavlin 352
172     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
173     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
174     value in index.
175    
176     lookup => [
177     { 'key' => 'd:v900', 'val' => 'v250^a' },
178     { 'eval' => '"v901^a" eq "Podruèje"',
179     'key' => 'pa:v561^4:v562^4:v461^1',
180     'val' => 'v900' },
181     ]
182    
183 dpavlin 367 Returns number of last record read into memory (size of database, really).
184    
185 dpavlin 352 =cut
186    
187     sub open_isis {
188     my $self = shift;
189     my $arg = {@_};
190    
191 dpavlin 372 my $log = $self->_get_logger();
192    
193     $log->logcroak("need filename") if (! $arg->{'filename'});
194 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
195    
196 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197    
198 dpavlin 421 # store data in object
199     $self->{'isis_filename'} = $arg->{'filename'};
200     $self->{'isis_code_page'} = $code_page;
201    
202 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
203    
204     # create Text::Iconv object
205     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
206    
207 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
208 dpavlin 421 $log->debug("isis code page: $code_page");
209 dpavlin 363
210 dpavlin 609 my $use_openisis = 1;
211 dpavlin 352
212 dpavlin 609 eval { use IsisDB 0.06; };
213     $use_openisis = 0 unless ($@);
214    
215     my ($isis_db,$maxmfn);
216    
217     if ($use_openisis) {
218     use OpenIsis;
219     $isis_db = OpenIsis::open($arg->{'filename'});
220     $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
221     } else {
222     $isis_db = new IsisDB(
223     isisdb => $arg->{'filename'},
224     include_deleted => 1,
225     hash_filter => sub {
226     my $l = shift || return;
227     $l = $cp->convert($l);
228     return $l;
229     },
230     );
231     $maxmfn = $isis_db->{'maxmfn'};
232     }
233    
234    
235 dpavlin 431 my $startmfn = 1;
236 dpavlin 352
237 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
238     $log->info("skipping to MFN $s");
239     $startmfn = $s;
240 dpavlin 448 } else {
241     $self->{'start_mfn'} = $startmfn;
242 dpavlin 431 }
243 dpavlin 357
244 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
245 dpavlin 363
246 dpavlin 609 $log->info("processing ",($maxmfn-$startmfn)." records using ",( $use_openisis ? 'OpenIsis' : 'IsisDB'));
247 dpavlin 431
248 dpavlin 609
249 dpavlin 352 # read database
250 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
251 dpavlin 352
252 dpavlin 418
253     $log->debug("mfn: $mfn\n");
254    
255 dpavlin 421 my $rec;
256    
257 dpavlin 609 if ($use_openisis) {
258    
259     # read record using OpenIsis
260     my $row = OpenIsis::read( $isis_db, $mfn );
261     foreach my $k (keys %{$row}) {
262     if ($k ne "mfn") {
263     foreach my $l (@{$row->{$k}}) {
264     $l = $cp->convert($l);
265     # has subfields?
266     my $val;
267     if ($l =~ m/\^/) {
268     foreach my $t (split(/\^/,$l)) {
269     next if (! $t);
270     $val->{substr($t,0,1)} = substr($t,1);
271     }
272     } else {
273     $val = $l;
274 dpavlin 352 }
275 dpavlin 609
276     push @{$rec->{$k}}, $val;
277 dpavlin 352 }
278 dpavlin 609 } else {
279     push @{$rec->{'000'}}, $mfn;
280 dpavlin 352 }
281     }
282    
283 dpavlin 609 } else {
284     $rec = $isis_db->to_hash($mfn);
285 dpavlin 352 }
286    
287 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
288    
289     # store
290     if ($self->{'low_mem'}) {
291     $self->{'db'}->put($mfn, $rec);
292     } else {
293     $self->{'data'}->{$mfn} = $rec;
294     }
295    
296 dpavlin 352 # create lookup
297 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
298 dpavlin 352
299 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
300    
301 dpavlin 352 }
302    
303 dpavlin 448 $self->{'current_mfn'} = -1;
304 dpavlin 389 $self->{'last_pcnt'} = 0;
305 dpavlin 362
306 dpavlin 418 $log->debug("max mfn: $maxmfn");
307    
308 dpavlin 352 # store max mfn and return it.
309     return $self->{'max_mfn'} = $maxmfn;
310     }
311    
312 dpavlin 362 =head2 fetch_rec
313    
314     Fetch next record from database. It will also display progress bar (once
315     it's implemented, that is).
316    
317     my $rec = $webpac->fetch_rec;
318    
319     =cut
320    
321     sub fetch_rec {
322     my $self = shift;
323    
324 dpavlin 372 my $log = $self->_get_logger();
325 dpavlin 362
326 dpavlin 448 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
327 dpavlin 372
328 dpavlin 448 if ($self->{'current_mfn'} == -1) {
329     $self->{'current_mfn'} = $self->{'start_mfn'};
330     } else {
331     $self->{'current_mfn'}++;
332     }
333    
334     my $mfn = $self->{'current_mfn'};
335    
336 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
337     $self->{'current_mfn'} = $self->{'max_mfn'};
338 dpavlin 373 $log->debug("at EOF");
339 dpavlin 362 return;
340     }
341    
342 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
343    
344 dpavlin 421 if ($self->{'low_mem'}) {
345     return $self->{'db'}->get($mfn);
346     } else {
347     return $self->{'data'}->{$mfn};
348     }
349 dpavlin 362 }
350    
351 dpavlin 448 =head2 mfn
352    
353     Returns current record number (MFN).
354    
355     print $webpac->mfn;
356    
357     =cut
358    
359     sub mfn {
360     my $self = shift;
361     return $self->{'current_mfn'};
362     }
363    
364 dpavlin 389 =head2 progress_bar
365    
366     Draw progress bar on STDERR.
367    
368     $webpac->progress_bar($current, $max);
369    
370     =cut
371    
372     sub progress_bar {
373     my $self = shift;
374    
375     my ($curr,$max) = @_;
376    
377     my $log = $self->_get_logger();
378    
379     $log->logconfess("no current value!") if (! $curr);
380     $log->logconfess("no maximum value!") if (! $max);
381    
382     if ($curr > $max) {
383     $max = $curr;
384     $log->debug("overflow to $curr");
385     }
386    
387     $self->{'last_pcnt'} ||= 1;
388    
389 dpavlin 454 my $p = int($curr * 100 / $max) || 1;
390 dpavlin 389
391 dpavlin 422 # reset on re-run
392     if ($p < $self->{'last_pcnt'}) {
393     $self->{'last_pcnt'} = $p;
394 dpavlin 609 $self->{'start_t'} = time();
395 dpavlin 422 }
396    
397 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
398 dpavlin 422
399     my $t = time();
400 dpavlin 609 my $rate = ($curr / ($t - $self->{'start_t'} || 1));
401 dpavlin 422 my $eta = ($max-$curr) / ($rate || 1);
402     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
403 dpavlin 389 $self->{'last_pcnt'} = $p;
404 dpavlin 422 $self->{'last_curr'} = $curr;
405 dpavlin 389 }
406 dpavlin 412 print STDERR "\n" if ($p == 100);
407 dpavlin 389 }
408    
409 dpavlin 422 =head2 fmt_time
410    
411     Format time (in seconds) for display.
412    
413     print $webpac->fmt_time(time());
414    
415     This method is called by L<progress_bar> to display remaining time.
416    
417     =cut
418    
419     sub fmt_time {
420     my $self = shift;
421    
422     my $t = shift || 0;
423     my $out = "";
424    
425     my ($ss,$mm,$hh) = gmtime($t);
426     $out .= "${hh}h" if ($hh);
427     $out .= sprintf("%02d:%02d", $mm,$ss);
428     $out .= " " if ($hh == 0);
429     return $out;
430     }
431    
432 dpavlin 363 =head2 open_import_xml
433    
434     Read file from C<import_xml/> directory and parse it.
435    
436     $webpac->open_import_xml(type => 'isis');
437    
438     =cut
439    
440     sub open_import_xml {
441     my $self = shift;
442    
443 dpavlin 372 my $log = $self->_get_logger();
444    
445 dpavlin 363 my $arg = {@_};
446 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
447 dpavlin 363
448 dpavlin 366 $self->{'type'} = $arg->{'type'};
449 dpavlin 363
450 dpavlin 366 my $type_base = $arg->{'type'};
451 dpavlin 363 $type_base =~ s/_.*$//g;
452    
453 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
454    
455 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
456 dpavlin 366
457     my $f = "./import_xml/".$self->{'type'}.".xml";
458 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
459 dpavlin 363
460 dpavlin 375 $log->info("reading '$f'");
461 dpavlin 363
462 dpavlin 375 $self->{'import_xml_file'} = $f;
463    
464 dpavlin 363 $self->{'import_xml'} = XMLin($f,
465 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
466 dpavlin 363 );
467    
468 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
469    
470 dpavlin 363 }
471    
472 dpavlin 355 =head2 create_lookup
473    
474     Create lookup from record using lookup definition.
475    
476 dpavlin 367 $self->create_lookup($rec, @lookups);
477    
478     Called internally by C<open_*> methods.
479    
480 dpavlin 355 =cut
481    
482     sub create_lookup {
483     my $self = shift;
484    
485 dpavlin 372 my $log = $self->_get_logger();
486 dpavlin 355
487 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
488     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
489    
490 dpavlin 355 foreach my $i (@_) {
491 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
492     $log->logconfess("need val") unless defined($i->{'val'});
493    
494     if (defined($i->{'eval'})) {
495     # eval first, so we can skip fill_in for key and val
496     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
497     if ($self->_eval($eval)) {
498     my $key = $self->fill_in($rec,$i->{'key'}) || next;
499     my @val = $self->fill_in($rec,$i->{'val'}) || next;
500 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
501 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
502     }
503     } else {
504 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
505     my @val = $self->fill_in($rec,$i->{'val'}) || next;
506     $log->debug("stored $key = ",sub { join(" | ",@val) });
507     push @{$self->{'lookup'}->{$key}}, @val;
508 dpavlin 355 }
509     }
510     }
511    
512 dpavlin 356 =head2 get_data
513    
514     Returns value from record.
515    
516 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
517 dpavlin 356
518     Arguments are:
519     record reference C<$rec>,
520     field C<$f>,
521     optional subfiled C<$sf>,
522     index for repeatable values C<$i>.
523    
524 dpavlin 367 Optinal variable C<$found> will be incremeted if there
525 dpavlin 356 is field.
526    
527     Returns value or empty string.
528    
529     =cut
530    
531     sub get_data {
532     my $self = shift;
533    
534     my ($rec,$f,$sf,$i,$found) = @_;
535 dpavlin 367
536 dpavlin 356 if ($$rec->{$f}) {
537 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
538 dpavlin 389 no strict 'refs';
539 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
540     $$found++ if (defined($$found));
541     return $$rec->{$f}->[$i]->{$sf};
542     } elsif ($$rec->{$f}->[$i]) {
543     $$found++ if (defined($$found));
544 dpavlin 366 # it still might have subfield, just
545     # not specified, so we'll dump all
546     if ($$rec->{$f}->[$i] =~ /HASH/o) {
547     my $out;
548     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
549     $out .= $$rec->{$f}->[$i]->{$k}." ";
550     }
551     return $out;
552     } else {
553     return $$rec->{$f}->[$i];
554     }
555 dpavlin 356 }
556     } else {
557     return '';
558     }
559     }
560    
561 dpavlin 352 =head2 fill_in
562    
563     Workhourse of all: takes record from in-memory structure of database and
564     strings with placeholders and returns string or array of with substituted
565     values from record.
566    
567 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
568 dpavlin 352
569     Optional argument is ordinal number for repeatable fields. By default,
570 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
571     element is 0).
572     Following example will read second value from repeatable field.
573 dpavlin 352
574 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
575 dpavlin 353
576     This function B<does not> perform parsing of format to inteligenty skip
577     delimiters before fields which aren't used.
578    
579 dpavlin 376 This method will automatically decode UTF-8 string to local code page
580     if needed.
581    
582 dpavlin 352 =cut
583    
584 dpavlin 372 sub fill_in {
585 dpavlin 371 my $self = shift;
586    
587 dpavlin 372 my $log = $self->_get_logger();
588 dpavlin 371
589 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
590     my $format = shift || $log->logconfess("need format to parse");
591 dpavlin 352 # iteration (for repeatable fields)
592     my $i = shift || 0;
593    
594 dpavlin 555 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
595    
596 dpavlin 352 # FIXME remove for speedup?
597 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
598 dpavlin 352
599 dpavlin 376 if (utf8::is_utf8($format)) {
600     $format = $self->_x($format);
601     }
602    
603 dpavlin 352 my $found = 0;
604    
605 dpavlin 359 my $eval_code;
606     # remove eval{...} from beginning
607     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
608    
609 dpavlin 500 my $filter_name;
610     # remove filter{...} from beginning
611     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
612    
613 dpavlin 352 # do actual replacement of placeholders
614 dpavlin 555 # repeatable fields
615 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
616 dpavlin 555 # non-repeatable fields
617     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
618 dpavlin 352
619 dpavlin 353 if ($found) {
620 dpavlin 373 $log->debug("format: $format");
621 dpavlin 359 if ($eval_code) {
622     my $eval = $self->fill_in($rec,$eval_code,$i);
623 dpavlin 371 return if (! $self->_eval($eval));
624 dpavlin 359 }
625 dpavlin 500 if ($filter_name && $self->{'filter'}->{$filter_name}) {
626     $log->debug("filter '$filter_name' for $format");
627     $format = $self->{'filter'}->{$filter_name}->($format);
628     return unless(defined($format));
629     $log->debug("filter result: $format");
630     }
631 dpavlin 353 # do we have lookups?
632 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
633     $log->debug("format '$format' has lookup");
634 dpavlin 353 return $self->lookup($format);
635     } else {
636     return $format;
637     }
638 dpavlin 352 } else {
639     return;
640     }
641     }
642    
643     =head2 lookup
644    
645 dpavlin 355 Perform lookups on format supplied to it.
646 dpavlin 352
647 dpavlin 367 my $text = $self->lookup('[v900]');
648 dpavlin 352
649 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
650    
651 dpavlin 352 =cut
652    
653     sub lookup {
654     my $self = shift;
655    
656 dpavlin 372 my $log = $self->_get_logger();
657 dpavlin 352
658 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
659    
660 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
661 dpavlin 352 my @in = ( $tmp );
662 dpavlin 372
663     $log->debug("lookup for: ",$tmp);
664    
665 dpavlin 352 my @out;
666     while (my $f = shift @in) {
667 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
668 dpavlin 352 my $k = $1;
669     if ($self->{'lookup'}->{$k}) {
670     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
671     my $tmp2 = $f;
672 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
673 dpavlin 352 push @in, $tmp2;
674     }
675     } else {
676     undef $f;
677     }
678     } elsif ($f) {
679     push @out, $f;
680     }
681     }
682 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
683 dpavlin 352 return @out;
684     } else {
685     return $tmp;
686     }
687     }
688    
689 dpavlin 356 =head2 parse
690    
691     Perform smart parsing of string, skipping delimiters for fields which aren't
692     defined. It can also eval code in format starting with C<eval{...}> and
693     return output or nothing depending on eval code.
694    
695 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
696 dpavlin 356
697     =cut
698    
699     sub parse {
700     my $self = shift;
701    
702 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
703 dpavlin 356
704 dpavlin 366 return if (! $format_utf8);
705    
706 dpavlin 372 my $log = $self->_get_logger();
707 dpavlin 358
708 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
709     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
710    
711 dpavlin 358 $i = 0 if (! $i);
712    
713 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
714 dpavlin 366
715 dpavlin 356 my @out;
716    
717 dpavlin 373 $log->debug("format: $format");
718    
719 dpavlin 356 my $eval_code;
720     # remove eval{...} from beginning
721     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
722    
723 dpavlin 500 my $filter_name;
724     # remove filter{...} from beginning
725     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
726    
727 dpavlin 358 my $prefix;
728     my $all_found=0;
729 dpavlin 356
730 dpavlin 555 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
731 dpavlin 356
732 dpavlin 358 my $del = $1 || '';
733 dpavlin 359 $prefix ||= $del if ($all_found == 0);
734 dpavlin 358
735 dpavlin 555 # repeatable index
736     my $r = $i;
737     $r = 0 if (lc("$2") eq 's');
738    
739 dpavlin 358 my $found = 0;
740 dpavlin 555 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
741 dpavlin 358
742 dpavlin 356 if ($found) {
743 dpavlin 358 push @out, $del;
744     push @out, $tmp;
745     $all_found += $found;
746 dpavlin 356 }
747     }
748    
749 dpavlin 358 return if (! $all_found);
750 dpavlin 356
751 dpavlin 373 my $out = join('',@out);
752 dpavlin 358
753 dpavlin 373 if ($out) {
754     # add rest of format (suffix)
755     $out .= $format;
756 dpavlin 367
757 dpavlin 373 # add prefix if not there
758     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
759    
760     $log->debug("result: $out");
761     }
762    
763 dpavlin 359 if ($eval_code) {
764 dpavlin 501 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
765 dpavlin 500 $log->debug("about to eval{$eval} format: $out");
766 dpavlin 371 return if (! $self->_eval($eval));
767 dpavlin 359 }
768 dpavlin 500
769     if ($filter_name && $self->{'filter'}->{$filter_name}) {
770     $log->debug("about to filter{$filter_name} format: $out");
771     $out = $self->{'filter'}->{$filter_name}->($out);
772     return unless(defined($out));
773     $log->debug("filter result: $out");
774     }
775 dpavlin 359
776 dpavlin 358 return $out;
777 dpavlin 356 }
778    
779 dpavlin 367 =head2 parse_to_arr
780    
781     Similar to C<parse>, but returns array of all repeatable fields
782    
783     my @arr = $webpac->parse_to_arr($rec,'v250^a');
784    
785     =cut
786    
787     sub parse_to_arr {
788     my $self = shift;
789    
790     my ($rec, $format_utf8) = @_;
791    
792 dpavlin 372 my $log = $self->_get_logger();
793    
794     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
795 dpavlin 367 return if (! $format_utf8);
796    
797     my $i = 0;
798     my @arr;
799    
800     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
801     push @arr, $v;
802     }
803    
804 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
805    
806 dpavlin 367 return @arr;
807     }
808    
809 dpavlin 373 =head2 fill_in_to_arr
810    
811     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
812     for fields which have lookups, so they shouldn't be parsed but rather
813     C<fill_id>ed.
814    
815     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
816    
817     =cut
818    
819     sub fill_in_to_arr {
820     my $self = shift;
821    
822     my ($rec, $format_utf8) = @_;
823    
824     my $log = $self->_get_logger();
825    
826     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
827     return if (! $format_utf8);
828    
829     my $i = 0;
830     my @arr;
831    
832     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
833     push @arr, @v;
834     }
835    
836     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
837    
838     return @arr;
839     }
840    
841 dpavlin 441 =head2 sort_arr
842 dpavlin 373
843 dpavlin 441 Sort array ignoring case and html in data
844    
845     my @sorted = $webpac->sort_arr(@unsorted);
846    
847     =cut
848    
849     sub sort_arr {
850     my $self = shift;
851    
852     my $log = $self->_get_logger();
853    
854     # FIXME add Schwartzian Transformation?
855    
856     my @sorted = sort {
857     $a =~ s#<[^>]+/*>##;
858     $b =~ s#<[^>]+/*>##;
859     lc($b) cmp lc($a)
860     } @_;
861     $log->debug("sorted values: ",sub { join(", ",@sorted) });
862    
863     return @sorted;
864     }
865    
866    
867 dpavlin 366 =head2 data_structure
868    
869     Create in-memory data structure which represents layout from C<import_xml>.
870     It is used later to produce output.
871    
872 dpavlin 368 my @ds = $webpac->data_structure($rec);
873 dpavlin 366
874 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
875 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
876     <headline> tag.
877 dpavlin 374
878 dpavlin 366 =cut
879    
880 dpavlin 372 sub data_structure {
881 dpavlin 366 my $self = shift;
882    
883 dpavlin 372 my $log = $self->_get_logger();
884 dpavlin 366
885     my $rec = shift;
886 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
887 dpavlin 366
888 dpavlin 374 undef $self->{'currnet_filename'};
889 dpavlin 398 undef $self->{'headline'};
890 dpavlin 374
891 dpavlin 366 my @sorted_tags;
892     if ($self->{tags_by_order}) {
893     @sorted_tags = @{$self->{tags_by_order}};
894     } else {
895     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
896     $self->{tags_by_order} = \@sorted_tags;
897     }
898    
899 dpavlin 368 my @ds;
900 dpavlin 366
901 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
902    
903 dpavlin 366 foreach my $field (@sorted_tags) {
904    
905     my $row;
906    
907     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
908    
909     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
910 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
911 dpavlin 366
912 dpavlin 373 $log->debug("format: $format");
913    
914     my @v;
915     if ($format =~ /$LOOKUP_REGEX/o) {
916     @v = $self->fill_in_to_arr($rec,$format);
917     } else {
918     @v = $self->parse_to_arr($rec,$format);
919     }
920 dpavlin 367 next if (! @v);
921 dpavlin 366
922 dpavlin 439 if ($tag->{'sort'}) {
923 dpavlin 441 @v = $self->sort_arr(@v);
924 dpavlin 439 }
925    
926 dpavlin 375 # use format?
927     if ($tag->{'format_name'}) {
928     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
929     }
930    
931 dpavlin 398 if ($field eq 'filename') {
932     $self->{'current_filename'} = join('',@v);
933     $log->debug("filename: ",$self->{'current_filename'});
934     } elsif ($field eq 'headline') {
935     $self->{'headline'} .= join('',@v);
936     $log->debug("headline: ",$self->{'headline'});
937     next; # don't return headline in data_structure!
938     }
939    
940 dpavlin 439 # delimiter will join repeatable fields
941     if ($tag->{'delimiter'}) {
942     @v = ( join($tag->{'delimiter'}, @v) );
943 dpavlin 366 }
944 dpavlin 373
945 dpavlin 439 # default types
946     my @types = qw(display swish);
947     # override by type attribute
948     @types = ( $tag->{'type'} ) if ($tag->{'type'});
949 dpavlin 374
950 dpavlin 439 foreach my $type (@types) {
951     # append to previous line?
952     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
953     if ($tag->{'append'}) {
954    
955     # I will delimit appended part with
956     # delimiter (or ,)
957     my $d = $tag->{'delimiter'};
958     # default delimiter
959 dpavlin 501 $d ||= " ";
960 dpavlin 439
961     my $last = pop @{$row->{$type}};
962     $d = "" if (! $last);
963     $last .= $d . join($d, @v);
964     push @{$row->{$type}}, $last;
965    
966     } else {
967     push @{$row->{$type}}, @v;
968     }
969     }
970    
971    
972 dpavlin 366 }
973    
974 dpavlin 368 if ($row) {
975     $row->{'tag'} = $field;
976 dpavlin 375
977     # TODO: name_sigular, name_plural
978     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
979     $row->{'name'} = $name ? $self->_x($name) : $field;
980    
981 dpavlin 439 # post-sort all values in field
982     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
983     $log->warn("sort at field tag not implemented");
984     }
985    
986 dpavlin 368 push @ds, $row;
987 dpavlin 374
988 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
989 dpavlin 368 }
990 dpavlin 366
991     }
992    
993 dpavlin 370 return @ds;
994 dpavlin 366
995     }
996    
997 dpavlin 370 =head2 output
998    
999     Create output from in-memory data structure using Template Toolkit template.
1000    
1001     my $text = $webpac->output( template => 'text.tt', data => @ds );
1002    
1003     =cut
1004    
1005     sub output {
1006     my $self = shift;
1007    
1008     my $args = {@_};
1009    
1010 dpavlin 372 my $log = $self->_get_logger();
1011 dpavlin 370
1012 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
1013     $log->logconfess("need data array") if (! $args->{'data'});
1014    
1015 dpavlin 370 my $out;
1016    
1017     $self->{'tt'}->process(
1018     $args->{'template'},
1019     $args,
1020     \$out
1021     ) || confess $self->{'tt'}->error();
1022    
1023     return $out;
1024     }
1025    
1026 dpavlin 411 =head2 output_file
1027    
1028     Create output from in-memory data structure using Template Toolkit template
1029     to a file.
1030    
1031     $webpac->output_file(
1032     file => 'out.txt',
1033     template => 'text.tt',
1034     data => @ds
1035     );
1036    
1037     =cut
1038    
1039     sub output_file {
1040     my $self = shift;
1041    
1042     my $args = {@_};
1043    
1044     my $log = $self->_get_logger();
1045    
1046 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
1047 dpavlin 411
1048 dpavlin 421 $log->debug("creating file ",$file);
1049 dpavlin 411
1050 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1051 dpavlin 411 print $fh $self->output(
1052     template => $args->{'template'},
1053     data => $args->{'data'},
1054     ) || $log->logdie("print: $!");
1055     close($fh) || $log->logdie("close: $!");
1056     }
1057    
1058 dpavlin 375 =head2 apply_format
1059    
1060     Apply format specified in tag with C<format_name="name"> and
1061     C<format_delimiter=";;">.
1062    
1063     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1064    
1065     Formats can contain C<lookup{...}> if you need them.
1066    
1067     =cut
1068    
1069     sub apply_format {
1070     my $self = shift;
1071    
1072     my ($name,$delimiter,$data) = @_;
1073    
1074     my $log = $self->_get_logger();
1075    
1076     if (! $self->{'import_xml'}->{'format'}->{$name}) {
1077     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1078     return $data;
1079     }
1080    
1081     $log->warn("no delimiter for format $name") if (! $delimiter);
1082    
1083     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1084    
1085     my @data = split(/\Q$delimiter\E/, $data);
1086    
1087     my $out = sprintf($format, @data);
1088     $log->debug("using format $name [$format] on $data to produce: $out");
1089    
1090     if ($out =~ m/$LOOKUP_REGEX/o) {
1091     return $self->lookup($out);
1092     } else {
1093     return $out;
1094     }
1095    
1096     }
1097    
1098    
1099 dpavlin 372 #
1100     #
1101     #
1102    
1103     =head1 INTERNAL METHODS
1104    
1105     Here is a quick list of internal methods, mostly useful to turn debugging
1106     on them (see L<LOGGING> below for explanation).
1107    
1108     =cut
1109    
1110     =head2 _eval
1111    
1112     Internal function to eval code without C<strict 'subs'>.
1113    
1114     =cut
1115    
1116     sub _eval {
1117     my $self = shift;
1118    
1119     my $code = shift || return;
1120    
1121     my $log = $self->_get_logger();
1122    
1123     no strict 'subs';
1124     my $ret = eval $code;
1125     if ($@) {
1126     $log->error("problem with eval code [$code]: $@");
1127     }
1128    
1129     $log->debug("eval: ",$code," [",$ret,"]");
1130    
1131 dpavlin 500 return $ret || undef;
1132 dpavlin 372 }
1133    
1134     =head2 _sort_by_order
1135    
1136     Sort xml tags data structure accoding to C<order=""> attribute.
1137    
1138     =cut
1139    
1140     sub _sort_by_order {
1141     my $self = shift;
1142    
1143     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1144     $self->{'import_xml'}->{'indexer'}->{$a};
1145     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1146     $self->{'import_xml'}->{'indexer'}->{$b};
1147    
1148     return $va <=> $vb;
1149     }
1150    
1151 dpavlin 375 =head2 _get_logger
1152    
1153     Get C<Log::Log4perl> object with a twist: domains are defined for each
1154     method
1155    
1156     my $log = $webpac->_get_logger();
1157    
1158     =cut
1159    
1160 dpavlin 372 sub _get_logger {
1161     my $self = shift;
1162    
1163 dpavlin 374 my $name = (caller(1))[3] || caller;
1164     return get_logger($name);
1165 dpavlin 372 }
1166    
1167 dpavlin 375 =head2 _x
1168    
1169     Convert string from UTF-8 to code page defined in C<import_xml>.
1170    
1171     my $text = $webpac->_x('utf8 text');
1172    
1173     =cut
1174    
1175     sub _x {
1176     my $self = shift;
1177     my $utf8 = shift || return;
1178    
1179     return $self->{'utf2cp'}->convert($utf8) ||
1180     $self->_get_logger()->logwarn("can't convert '$utf8'");
1181     }
1182    
1183 dpavlin 372 #
1184     #
1185     #
1186    
1187     =head1 LOGGING
1188    
1189     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1190     C<log.conf>.
1191    
1192     Methods defined above have different levels of logging, so
1193     it's descriptions will be useful to turn (mostry B<debug> logging) on
1194     or off to see why WabPAC isn't perforing as you expect it (it might even
1195     be a bug!).
1196    
1197     B<This is different from normal Log4perl behaviour>. To repeat, you can
1198     also use method names, and not only classes (which are just few)
1199     to filter logging.
1200    
1201 dpavlin 422
1202     =head1 MEMORY USAGE
1203    
1204     C<low_mem> options is double-edged sword. If enabled, WebPAC
1205     will run on memory constraint machines (which doesn't have enough
1206     physical RAM to create memory structure for whole source database).
1207    
1208     If your machine has 512Mb or more of RAM and database is around 10000 records,
1209     memory shouldn't be an issue. If you don't have enough physical RAM, you
1210     might consider using virtual memory (if your operating system is handling it
1211     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1212     parsed structure of ISIS database (this is what C<low_mem> option does).
1213    
1214     Hitting swap at end of reading source database is probably o.k. However,
1215     hitting swap before 90% will dramatically decrease performance and you will
1216     be better off with C<low_mem> and using rest of availble memory for
1217     operating system disk cache (Linux is particuallary good about this).
1218     However, every access to database record will require disk access, so
1219     generation phase will be slower 10-100 times.
1220    
1221     Parsed structures are essential - you just have option to trade RAM memory
1222     (which is fast) for disk space (which is slow). Be sure to have planty of
1223     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1224    
1225     However, when WebPAC is running on desktop machines (or laptops :-), it's
1226     highly undesireable for system to start swapping. Using C<low_mem> option can
1227     reduce WecPAC memory usage to around 64Mb for same database with lookup
1228     fields and sorted indexes which stay in RAM. Performance will suffer, but
1229     memory usage will really be minimal. It might be also more confortable to
1230     run WebPAC reniced on those machines.
1231    
1232 dpavlin 372 =cut
1233    
1234 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26