/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 560 - (hide annotations)
Sat Oct 30 23:04:37 2004 UTC (19 years, 5 months ago) by dpavlin
File size: 27648 byte(s)
removed upper case letters from encoding of index nodes (to support Windows
which is case-insensitive), added support for filters into TT templates,
changed crovoc symbol to #C# in data

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37 dpavlin 439 code_page => 'ISO-8859-2',
38     low_mem => 1,
39 dpavlin 500 filter => {
40     'lower' => sub { lc($_[0]) },
41     },
42 dpavlin 352 );
43    
44     Default C<code_page> is C<ISO-8859-2>.
45    
46 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48 dpavlin 500 There is optinal parametar C<filter> which specify different filters which
49     can be applied using C<filter{name}> notation.
50 dpavlin 560 Same filters can be used in Template Toolkit files.
51 dpavlin 500
52 dpavlin 421 This method will also read configuration files
53 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
54     and configuration file specified by C<config_file>
55     which describes databases to be indexed.
56    
57 dpavlin 352 =cut
58    
59 dpavlin 363 # mapping between data type and tag which specify
60     # format in XML file
61     my %type2tag = (
62     'isis' => 'isis',
63     # 'excel' => 'column',
64     # 'marc' => 'marc',
65     # 'feed' => 'feed'
66     );
67    
68 dpavlin 352 sub new {
69     my $class = shift;
70     my $self = {@_};
71     bless($self, $class);
72    
73 dpavlin 422 $self->{'start_t'} = time();
74    
75 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
76     Log::Log4perl->init($log_file);
77    
78     my $log = $self->_get_logger();
79    
80 dpavlin 352 # fill in default values
81     # output codepage
82     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
83    
84 dpavlin 353 #
85     # read global.conf
86     #
87 dpavlin 372 $log->debug("read 'global.conf'");
88 dpavlin 352
89 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
90 dpavlin 352
91     # read global config parametars
92     foreach my $var (qw(
93     dbi_dbd
94     dbi_dsn
95     dbi_user
96     dbi_passwd
97     show_progress
98     my_unac_filter
99 dpavlin 370 output_template
100 dpavlin 352 )) {
101 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
102 dpavlin 352 }
103    
104 dpavlin 353 #
105     # read indexer config file
106     #
107 dpavlin 352
108 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
109 dpavlin 352
110 dpavlin 370 # create UTF-8 convertor for import_xml files
111 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
112 dpavlin 370
113     # create Template toolkit instance
114     $self->{'tt'} = Template->new(
115     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116 dpavlin 560 FILTERS => $self->{'filter'},
117 dpavlin 370 EVAL_PERL => 1,
118     );
119    
120 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
121     if ($self->{'low_mem'}) {
122 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123 dpavlin 421
124     my $db_file = "data.db";
125    
126     if (-e $db_file) {
127     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128     $log->debug("removed '$db_file' from last run");
129     }
130    
131 dpavlin 434 require DBM::Deep;
132 dpavlin 421
133     my $db = new DBM::Deep $db_file;
134    
135     $log->logdie("DBM::Deep error: $!") unless ($db);
136    
137     if ($db->error()) {
138     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139     } else {
140 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
141 dpavlin 421 }
142    
143     $self->{'db'} = $db;
144     }
145    
146 dpavlin 500 $log->debug("filters defined: ",Dumper($self->{'filter'}));
147    
148 dpavlin 352 return $self;
149     }
150    
151     =head2 open_isis
152    
153     Open CDS/ISIS database using OpenIsis module and read all records to memory.
154    
155     $webpac->open_isis(
156     filename => '/data/ISIS/ISIS',
157     code_page => '852',
158 dpavlin 431 limit_mfn => 500,
159     start_mfn => 6000,
160 dpavlin 352 lookup => [ ... ],
161     );
162    
163     By default, ISIS code page is assumed to be C<852>.
164    
165 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
166     from database (so you can skip beginning of your database if you need to).
167    
168 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
169     from database in example above.
170 dpavlin 352
171     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
172     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
173     value in index.
174    
175     lookup => [
176     { 'key' => 'd:v900', 'val' => 'v250^a' },
177     { 'eval' => '"v901^a" eq "Podruèje"',
178     'key' => 'pa:v561^4:v562^4:v461^1',
179     'val' => 'v900' },
180     ]
181    
182 dpavlin 367 Returns number of last record read into memory (size of database, really).
183    
184 dpavlin 352 =cut
185    
186     sub open_isis {
187     my $self = shift;
188     my $arg = {@_};
189    
190 dpavlin 372 my $log = $self->_get_logger();
191    
192     $log->logcroak("need filename") if (! $arg->{'filename'});
193 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
194    
195 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
196    
197 dpavlin 421 # store data in object
198     $self->{'isis_filename'} = $arg->{'filename'};
199     $self->{'isis_code_page'} = $code_page;
200    
201 dpavlin 353 use OpenIsis;
202    
203 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
204    
205     # create Text::Iconv object
206     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
207    
208 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
209 dpavlin 421 $log->debug("isis code page: $code_page");
210 dpavlin 363
211 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
212    
213     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
214 dpavlin 431 my $startmfn = 1;
215 dpavlin 352
216 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
217     $log->info("skipping to MFN $s");
218     $startmfn = $s;
219 dpavlin 448 } else {
220     $self->{'start_mfn'} = $startmfn;
221 dpavlin 431 }
222 dpavlin 357
223 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
224 dpavlin 363
225 dpavlin 431 $log->info("processing ",($maxmfn-$startmfn)." records...");
226    
227 dpavlin 352 # read database
228 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
229 dpavlin 352
230 dpavlin 418
231     $log->debug("mfn: $mfn\n");
232    
233 dpavlin 421 my $rec;
234    
235 dpavlin 352 # read record
236     my $row = OpenIsis::read( $isis_db, $mfn );
237     foreach my $k (keys %{$row}) {
238     if ($k ne "mfn") {
239     foreach my $l (@{$row->{$k}}) {
240     $l = $cp->convert($l);
241     # has subfields?
242     my $val;
243     if ($l =~ m/\^/) {
244     foreach my $t (split(/\^/,$l)) {
245     next if (! $t);
246     $val->{substr($t,0,1)} = substr($t,1);
247     }
248     } else {
249     $val = $l;
250     }
251    
252 dpavlin 421 push @{$rec->{$k}}, $val;
253 dpavlin 352 }
254 dpavlin 374 } else {
255 dpavlin 421 push @{$rec->{'000'}}, $mfn;
256 dpavlin 352 }
257    
258     }
259    
260 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
261    
262     # store
263     if ($self->{'low_mem'}) {
264     $self->{'db'}->put($mfn, $rec);
265     } else {
266     $self->{'data'}->{$mfn} = $rec;
267     }
268    
269 dpavlin 352 # create lookup
270 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
271 dpavlin 352
272 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
273    
274 dpavlin 352 }
275    
276 dpavlin 448 $self->{'current_mfn'} = -1;
277 dpavlin 389 $self->{'last_pcnt'} = 0;
278 dpavlin 362
279 dpavlin 418 $log->debug("max mfn: $maxmfn");
280    
281 dpavlin 352 # store max mfn and return it.
282     return $self->{'max_mfn'} = $maxmfn;
283     }
284    
285 dpavlin 362 =head2 fetch_rec
286    
287     Fetch next record from database. It will also display progress bar (once
288     it's implemented, that is).
289    
290     my $rec = $webpac->fetch_rec;
291    
292     =cut
293    
294     sub fetch_rec {
295     my $self = shift;
296    
297 dpavlin 372 my $log = $self->_get_logger();
298 dpavlin 362
299 dpavlin 448 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
300 dpavlin 372
301 dpavlin 448 if ($self->{'current_mfn'} == -1) {
302     $self->{'current_mfn'} = $self->{'start_mfn'};
303     } else {
304     $self->{'current_mfn'}++;
305     }
306    
307     my $mfn = $self->{'current_mfn'};
308    
309 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
310     $self->{'current_mfn'} = $self->{'max_mfn'};
311 dpavlin 373 $log->debug("at EOF");
312 dpavlin 362 return;
313     }
314    
315 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
316    
317 dpavlin 421 if ($self->{'low_mem'}) {
318     return $self->{'db'}->get($mfn);
319     } else {
320     return $self->{'data'}->{$mfn};
321     }
322 dpavlin 362 }
323    
324 dpavlin 448 =head2 mfn
325    
326     Returns current record number (MFN).
327    
328     print $webpac->mfn;
329    
330     =cut
331    
332     sub mfn {
333     my $self = shift;
334     return $self->{'current_mfn'};
335     }
336    
337 dpavlin 389 =head2 progress_bar
338    
339     Draw progress bar on STDERR.
340    
341     $webpac->progress_bar($current, $max);
342    
343     =cut
344    
345     sub progress_bar {
346     my $self = shift;
347    
348     my ($curr,$max) = @_;
349    
350     my $log = $self->_get_logger();
351    
352     $log->logconfess("no current value!") if (! $curr);
353     $log->logconfess("no maximum value!") if (! $max);
354    
355     if ($curr > $max) {
356     $max = $curr;
357     $log->debug("overflow to $curr");
358     }
359    
360     $self->{'last_pcnt'} ||= 1;
361    
362 dpavlin 454 my $p = int($curr * 100 / $max) || 1;
363 dpavlin 389
364 dpavlin 422 # reset on re-run
365     if ($p < $self->{'last_pcnt'}) {
366     $self->{'last_pcnt'} = $p;
367     $self->{'last_t'} = time();
368 dpavlin 453 $self->{'last_curr'} = undef;
369 dpavlin 422 }
370    
371 dpavlin 459 $self->{'last_t'} ||= time();
372    
373 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
374 dpavlin 422
375     my $last_curr = $self->{'last_curr'} || $curr;
376     my $t = time();
377     my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
378     my $eta = ($max-$curr) / ($rate || 1);
379     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
380 dpavlin 389 $self->{'last_pcnt'} = $p;
381 dpavlin 422 $self->{'last_t'} = time();
382     $self->{'last_curr'} = $curr;
383 dpavlin 389 }
384 dpavlin 412 print STDERR "\n" if ($p == 100);
385 dpavlin 389 }
386    
387 dpavlin 422 =head2 fmt_time
388    
389     Format time (in seconds) for display.
390    
391     print $webpac->fmt_time(time());
392    
393     This method is called by L<progress_bar> to display remaining time.
394    
395     =cut
396    
397     sub fmt_time {
398     my $self = shift;
399    
400     my $t = shift || 0;
401     my $out = "";
402    
403     my ($ss,$mm,$hh) = gmtime($t);
404     $out .= "${hh}h" if ($hh);
405     $out .= sprintf("%02d:%02d", $mm,$ss);
406     $out .= " " if ($hh == 0);
407     return $out;
408     }
409    
410 dpavlin 363 =head2 open_import_xml
411    
412     Read file from C<import_xml/> directory and parse it.
413    
414     $webpac->open_import_xml(type => 'isis');
415    
416     =cut
417    
418     sub open_import_xml {
419     my $self = shift;
420    
421 dpavlin 372 my $log = $self->_get_logger();
422    
423 dpavlin 363 my $arg = {@_};
424 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
425 dpavlin 363
426 dpavlin 366 $self->{'type'} = $arg->{'type'};
427 dpavlin 363
428 dpavlin 366 my $type_base = $arg->{'type'};
429 dpavlin 363 $type_base =~ s/_.*$//g;
430    
431 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
432    
433 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
434 dpavlin 366
435     my $f = "./import_xml/".$self->{'type'}.".xml";
436 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
437 dpavlin 363
438 dpavlin 375 $log->info("reading '$f'");
439 dpavlin 363
440 dpavlin 375 $self->{'import_xml_file'} = $f;
441    
442 dpavlin 363 $self->{'import_xml'} = XMLin($f,
443 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
444 dpavlin 363 );
445    
446 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
447    
448 dpavlin 363 }
449    
450 dpavlin 355 =head2 create_lookup
451    
452     Create lookup from record using lookup definition.
453    
454 dpavlin 367 $self->create_lookup($rec, @lookups);
455    
456     Called internally by C<open_*> methods.
457    
458 dpavlin 355 =cut
459    
460     sub create_lookup {
461     my $self = shift;
462    
463 dpavlin 372 my $log = $self->_get_logger();
464 dpavlin 355
465 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
466     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
467    
468 dpavlin 355 foreach my $i (@_) {
469 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
470     $log->logconfess("need val") unless defined($i->{'val'});
471    
472     if (defined($i->{'eval'})) {
473     # eval first, so we can skip fill_in for key and val
474     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
475     if ($self->_eval($eval)) {
476     my $key = $self->fill_in($rec,$i->{'key'}) || next;
477     my @val = $self->fill_in($rec,$i->{'val'}) || next;
478 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
479 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
480     }
481     } else {
482 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
483     my @val = $self->fill_in($rec,$i->{'val'}) || next;
484     $log->debug("stored $key = ",sub { join(" | ",@val) });
485     push @{$self->{'lookup'}->{$key}}, @val;
486 dpavlin 355 }
487     }
488     }
489    
490 dpavlin 356 =head2 get_data
491    
492     Returns value from record.
493    
494 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
495 dpavlin 356
496     Arguments are:
497     record reference C<$rec>,
498     field C<$f>,
499     optional subfiled C<$sf>,
500     index for repeatable values C<$i>.
501    
502 dpavlin 367 Optinal variable C<$found> will be incremeted if there
503 dpavlin 356 is field.
504    
505     Returns value or empty string.
506    
507     =cut
508    
509     sub get_data {
510     my $self = shift;
511    
512     my ($rec,$f,$sf,$i,$found) = @_;
513 dpavlin 367
514 dpavlin 356 if ($$rec->{$f}) {
515 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
516 dpavlin 389 no strict 'refs';
517 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
518     $$found++ if (defined($$found));
519     return $$rec->{$f}->[$i]->{$sf};
520     } elsif ($$rec->{$f}->[$i]) {
521     $$found++ if (defined($$found));
522 dpavlin 366 # it still might have subfield, just
523     # not specified, so we'll dump all
524     if ($$rec->{$f}->[$i] =~ /HASH/o) {
525     my $out;
526     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
527     $out .= $$rec->{$f}->[$i]->{$k}." ";
528     }
529     return $out;
530     } else {
531     return $$rec->{$f}->[$i];
532     }
533 dpavlin 356 }
534     } else {
535     return '';
536     }
537     }
538    
539 dpavlin 352 =head2 fill_in
540    
541     Workhourse of all: takes record from in-memory structure of database and
542     strings with placeholders and returns string or array of with substituted
543     values from record.
544    
545 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
546 dpavlin 352
547     Optional argument is ordinal number for repeatable fields. By default,
548 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
549     element is 0).
550     Following example will read second value from repeatable field.
551 dpavlin 352
552 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
553 dpavlin 353
554     This function B<does not> perform parsing of format to inteligenty skip
555     delimiters before fields which aren't used.
556    
557 dpavlin 376 This method will automatically decode UTF-8 string to local code page
558     if needed.
559    
560 dpavlin 352 =cut
561    
562 dpavlin 372 sub fill_in {
563 dpavlin 371 my $self = shift;
564    
565 dpavlin 372 my $log = $self->_get_logger();
566 dpavlin 371
567 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
568     my $format = shift || $log->logconfess("need format to parse");
569 dpavlin 352 # iteration (for repeatable fields)
570     my $i = shift || 0;
571    
572 dpavlin 555 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
573    
574 dpavlin 352 # FIXME remove for speedup?
575 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
576 dpavlin 352
577 dpavlin 376 if (utf8::is_utf8($format)) {
578     $format = $self->_x($format);
579     }
580    
581 dpavlin 352 my $found = 0;
582    
583 dpavlin 359 my $eval_code;
584     # remove eval{...} from beginning
585     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
586    
587 dpavlin 500 my $filter_name;
588     # remove filter{...} from beginning
589     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
590    
591 dpavlin 352 # do actual replacement of placeholders
592 dpavlin 555 # repeatable fields
593 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
594 dpavlin 555 # non-repeatable fields
595     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
596 dpavlin 352
597 dpavlin 353 if ($found) {
598 dpavlin 373 $log->debug("format: $format");
599 dpavlin 359 if ($eval_code) {
600     my $eval = $self->fill_in($rec,$eval_code,$i);
601 dpavlin 371 return if (! $self->_eval($eval));
602 dpavlin 359 }
603 dpavlin 500 if ($filter_name && $self->{'filter'}->{$filter_name}) {
604     $log->debug("filter '$filter_name' for $format");
605     $format = $self->{'filter'}->{$filter_name}->($format);
606     return unless(defined($format));
607     $log->debug("filter result: $format");
608     }
609 dpavlin 353 # do we have lookups?
610 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
611     $log->debug("format '$format' has lookup");
612 dpavlin 353 return $self->lookup($format);
613     } else {
614     return $format;
615     }
616 dpavlin 352 } else {
617     return;
618     }
619     }
620    
621     =head2 lookup
622    
623 dpavlin 355 Perform lookups on format supplied to it.
624 dpavlin 352
625 dpavlin 367 my $text = $self->lookup('[v900]');
626 dpavlin 352
627 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
628    
629 dpavlin 352 =cut
630    
631     sub lookup {
632     my $self = shift;
633    
634 dpavlin 372 my $log = $self->_get_logger();
635 dpavlin 352
636 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
637    
638 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
639 dpavlin 352 my @in = ( $tmp );
640 dpavlin 372
641     $log->debug("lookup for: ",$tmp);
642    
643 dpavlin 352 my @out;
644     while (my $f = shift @in) {
645 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
646 dpavlin 352 my $k = $1;
647     if ($self->{'lookup'}->{$k}) {
648     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
649     my $tmp2 = $f;
650 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
651 dpavlin 352 push @in, $tmp2;
652     }
653     } else {
654     undef $f;
655     }
656     } elsif ($f) {
657     push @out, $f;
658     }
659     }
660 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
661 dpavlin 352 return @out;
662     } else {
663     return $tmp;
664     }
665     }
666    
667 dpavlin 356 =head2 parse
668    
669     Perform smart parsing of string, skipping delimiters for fields which aren't
670     defined. It can also eval code in format starting with C<eval{...}> and
671     return output or nothing depending on eval code.
672    
673 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
674 dpavlin 356
675     =cut
676    
677     sub parse {
678     my $self = shift;
679    
680 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
681 dpavlin 356
682 dpavlin 366 return if (! $format_utf8);
683    
684 dpavlin 372 my $log = $self->_get_logger();
685 dpavlin 358
686 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
687     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
688    
689 dpavlin 358 $i = 0 if (! $i);
690    
691 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
692 dpavlin 366
693 dpavlin 356 my @out;
694    
695 dpavlin 373 $log->debug("format: $format");
696    
697 dpavlin 356 my $eval_code;
698     # remove eval{...} from beginning
699     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
700    
701 dpavlin 500 my $filter_name;
702     # remove filter{...} from beginning
703     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
704    
705 dpavlin 358 my $prefix;
706     my $all_found=0;
707 dpavlin 356
708 dpavlin 555 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
709 dpavlin 356
710 dpavlin 358 my $del = $1 || '';
711 dpavlin 359 $prefix ||= $del if ($all_found == 0);
712 dpavlin 358
713 dpavlin 555 # repeatable index
714     my $r = $i;
715     $r = 0 if (lc("$2") eq 's');
716    
717 dpavlin 358 my $found = 0;
718 dpavlin 555 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
719 dpavlin 358
720 dpavlin 356 if ($found) {
721 dpavlin 358 push @out, $del;
722     push @out, $tmp;
723     $all_found += $found;
724 dpavlin 356 }
725     }
726    
727 dpavlin 358 return if (! $all_found);
728 dpavlin 356
729 dpavlin 373 my $out = join('',@out);
730 dpavlin 358
731 dpavlin 373 if ($out) {
732     # add rest of format (suffix)
733     $out .= $format;
734 dpavlin 367
735 dpavlin 373 # add prefix if not there
736     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
737    
738     $log->debug("result: $out");
739     }
740    
741 dpavlin 359 if ($eval_code) {
742 dpavlin 501 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
743 dpavlin 500 $log->debug("about to eval{$eval} format: $out");
744 dpavlin 371 return if (! $self->_eval($eval));
745 dpavlin 359 }
746 dpavlin 500
747     if ($filter_name && $self->{'filter'}->{$filter_name}) {
748     $log->debug("about to filter{$filter_name} format: $out");
749     $out = $self->{'filter'}->{$filter_name}->($out);
750     return unless(defined($out));
751     $log->debug("filter result: $out");
752     }
753 dpavlin 359
754 dpavlin 358 return $out;
755 dpavlin 356 }
756    
757 dpavlin 367 =head2 parse_to_arr
758    
759     Similar to C<parse>, but returns array of all repeatable fields
760    
761     my @arr = $webpac->parse_to_arr($rec,'v250^a');
762    
763     =cut
764    
765     sub parse_to_arr {
766     my $self = shift;
767    
768     my ($rec, $format_utf8) = @_;
769    
770 dpavlin 372 my $log = $self->_get_logger();
771    
772     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
773 dpavlin 367 return if (! $format_utf8);
774    
775     my $i = 0;
776     my @arr;
777    
778     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
779     push @arr, $v;
780     }
781    
782 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
783    
784 dpavlin 367 return @arr;
785     }
786    
787 dpavlin 373 =head2 fill_in_to_arr
788    
789     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
790     for fields which have lookups, so they shouldn't be parsed but rather
791     C<fill_id>ed.
792    
793     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
794    
795     =cut
796    
797     sub fill_in_to_arr {
798     my $self = shift;
799    
800     my ($rec, $format_utf8) = @_;
801    
802     my $log = $self->_get_logger();
803    
804     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
805     return if (! $format_utf8);
806    
807     my $i = 0;
808     my @arr;
809    
810     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
811     push @arr, @v;
812     }
813    
814     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
815    
816     return @arr;
817     }
818    
819 dpavlin 441 =head2 sort_arr
820 dpavlin 373
821 dpavlin 441 Sort array ignoring case and html in data
822    
823     my @sorted = $webpac->sort_arr(@unsorted);
824    
825     =cut
826    
827     sub sort_arr {
828     my $self = shift;
829    
830     my $log = $self->_get_logger();
831    
832     # FIXME add Schwartzian Transformation?
833    
834     my @sorted = sort {
835     $a =~ s#<[^>]+/*>##;
836     $b =~ s#<[^>]+/*>##;
837     lc($b) cmp lc($a)
838     } @_;
839     $log->debug("sorted values: ",sub { join(", ",@sorted) });
840    
841     return @sorted;
842     }
843    
844    
845 dpavlin 366 =head2 data_structure
846    
847     Create in-memory data structure which represents layout from C<import_xml>.
848     It is used later to produce output.
849    
850 dpavlin 368 my @ds = $webpac->data_structure($rec);
851 dpavlin 366
852 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
853 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
854     <headline> tag.
855 dpavlin 374
856 dpavlin 366 =cut
857    
858 dpavlin 372 sub data_structure {
859 dpavlin 366 my $self = shift;
860    
861 dpavlin 372 my $log = $self->_get_logger();
862 dpavlin 366
863     my $rec = shift;
864 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
865 dpavlin 366
866 dpavlin 374 undef $self->{'currnet_filename'};
867 dpavlin 398 undef $self->{'headline'};
868 dpavlin 374
869 dpavlin 366 my @sorted_tags;
870     if ($self->{tags_by_order}) {
871     @sorted_tags = @{$self->{tags_by_order}};
872     } else {
873     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
874     $self->{tags_by_order} = \@sorted_tags;
875     }
876    
877 dpavlin 368 my @ds;
878 dpavlin 366
879 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
880    
881 dpavlin 366 foreach my $field (@sorted_tags) {
882    
883     my $row;
884    
885     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
886    
887     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
888 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
889 dpavlin 366
890 dpavlin 373 $log->debug("format: $format");
891    
892     my @v;
893     if ($format =~ /$LOOKUP_REGEX/o) {
894     @v = $self->fill_in_to_arr($rec,$format);
895     } else {
896     @v = $self->parse_to_arr($rec,$format);
897     }
898 dpavlin 367 next if (! @v);
899 dpavlin 366
900 dpavlin 439 if ($tag->{'sort'}) {
901 dpavlin 441 @v = $self->sort_arr(@v);
902 dpavlin 439 $log->warn("sort within tag is usually not what you want!");
903     }
904    
905 dpavlin 375 # use format?
906     if ($tag->{'format_name'}) {
907     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
908     }
909    
910 dpavlin 398 if ($field eq 'filename') {
911     $self->{'current_filename'} = join('',@v);
912     $log->debug("filename: ",$self->{'current_filename'});
913     } elsif ($field eq 'headline') {
914     $self->{'headline'} .= join('',@v);
915     $log->debug("headline: ",$self->{'headline'});
916     next; # don't return headline in data_structure!
917     }
918    
919 dpavlin 439 # delimiter will join repeatable fields
920     if ($tag->{'delimiter'}) {
921     @v = ( join($tag->{'delimiter'}, @v) );
922 dpavlin 366 }
923 dpavlin 373
924 dpavlin 439 # default types
925     my @types = qw(display swish);
926     # override by type attribute
927     @types = ( $tag->{'type'} ) if ($tag->{'type'});
928 dpavlin 374
929 dpavlin 439 foreach my $type (@types) {
930     # append to previous line?
931     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
932     if ($tag->{'append'}) {
933    
934     # I will delimit appended part with
935     # delimiter (or ,)
936     my $d = $tag->{'delimiter'};
937     # default delimiter
938 dpavlin 501 $d ||= " ";
939 dpavlin 439
940     my $last = pop @{$row->{$type}};
941     $d = "" if (! $last);
942     $last .= $d . join($d, @v);
943     push @{$row->{$type}}, $last;
944    
945     } else {
946     push @{$row->{$type}}, @v;
947     }
948     }
949    
950    
951 dpavlin 366 }
952    
953 dpavlin 368 if ($row) {
954     $row->{'tag'} = $field;
955 dpavlin 375
956     # TODO: name_sigular, name_plural
957     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
958     $row->{'name'} = $name ? $self->_x($name) : $field;
959    
960 dpavlin 439 # post-sort all values in field
961     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
962     $log->warn("sort at field tag not implemented");
963     }
964    
965 dpavlin 368 push @ds, $row;
966 dpavlin 374
967 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
968 dpavlin 368 }
969 dpavlin 366
970     }
971    
972 dpavlin 370 return @ds;
973 dpavlin 366
974     }
975    
976 dpavlin 370 =head2 output
977    
978     Create output from in-memory data structure using Template Toolkit template.
979    
980     my $text = $webpac->output( template => 'text.tt', data => @ds );
981    
982     =cut
983    
984     sub output {
985     my $self = shift;
986    
987     my $args = {@_};
988    
989 dpavlin 372 my $log = $self->_get_logger();
990 dpavlin 370
991 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
992     $log->logconfess("need data array") if (! $args->{'data'});
993    
994 dpavlin 370 my $out;
995    
996     $self->{'tt'}->process(
997     $args->{'template'},
998     $args,
999     \$out
1000     ) || confess $self->{'tt'}->error();
1001    
1002     return $out;
1003     }
1004    
1005 dpavlin 411 =head2 output_file
1006    
1007     Create output from in-memory data structure using Template Toolkit template
1008     to a file.
1009    
1010     $webpac->output_file(
1011     file => 'out.txt',
1012     template => 'text.tt',
1013     data => @ds
1014     );
1015    
1016     =cut
1017    
1018     sub output_file {
1019     my $self = shift;
1020    
1021     my $args = {@_};
1022    
1023     my $log = $self->_get_logger();
1024    
1025 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
1026 dpavlin 411
1027 dpavlin 421 $log->debug("creating file ",$file);
1028 dpavlin 411
1029 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1030 dpavlin 411 print $fh $self->output(
1031     template => $args->{'template'},
1032     data => $args->{'data'},
1033     ) || $log->logdie("print: $!");
1034     close($fh) || $log->logdie("close: $!");
1035     }
1036    
1037 dpavlin 375 =head2 apply_format
1038    
1039     Apply format specified in tag with C<format_name="name"> and
1040     C<format_delimiter=";;">.
1041    
1042     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1043    
1044     Formats can contain C<lookup{...}> if you need them.
1045    
1046     =cut
1047    
1048     sub apply_format {
1049     my $self = shift;
1050    
1051     my ($name,$delimiter,$data) = @_;
1052    
1053     my $log = $self->_get_logger();
1054    
1055     if (! $self->{'import_xml'}->{'format'}->{$name}) {
1056     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1057     return $data;
1058     }
1059    
1060     $log->warn("no delimiter for format $name") if (! $delimiter);
1061    
1062     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1063    
1064     my @data = split(/\Q$delimiter\E/, $data);
1065    
1066     my $out = sprintf($format, @data);
1067     $log->debug("using format $name [$format] on $data to produce: $out");
1068    
1069     if ($out =~ m/$LOOKUP_REGEX/o) {
1070     return $self->lookup($out);
1071     } else {
1072     return $out;
1073     }
1074    
1075     }
1076    
1077    
1078 dpavlin 372 #
1079     #
1080     #
1081    
1082     =head1 INTERNAL METHODS
1083    
1084     Here is a quick list of internal methods, mostly useful to turn debugging
1085     on them (see L<LOGGING> below for explanation).
1086    
1087     =cut
1088    
1089     =head2 _eval
1090    
1091     Internal function to eval code without C<strict 'subs'>.
1092    
1093     =cut
1094    
1095     sub _eval {
1096     my $self = shift;
1097    
1098     my $code = shift || return;
1099    
1100     my $log = $self->_get_logger();
1101    
1102     no strict 'subs';
1103     my $ret = eval $code;
1104     if ($@) {
1105     $log->error("problem with eval code [$code]: $@");
1106     }
1107    
1108     $log->debug("eval: ",$code," [",$ret,"]");
1109    
1110 dpavlin 500 return $ret || undef;
1111 dpavlin 372 }
1112    
1113     =head2 _sort_by_order
1114    
1115     Sort xml tags data structure accoding to C<order=""> attribute.
1116    
1117     =cut
1118    
1119     sub _sort_by_order {
1120     my $self = shift;
1121    
1122     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1123     $self->{'import_xml'}->{'indexer'}->{$a};
1124     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1125     $self->{'import_xml'}->{'indexer'}->{$b};
1126    
1127     return $va <=> $vb;
1128     }
1129    
1130 dpavlin 375 =head2 _get_logger
1131    
1132     Get C<Log::Log4perl> object with a twist: domains are defined for each
1133     method
1134    
1135     my $log = $webpac->_get_logger();
1136    
1137     =cut
1138    
1139 dpavlin 372 sub _get_logger {
1140     my $self = shift;
1141    
1142 dpavlin 374 my $name = (caller(1))[3] || caller;
1143     return get_logger($name);
1144 dpavlin 372 }
1145    
1146 dpavlin 375 =head2 _x
1147    
1148     Convert string from UTF-8 to code page defined in C<import_xml>.
1149    
1150     my $text = $webpac->_x('utf8 text');
1151    
1152     =cut
1153    
1154     sub _x {
1155     my $self = shift;
1156     my $utf8 = shift || return;
1157    
1158     return $self->{'utf2cp'}->convert($utf8) ||
1159     $self->_get_logger()->logwarn("can't convert '$utf8'");
1160     }
1161    
1162 dpavlin 372 #
1163     #
1164     #
1165    
1166     =head1 LOGGING
1167    
1168     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1169     C<log.conf>.
1170    
1171     Methods defined above have different levels of logging, so
1172     it's descriptions will be useful to turn (mostry B<debug> logging) on
1173     or off to see why WabPAC isn't perforing as you expect it (it might even
1174     be a bug!).
1175    
1176     B<This is different from normal Log4perl behaviour>. To repeat, you can
1177     also use method names, and not only classes (which are just few)
1178     to filter logging.
1179    
1180 dpavlin 422
1181     =head1 MEMORY USAGE
1182    
1183     C<low_mem> options is double-edged sword. If enabled, WebPAC
1184     will run on memory constraint machines (which doesn't have enough
1185     physical RAM to create memory structure for whole source database).
1186    
1187     If your machine has 512Mb or more of RAM and database is around 10000 records,
1188     memory shouldn't be an issue. If you don't have enough physical RAM, you
1189     might consider using virtual memory (if your operating system is handling it
1190     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1191     parsed structure of ISIS database (this is what C<low_mem> option does).
1192    
1193     Hitting swap at end of reading source database is probably o.k. However,
1194     hitting swap before 90% will dramatically decrease performance and you will
1195     be better off with C<low_mem> and using rest of availble memory for
1196     operating system disk cache (Linux is particuallary good about this).
1197     However, every access to database record will require disk access, so
1198     generation phase will be slower 10-100 times.
1199    
1200     Parsed structures are essential - you just have option to trade RAM memory
1201     (which is fast) for disk space (which is slow). Be sure to have planty of
1202     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1203    
1204     However, when WebPAC is running on desktop machines (or laptops :-), it's
1205     highly undesireable for system to start swapping. Using C<low_mem> option can
1206     reduce WecPAC memory usage to around 64Mb for same database with lookup
1207     fields and sorted indexes which stay in RAM. Performance will suffer, but
1208     memory usage will really be minimal. It might be also more confortable to
1209     run WebPAC reniced on those machines.
1210    
1211 dpavlin 372 =cut
1212    
1213 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26