/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 563 - (hide annotations)
Sat Oct 30 23:58:36 2004 UTC (16 years, 6 months ago) by dpavlin
File size: 27583 byte(s)
well, it seems it *IS* what I want. Removed that warning :-)

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37 dpavlin 439 code_page => 'ISO-8859-2',
38     low_mem => 1,
39 dpavlin 500 filter => {
40     'lower' => sub { lc($_[0]) },
41     },
42 dpavlin 352 );
43    
44     Default C<code_page> is C<ISO-8859-2>.
45    
46 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48 dpavlin 500 There is optinal parametar C<filter> which specify different filters which
49     can be applied using C<filter{name}> notation.
50 dpavlin 560 Same filters can be used in Template Toolkit files.
51 dpavlin 500
52 dpavlin 421 This method will also read configuration files
53 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
54     and configuration file specified by C<config_file>
55     which describes databases to be indexed.
56    
57 dpavlin 352 =cut
58    
59 dpavlin 363 # mapping between data type and tag which specify
60     # format in XML file
61     my %type2tag = (
62     'isis' => 'isis',
63     # 'excel' => 'column',
64     # 'marc' => 'marc',
65     # 'feed' => 'feed'
66     );
67    
68 dpavlin 352 sub new {
69     my $class = shift;
70     my $self = {@_};
71     bless($self, $class);
72    
73 dpavlin 422 $self->{'start_t'} = time();
74    
75 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
76     Log::Log4perl->init($log_file);
77    
78     my $log = $self->_get_logger();
79    
80 dpavlin 352 # fill in default values
81     # output codepage
82     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
83    
84 dpavlin 353 #
85     # read global.conf
86     #
87 dpavlin 372 $log->debug("read 'global.conf'");
88 dpavlin 352
89 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
90 dpavlin 352
91     # read global config parametars
92     foreach my $var (qw(
93     dbi_dbd
94     dbi_dsn
95     dbi_user
96     dbi_passwd
97     show_progress
98     my_unac_filter
99 dpavlin 370 output_template
100 dpavlin 352 )) {
101 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
102 dpavlin 352 }
103    
104 dpavlin 353 #
105     # read indexer config file
106     #
107 dpavlin 352
108 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
109 dpavlin 352
110 dpavlin 370 # create UTF-8 convertor for import_xml files
111 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
112 dpavlin 370
113     # create Template toolkit instance
114     $self->{'tt'} = Template->new(
115     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116 dpavlin 560 FILTERS => $self->{'filter'},
117 dpavlin 370 EVAL_PERL => 1,
118     );
119    
120 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
121     if ($self->{'low_mem'}) {
122 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123 dpavlin 421
124     my $db_file = "data.db";
125    
126     if (-e $db_file) {
127     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128     $log->debug("removed '$db_file' from last run");
129     }
130    
131 dpavlin 434 require DBM::Deep;
132 dpavlin 421
133     my $db = new DBM::Deep $db_file;
134    
135     $log->logdie("DBM::Deep error: $!") unless ($db);
136    
137     if ($db->error()) {
138     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139     } else {
140 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
141 dpavlin 421 }
142    
143     $self->{'db'} = $db;
144     }
145    
146 dpavlin 500 $log->debug("filters defined: ",Dumper($self->{'filter'}));
147    
148 dpavlin 352 return $self;
149     }
150    
151     =head2 open_isis
152    
153     Open CDS/ISIS database using OpenIsis module and read all records to memory.
154    
155     $webpac->open_isis(
156     filename => '/data/ISIS/ISIS',
157     code_page => '852',
158 dpavlin 431 limit_mfn => 500,
159     start_mfn => 6000,
160 dpavlin 352 lookup => [ ... ],
161     );
162    
163     By default, ISIS code page is assumed to be C<852>.
164    
165 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
166     from database (so you can skip beginning of your database if you need to).
167    
168 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
169     from database in example above.
170 dpavlin 352
171     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
172     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
173     value in index.
174    
175     lookup => [
176     { 'key' => 'd:v900', 'val' => 'v250^a' },
177     { 'eval' => '"v901^a" eq "Podruèje"',
178     'key' => 'pa:v561^4:v562^4:v461^1',
179     'val' => 'v900' },
180     ]
181    
182 dpavlin 367 Returns number of last record read into memory (size of database, really).
183    
184 dpavlin 352 =cut
185    
186     sub open_isis {
187     my $self = shift;
188     my $arg = {@_};
189    
190 dpavlin 372 my $log = $self->_get_logger();
191    
192     $log->logcroak("need filename") if (! $arg->{'filename'});
193 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
194    
195 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
196    
197 dpavlin 421 # store data in object
198     $self->{'isis_filename'} = $arg->{'filename'};
199     $self->{'isis_code_page'} = $code_page;
200    
201 dpavlin 353 use OpenIsis;
202    
203 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
204    
205     # create Text::Iconv object
206     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
207    
208 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
209 dpavlin 421 $log->debug("isis code page: $code_page");
210 dpavlin 363
211 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
212    
213     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
214 dpavlin 431 my $startmfn = 1;
215 dpavlin 352
216 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
217     $log->info("skipping to MFN $s");
218     $startmfn = $s;
219 dpavlin 448 } else {
220     $self->{'start_mfn'} = $startmfn;
221 dpavlin 431 }
222 dpavlin 357
223 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
224 dpavlin 363
225 dpavlin 431 $log->info("processing ",($maxmfn-$startmfn)." records...");
226    
227 dpavlin 352 # read database
228 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
229 dpavlin 352
230 dpavlin 418
231     $log->debug("mfn: $mfn\n");
232    
233 dpavlin 421 my $rec;
234    
235 dpavlin 352 # read record
236     my $row = OpenIsis::read( $isis_db, $mfn );
237     foreach my $k (keys %{$row}) {
238     if ($k ne "mfn") {
239     foreach my $l (@{$row->{$k}}) {
240     $l = $cp->convert($l);
241     # has subfields?
242     my $val;
243     if ($l =~ m/\^/) {
244     foreach my $t (split(/\^/,$l)) {
245     next if (! $t);
246     $val->{substr($t,0,1)} = substr($t,1);
247     }
248     } else {
249     $val = $l;
250     }
251    
252 dpavlin 421 push @{$rec->{$k}}, $val;
253 dpavlin 352 }
254 dpavlin 374 } else {
255 dpavlin 421 push @{$rec->{'000'}}, $mfn;
256 dpavlin 352 }
257    
258     }
259    
260 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
261    
262     # store
263     if ($self->{'low_mem'}) {
264     $self->{'db'}->put($mfn, $rec);
265     } else {
266     $self->{'data'}->{$mfn} = $rec;
267     }
268    
269 dpavlin 352 # create lookup
270 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
271 dpavlin 352
272 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
273    
274 dpavlin 352 }
275    
276 dpavlin 448 $self->{'current_mfn'} = -1;
277 dpavlin 389 $self->{'last_pcnt'} = 0;
278 dpavlin 362
279 dpavlin 418 $log->debug("max mfn: $maxmfn");
280    
281 dpavlin 352 # store max mfn and return it.
282     return $self->{'max_mfn'} = $maxmfn;
283     }
284    
285 dpavlin 362 =head2 fetch_rec
286    
287     Fetch next record from database. It will also display progress bar (once
288     it's implemented, that is).
289    
290     my $rec = $webpac->fetch_rec;
291    
292     =cut
293    
294     sub fetch_rec {
295     my $self = shift;
296    
297 dpavlin 372 my $log = $self->_get_logger();
298 dpavlin 362
299 dpavlin 448 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
300 dpavlin 372
301 dpavlin 448 if ($self->{'current_mfn'} == -1) {
302     $self->{'current_mfn'} = $self->{'start_mfn'};
303     } else {
304     $self->{'current_mfn'}++;
305     }
306    
307     my $mfn = $self->{'current_mfn'};
308    
309 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
310     $self->{'current_mfn'} = $self->{'max_mfn'};
311 dpavlin 373 $log->debug("at EOF");
312 dpavlin 362 return;
313     }
314    
315 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
316    
317 dpavlin 421 if ($self->{'low_mem'}) {
318     return $self->{'db'}->get($mfn);
319     } else {
320     return $self->{'data'}->{$mfn};
321     }
322 dpavlin 362 }
323    
324 dpavlin 448 =head2 mfn
325    
326     Returns current record number (MFN).
327    
328     print $webpac->mfn;
329    
330     =cut
331    
332     sub mfn {
333     my $self = shift;
334     return $self->{'current_mfn'};
335     }
336    
337 dpavlin 389 =head2 progress_bar
338    
339     Draw progress bar on STDERR.
340    
341     $webpac->progress_bar($current, $max);
342    
343     =cut
344    
345     sub progress_bar {
346     my $self = shift;
347    
348     my ($curr,$max) = @_;
349    
350     my $log = $self->_get_logger();
351    
352     $log->logconfess("no current value!") if (! $curr);
353     $log->logconfess("no maximum value!") if (! $max);
354    
355     if ($curr > $max) {
356     $max = $curr;
357     $log->debug("overflow to $curr");
358     }
359    
360     $self->{'last_pcnt'} ||= 1;
361    
362 dpavlin 454 my $p = int($curr * 100 / $max) || 1;
363 dpavlin 389
364 dpavlin 422 # reset on re-run
365     if ($p < $self->{'last_pcnt'}) {
366     $self->{'last_pcnt'} = $p;
367     $self->{'last_t'} = time();
368 dpavlin 453 $self->{'last_curr'} = undef;
369 dpavlin 422 }
370    
371 dpavlin 459 $self->{'last_t'} ||= time();
372    
373 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
374 dpavlin 422
375     my $last_curr = $self->{'last_curr'} || $curr;
376     my $t = time();
377     my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
378     my $eta = ($max-$curr) / ($rate || 1);
379     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
380 dpavlin 389 $self->{'last_pcnt'} = $p;
381 dpavlin 422 $self->{'last_t'} = time();
382     $self->{'last_curr'} = $curr;
383 dpavlin 389 }
384 dpavlin 412 print STDERR "\n" if ($p == 100);
385 dpavlin 389 }
386    
387 dpavlin 422 =head2 fmt_time
388    
389     Format time (in seconds) for display.
390    
391     print $webpac->fmt_time(time());
392    
393     This method is called by L<progress_bar> to display remaining time.
394    
395     =cut
396    
397     sub fmt_time {
398     my $self = shift;
399    
400     my $t = shift || 0;
401     my $out = "";
402    
403     my ($ss,$mm,$hh) = gmtime($t);
404     $out .= "${hh}h" if ($hh);
405     $out .= sprintf("%02d:%02d", $mm,$ss);
406     $out .= " " if ($hh == 0);
407     return $out;
408     }
409    
410 dpavlin 363 =head2 open_import_xml
411    
412     Read file from C<import_xml/> directory and parse it.
413    
414     $webpac->open_import_xml(type => 'isis');
415    
416     =cut
417    
418     sub open_import_xml {
419     my $self = shift;
420    
421 dpavlin 372 my $log = $self->_get_logger();
422    
423 dpavlin 363 my $arg = {@_};
424 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
425 dpavlin 363
426 dpavlin 366 $self->{'type'} = $arg->{'type'};
427 dpavlin 363
428 dpavlin 366 my $type_base = $arg->{'type'};
429 dpavlin 363 $type_base =~ s/_.*$//g;
430    
431 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
432    
433 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
434 dpavlin 366
435     my $f = "./import_xml/".$self->{'type'}.".xml";
436 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
437 dpavlin 363
438 dpavlin 375 $log->info("reading '$f'");
439 dpavlin 363
440 dpavlin 375 $self->{'import_xml_file'} = $f;
441    
442 dpavlin 363 $self->{'import_xml'} = XMLin($f,
443 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
444 dpavlin 363 );
445    
446 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
447    
448 dpavlin 363 }
449    
450 dpavlin 355 =head2 create_lookup
451    
452     Create lookup from record using lookup definition.
453    
454 dpavlin 367 $self->create_lookup($rec, @lookups);
455    
456     Called internally by C<open_*> methods.
457    
458 dpavlin 355 =cut
459    
460     sub create_lookup {
461     my $self = shift;
462    
463 dpavlin 372 my $log = $self->_get_logger();
464 dpavlin 355
465 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
466     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
467    
468 dpavlin 355 foreach my $i (@_) {
469 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
470     $log->logconfess("need val") unless defined($i->{'val'});
471    
472     if (defined($i->{'eval'})) {
473     # eval first, so we can skip fill_in for key and val
474     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
475     if ($self->_eval($eval)) {
476     my $key = $self->fill_in($rec,$i->{'key'}) || next;
477     my @val = $self->fill_in($rec,$i->{'val'}) || next;
478 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
479 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
480     }
481     } else {
482 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
483     my @val = $self->fill_in($rec,$i->{'val'}) || next;
484     $log->debug("stored $key = ",sub { join(" | ",@val) });
485     push @{$self->{'lookup'}->{$key}}, @val;
486 dpavlin 355 }
487     }
488     }
489    
490 dpavlin 356 =head2 get_data
491    
492     Returns value from record.
493    
494 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
495 dpavlin 356
496     Arguments are:
497     record reference C<$rec>,
498     field C<$f>,
499     optional subfiled C<$sf>,
500     index for repeatable values C<$i>.
501    
502 dpavlin 367 Optinal variable C<$found> will be incremeted if there
503 dpavlin 356 is field.
504    
505     Returns value or empty string.
506    
507     =cut
508    
509     sub get_data {
510     my $self = shift;
511    
512     my ($rec,$f,$sf,$i,$found) = @_;
513 dpavlin 367
514 dpavlin 356 if ($$rec->{$f}) {
515 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
516 dpavlin 389 no strict 'refs';
517 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
518     $$found++ if (defined($$found));
519     return $$rec->{$f}->[$i]->{$sf};
520     } elsif ($$rec->{$f}->[$i]) {
521     $$found++ if (defined($$found));
522 dpavlin 366 # it still might have subfield, just
523     # not specified, so we'll dump all
524     if ($$rec->{$f}->[$i] =~ /HASH/o) {
525     my $out;
526     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
527     $out .= $$rec->{$f}->[$i]->{$k}." ";
528     }
529     return $out;
530     } else {
531     return $$rec->{$f}->[$i];
532     }
533 dpavlin 356 }
534     } else {
535     return '';
536     }
537     }
538    
539 dpavlin 352 =head2 fill_in
540    
541     Workhourse of all: takes record from in-memory structure of database and
542     strings with placeholders and returns string or array of with substituted
543     values from record.
544    
545 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
546 dpavlin 352
547     Optional argument is ordinal number for repeatable fields. By default,
548 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
549     element is 0).
550     Following example will read second value from repeatable field.
551 dpavlin 352
552 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
553 dpavlin 353
554     This function B<does not> perform parsing of format to inteligenty skip
555     delimiters before fields which aren't used.
556    
557 dpavlin 376 This method will automatically decode UTF-8 string to local code page
558     if needed.
559    
560 dpavlin 352 =cut
561    
562 dpavlin 372 sub fill_in {
563 dpavlin 371 my $self = shift;
564    
565 dpavlin 372 my $log = $self->_get_logger();
566 dpavlin 371
567 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
568     my $format = shift || $log->logconfess("need format to parse");
569 dpavlin 352 # iteration (for repeatable fields)
570     my $i = shift || 0;
571    
572 dpavlin 555 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
573    
574 dpavlin 352 # FIXME remove for speedup?
575 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
576 dpavlin 352
577 dpavlin 376 if (utf8::is_utf8($format)) {
578     $format = $self->_x($format);
579     }
580    
581 dpavlin 352 my $found = 0;
582    
583 dpavlin 359 my $eval_code;
584     # remove eval{...} from beginning
585     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
586    
587 dpavlin 500 my $filter_name;
588     # remove filter{...} from beginning
589     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
590    
591 dpavlin 352 # do actual replacement of placeholders
592 dpavlin 555 # repeatable fields
593 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
594 dpavlin 555 # non-repeatable fields
595     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
596 dpavlin 352
597 dpavlin 353 if ($found) {
598 dpavlin 373 $log->debug("format: $format");
599 dpavlin 359 if ($eval_code) {
600     my $eval = $self->fill_in($rec,$eval_code,$i);
601 dpavlin 371 return if (! $self->_eval($eval));
602 dpavlin 359 }
603 dpavlin 500 if ($filter_name && $self->{'filter'}->{$filter_name}) {
604     $log->debug("filter '$filter_name' for $format");
605     $format = $self->{'filter'}->{$filter_name}->($format);
606     return unless(defined($format));
607     $log->debug("filter result: $format");
608     }
609 dpavlin 353 # do we have lookups?
610 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
611     $log->debug("format '$format' has lookup");
612 dpavlin 353 return $self->lookup($format);
613     } else {
614     return $format;
615     }
616 dpavlin 352 } else {
617     return;
618     }
619     }
620    
621     =head2 lookup
622    
623 dpavlin 355 Perform lookups on format supplied to it.
624 dpavlin 352
625 dpavlin 367 my $text = $self->lookup('[v900]');
626 dpavlin 352
627 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
628    
629 dpavlin 352 =cut
630    
631     sub lookup {
632     my $self = shift;
633    
634 dpavlin 372 my $log = $self->_get_logger();
635 dpavlin 352
636 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
637    
638 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
639 dpavlin 352 my @in = ( $tmp );
640 dpavlin 372
641     $log->debug("lookup for: ",$tmp);
642    
643 dpavlin 352 my @out;
644     while (my $f = shift @in) {
645 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
646 dpavlin 352 my $k = $1;
647     if ($self->{'lookup'}->{$k}) {
648     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
649     my $tmp2 = $f;
650 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
651 dpavlin 352 push @in, $tmp2;
652     }
653     } else {
654     undef $f;
655     }
656     } elsif ($f) {
657     push @out, $f;
658     }
659     }
660 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
661 dpavlin 352 return @out;
662     } else {
663     return $tmp;
664     }
665     }
666    
667 dpavlin 356 =head2 parse
668    
669     Perform smart parsing of string, skipping delimiters for fields which aren't
670     defined. It can also eval code in format starting with C<eval{...}> and
671     return output or nothing depending on eval code.
672    
673 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
674 dpavlin 356
675     =cut
676    
677     sub parse {
678     my $self = shift;
679    
680 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
681 dpavlin 356
682 dpavlin 366 return if (! $format_utf8);
683    
684 dpavlin 372 my $log = $self->_get_logger();
685 dpavlin 358
686 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
687     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
688    
689 dpavlin 358 $i = 0 if (! $i);
690    
691 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
692 dpavlin 366
693 dpavlin 356 my @out;
694    
695 dpavlin 373 $log->debug("format: $format");
696    
697 dpavlin 356 my $eval_code;
698     # remove eval{...} from beginning
699     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
700    
701 dpavlin 500 my $filter_name;
702     # remove filter{...} from beginning
703     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
704    
705 dpavlin 358 my $prefix;
706     my $all_found=0;
707 dpavlin 356
708 dpavlin 555 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
709 dpavlin 356
710 dpavlin 358 my $del = $1 || '';
711 dpavlin 359 $prefix ||= $del if ($all_found == 0);
712 dpavlin 358
713 dpavlin 555 # repeatable index
714     my $r = $i;
715     $r = 0 if (lc("$2") eq 's');
716    
717 dpavlin 358 my $found = 0;
718 dpavlin 555 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
719 dpavlin 358
720 dpavlin 356 if ($found) {
721 dpavlin 358 push @out, $del;
722     push @out, $tmp;
723     $all_found += $found;
724 dpavlin 356 }
725     }
726    
727 dpavlin 358 return if (! $all_found);
728 dpavlin 356
729 dpavlin 373 my $out = join('',@out);
730 dpavlin 358
731 dpavlin 373 if ($out) {
732     # add rest of format (suffix)
733     $out .= $format;
734 dpavlin 367
735 dpavlin 373 # add prefix if not there
736     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
737    
738     $log->debug("result: $out");
739     }
740    
741 dpavlin 359 if ($eval_code) {
742 dpavlin 501 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
743 dpavlin 500 $log->debug("about to eval{$eval} format: $out");
744 dpavlin 371 return if (! $self->_eval($eval));
745 dpavlin 359 }
746 dpavlin 500
747     if ($filter_name && $self->{'filter'}->{$filter_name}) {
748     $log->debug("about to filter{$filter_name} format: $out");
749     $out = $self->{'filter'}->{$filter_name}->($out);
750     return unless(defined($out));
751     $log->debug("filter result: $out");
752     }
753 dpavlin 359
754 dpavlin 358 return $out;
755 dpavlin 356 }
756    
757 dpavlin 367 =head2 parse_to_arr
758    
759     Similar to C<parse>, but returns array of all repeatable fields
760    
761     my @arr = $webpac->parse_to_arr($rec,'v250^a');
762    
763     =cut
764    
765     sub parse_to_arr {
766     my $self = shift;
767    
768     my ($rec, $format_utf8) = @_;
769    
770 dpavlin 372 my $log = $self->_get_logger();
771    
772     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
773 dpavlin 367 return if (! $format_utf8);
774    
775     my $i = 0;
776     my @arr;
777    
778     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
779     push @arr, $v;
780     }
781    
782 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
783    
784 dpavlin 367 return @arr;
785     }
786    
787 dpavlin 373 =head2 fill_in_to_arr
788    
789     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
790     for fields which have lookups, so they shouldn't be parsed but rather
791     C<fill_id>ed.
792    
793     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
794    
795     =cut
796    
797     sub fill_in_to_arr {
798     my $self = shift;
799    
800     my ($rec, $format_utf8) = @_;
801    
802     my $log = $self->_get_logger();
803    
804     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
805     return if (! $format_utf8);
806    
807     my $i = 0;
808     my @arr;
809    
810     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
811     push @arr, @v;
812     }
813    
814     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
815    
816     return @arr;
817     }
818    
819 dpavlin 441 =head2 sort_arr
820 dpavlin 373
821 dpavlin 441 Sort array ignoring case and html in data
822    
823     my @sorted = $webpac->sort_arr(@unsorted);
824    
825     =cut
826    
827     sub sort_arr {
828     my $self = shift;
829    
830     my $log = $self->_get_logger();
831    
832     # FIXME add Schwartzian Transformation?
833    
834     my @sorted = sort {
835     $a =~ s#<[^>]+/*>##;
836     $b =~ s#<[^>]+/*>##;
837     lc($b) cmp lc($a)
838     } @_;
839     $log->debug("sorted values: ",sub { join(", ",@sorted) });
840    
841     return @sorted;
842     }
843    
844    
845 dpavlin 366 =head2 data_structure
846    
847     Create in-memory data structure which represents layout from C<import_xml>.
848     It is used later to produce output.
849    
850 dpavlin 368 my @ds = $webpac->data_structure($rec);
851 dpavlin 366
852 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
853 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
854     <headline> tag.
855 dpavlin 374
856 dpavlin 366 =cut
857    
858 dpavlin 372 sub data_structure {
859 dpavlin 366 my $self = shift;
860    
861 dpavlin 372 my $log = $self->_get_logger();
862 dpavlin 366
863     my $rec = shift;
864 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
865 dpavlin 366
866 dpavlin 374 undef $self->{'currnet_filename'};
867 dpavlin 398 undef $self->{'headline'};
868 dpavlin 374
869 dpavlin 366 my @sorted_tags;
870     if ($self->{tags_by_order}) {
871     @sorted_tags = @{$self->{tags_by_order}};
872     } else {
873     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
874     $self->{tags_by_order} = \@sorted_tags;
875     }
876    
877 dpavlin 368 my @ds;
878 dpavlin 366
879 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
880    
881 dpavlin 366 foreach my $field (@sorted_tags) {
882    
883     my $row;
884    
885     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
886    
887     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
888 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
889 dpavlin 366
890 dpavlin 373 $log->debug("format: $format");
891    
892     my @v;
893     if ($format =~ /$LOOKUP_REGEX/o) {
894     @v = $self->fill_in_to_arr($rec,$format);
895     } else {
896     @v = $self->parse_to_arr($rec,$format);
897     }
898 dpavlin 367 next if (! @v);
899 dpavlin 366
900 dpavlin 439 if ($tag->{'sort'}) {
901 dpavlin 441 @v = $self->sort_arr(@v);
902 dpavlin 439 }
903    
904 dpavlin 375 # use format?
905     if ($tag->{'format_name'}) {
906     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
907     }
908    
909 dpavlin 398 if ($field eq 'filename') {
910     $self->{'current_filename'} = join('',@v);
911     $log->debug("filename: ",$self->{'current_filename'});
912     } elsif ($field eq 'headline') {
913     $self->{'headline'} .= join('',@v);
914     $log->debug("headline: ",$self->{'headline'});
915     next; # don't return headline in data_structure!
916     }
917    
918 dpavlin 439 # delimiter will join repeatable fields
919     if ($tag->{'delimiter'}) {
920     @v = ( join($tag->{'delimiter'}, @v) );
921 dpavlin 366 }
922 dpavlin 373
923 dpavlin 439 # default types
924     my @types = qw(display swish);
925     # override by type attribute
926     @types = ( $tag->{'type'} ) if ($tag->{'type'});
927 dpavlin 374
928 dpavlin 439 foreach my $type (@types) {
929     # append to previous line?
930     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
931     if ($tag->{'append'}) {
932    
933     # I will delimit appended part with
934     # delimiter (or ,)
935     my $d = $tag->{'delimiter'};
936     # default delimiter
937 dpavlin 501 $d ||= " ";
938 dpavlin 439
939     my $last = pop @{$row->{$type}};
940     $d = "" if (! $last);
941     $last .= $d . join($d, @v);
942     push @{$row->{$type}}, $last;
943    
944     } else {
945     push @{$row->{$type}}, @v;
946     }
947     }
948    
949    
950 dpavlin 366 }
951    
952 dpavlin 368 if ($row) {
953     $row->{'tag'} = $field;
954 dpavlin 375
955     # TODO: name_sigular, name_plural
956     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
957     $row->{'name'} = $name ? $self->_x($name) : $field;
958    
959 dpavlin 439 # post-sort all values in field
960     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
961     $log->warn("sort at field tag not implemented");
962     }
963    
964 dpavlin 368 push @ds, $row;
965 dpavlin 374
966 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
967 dpavlin 368 }
968 dpavlin 366
969     }
970    
971 dpavlin 370 return @ds;
972 dpavlin 366
973     }
974    
975 dpavlin 370 =head2 output
976    
977     Create output from in-memory data structure using Template Toolkit template.
978    
979     my $text = $webpac->output( template => 'text.tt', data => @ds );
980    
981     =cut
982    
983     sub output {
984     my $self = shift;
985    
986     my $args = {@_};
987    
988 dpavlin 372 my $log = $self->_get_logger();
989 dpavlin 370
990 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
991     $log->logconfess("need data array") if (! $args->{'data'});
992    
993 dpavlin 370 my $out;
994    
995     $self->{'tt'}->process(
996     $args->{'template'},
997     $args,
998     \$out
999     ) || confess $self->{'tt'}->error();
1000    
1001     return $out;
1002     }
1003    
1004 dpavlin 411 =head2 output_file
1005    
1006     Create output from in-memory data structure using Template Toolkit template
1007     to a file.
1008    
1009     $webpac->output_file(
1010     file => 'out.txt',
1011     template => 'text.tt',
1012     data => @ds
1013     );
1014    
1015     =cut
1016    
1017     sub output_file {
1018     my $self = shift;
1019    
1020     my $args = {@_};
1021    
1022     my $log = $self->_get_logger();
1023    
1024 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
1025 dpavlin 411
1026 dpavlin 421 $log->debug("creating file ",$file);
1027 dpavlin 411
1028 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1029 dpavlin 411 print $fh $self->output(
1030     template => $args->{'template'},
1031     data => $args->{'data'},
1032     ) || $log->logdie("print: $!");
1033     close($fh) || $log->logdie("close: $!");
1034     }
1035    
1036 dpavlin 375 =head2 apply_format
1037    
1038     Apply format specified in tag with C<format_name="name"> and
1039     C<format_delimiter=";;">.
1040    
1041     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1042    
1043     Formats can contain C<lookup{...}> if you need them.
1044    
1045     =cut
1046    
1047     sub apply_format {
1048     my $self = shift;
1049    
1050     my ($name,$delimiter,$data) = @_;
1051    
1052     my $log = $self->_get_logger();
1053    
1054     if (! $self->{'import_xml'}->{'format'}->{$name}) {
1055     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1056     return $data;
1057     }
1058    
1059     $log->warn("no delimiter for format $name") if (! $delimiter);
1060    
1061     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1062    
1063     my @data = split(/\Q$delimiter\E/, $data);
1064    
1065     my $out = sprintf($format, @data);
1066     $log->debug("using format $name [$format] on $data to produce: $out");
1067    
1068     if ($out =~ m/$LOOKUP_REGEX/o) {
1069     return $self->lookup($out);
1070     } else {
1071     return $out;
1072     }
1073    
1074     }
1075    
1076    
1077 dpavlin 372 #
1078     #
1079     #
1080    
1081     =head1 INTERNAL METHODS
1082    
1083     Here is a quick list of internal methods, mostly useful to turn debugging
1084     on them (see L<LOGGING> below for explanation).
1085    
1086     =cut
1087    
1088     =head2 _eval
1089    
1090     Internal function to eval code without C<strict 'subs'>.
1091    
1092     =cut
1093    
1094     sub _eval {
1095     my $self = shift;
1096    
1097     my $code = shift || return;
1098    
1099     my $log = $self->_get_logger();
1100    
1101     no strict 'subs';
1102     my $ret = eval $code;
1103     if ($@) {
1104     $log->error("problem with eval code [$code]: $@");
1105     }
1106    
1107     $log->debug("eval: ",$code," [",$ret,"]");
1108    
1109 dpavlin 500 return $ret || undef;
1110 dpavlin 372 }
1111    
1112     =head2 _sort_by_order
1113    
1114     Sort xml tags data structure accoding to C<order=""> attribute.
1115    
1116     =cut
1117    
1118     sub _sort_by_order {
1119     my $self = shift;
1120    
1121     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1122     $self->{'import_xml'}->{'indexer'}->{$a};
1123     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1124     $self->{'import_xml'}->{'indexer'}->{$b};
1125    
1126     return $va <=> $vb;
1127     }
1128    
1129 dpavlin 375 =head2 _get_logger
1130    
1131     Get C<Log::Log4perl> object with a twist: domains are defined for each
1132     method
1133    
1134     my $log = $webpac->_get_logger();
1135    
1136     =cut
1137    
1138 dpavlin 372 sub _get_logger {
1139     my $self = shift;
1140    
1141 dpavlin 374 my $name = (caller(1))[3] || caller;
1142     return get_logger($name);
1143 dpavlin 372 }
1144    
1145 dpavlin 375 =head2 _x
1146    
1147     Convert string from UTF-8 to code page defined in C<import_xml>.
1148    
1149     my $text = $webpac->_x('utf8 text');
1150    
1151     =cut
1152    
1153     sub _x {
1154     my $self = shift;
1155     my $utf8 = shift || return;
1156    
1157     return $self->{'utf2cp'}->convert($utf8) ||
1158     $self->_get_logger()->logwarn("can't convert '$utf8'");
1159     }
1160    
1161 dpavlin 372 #
1162     #
1163     #
1164    
1165     =head1 LOGGING
1166    
1167     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1168     C<log.conf>.
1169    
1170     Methods defined above have different levels of logging, so
1171     it's descriptions will be useful to turn (mostry B<debug> logging) on
1172     or off to see why WabPAC isn't perforing as you expect it (it might even
1173     be a bug!).
1174    
1175     B<This is different from normal Log4perl behaviour>. To repeat, you can
1176     also use method names, and not only classes (which are just few)
1177     to filter logging.
1178    
1179 dpavlin 422
1180     =head1 MEMORY USAGE
1181    
1182     C<low_mem> options is double-edged sword. If enabled, WebPAC
1183     will run on memory constraint machines (which doesn't have enough
1184     physical RAM to create memory structure for whole source database).
1185    
1186     If your machine has 512Mb or more of RAM and database is around 10000 records,
1187     memory shouldn't be an issue. If you don't have enough physical RAM, you
1188     might consider using virtual memory (if your operating system is handling it
1189     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1190     parsed structure of ISIS database (this is what C<low_mem> option does).
1191    
1192     Hitting swap at end of reading source database is probably o.k. However,
1193     hitting swap before 90% will dramatically decrease performance and you will
1194     be better off with C<low_mem> and using rest of availble memory for
1195     operating system disk cache (Linux is particuallary good about this).
1196     However, every access to database record will require disk access, so
1197     generation phase will be slower 10-100 times.
1198    
1199     Parsed structures are essential - you just have option to trade RAM memory
1200     (which is fast) for disk space (which is slow). Be sure to have planty of
1201     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1202    
1203     However, when WebPAC is running on desktop machines (or laptops :-), it's
1204     highly undesireable for system to start swapping. Using C<low_mem> option can
1205     reduce WecPAC memory usage to around 64Mb for same database with lookup
1206     fields and sorted indexes which stay in RAM. Performance will suffer, but
1207     memory usage will really be minimal. It might be also more confortable to
1208     run WebPAC reniced on those machines.
1209    
1210 dpavlin 372 =cut
1211    
1212 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26