/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 501 - (hide annotations)
Sun Oct 10 11:25:10 2004 UTC (19 years, 5 months ago) by dpavlin
File size: 27330 byte(s)
implement CROVOC marker for tree

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37 dpavlin 439 code_page => 'ISO-8859-2',
38     low_mem => 1,
39 dpavlin 500 filter => {
40     'lower' => sub { lc($_[0]) },
41     },
42 dpavlin 352 );
43    
44     Default C<code_page> is C<ISO-8859-2>.
45    
46 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48 dpavlin 500 There is optinal parametar C<filter> which specify different filters which
49     can be applied using C<filter{name}> notation.
50    
51 dpavlin 421 This method will also read configuration files
52 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
53     and configuration file specified by C<config_file>
54     which describes databases to be indexed.
55    
56 dpavlin 352 =cut
57    
58 dpavlin 363 # mapping between data type and tag which specify
59     # format in XML file
60     my %type2tag = (
61     'isis' => 'isis',
62     # 'excel' => 'column',
63     # 'marc' => 'marc',
64     # 'feed' => 'feed'
65     );
66    
67 dpavlin 352 sub new {
68     my $class = shift;
69     my $self = {@_};
70     bless($self, $class);
71    
72 dpavlin 422 $self->{'start_t'} = time();
73    
74 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
75     Log::Log4perl->init($log_file);
76    
77     my $log = $self->_get_logger();
78    
79 dpavlin 352 # fill in default values
80     # output codepage
81     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
82    
83 dpavlin 353 #
84     # read global.conf
85     #
86 dpavlin 372 $log->debug("read 'global.conf'");
87 dpavlin 352
88 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
89 dpavlin 352
90     # read global config parametars
91     foreach my $var (qw(
92     dbi_dbd
93     dbi_dsn
94     dbi_user
95     dbi_passwd
96     show_progress
97     my_unac_filter
98 dpavlin 370 output_template
99 dpavlin 352 )) {
100 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
101 dpavlin 352 }
102    
103 dpavlin 353 #
104     # read indexer config file
105     #
106 dpavlin 352
107 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
108 dpavlin 352
109 dpavlin 370 # create UTF-8 convertor for import_xml files
110 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
111 dpavlin 370
112     # create Template toolkit instance
113     $self->{'tt'} = Template->new(
114     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
115     # FILTERS => {
116     # 'foo' => \&foo_filter,
117     # },
118     EVAL_PERL => 1,
119     );
120    
121 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
122     if ($self->{'low_mem'}) {
123 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
124 dpavlin 421
125     my $db_file = "data.db";
126    
127     if (-e $db_file) {
128     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
129     $log->debug("removed '$db_file' from last run");
130     }
131    
132 dpavlin 434 require DBM::Deep;
133 dpavlin 421
134     my $db = new DBM::Deep $db_file;
135    
136     $log->logdie("DBM::Deep error: $!") unless ($db);
137    
138     if ($db->error()) {
139     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
140     } else {
141 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
142 dpavlin 421 }
143    
144     $self->{'db'} = $db;
145     }
146    
147 dpavlin 500 $log->debug("filters defined: ",Dumper($self->{'filter'}));
148    
149 dpavlin 352 return $self;
150     }
151    
152     =head2 open_isis
153    
154     Open CDS/ISIS database using OpenIsis module and read all records to memory.
155    
156     $webpac->open_isis(
157     filename => '/data/ISIS/ISIS',
158     code_page => '852',
159 dpavlin 431 limit_mfn => 500,
160     start_mfn => 6000,
161 dpavlin 352 lookup => [ ... ],
162     );
163    
164     By default, ISIS code page is assumed to be C<852>.
165    
166 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
167     from database (so you can skip beginning of your database if you need to).
168    
169 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
170     from database in example above.
171 dpavlin 352
172     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
173     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
174     value in index.
175    
176     lookup => [
177     { 'key' => 'd:v900', 'val' => 'v250^a' },
178     { 'eval' => '"v901^a" eq "Podruèje"',
179     'key' => 'pa:v561^4:v562^4:v461^1',
180     'val' => 'v900' },
181     ]
182    
183 dpavlin 367 Returns number of last record read into memory (size of database, really).
184    
185 dpavlin 352 =cut
186    
187     sub open_isis {
188     my $self = shift;
189     my $arg = {@_};
190    
191 dpavlin 372 my $log = $self->_get_logger();
192    
193     $log->logcroak("need filename") if (! $arg->{'filename'});
194 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
195    
196 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197    
198 dpavlin 421 # store data in object
199     $self->{'isis_filename'} = $arg->{'filename'};
200     $self->{'isis_code_page'} = $code_page;
201    
202 dpavlin 353 use OpenIsis;
203    
204 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
205    
206     # create Text::Iconv object
207     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
208    
209 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
210 dpavlin 421 $log->debug("isis code page: $code_page");
211 dpavlin 363
212 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
213    
214     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
215 dpavlin 431 my $startmfn = 1;
216 dpavlin 352
217 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
218     $log->info("skipping to MFN $s");
219     $startmfn = $s;
220 dpavlin 448 } else {
221     $self->{'start_mfn'} = $startmfn;
222 dpavlin 431 }
223 dpavlin 357
224 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
225 dpavlin 363
226 dpavlin 431 $log->info("processing ",($maxmfn-$startmfn)." records...");
227    
228 dpavlin 352 # read database
229 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
230 dpavlin 352
231 dpavlin 418
232     $log->debug("mfn: $mfn\n");
233    
234 dpavlin 421 my $rec;
235    
236 dpavlin 352 # read record
237     my $row = OpenIsis::read( $isis_db, $mfn );
238     foreach my $k (keys %{$row}) {
239     if ($k ne "mfn") {
240     foreach my $l (@{$row->{$k}}) {
241     $l = $cp->convert($l);
242     # has subfields?
243     my $val;
244     if ($l =~ m/\^/) {
245     foreach my $t (split(/\^/,$l)) {
246     next if (! $t);
247     $val->{substr($t,0,1)} = substr($t,1);
248     }
249     } else {
250     $val = $l;
251     }
252    
253 dpavlin 421 push @{$rec->{$k}}, $val;
254 dpavlin 352 }
255 dpavlin 374 } else {
256 dpavlin 421 push @{$rec->{'000'}}, $mfn;
257 dpavlin 352 }
258    
259     }
260    
261 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
262    
263     # store
264     if ($self->{'low_mem'}) {
265     $self->{'db'}->put($mfn, $rec);
266     } else {
267     $self->{'data'}->{$mfn} = $rec;
268     }
269    
270 dpavlin 352 # create lookup
271 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
272 dpavlin 352
273 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
274    
275 dpavlin 352 }
276    
277 dpavlin 448 $self->{'current_mfn'} = -1;
278 dpavlin 389 $self->{'last_pcnt'} = 0;
279 dpavlin 362
280 dpavlin 418 $log->debug("max mfn: $maxmfn");
281    
282 dpavlin 352 # store max mfn and return it.
283     return $self->{'max_mfn'} = $maxmfn;
284     }
285    
286 dpavlin 362 =head2 fetch_rec
287    
288     Fetch next record from database. It will also display progress bar (once
289     it's implemented, that is).
290    
291     my $rec = $webpac->fetch_rec;
292    
293     =cut
294    
295     sub fetch_rec {
296     my $self = shift;
297    
298 dpavlin 372 my $log = $self->_get_logger();
299 dpavlin 362
300 dpavlin 448 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
301 dpavlin 372
302 dpavlin 448 if ($self->{'current_mfn'} == -1) {
303     $self->{'current_mfn'} = $self->{'start_mfn'};
304     } else {
305     $self->{'current_mfn'}++;
306     }
307    
308     my $mfn = $self->{'current_mfn'};
309    
310 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
311     $self->{'current_mfn'} = $self->{'max_mfn'};
312 dpavlin 373 $log->debug("at EOF");
313 dpavlin 362 return;
314     }
315    
316 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
317    
318 dpavlin 421 if ($self->{'low_mem'}) {
319     return $self->{'db'}->get($mfn);
320     } else {
321     return $self->{'data'}->{$mfn};
322     }
323 dpavlin 362 }
324    
325 dpavlin 448 =head2 mfn
326    
327     Returns current record number (MFN).
328    
329     print $webpac->mfn;
330    
331     =cut
332    
333     sub mfn {
334     my $self = shift;
335     return $self->{'current_mfn'};
336     }
337    
338 dpavlin 389 =head2 progress_bar
339    
340     Draw progress bar on STDERR.
341    
342     $webpac->progress_bar($current, $max);
343    
344     =cut
345    
346     sub progress_bar {
347     my $self = shift;
348    
349     my ($curr,$max) = @_;
350    
351     my $log = $self->_get_logger();
352    
353     $log->logconfess("no current value!") if (! $curr);
354     $log->logconfess("no maximum value!") if (! $max);
355    
356     if ($curr > $max) {
357     $max = $curr;
358     $log->debug("overflow to $curr");
359     }
360    
361     $self->{'last_pcnt'} ||= 1;
362    
363 dpavlin 454 my $p = int($curr * 100 / $max) || 1;
364 dpavlin 389
365 dpavlin 422 # reset on re-run
366     if ($p < $self->{'last_pcnt'}) {
367     $self->{'last_pcnt'} = $p;
368     $self->{'last_t'} = time();
369 dpavlin 453 $self->{'last_curr'} = undef;
370 dpavlin 422 }
371    
372 dpavlin 459 $self->{'last_t'} ||= time();
373    
374 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
375 dpavlin 422
376     my $last_curr = $self->{'last_curr'} || $curr;
377     my $t = time();
378     my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
379     my $eta = ($max-$curr) / ($rate || 1);
380     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
381 dpavlin 389 $self->{'last_pcnt'} = $p;
382 dpavlin 422 $self->{'last_t'} = time();
383     $self->{'last_curr'} = $curr;
384 dpavlin 389 }
385 dpavlin 412 print STDERR "\n" if ($p == 100);
386 dpavlin 389 }
387    
388 dpavlin 422 =head2 fmt_time
389    
390     Format time (in seconds) for display.
391    
392     print $webpac->fmt_time(time());
393    
394     This method is called by L<progress_bar> to display remaining time.
395    
396     =cut
397    
398     sub fmt_time {
399     my $self = shift;
400    
401     my $t = shift || 0;
402     my $out = "";
403    
404     my ($ss,$mm,$hh) = gmtime($t);
405     $out .= "${hh}h" if ($hh);
406     $out .= sprintf("%02d:%02d", $mm,$ss);
407     $out .= " " if ($hh == 0);
408     return $out;
409     }
410    
411 dpavlin 363 =head2 open_import_xml
412    
413     Read file from C<import_xml/> directory and parse it.
414    
415     $webpac->open_import_xml(type => 'isis');
416    
417     =cut
418    
419     sub open_import_xml {
420     my $self = shift;
421    
422 dpavlin 372 my $log = $self->_get_logger();
423    
424 dpavlin 363 my $arg = {@_};
425 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
426 dpavlin 363
427 dpavlin 366 $self->{'type'} = $arg->{'type'};
428 dpavlin 363
429 dpavlin 366 my $type_base = $arg->{'type'};
430 dpavlin 363 $type_base =~ s/_.*$//g;
431    
432 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
433    
434 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
435 dpavlin 366
436     my $f = "./import_xml/".$self->{'type'}.".xml";
437 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
438 dpavlin 363
439 dpavlin 375 $log->info("reading '$f'");
440 dpavlin 363
441 dpavlin 375 $self->{'import_xml_file'} = $f;
442    
443 dpavlin 363 $self->{'import_xml'} = XMLin($f,
444 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
445 dpavlin 363 );
446    
447 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
448    
449 dpavlin 363 }
450    
451 dpavlin 355 =head2 create_lookup
452    
453     Create lookup from record using lookup definition.
454    
455 dpavlin 367 $self->create_lookup($rec, @lookups);
456    
457     Called internally by C<open_*> methods.
458    
459 dpavlin 355 =cut
460    
461     sub create_lookup {
462     my $self = shift;
463    
464 dpavlin 372 my $log = $self->_get_logger();
465 dpavlin 355
466 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
467     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
468    
469 dpavlin 355 foreach my $i (@_) {
470 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
471     $log->logconfess("need val") unless defined($i->{'val'});
472    
473     if (defined($i->{'eval'})) {
474     # eval first, so we can skip fill_in for key and val
475     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
476     if ($self->_eval($eval)) {
477     my $key = $self->fill_in($rec,$i->{'key'}) || next;
478     my @val = $self->fill_in($rec,$i->{'val'}) || next;
479 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
480 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
481     }
482     } else {
483 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
484     my @val = $self->fill_in($rec,$i->{'val'}) || next;
485     $log->debug("stored $key = ",sub { join(" | ",@val) });
486     push @{$self->{'lookup'}->{$key}}, @val;
487 dpavlin 355 }
488     }
489     }
490    
491 dpavlin 356 =head2 get_data
492    
493     Returns value from record.
494    
495 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
496 dpavlin 356
497     Arguments are:
498     record reference C<$rec>,
499     field C<$f>,
500     optional subfiled C<$sf>,
501     index for repeatable values C<$i>.
502    
503 dpavlin 367 Optinal variable C<$found> will be incremeted if there
504 dpavlin 356 is field.
505    
506     Returns value or empty string.
507    
508     =cut
509    
510     sub get_data {
511     my $self = shift;
512    
513     my ($rec,$f,$sf,$i,$found) = @_;
514 dpavlin 367
515 dpavlin 356 if ($$rec->{$f}) {
516 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
517 dpavlin 389 no strict 'refs';
518 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
519     $$found++ if (defined($$found));
520     return $$rec->{$f}->[$i]->{$sf};
521     } elsif ($$rec->{$f}->[$i]) {
522     $$found++ if (defined($$found));
523 dpavlin 366 # it still might have subfield, just
524     # not specified, so we'll dump all
525     if ($$rec->{$f}->[$i] =~ /HASH/o) {
526     my $out;
527     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
528     $out .= $$rec->{$f}->[$i]->{$k}." ";
529     }
530     return $out;
531     } else {
532     return $$rec->{$f}->[$i];
533     }
534 dpavlin 356 }
535     } else {
536     return '';
537     }
538     }
539    
540 dpavlin 352 =head2 fill_in
541    
542     Workhourse of all: takes record from in-memory structure of database and
543     strings with placeholders and returns string or array of with substituted
544     values from record.
545    
546 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
547 dpavlin 352
548     Optional argument is ordinal number for repeatable fields. By default,
549 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
550     element is 0).
551     Following example will read second value from repeatable field.
552 dpavlin 352
553 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
554 dpavlin 353
555     This function B<does not> perform parsing of format to inteligenty skip
556     delimiters before fields which aren't used.
557    
558 dpavlin 376 This method will automatically decode UTF-8 string to local code page
559     if needed.
560    
561 dpavlin 352 =cut
562    
563 dpavlin 372 sub fill_in {
564 dpavlin 371 my $self = shift;
565    
566 dpavlin 372 my $log = $self->_get_logger();
567 dpavlin 371
568 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
569     my $format = shift || $log->logconfess("need format to parse");
570 dpavlin 352 # iteration (for repeatable fields)
571     my $i = shift || 0;
572    
573     # FIXME remove for speedup?
574 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
575 dpavlin 352
576 dpavlin 376 if (utf8::is_utf8($format)) {
577     $format = $self->_x($format);
578     }
579    
580 dpavlin 352 my $found = 0;
581    
582 dpavlin 359 my $eval_code;
583     # remove eval{...} from beginning
584     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
585    
586 dpavlin 500 my $filter_name;
587     # remove filter{...} from beginning
588     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
589    
590 dpavlin 352 # do actual replacement of placeholders
591 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
592 dpavlin 352
593 dpavlin 353 if ($found) {
594 dpavlin 373 $log->debug("format: $format");
595 dpavlin 359 if ($eval_code) {
596     my $eval = $self->fill_in($rec,$eval_code,$i);
597 dpavlin 371 return if (! $self->_eval($eval));
598 dpavlin 359 }
599 dpavlin 500 if ($filter_name && $self->{'filter'}->{$filter_name}) {
600     $log->debug("filter '$filter_name' for $format");
601     $format = $self->{'filter'}->{$filter_name}->($format);
602     return unless(defined($format));
603     $log->debug("filter result: $format");
604     }
605 dpavlin 353 # do we have lookups?
606 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
607     $log->debug("format '$format' has lookup");
608 dpavlin 353 return $self->lookup($format);
609     } else {
610     return $format;
611     }
612 dpavlin 352 } else {
613     return;
614     }
615     }
616    
617     =head2 lookup
618    
619 dpavlin 355 Perform lookups on format supplied to it.
620 dpavlin 352
621 dpavlin 367 my $text = $self->lookup('[v900]');
622 dpavlin 352
623 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
624    
625 dpavlin 352 =cut
626    
627     sub lookup {
628     my $self = shift;
629    
630 dpavlin 372 my $log = $self->_get_logger();
631 dpavlin 352
632 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
633    
634 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
635 dpavlin 352 my @in = ( $tmp );
636 dpavlin 372
637     $log->debug("lookup for: ",$tmp);
638    
639 dpavlin 352 my @out;
640     while (my $f = shift @in) {
641 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
642 dpavlin 352 my $k = $1;
643     if ($self->{'lookup'}->{$k}) {
644     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
645     my $tmp2 = $f;
646 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
647 dpavlin 352 push @in, $tmp2;
648     }
649     } else {
650     undef $f;
651     }
652     } elsif ($f) {
653     push @out, $f;
654     }
655     }
656 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
657 dpavlin 352 return @out;
658     } else {
659     return $tmp;
660     }
661     }
662    
663 dpavlin 356 =head2 parse
664    
665     Perform smart parsing of string, skipping delimiters for fields which aren't
666     defined. It can also eval code in format starting with C<eval{...}> and
667     return output or nothing depending on eval code.
668    
669 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
670 dpavlin 356
671     =cut
672    
673     sub parse {
674     my $self = shift;
675    
676 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
677 dpavlin 356
678 dpavlin 366 return if (! $format_utf8);
679    
680 dpavlin 372 my $log = $self->_get_logger();
681 dpavlin 358
682 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
683     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
684    
685 dpavlin 358 $i = 0 if (! $i);
686    
687 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
688 dpavlin 366
689 dpavlin 356 my @out;
690    
691 dpavlin 373 $log->debug("format: $format");
692    
693 dpavlin 356 my $eval_code;
694     # remove eval{...} from beginning
695     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
696    
697 dpavlin 500 my $filter_name;
698     # remove filter{...} from beginning
699     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
700    
701 dpavlin 358 my $prefix;
702     my $all_found=0;
703 dpavlin 356
704 dpavlin 373 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
705 dpavlin 356
706 dpavlin 358 my $del = $1 || '';
707 dpavlin 359 $prefix ||= $del if ($all_found == 0);
708 dpavlin 358
709     my $found = 0;
710     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
711    
712 dpavlin 356 if ($found) {
713 dpavlin 358 push @out, $del;
714     push @out, $tmp;
715     $all_found += $found;
716 dpavlin 356 }
717     }
718    
719 dpavlin 358 return if (! $all_found);
720 dpavlin 356
721 dpavlin 373 my $out = join('',@out);
722 dpavlin 358
723 dpavlin 373 if ($out) {
724     # add rest of format (suffix)
725     $out .= $format;
726 dpavlin 367
727 dpavlin 373 # add prefix if not there
728     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
729    
730     $log->debug("result: $out");
731     }
732    
733 dpavlin 359 if ($eval_code) {
734 dpavlin 501 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
735 dpavlin 500 $log->debug("about to eval{$eval} format: $out");
736 dpavlin 371 return if (! $self->_eval($eval));
737 dpavlin 359 }
738 dpavlin 500
739     if ($filter_name && $self->{'filter'}->{$filter_name}) {
740     $log->debug("about to filter{$filter_name} format: $out");
741     $out = $self->{'filter'}->{$filter_name}->($out);
742     return unless(defined($out));
743     $log->debug("filter result: $out");
744     }
745 dpavlin 359
746 dpavlin 358 return $out;
747 dpavlin 356 }
748    
749 dpavlin 367 =head2 parse_to_arr
750    
751     Similar to C<parse>, but returns array of all repeatable fields
752    
753     my @arr = $webpac->parse_to_arr($rec,'v250^a');
754    
755     =cut
756    
757     sub parse_to_arr {
758     my $self = shift;
759    
760     my ($rec, $format_utf8) = @_;
761    
762 dpavlin 372 my $log = $self->_get_logger();
763    
764     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
765 dpavlin 367 return if (! $format_utf8);
766    
767     my $i = 0;
768     my @arr;
769    
770     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
771     push @arr, $v;
772     }
773    
774 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
775    
776 dpavlin 367 return @arr;
777     }
778    
779 dpavlin 373 =head2 fill_in_to_arr
780    
781     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
782     for fields which have lookups, so they shouldn't be parsed but rather
783     C<fill_id>ed.
784    
785     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
786    
787     =cut
788    
789     sub fill_in_to_arr {
790     my $self = shift;
791    
792     my ($rec, $format_utf8) = @_;
793    
794     my $log = $self->_get_logger();
795    
796     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
797     return if (! $format_utf8);
798    
799     my $i = 0;
800     my @arr;
801    
802     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
803     push @arr, @v;
804     }
805    
806     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
807    
808     return @arr;
809     }
810    
811 dpavlin 441 =head2 sort_arr
812 dpavlin 373
813 dpavlin 441 Sort array ignoring case and html in data
814    
815     my @sorted = $webpac->sort_arr(@unsorted);
816    
817     =cut
818    
819     sub sort_arr {
820     my $self = shift;
821    
822     my $log = $self->_get_logger();
823    
824     # FIXME add Schwartzian Transformation?
825    
826     my @sorted = sort {
827     $a =~ s#<[^>]+/*>##;
828     $b =~ s#<[^>]+/*>##;
829     lc($b) cmp lc($a)
830     } @_;
831     $log->debug("sorted values: ",sub { join(", ",@sorted) });
832    
833     return @sorted;
834     }
835    
836    
837 dpavlin 366 =head2 data_structure
838    
839     Create in-memory data structure which represents layout from C<import_xml>.
840     It is used later to produce output.
841    
842 dpavlin 368 my @ds = $webpac->data_structure($rec);
843 dpavlin 366
844 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
845 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
846     <headline> tag.
847 dpavlin 374
848 dpavlin 366 =cut
849    
850 dpavlin 372 sub data_structure {
851 dpavlin 366 my $self = shift;
852    
853 dpavlin 372 my $log = $self->_get_logger();
854 dpavlin 366
855     my $rec = shift;
856 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
857 dpavlin 366
858 dpavlin 374 undef $self->{'currnet_filename'};
859 dpavlin 398 undef $self->{'headline'};
860 dpavlin 374
861 dpavlin 366 my @sorted_tags;
862     if ($self->{tags_by_order}) {
863     @sorted_tags = @{$self->{tags_by_order}};
864     } else {
865     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
866     $self->{tags_by_order} = \@sorted_tags;
867     }
868    
869 dpavlin 368 my @ds;
870 dpavlin 366
871 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
872    
873 dpavlin 366 foreach my $field (@sorted_tags) {
874    
875     my $row;
876    
877     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
878    
879     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
880 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
881 dpavlin 366
882 dpavlin 373 $log->debug("format: $format");
883    
884     my @v;
885     if ($format =~ /$LOOKUP_REGEX/o) {
886     @v = $self->fill_in_to_arr($rec,$format);
887     } else {
888     @v = $self->parse_to_arr($rec,$format);
889     }
890 dpavlin 367 next if (! @v);
891 dpavlin 366
892 dpavlin 439 if ($tag->{'sort'}) {
893 dpavlin 441 @v = $self->sort_arr(@v);
894 dpavlin 439 $log->warn("sort within tag is usually not what you want!");
895     }
896    
897 dpavlin 375 # use format?
898     if ($tag->{'format_name'}) {
899     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
900     }
901    
902 dpavlin 398 if ($field eq 'filename') {
903     $self->{'current_filename'} = join('',@v);
904     $log->debug("filename: ",$self->{'current_filename'});
905     } elsif ($field eq 'headline') {
906     $self->{'headline'} .= join('',@v);
907     $log->debug("headline: ",$self->{'headline'});
908     next; # don't return headline in data_structure!
909     }
910    
911 dpavlin 439 # delimiter will join repeatable fields
912     if ($tag->{'delimiter'}) {
913     @v = ( join($tag->{'delimiter'}, @v) );
914 dpavlin 366 }
915 dpavlin 373
916 dpavlin 439 # default types
917     my @types = qw(display swish);
918     # override by type attribute
919     @types = ( $tag->{'type'} ) if ($tag->{'type'});
920 dpavlin 374
921 dpavlin 439 foreach my $type (@types) {
922     # append to previous line?
923     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
924     if ($tag->{'append'}) {
925    
926     # I will delimit appended part with
927     # delimiter (or ,)
928     my $d = $tag->{'delimiter'};
929     # default delimiter
930 dpavlin 501 $d ||= " ";
931 dpavlin 439
932     my $last = pop @{$row->{$type}};
933     $d = "" if (! $last);
934     $last .= $d . join($d, @v);
935     push @{$row->{$type}}, $last;
936    
937     } else {
938     push @{$row->{$type}}, @v;
939     }
940     }
941    
942    
943 dpavlin 366 }
944    
945 dpavlin 368 if ($row) {
946     $row->{'tag'} = $field;
947 dpavlin 375
948     # TODO: name_sigular, name_plural
949     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
950     $row->{'name'} = $name ? $self->_x($name) : $field;
951    
952 dpavlin 439 # post-sort all values in field
953     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
954     $log->warn("sort at field tag not implemented");
955     }
956    
957 dpavlin 368 push @ds, $row;
958 dpavlin 374
959 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
960 dpavlin 368 }
961 dpavlin 366
962     }
963    
964 dpavlin 370 return @ds;
965 dpavlin 366
966     }
967    
968 dpavlin 370 =head2 output
969    
970     Create output from in-memory data structure using Template Toolkit template.
971    
972     my $text = $webpac->output( template => 'text.tt', data => @ds );
973    
974     =cut
975    
976     sub output {
977     my $self = shift;
978    
979     my $args = {@_};
980    
981 dpavlin 372 my $log = $self->_get_logger();
982 dpavlin 370
983 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
984     $log->logconfess("need data array") if (! $args->{'data'});
985    
986 dpavlin 370 my $out;
987    
988     $self->{'tt'}->process(
989     $args->{'template'},
990     $args,
991     \$out
992     ) || confess $self->{'tt'}->error();
993    
994     return $out;
995     }
996    
997 dpavlin 411 =head2 output_file
998    
999     Create output from in-memory data structure using Template Toolkit template
1000     to a file.
1001    
1002     $webpac->output_file(
1003     file => 'out.txt',
1004     template => 'text.tt',
1005     data => @ds
1006     );
1007    
1008     =cut
1009    
1010     sub output_file {
1011     my $self = shift;
1012    
1013     my $args = {@_};
1014    
1015     my $log = $self->_get_logger();
1016    
1017 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
1018 dpavlin 411
1019 dpavlin 421 $log->debug("creating file ",$file);
1020 dpavlin 411
1021 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1022 dpavlin 411 print $fh $self->output(
1023     template => $args->{'template'},
1024     data => $args->{'data'},
1025     ) || $log->logdie("print: $!");
1026     close($fh) || $log->logdie("close: $!");
1027     }
1028    
1029 dpavlin 375 =head2 apply_format
1030    
1031     Apply format specified in tag with C<format_name="name"> and
1032     C<format_delimiter=";;">.
1033    
1034     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1035    
1036     Formats can contain C<lookup{...}> if you need them.
1037    
1038     =cut
1039    
1040     sub apply_format {
1041     my $self = shift;
1042    
1043     my ($name,$delimiter,$data) = @_;
1044    
1045     my $log = $self->_get_logger();
1046    
1047     if (! $self->{'import_xml'}->{'format'}->{$name}) {
1048     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1049     return $data;
1050     }
1051    
1052     $log->warn("no delimiter for format $name") if (! $delimiter);
1053    
1054     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1055    
1056     my @data = split(/\Q$delimiter\E/, $data);
1057    
1058     my $out = sprintf($format, @data);
1059     $log->debug("using format $name [$format] on $data to produce: $out");
1060    
1061     if ($out =~ m/$LOOKUP_REGEX/o) {
1062     return $self->lookup($out);
1063     } else {
1064     return $out;
1065     }
1066    
1067     }
1068    
1069    
1070 dpavlin 372 #
1071     #
1072     #
1073    
1074     =head1 INTERNAL METHODS
1075    
1076     Here is a quick list of internal methods, mostly useful to turn debugging
1077     on them (see L<LOGGING> below for explanation).
1078    
1079     =cut
1080    
1081     =head2 _eval
1082    
1083     Internal function to eval code without C<strict 'subs'>.
1084    
1085     =cut
1086    
1087     sub _eval {
1088     my $self = shift;
1089    
1090     my $code = shift || return;
1091    
1092     my $log = $self->_get_logger();
1093    
1094     no strict 'subs';
1095     my $ret = eval $code;
1096     if ($@) {
1097     $log->error("problem with eval code [$code]: $@");
1098     }
1099    
1100     $log->debug("eval: ",$code," [",$ret,"]");
1101    
1102 dpavlin 500 return $ret || undef;
1103 dpavlin 372 }
1104    
1105     =head2 _sort_by_order
1106    
1107     Sort xml tags data structure accoding to C<order=""> attribute.
1108    
1109     =cut
1110    
1111     sub _sort_by_order {
1112     my $self = shift;
1113    
1114     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1115     $self->{'import_xml'}->{'indexer'}->{$a};
1116     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1117     $self->{'import_xml'}->{'indexer'}->{$b};
1118    
1119     return $va <=> $vb;
1120     }
1121    
1122 dpavlin 375 =head2 _get_logger
1123    
1124     Get C<Log::Log4perl> object with a twist: domains are defined for each
1125     method
1126    
1127     my $log = $webpac->_get_logger();
1128    
1129     =cut
1130    
1131 dpavlin 372 sub _get_logger {
1132     my $self = shift;
1133    
1134 dpavlin 374 my $name = (caller(1))[3] || caller;
1135     return get_logger($name);
1136 dpavlin 372 }
1137    
1138 dpavlin 375 =head2 _x
1139    
1140     Convert string from UTF-8 to code page defined in C<import_xml>.
1141    
1142     my $text = $webpac->_x('utf8 text');
1143    
1144     =cut
1145    
1146     sub _x {
1147     my $self = shift;
1148     my $utf8 = shift || return;
1149    
1150     return $self->{'utf2cp'}->convert($utf8) ||
1151     $self->_get_logger()->logwarn("can't convert '$utf8'");
1152     }
1153    
1154 dpavlin 372 #
1155     #
1156     #
1157    
1158     =head1 LOGGING
1159    
1160     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1161     C<log.conf>.
1162    
1163     Methods defined above have different levels of logging, so
1164     it's descriptions will be useful to turn (mostry B<debug> logging) on
1165     or off to see why WabPAC isn't perforing as you expect it (it might even
1166     be a bug!).
1167    
1168     B<This is different from normal Log4perl behaviour>. To repeat, you can
1169     also use method names, and not only classes (which are just few)
1170     to filter logging.
1171    
1172 dpavlin 422
1173     =head1 MEMORY USAGE
1174    
1175     C<low_mem> options is double-edged sword. If enabled, WebPAC
1176     will run on memory constraint machines (which doesn't have enough
1177     physical RAM to create memory structure for whole source database).
1178    
1179     If your machine has 512Mb or more of RAM and database is around 10000 records,
1180     memory shouldn't be an issue. If you don't have enough physical RAM, you
1181     might consider using virtual memory (if your operating system is handling it
1182     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1183     parsed structure of ISIS database (this is what C<low_mem> option does).
1184    
1185     Hitting swap at end of reading source database is probably o.k. However,
1186     hitting swap before 90% will dramatically decrease performance and you will
1187     be better off with C<low_mem> and using rest of availble memory for
1188     operating system disk cache (Linux is particuallary good about this).
1189     However, every access to database record will require disk access, so
1190     generation phase will be slower 10-100 times.
1191    
1192     Parsed structures are essential - you just have option to trade RAM memory
1193     (which is fast) for disk space (which is slow). Be sure to have planty of
1194     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1195    
1196     However, when WebPAC is running on desktop machines (or laptops :-), it's
1197     highly undesireable for system to start swapping. Using C<low_mem> option can
1198     reduce WecPAC memory usage to around 64Mb for same database with lookup
1199     fields and sorted indexes which stay in RAM. Performance will suffer, but
1200     memory usage will really be minimal. It might be also more confortable to
1201     run WebPAC reniced on those machines.
1202    
1203 dpavlin 372 =cut
1204    
1205 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26