/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 436 - (hide annotations)
Mon Sep 13 14:55:13 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 24704 byte(s)
die if database doesn't exist

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37     [code_page => 'ISO-8859-2',]
38 dpavlin 421 [low_mem => 1,]
39 dpavlin 352 );
40    
41     Default C<code_page> is C<ISO-8859-2>.
42    
43 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45 dpavlin 421 This method will also read configuration files
46 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
47     and configuration file specified by C<config_file>
48     which describes databases to be indexed.
49    
50 dpavlin 352 =cut
51    
52 dpavlin 363 # mapping between data type and tag which specify
53     # format in XML file
54     my %type2tag = (
55     'isis' => 'isis',
56     # 'excel' => 'column',
57     # 'marc' => 'marc',
58     # 'feed' => 'feed'
59     );
60    
61 dpavlin 352 sub new {
62     my $class = shift;
63     my $self = {@_};
64     bless($self, $class);
65    
66 dpavlin 422 $self->{'start_t'} = time();
67    
68 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
69     Log::Log4perl->init($log_file);
70    
71     my $log = $self->_get_logger();
72    
73 dpavlin 352 # fill in default values
74     # output codepage
75     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76    
77 dpavlin 353 #
78     # read global.conf
79     #
80 dpavlin 372 $log->debug("read 'global.conf'");
81 dpavlin 352
82 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83 dpavlin 352
84     # read global config parametars
85     foreach my $var (qw(
86     dbi_dbd
87     dbi_dsn
88     dbi_user
89     dbi_passwd
90     show_progress
91     my_unac_filter
92 dpavlin 370 output_template
93 dpavlin 352 )) {
94 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 dpavlin 352 }
96    
97 dpavlin 353 #
98     # read indexer config file
99     #
100 dpavlin 352
101 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102 dpavlin 352
103 dpavlin 370 # create UTF-8 convertor for import_xml files
104 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105 dpavlin 370
106     # create Template toolkit instance
107     $self->{'tt'} = Template->new(
108     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109     # FILTERS => {
110     # 'foo' => \&foo_filter,
111     # },
112     EVAL_PERL => 1,
113     );
114    
115 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
116     if ($self->{'low_mem'}) {
117 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118 dpavlin 421
119     my $db_file = "data.db";
120    
121     if (-e $db_file) {
122     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123     $log->debug("removed '$db_file' from last run");
124     }
125    
126 dpavlin 434 require DBM::Deep;
127 dpavlin 421
128     my $db = new DBM::Deep $db_file;
129    
130     $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132     if ($db->error()) {
133     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134     } else {
135 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
136 dpavlin 421 }
137    
138     $self->{'db'} = $db;
139     }
140    
141 dpavlin 352 return $self;
142     }
143    
144     =head2 open_isis
145    
146     Open CDS/ISIS database using OpenIsis module and read all records to memory.
147    
148     $webpac->open_isis(
149     filename => '/data/ISIS/ISIS',
150     code_page => '852',
151 dpavlin 431 limit_mfn => 500,
152     start_mfn => 6000,
153 dpavlin 352 lookup => [ ... ],
154     );
155    
156     By default, ISIS code page is assumed to be C<852>.
157    
158 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
159     from database (so you can skip beginning of your database if you need to).
160    
161 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
162     from database in example above.
163 dpavlin 352
164     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166     value in index.
167    
168     lookup => [
169     { 'key' => 'd:v900', 'val' => 'v250^a' },
170     { 'eval' => '"v901^a" eq "Podruèje"',
171     'key' => 'pa:v561^4:v562^4:v461^1',
172     'val' => 'v900' },
173     ]
174    
175 dpavlin 367 Returns number of last record read into memory (size of database, really).
176    
177 dpavlin 352 =cut
178    
179     sub open_isis {
180     my $self = shift;
181     my $arg = {@_};
182    
183 dpavlin 372 my $log = $self->_get_logger();
184    
185     $log->logcroak("need filename") if (! $arg->{'filename'});
186 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
187    
188 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190 dpavlin 421 # store data in object
191     $self->{'isis_filename'} = $arg->{'filename'};
192     $self->{'isis_code_page'} = $code_page;
193    
194 dpavlin 353 use OpenIsis;
195    
196 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
197    
198     # create Text::Iconv object
199     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200    
201 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
202 dpavlin 421 $log->debug("isis code page: $code_page");
203 dpavlin 363
204 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207 dpavlin 431 my $startmfn = 1;
208 dpavlin 352
209 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
210     $log->info("skipping to MFN $s");
211     $startmfn = $s;
212     }
213 dpavlin 357
214 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
215 dpavlin 363
216 dpavlin 431 $log->info("processing ",($maxmfn-$startmfn)." records...");
217    
218 dpavlin 352 # read database
219 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
220 dpavlin 352
221 dpavlin 418
222     $log->debug("mfn: $mfn\n");
223    
224 dpavlin 421 my $rec;
225    
226 dpavlin 352 # read record
227     my $row = OpenIsis::read( $isis_db, $mfn );
228     foreach my $k (keys %{$row}) {
229     if ($k ne "mfn") {
230     foreach my $l (@{$row->{$k}}) {
231     $l = $cp->convert($l);
232     # has subfields?
233     my $val;
234     if ($l =~ m/\^/) {
235     foreach my $t (split(/\^/,$l)) {
236     next if (! $t);
237     $val->{substr($t,0,1)} = substr($t,1);
238     }
239     } else {
240     $val = $l;
241     }
242    
243 dpavlin 421 push @{$rec->{$k}}, $val;
244 dpavlin 352 }
245 dpavlin 374 } else {
246 dpavlin 421 push @{$rec->{'000'}}, $mfn;
247 dpavlin 352 }
248    
249     }
250    
251 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
252    
253     # store
254     if ($self->{'low_mem'}) {
255     $self->{'db'}->put($mfn, $rec);
256     } else {
257     $self->{'data'}->{$mfn} = $rec;
258     }
259    
260 dpavlin 352 # create lookup
261 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
262 dpavlin 352
263 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
264    
265 dpavlin 352 }
266    
267 dpavlin 431 $self->{'current_mfn'} = $startmfn;
268 dpavlin 389 $self->{'last_pcnt'} = 0;
269 dpavlin 362
270 dpavlin 418 $log->debug("max mfn: $maxmfn");
271    
272 dpavlin 352 # store max mfn and return it.
273     return $self->{'max_mfn'} = $maxmfn;
274     }
275    
276 dpavlin 362 =head2 fetch_rec
277    
278     Fetch next record from database. It will also display progress bar (once
279     it's implemented, that is).
280    
281     my $rec = $webpac->fetch_rec;
282    
283     =cut
284    
285     sub fetch_rec {
286     my $self = shift;
287    
288 dpavlin 372 my $log = $self->_get_logger();
289 dpavlin 362
290 dpavlin 372 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
291    
292 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
293     $self->{'current_mfn'} = $self->{'max_mfn'};
294 dpavlin 373 $log->debug("at EOF");
295 dpavlin 362 return;
296     }
297    
298 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
299    
300 dpavlin 421 if ($self->{'low_mem'}) {
301     return $self->{'db'}->get($mfn);
302     } else {
303     return $self->{'data'}->{$mfn};
304     }
305 dpavlin 362 }
306    
307 dpavlin 389 =head2 progress_bar
308    
309     Draw progress bar on STDERR.
310    
311     $webpac->progress_bar($current, $max);
312    
313     =cut
314    
315     sub progress_bar {
316     my $self = shift;
317    
318     my ($curr,$max) = @_;
319    
320     my $log = $self->_get_logger();
321    
322     $log->logconfess("no current value!") if (! $curr);
323     $log->logconfess("no maximum value!") if (! $max);
324    
325     if ($curr > $max) {
326     $max = $curr;
327     $log->debug("overflow to $curr");
328     }
329    
330     $self->{'last_pcnt'} ||= 1;
331    
332 dpavlin 422 my $p = int($curr * 100 / $max);
333 dpavlin 389
334 dpavlin 422 # reset on re-run
335     if ($p < $self->{'last_pcnt'}) {
336     $self->{'last_pcnt'} = $p;
337     $self->{'last_t'} = time();
338     $self->{'last_curr'} = 1;
339     }
340    
341 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
342 dpavlin 422
343     my $last_curr = $self->{'last_curr'} || $curr;
344     my $t = time();
345     my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
346     my $eta = ($max-$curr) / ($rate || 1);
347     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
348 dpavlin 389 $self->{'last_pcnt'} = $p;
349 dpavlin 422 $self->{'last_t'} = time();
350     $self->{'last_curr'} = $curr;
351 dpavlin 389 }
352 dpavlin 412 print STDERR "\n" if ($p == 100);
353 dpavlin 389 }
354    
355 dpavlin 422 =head2 fmt_time
356    
357     Format time (in seconds) for display.
358    
359     print $webpac->fmt_time(time());
360    
361     This method is called by L<progress_bar> to display remaining time.
362    
363     =cut
364    
365     sub fmt_time {
366     my $self = shift;
367    
368     my $t = shift || 0;
369     my $out = "";
370    
371     my ($ss,$mm,$hh) = gmtime($t);
372     $out .= "${hh}h" if ($hh);
373     $out .= sprintf("%02d:%02d", $mm,$ss);
374     $out .= " " if ($hh == 0);
375     return $out;
376     }
377    
378 dpavlin 363 =head2 open_import_xml
379    
380     Read file from C<import_xml/> directory and parse it.
381    
382     $webpac->open_import_xml(type => 'isis');
383    
384     =cut
385    
386     sub open_import_xml {
387     my $self = shift;
388    
389 dpavlin 372 my $log = $self->_get_logger();
390    
391 dpavlin 363 my $arg = {@_};
392 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
393 dpavlin 363
394 dpavlin 366 $self->{'type'} = $arg->{'type'};
395 dpavlin 363
396 dpavlin 366 my $type_base = $arg->{'type'};
397 dpavlin 363 $type_base =~ s/_.*$//g;
398    
399 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
400    
401 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
402 dpavlin 366
403     my $f = "./import_xml/".$self->{'type'}.".xml";
404 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
405 dpavlin 363
406 dpavlin 375 $log->info("reading '$f'");
407 dpavlin 363
408 dpavlin 375 $self->{'import_xml_file'} = $f;
409    
410 dpavlin 363 $self->{'import_xml'} = XMLin($f,
411 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
412 dpavlin 363 );
413    
414 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
415    
416 dpavlin 363 }
417    
418 dpavlin 355 =head2 create_lookup
419    
420     Create lookup from record using lookup definition.
421    
422 dpavlin 367 $self->create_lookup($rec, @lookups);
423    
424     Called internally by C<open_*> methods.
425    
426 dpavlin 355 =cut
427    
428     sub create_lookup {
429     my $self = shift;
430    
431 dpavlin 372 my $log = $self->_get_logger();
432 dpavlin 355
433 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
434     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
435    
436 dpavlin 355 foreach my $i (@_) {
437 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
438     $log->logconfess("need val") unless defined($i->{'val'});
439    
440     if (defined($i->{'eval'})) {
441     # eval first, so we can skip fill_in for key and val
442     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
443     if ($self->_eval($eval)) {
444     my $key = $self->fill_in($rec,$i->{'key'}) || next;
445     my @val = $self->fill_in($rec,$i->{'val'}) || next;
446 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
447 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
448     }
449     } else {
450 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
451     my @val = $self->fill_in($rec,$i->{'val'}) || next;
452     $log->debug("stored $key = ",sub { join(" | ",@val) });
453     push @{$self->{'lookup'}->{$key}}, @val;
454 dpavlin 355 }
455     }
456     }
457    
458 dpavlin 356 =head2 get_data
459    
460     Returns value from record.
461    
462 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
463 dpavlin 356
464     Arguments are:
465     record reference C<$rec>,
466     field C<$f>,
467     optional subfiled C<$sf>,
468     index for repeatable values C<$i>.
469    
470 dpavlin 367 Optinal variable C<$found> will be incremeted if there
471 dpavlin 356 is field.
472    
473     Returns value or empty string.
474    
475     =cut
476    
477     sub get_data {
478     my $self = shift;
479    
480     my ($rec,$f,$sf,$i,$found) = @_;
481 dpavlin 367
482 dpavlin 356 if ($$rec->{$f}) {
483 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
484 dpavlin 389 no strict 'refs';
485 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
486     $$found++ if (defined($$found));
487     return $$rec->{$f}->[$i]->{$sf};
488     } elsif ($$rec->{$f}->[$i]) {
489     $$found++ if (defined($$found));
490 dpavlin 366 # it still might have subfield, just
491     # not specified, so we'll dump all
492     if ($$rec->{$f}->[$i] =~ /HASH/o) {
493     my $out;
494     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
495     $out .= $$rec->{$f}->[$i]->{$k}." ";
496     }
497     return $out;
498     } else {
499     return $$rec->{$f}->[$i];
500     }
501 dpavlin 356 }
502     } else {
503     return '';
504     }
505     }
506    
507 dpavlin 352 =head2 fill_in
508    
509     Workhourse of all: takes record from in-memory structure of database and
510     strings with placeholders and returns string or array of with substituted
511     values from record.
512    
513 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
514 dpavlin 352
515     Optional argument is ordinal number for repeatable fields. By default,
516 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
517     element is 0).
518     Following example will read second value from repeatable field.
519 dpavlin 352
520 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
521 dpavlin 353
522     This function B<does not> perform parsing of format to inteligenty skip
523     delimiters before fields which aren't used.
524    
525 dpavlin 376 This method will automatically decode UTF-8 string to local code page
526     if needed.
527    
528 dpavlin 352 =cut
529    
530 dpavlin 372 sub fill_in {
531 dpavlin 371 my $self = shift;
532    
533 dpavlin 372 my $log = $self->_get_logger();
534 dpavlin 371
535 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
536     my $format = shift || $log->logconfess("need format to parse");
537 dpavlin 352 # iteration (for repeatable fields)
538     my $i = shift || 0;
539    
540     # FIXME remove for speedup?
541 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
542 dpavlin 352
543 dpavlin 376 if (utf8::is_utf8($format)) {
544     $format = $self->_x($format);
545     }
546    
547 dpavlin 352 my $found = 0;
548    
549 dpavlin 359 my $eval_code;
550     # remove eval{...} from beginning
551     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
552    
553 dpavlin 352 # do actual replacement of placeholders
554 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
555 dpavlin 352
556 dpavlin 353 if ($found) {
557 dpavlin 373 $log->debug("format: $format");
558 dpavlin 359 if ($eval_code) {
559     my $eval = $self->fill_in($rec,$eval_code,$i);
560 dpavlin 371 return if (! $self->_eval($eval));
561 dpavlin 359 }
562 dpavlin 353 # do we have lookups?
563 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
564     $log->debug("format '$format' has lookup");
565 dpavlin 353 return $self->lookup($format);
566     } else {
567     return $format;
568     }
569 dpavlin 352 } else {
570     return;
571     }
572     }
573    
574     =head2 lookup
575    
576 dpavlin 355 Perform lookups on format supplied to it.
577 dpavlin 352
578 dpavlin 367 my $text = $self->lookup('[v900]');
579 dpavlin 352
580 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
581    
582 dpavlin 352 =cut
583    
584     sub lookup {
585     my $self = shift;
586    
587 dpavlin 372 my $log = $self->_get_logger();
588 dpavlin 352
589 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
590    
591 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
592 dpavlin 352 my @in = ( $tmp );
593 dpavlin 372
594     $log->debug("lookup for: ",$tmp);
595    
596 dpavlin 352 my @out;
597     while (my $f = shift @in) {
598 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
599 dpavlin 352 my $k = $1;
600     if ($self->{'lookup'}->{$k}) {
601     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
602     my $tmp2 = $f;
603 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
604 dpavlin 352 push @in, $tmp2;
605     }
606     } else {
607     undef $f;
608     }
609     } elsif ($f) {
610     push @out, $f;
611     }
612     }
613 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
614 dpavlin 352 return @out;
615     } else {
616     return $tmp;
617     }
618     }
619    
620 dpavlin 356 =head2 parse
621    
622     Perform smart parsing of string, skipping delimiters for fields which aren't
623     defined. It can also eval code in format starting with C<eval{...}> and
624     return output or nothing depending on eval code.
625    
626 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
627 dpavlin 356
628     =cut
629    
630     sub parse {
631     my $self = shift;
632    
633 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
634 dpavlin 356
635 dpavlin 366 return if (! $format_utf8);
636    
637 dpavlin 372 my $log = $self->_get_logger();
638 dpavlin 358
639 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
640     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
641    
642 dpavlin 358 $i = 0 if (! $i);
643    
644 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
645 dpavlin 366
646 dpavlin 356 my @out;
647    
648 dpavlin 373 $log->debug("format: $format");
649    
650 dpavlin 356 my $eval_code;
651     # remove eval{...} from beginning
652     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
653    
654 dpavlin 358 my $prefix;
655     my $all_found=0;
656 dpavlin 356
657 dpavlin 373 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
658 dpavlin 356
659 dpavlin 358 my $del = $1 || '';
660 dpavlin 359 $prefix ||= $del if ($all_found == 0);
661 dpavlin 358
662     my $found = 0;
663     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
664    
665 dpavlin 356 if ($found) {
666 dpavlin 358 push @out, $del;
667     push @out, $tmp;
668     $all_found += $found;
669 dpavlin 356 }
670     }
671    
672 dpavlin 358 return if (! $all_found);
673 dpavlin 356
674 dpavlin 373 my $out = join('',@out);
675 dpavlin 358
676 dpavlin 373 if ($out) {
677     # add rest of format (suffix)
678     $out .= $format;
679 dpavlin 367
680 dpavlin 373 # add prefix if not there
681     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
682    
683     $log->debug("result: $out");
684     }
685    
686 dpavlin 359 if ($eval_code) {
687     my $eval = $self->fill_in($rec,$eval_code,$i);
688 dpavlin 373 $log->debug("about to eval{",$eval,"} format: $out");
689 dpavlin 371 return if (! $self->_eval($eval));
690 dpavlin 359 }
691    
692 dpavlin 358 return $out;
693 dpavlin 356 }
694    
695 dpavlin 367 =head2 parse_to_arr
696    
697     Similar to C<parse>, but returns array of all repeatable fields
698    
699     my @arr = $webpac->parse_to_arr($rec,'v250^a');
700    
701     =cut
702    
703     sub parse_to_arr {
704     my $self = shift;
705    
706     my ($rec, $format_utf8) = @_;
707    
708 dpavlin 372 my $log = $self->_get_logger();
709    
710     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
711 dpavlin 367 return if (! $format_utf8);
712    
713     my $i = 0;
714     my @arr;
715    
716     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
717     push @arr, $v;
718     }
719    
720 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
721    
722 dpavlin 367 return @arr;
723     }
724    
725 dpavlin 373 =head2 fill_in_to_arr
726    
727     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
728     for fields which have lookups, so they shouldn't be parsed but rather
729     C<fill_id>ed.
730    
731     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
732    
733     =cut
734    
735     sub fill_in_to_arr {
736     my $self = shift;
737    
738     my ($rec, $format_utf8) = @_;
739    
740     my $log = $self->_get_logger();
741    
742     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
743     return if (! $format_utf8);
744    
745     my $i = 0;
746     my @arr;
747    
748     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
749     push @arr, @v;
750     }
751    
752     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
753    
754     return @arr;
755     }
756    
757    
758 dpavlin 366 =head2 data_structure
759    
760     Create in-memory data structure which represents layout from C<import_xml>.
761     It is used later to produce output.
762    
763 dpavlin 368 my @ds = $webpac->data_structure($rec);
764 dpavlin 366
765 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
766 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
767     <headline> tag.
768 dpavlin 374
769 dpavlin 366 =cut
770    
771 dpavlin 372 sub data_structure {
772 dpavlin 366 my $self = shift;
773    
774 dpavlin 372 my $log = $self->_get_logger();
775 dpavlin 366
776     my $rec = shift;
777 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
778 dpavlin 366
779 dpavlin 374 undef $self->{'currnet_filename'};
780 dpavlin 398 undef $self->{'headline'};
781 dpavlin 374
782 dpavlin 366 my @sorted_tags;
783     if ($self->{tags_by_order}) {
784     @sorted_tags = @{$self->{tags_by_order}};
785     } else {
786     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
787     $self->{tags_by_order} = \@sorted_tags;
788     }
789    
790 dpavlin 368 my @ds;
791 dpavlin 366
792 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
793    
794 dpavlin 366 foreach my $field (@sorted_tags) {
795    
796     my $row;
797    
798     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
799    
800     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
801 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
802 dpavlin 366
803 dpavlin 373 $log->debug("format: $format");
804    
805     my @v;
806     if ($format =~ /$LOOKUP_REGEX/o) {
807     @v = $self->fill_in_to_arr($rec,$format);
808     } else {
809     @v = $self->parse_to_arr($rec,$format);
810     }
811 dpavlin 367 next if (! @v);
812 dpavlin 366
813 dpavlin 375 # use format?
814     if ($tag->{'format_name'}) {
815     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
816     }
817    
818 dpavlin 398 if ($field eq 'filename') {
819     $self->{'current_filename'} = join('',@v);
820     $log->debug("filename: ",$self->{'current_filename'});
821     } elsif ($field eq 'headline') {
822     $self->{'headline'} .= join('',@v);
823     $log->debug("headline: ",$self->{'headline'});
824     next; # don't return headline in data_structure!
825     }
826    
827 dpavlin 366 # does tag have type?
828     if ($tag->{'type'}) {
829 dpavlin 367 push @{$row->{$tag->{'type'}}}, @v;
830 dpavlin 366 } else {
831 dpavlin 367 push @{$row->{'display'}}, @v;
832     push @{$row->{'swish'}}, @v;
833 dpavlin 366 }
834 dpavlin 373
835 dpavlin 374
836 dpavlin 366 }
837    
838 dpavlin 368 if ($row) {
839     $row->{'tag'} = $field;
840 dpavlin 375
841     # TODO: name_sigular, name_plural
842     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
843     $row->{'name'} = $name ? $self->_x($name) : $field;
844    
845 dpavlin 368 push @ds, $row;
846 dpavlin 374
847 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
848 dpavlin 368 }
849 dpavlin 366
850     }
851    
852 dpavlin 370 return @ds;
853 dpavlin 366
854     }
855    
856 dpavlin 370 =head2 output
857    
858     Create output from in-memory data structure using Template Toolkit template.
859    
860     my $text = $webpac->output( template => 'text.tt', data => @ds );
861    
862     =cut
863    
864     sub output {
865     my $self = shift;
866    
867     my $args = {@_};
868    
869 dpavlin 372 my $log = $self->_get_logger();
870 dpavlin 370
871 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
872     $log->logconfess("need data array") if (! $args->{'data'});
873    
874 dpavlin 370 my $out;
875    
876     $self->{'tt'}->process(
877     $args->{'template'},
878     $args,
879     \$out
880     ) || confess $self->{'tt'}->error();
881    
882     return $out;
883     }
884    
885 dpavlin 411 =head2 output_file
886    
887     Create output from in-memory data structure using Template Toolkit template
888     to a file.
889    
890     $webpac->output_file(
891     file => 'out.txt',
892     template => 'text.tt',
893     data => @ds
894     );
895    
896     =cut
897    
898     sub output_file {
899     my $self = shift;
900    
901     my $args = {@_};
902    
903     my $log = $self->_get_logger();
904    
905 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
906 dpavlin 411
907 dpavlin 421 $log->debug("creating file ",$file);
908 dpavlin 411
909 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
910 dpavlin 411 print $fh $self->output(
911     template => $args->{'template'},
912     data => $args->{'data'},
913     ) || $log->logdie("print: $!");
914     close($fh) || $log->logdie("close: $!");
915     }
916    
917 dpavlin 375 =head2 apply_format
918    
919     Apply format specified in tag with C<format_name="name"> and
920     C<format_delimiter=";;">.
921    
922     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
923    
924     Formats can contain C<lookup{...}> if you need them.
925    
926     =cut
927    
928     sub apply_format {
929     my $self = shift;
930    
931     my ($name,$delimiter,$data) = @_;
932    
933     my $log = $self->_get_logger();
934    
935     if (! $self->{'import_xml'}->{'format'}->{$name}) {
936     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
937     return $data;
938     }
939    
940     $log->warn("no delimiter for format $name") if (! $delimiter);
941    
942     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
943    
944     my @data = split(/\Q$delimiter\E/, $data);
945    
946     my $out = sprintf($format, @data);
947     $log->debug("using format $name [$format] on $data to produce: $out");
948    
949     if ($out =~ m/$LOOKUP_REGEX/o) {
950     return $self->lookup($out);
951     } else {
952     return $out;
953     }
954    
955     }
956    
957    
958 dpavlin 372 #
959     #
960     #
961    
962     =head1 INTERNAL METHODS
963    
964     Here is a quick list of internal methods, mostly useful to turn debugging
965     on them (see L<LOGGING> below for explanation).
966    
967     =cut
968    
969     =head2 _eval
970    
971     Internal function to eval code without C<strict 'subs'>.
972    
973     =cut
974    
975     sub _eval {
976     my $self = shift;
977    
978     my $code = shift || return;
979    
980     my $log = $self->_get_logger();
981    
982     no strict 'subs';
983     my $ret = eval $code;
984     if ($@) {
985     $log->error("problem with eval code [$code]: $@");
986     }
987    
988     $log->debug("eval: ",$code," [",$ret,"]");
989    
990     return $ret || 0;
991     }
992    
993     =head2 _sort_by_order
994    
995     Sort xml tags data structure accoding to C<order=""> attribute.
996    
997     =cut
998    
999     sub _sort_by_order {
1000     my $self = shift;
1001    
1002     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1003     $self->{'import_xml'}->{'indexer'}->{$a};
1004     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1005     $self->{'import_xml'}->{'indexer'}->{$b};
1006    
1007     return $va <=> $vb;
1008     }
1009    
1010 dpavlin 375 =head2 _get_logger
1011    
1012     Get C<Log::Log4perl> object with a twist: domains are defined for each
1013     method
1014    
1015     my $log = $webpac->_get_logger();
1016    
1017     =cut
1018    
1019 dpavlin 372 sub _get_logger {
1020     my $self = shift;
1021    
1022 dpavlin 374 my $name = (caller(1))[3] || caller;
1023     return get_logger($name);
1024 dpavlin 372 }
1025    
1026 dpavlin 375 =head2 _x
1027    
1028     Convert string from UTF-8 to code page defined in C<import_xml>.
1029    
1030     my $text = $webpac->_x('utf8 text');
1031    
1032     =cut
1033    
1034     sub _x {
1035     my $self = shift;
1036     my $utf8 = shift || return;
1037    
1038     return $self->{'utf2cp'}->convert($utf8) ||
1039     $self->_get_logger()->logwarn("can't convert '$utf8'");
1040     }
1041    
1042 dpavlin 372 #
1043     #
1044     #
1045    
1046     =head1 LOGGING
1047    
1048     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1049     C<log.conf>.
1050    
1051     Methods defined above have different levels of logging, so
1052     it's descriptions will be useful to turn (mostry B<debug> logging) on
1053     or off to see why WabPAC isn't perforing as you expect it (it might even
1054     be a bug!).
1055    
1056     B<This is different from normal Log4perl behaviour>. To repeat, you can
1057     also use method names, and not only classes (which are just few)
1058     to filter logging.
1059    
1060 dpavlin 422
1061     =head1 MEMORY USAGE
1062    
1063     C<low_mem> options is double-edged sword. If enabled, WebPAC
1064     will run on memory constraint machines (which doesn't have enough
1065     physical RAM to create memory structure for whole source database).
1066    
1067     If your machine has 512Mb or more of RAM and database is around 10000 records,
1068     memory shouldn't be an issue. If you don't have enough physical RAM, you
1069     might consider using virtual memory (if your operating system is handling it
1070     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1071     parsed structure of ISIS database (this is what C<low_mem> option does).
1072    
1073     Hitting swap at end of reading source database is probably o.k. However,
1074     hitting swap before 90% will dramatically decrease performance and you will
1075     be better off with C<low_mem> and using rest of availble memory for
1076     operating system disk cache (Linux is particuallary good about this).
1077     However, every access to database record will require disk access, so
1078     generation phase will be slower 10-100 times.
1079    
1080     Parsed structures are essential - you just have option to trade RAM memory
1081     (which is fast) for disk space (which is slow). Be sure to have planty of
1082     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1083    
1084     However, when WebPAC is running on desktop machines (or laptops :-), it's
1085     highly undesireable for system to start swapping. Using C<low_mem> option can
1086     reduce WecPAC memory usage to around 64Mb for same database with lookup
1087     fields and sorted indexes which stay in RAM. Performance will suffer, but
1088     memory usage will really be minimal. It might be also more confortable to
1089     run WebPAC reniced on those machines.
1090    
1091 dpavlin 372 =cut
1092    
1093 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26