/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 441 - (hide annotations)
Tue Sep 14 17:07:59 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 25980 byte(s)
refactore tree generation into WebPAC::Tree

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37 dpavlin 439 code_page => 'ISO-8859-2',
38     low_mem => 1,
39 dpavlin 352 );
40    
41     Default C<code_page> is C<ISO-8859-2>.
42    
43 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45 dpavlin 421 This method will also read configuration files
46 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
47     and configuration file specified by C<config_file>
48     which describes databases to be indexed.
49    
50 dpavlin 352 =cut
51    
52 dpavlin 363 # mapping between data type and tag which specify
53     # format in XML file
54     my %type2tag = (
55     'isis' => 'isis',
56     # 'excel' => 'column',
57     # 'marc' => 'marc',
58     # 'feed' => 'feed'
59     );
60    
61 dpavlin 352 sub new {
62     my $class = shift;
63     my $self = {@_};
64     bless($self, $class);
65    
66 dpavlin 422 $self->{'start_t'} = time();
67    
68 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
69     Log::Log4perl->init($log_file);
70    
71     my $log = $self->_get_logger();
72    
73 dpavlin 352 # fill in default values
74     # output codepage
75     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76    
77 dpavlin 353 #
78     # read global.conf
79     #
80 dpavlin 372 $log->debug("read 'global.conf'");
81 dpavlin 352
82 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83 dpavlin 352
84     # read global config parametars
85     foreach my $var (qw(
86     dbi_dbd
87     dbi_dsn
88     dbi_user
89     dbi_passwd
90     show_progress
91     my_unac_filter
92 dpavlin 370 output_template
93 dpavlin 352 )) {
94 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 dpavlin 352 }
96    
97 dpavlin 353 #
98     # read indexer config file
99     #
100 dpavlin 352
101 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102 dpavlin 352
103 dpavlin 370 # create UTF-8 convertor for import_xml files
104 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105 dpavlin 370
106     # create Template toolkit instance
107     $self->{'tt'} = Template->new(
108     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109     # FILTERS => {
110     # 'foo' => \&foo_filter,
111     # },
112     EVAL_PERL => 1,
113     );
114    
115 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
116     if ($self->{'low_mem'}) {
117 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118 dpavlin 421
119     my $db_file = "data.db";
120    
121     if (-e $db_file) {
122     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123     $log->debug("removed '$db_file' from last run");
124     }
125    
126 dpavlin 434 require DBM::Deep;
127 dpavlin 421
128     my $db = new DBM::Deep $db_file;
129    
130     $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132     if ($db->error()) {
133     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134     } else {
135 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
136 dpavlin 421 }
137    
138     $self->{'db'} = $db;
139     }
140    
141 dpavlin 352 return $self;
142     }
143    
144     =head2 open_isis
145    
146     Open CDS/ISIS database using OpenIsis module and read all records to memory.
147    
148     $webpac->open_isis(
149     filename => '/data/ISIS/ISIS',
150     code_page => '852',
151 dpavlin 431 limit_mfn => 500,
152     start_mfn => 6000,
153 dpavlin 352 lookup => [ ... ],
154     );
155    
156     By default, ISIS code page is assumed to be C<852>.
157    
158 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
159     from database (so you can skip beginning of your database if you need to).
160    
161 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
162     from database in example above.
163 dpavlin 352
164     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166     value in index.
167    
168     lookup => [
169     { 'key' => 'd:v900', 'val' => 'v250^a' },
170     { 'eval' => '"v901^a" eq "Podruèje"',
171     'key' => 'pa:v561^4:v562^4:v461^1',
172     'val' => 'v900' },
173     ]
174    
175 dpavlin 367 Returns number of last record read into memory (size of database, really).
176    
177 dpavlin 352 =cut
178    
179     sub open_isis {
180     my $self = shift;
181     my $arg = {@_};
182    
183 dpavlin 372 my $log = $self->_get_logger();
184    
185     $log->logcroak("need filename") if (! $arg->{'filename'});
186 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
187    
188 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190 dpavlin 421 # store data in object
191     $self->{'isis_filename'} = $arg->{'filename'};
192     $self->{'isis_code_page'} = $code_page;
193    
194 dpavlin 353 use OpenIsis;
195    
196 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
197    
198     # create Text::Iconv object
199     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200    
201 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
202 dpavlin 421 $log->debug("isis code page: $code_page");
203 dpavlin 363
204 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207 dpavlin 431 my $startmfn = 1;
208 dpavlin 352
209 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
210     $log->info("skipping to MFN $s");
211     $startmfn = $s;
212     }
213 dpavlin 357
214 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
215 dpavlin 363
216 dpavlin 431 $log->info("processing ",($maxmfn-$startmfn)." records...");
217    
218 dpavlin 352 # read database
219 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
220 dpavlin 352
221 dpavlin 418
222     $log->debug("mfn: $mfn\n");
223    
224 dpavlin 421 my $rec;
225    
226 dpavlin 352 # read record
227     my $row = OpenIsis::read( $isis_db, $mfn );
228     foreach my $k (keys %{$row}) {
229     if ($k ne "mfn") {
230     foreach my $l (@{$row->{$k}}) {
231     $l = $cp->convert($l);
232     # has subfields?
233     my $val;
234     if ($l =~ m/\^/) {
235     foreach my $t (split(/\^/,$l)) {
236     next if (! $t);
237     $val->{substr($t,0,1)} = substr($t,1);
238     }
239     } else {
240     $val = $l;
241     }
242    
243 dpavlin 421 push @{$rec->{$k}}, $val;
244 dpavlin 352 }
245 dpavlin 374 } else {
246 dpavlin 421 push @{$rec->{'000'}}, $mfn;
247 dpavlin 352 }
248    
249     }
250    
251 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
252    
253     # store
254     if ($self->{'low_mem'}) {
255     $self->{'db'}->put($mfn, $rec);
256     } else {
257     $self->{'data'}->{$mfn} = $rec;
258     }
259    
260 dpavlin 352 # create lookup
261 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
262 dpavlin 352
263 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
264    
265 dpavlin 352 }
266    
267 dpavlin 431 $self->{'current_mfn'} = $startmfn;
268 dpavlin 389 $self->{'last_pcnt'} = 0;
269 dpavlin 362
270 dpavlin 418 $log->debug("max mfn: $maxmfn");
271    
272 dpavlin 352 # store max mfn and return it.
273     return $self->{'max_mfn'} = $maxmfn;
274     }
275    
276 dpavlin 362 =head2 fetch_rec
277    
278     Fetch next record from database. It will also display progress bar (once
279     it's implemented, that is).
280    
281     my $rec = $webpac->fetch_rec;
282    
283     =cut
284    
285     sub fetch_rec {
286     my $self = shift;
287    
288 dpavlin 372 my $log = $self->_get_logger();
289 dpavlin 362
290 dpavlin 372 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
291    
292 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
293     $self->{'current_mfn'} = $self->{'max_mfn'};
294 dpavlin 373 $log->debug("at EOF");
295 dpavlin 362 return;
296     }
297    
298 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
299    
300 dpavlin 421 if ($self->{'low_mem'}) {
301     return $self->{'db'}->get($mfn);
302     } else {
303     return $self->{'data'}->{$mfn};
304     }
305 dpavlin 362 }
306    
307 dpavlin 389 =head2 progress_bar
308    
309     Draw progress bar on STDERR.
310    
311     $webpac->progress_bar($current, $max);
312    
313     =cut
314    
315     sub progress_bar {
316     my $self = shift;
317    
318     my ($curr,$max) = @_;
319    
320     my $log = $self->_get_logger();
321    
322     $log->logconfess("no current value!") if (! $curr);
323     $log->logconfess("no maximum value!") if (! $max);
324    
325     if ($curr > $max) {
326     $max = $curr;
327     $log->debug("overflow to $curr");
328     }
329    
330     $self->{'last_pcnt'} ||= 1;
331    
332 dpavlin 422 my $p = int($curr * 100 / $max);
333 dpavlin 389
334 dpavlin 422 # reset on re-run
335     if ($p < $self->{'last_pcnt'}) {
336     $self->{'last_pcnt'} = $p;
337     $self->{'last_t'} = time();
338     $self->{'last_curr'} = 1;
339     }
340    
341 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
342 dpavlin 422
343     my $last_curr = $self->{'last_curr'} || $curr;
344     my $t = time();
345     my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
346     my $eta = ($max-$curr) / ($rate || 1);
347     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
348 dpavlin 389 $self->{'last_pcnt'} = $p;
349 dpavlin 422 $self->{'last_t'} = time();
350     $self->{'last_curr'} = $curr;
351 dpavlin 389 }
352 dpavlin 412 print STDERR "\n" if ($p == 100);
353 dpavlin 389 }
354    
355 dpavlin 422 =head2 fmt_time
356    
357     Format time (in seconds) for display.
358    
359     print $webpac->fmt_time(time());
360    
361     This method is called by L<progress_bar> to display remaining time.
362    
363     =cut
364    
365     sub fmt_time {
366     my $self = shift;
367    
368     my $t = shift || 0;
369     my $out = "";
370    
371     my ($ss,$mm,$hh) = gmtime($t);
372     $out .= "${hh}h" if ($hh);
373     $out .= sprintf("%02d:%02d", $mm,$ss);
374     $out .= " " if ($hh == 0);
375     return $out;
376     }
377    
378 dpavlin 363 =head2 open_import_xml
379    
380     Read file from C<import_xml/> directory and parse it.
381    
382     $webpac->open_import_xml(type => 'isis');
383    
384     =cut
385    
386     sub open_import_xml {
387     my $self = shift;
388    
389 dpavlin 372 my $log = $self->_get_logger();
390    
391 dpavlin 363 my $arg = {@_};
392 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
393 dpavlin 363
394 dpavlin 366 $self->{'type'} = $arg->{'type'};
395 dpavlin 363
396 dpavlin 366 my $type_base = $arg->{'type'};
397 dpavlin 363 $type_base =~ s/_.*$//g;
398    
399 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
400    
401 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
402 dpavlin 366
403     my $f = "./import_xml/".$self->{'type'}.".xml";
404 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
405 dpavlin 363
406 dpavlin 375 $log->info("reading '$f'");
407 dpavlin 363
408 dpavlin 375 $self->{'import_xml_file'} = $f;
409    
410 dpavlin 363 $self->{'import_xml'} = XMLin($f,
411 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
412 dpavlin 363 );
413    
414 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
415    
416 dpavlin 363 }
417    
418 dpavlin 355 =head2 create_lookup
419    
420     Create lookup from record using lookup definition.
421    
422 dpavlin 367 $self->create_lookup($rec, @lookups);
423    
424     Called internally by C<open_*> methods.
425    
426 dpavlin 355 =cut
427    
428     sub create_lookup {
429     my $self = shift;
430    
431 dpavlin 372 my $log = $self->_get_logger();
432 dpavlin 355
433 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
434     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
435    
436 dpavlin 355 foreach my $i (@_) {
437 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
438     $log->logconfess("need val") unless defined($i->{'val'});
439    
440     if (defined($i->{'eval'})) {
441     # eval first, so we can skip fill_in for key and val
442     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
443     if ($self->_eval($eval)) {
444     my $key = $self->fill_in($rec,$i->{'key'}) || next;
445     my @val = $self->fill_in($rec,$i->{'val'}) || next;
446 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
447 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
448     }
449     } else {
450 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
451     my @val = $self->fill_in($rec,$i->{'val'}) || next;
452     $log->debug("stored $key = ",sub { join(" | ",@val) });
453     push @{$self->{'lookup'}->{$key}}, @val;
454 dpavlin 355 }
455     }
456     }
457    
458 dpavlin 356 =head2 get_data
459    
460     Returns value from record.
461    
462 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
463 dpavlin 356
464     Arguments are:
465     record reference C<$rec>,
466     field C<$f>,
467     optional subfiled C<$sf>,
468     index for repeatable values C<$i>.
469    
470 dpavlin 367 Optinal variable C<$found> will be incremeted if there
471 dpavlin 356 is field.
472    
473     Returns value or empty string.
474    
475     =cut
476    
477     sub get_data {
478     my $self = shift;
479    
480     my ($rec,$f,$sf,$i,$found) = @_;
481 dpavlin 367
482 dpavlin 356 if ($$rec->{$f}) {
483 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
484 dpavlin 389 no strict 'refs';
485 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
486     $$found++ if (defined($$found));
487     return $$rec->{$f}->[$i]->{$sf};
488     } elsif ($$rec->{$f}->[$i]) {
489     $$found++ if (defined($$found));
490 dpavlin 366 # it still might have subfield, just
491     # not specified, so we'll dump all
492     if ($$rec->{$f}->[$i] =~ /HASH/o) {
493     my $out;
494     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
495     $out .= $$rec->{$f}->[$i]->{$k}." ";
496     }
497     return $out;
498     } else {
499     return $$rec->{$f}->[$i];
500     }
501 dpavlin 356 }
502     } else {
503     return '';
504     }
505     }
506    
507 dpavlin 352 =head2 fill_in
508    
509     Workhourse of all: takes record from in-memory structure of database and
510     strings with placeholders and returns string or array of with substituted
511     values from record.
512    
513 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
514 dpavlin 352
515     Optional argument is ordinal number for repeatable fields. By default,
516 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
517     element is 0).
518     Following example will read second value from repeatable field.
519 dpavlin 352
520 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
521 dpavlin 353
522     This function B<does not> perform parsing of format to inteligenty skip
523     delimiters before fields which aren't used.
524    
525 dpavlin 376 This method will automatically decode UTF-8 string to local code page
526     if needed.
527    
528 dpavlin 352 =cut
529    
530 dpavlin 372 sub fill_in {
531 dpavlin 371 my $self = shift;
532    
533 dpavlin 372 my $log = $self->_get_logger();
534 dpavlin 371
535 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
536     my $format = shift || $log->logconfess("need format to parse");
537 dpavlin 352 # iteration (for repeatable fields)
538     my $i = shift || 0;
539    
540     # FIXME remove for speedup?
541 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
542 dpavlin 352
543 dpavlin 376 if (utf8::is_utf8($format)) {
544     $format = $self->_x($format);
545     }
546    
547 dpavlin 352 my $found = 0;
548    
549 dpavlin 359 my $eval_code;
550     # remove eval{...} from beginning
551     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
552    
553 dpavlin 352 # do actual replacement of placeholders
554 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
555 dpavlin 352
556 dpavlin 353 if ($found) {
557 dpavlin 373 $log->debug("format: $format");
558 dpavlin 359 if ($eval_code) {
559     my $eval = $self->fill_in($rec,$eval_code,$i);
560 dpavlin 371 return if (! $self->_eval($eval));
561 dpavlin 359 }
562 dpavlin 353 # do we have lookups?
563 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
564     $log->debug("format '$format' has lookup");
565 dpavlin 353 return $self->lookup($format);
566     } else {
567     return $format;
568     }
569 dpavlin 352 } else {
570     return;
571     }
572     }
573    
574     =head2 lookup
575    
576 dpavlin 355 Perform lookups on format supplied to it.
577 dpavlin 352
578 dpavlin 367 my $text = $self->lookup('[v900]');
579 dpavlin 352
580 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
581    
582 dpavlin 352 =cut
583    
584     sub lookup {
585     my $self = shift;
586    
587 dpavlin 372 my $log = $self->_get_logger();
588 dpavlin 352
589 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
590    
591 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
592 dpavlin 352 my @in = ( $tmp );
593 dpavlin 372
594     $log->debug("lookup for: ",$tmp);
595    
596 dpavlin 352 my @out;
597     while (my $f = shift @in) {
598 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
599 dpavlin 352 my $k = $1;
600     if ($self->{'lookup'}->{$k}) {
601     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
602     my $tmp2 = $f;
603 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
604 dpavlin 352 push @in, $tmp2;
605     }
606     } else {
607     undef $f;
608     }
609     } elsif ($f) {
610     push @out, $f;
611     }
612     }
613 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
614 dpavlin 352 return @out;
615     } else {
616     return $tmp;
617     }
618     }
619    
620 dpavlin 356 =head2 parse
621    
622     Perform smart parsing of string, skipping delimiters for fields which aren't
623     defined. It can also eval code in format starting with C<eval{...}> and
624     return output or nothing depending on eval code.
625    
626 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
627 dpavlin 356
628     =cut
629    
630     sub parse {
631     my $self = shift;
632    
633 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
634 dpavlin 356
635 dpavlin 366 return if (! $format_utf8);
636    
637 dpavlin 372 my $log = $self->_get_logger();
638 dpavlin 358
639 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
640     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
641    
642 dpavlin 358 $i = 0 if (! $i);
643    
644 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
645 dpavlin 366
646 dpavlin 356 my @out;
647    
648 dpavlin 373 $log->debug("format: $format");
649    
650 dpavlin 356 my $eval_code;
651     # remove eval{...} from beginning
652     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
653    
654 dpavlin 358 my $prefix;
655     my $all_found=0;
656 dpavlin 356
657 dpavlin 373 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
658 dpavlin 356
659 dpavlin 358 my $del = $1 || '';
660 dpavlin 359 $prefix ||= $del if ($all_found == 0);
661 dpavlin 358
662     my $found = 0;
663     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
664    
665 dpavlin 356 if ($found) {
666 dpavlin 358 push @out, $del;
667     push @out, $tmp;
668     $all_found += $found;
669 dpavlin 356 }
670     }
671    
672 dpavlin 358 return if (! $all_found);
673 dpavlin 356
674 dpavlin 373 my $out = join('',@out);
675 dpavlin 358
676 dpavlin 373 if ($out) {
677     # add rest of format (suffix)
678     $out .= $format;
679 dpavlin 367
680 dpavlin 373 # add prefix if not there
681     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
682    
683     $log->debug("result: $out");
684     }
685    
686 dpavlin 359 if ($eval_code) {
687     my $eval = $self->fill_in($rec,$eval_code,$i);
688 dpavlin 373 $log->debug("about to eval{",$eval,"} format: $out");
689 dpavlin 371 return if (! $self->_eval($eval));
690 dpavlin 359 }
691    
692 dpavlin 358 return $out;
693 dpavlin 356 }
694    
695 dpavlin 367 =head2 parse_to_arr
696    
697     Similar to C<parse>, but returns array of all repeatable fields
698    
699     my @arr = $webpac->parse_to_arr($rec,'v250^a');
700    
701     =cut
702    
703     sub parse_to_arr {
704     my $self = shift;
705    
706     my ($rec, $format_utf8) = @_;
707    
708 dpavlin 372 my $log = $self->_get_logger();
709    
710     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
711 dpavlin 367 return if (! $format_utf8);
712    
713     my $i = 0;
714     my @arr;
715    
716     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
717     push @arr, $v;
718     }
719    
720 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
721    
722 dpavlin 367 return @arr;
723     }
724    
725 dpavlin 373 =head2 fill_in_to_arr
726    
727     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
728     for fields which have lookups, so they shouldn't be parsed but rather
729     C<fill_id>ed.
730    
731     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
732    
733     =cut
734    
735     sub fill_in_to_arr {
736     my $self = shift;
737    
738     my ($rec, $format_utf8) = @_;
739    
740     my $log = $self->_get_logger();
741    
742     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
743     return if (! $format_utf8);
744    
745     my $i = 0;
746     my @arr;
747    
748     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
749     push @arr, @v;
750     }
751    
752     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
753    
754     return @arr;
755     }
756    
757 dpavlin 441 =head2 sort_arr
758 dpavlin 373
759 dpavlin 441 Sort array ignoring case and html in data
760    
761     my @sorted = $webpac->sort_arr(@unsorted);
762    
763     =cut
764    
765     sub sort_arr {
766     my $self = shift;
767    
768     my $log = $self->_get_logger();
769    
770     # FIXME add Schwartzian Transformation?
771    
772     my @sorted = sort {
773     $a =~ s#<[^>]+/*>##;
774     $b =~ s#<[^>]+/*>##;
775     lc($b) cmp lc($a)
776     } @_;
777     $log->debug("sorted values: ",sub { join(", ",@sorted) });
778    
779     return @sorted;
780     }
781    
782    
783 dpavlin 366 =head2 data_structure
784    
785     Create in-memory data structure which represents layout from C<import_xml>.
786     It is used later to produce output.
787    
788 dpavlin 368 my @ds = $webpac->data_structure($rec);
789 dpavlin 366
790 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
791 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
792     <headline> tag.
793 dpavlin 374
794 dpavlin 366 =cut
795    
796 dpavlin 372 sub data_structure {
797 dpavlin 366 my $self = shift;
798    
799 dpavlin 372 my $log = $self->_get_logger();
800 dpavlin 366
801     my $rec = shift;
802 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
803 dpavlin 366
804 dpavlin 374 undef $self->{'currnet_filename'};
805 dpavlin 398 undef $self->{'headline'};
806 dpavlin 374
807 dpavlin 366 my @sorted_tags;
808     if ($self->{tags_by_order}) {
809     @sorted_tags = @{$self->{tags_by_order}};
810     } else {
811     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
812     $self->{tags_by_order} = \@sorted_tags;
813     }
814    
815 dpavlin 368 my @ds;
816 dpavlin 366
817 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
818    
819 dpavlin 366 foreach my $field (@sorted_tags) {
820    
821     my $row;
822    
823     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
824    
825     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
826 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
827 dpavlin 366
828 dpavlin 373 $log->debug("format: $format");
829    
830     my @v;
831     if ($format =~ /$LOOKUP_REGEX/o) {
832     @v = $self->fill_in_to_arr($rec,$format);
833     } else {
834     @v = $self->parse_to_arr($rec,$format);
835     }
836 dpavlin 367 next if (! @v);
837 dpavlin 366
838 dpavlin 439 if ($tag->{'sort'}) {
839 dpavlin 441 @v = $self->sort_arr(@v);
840 dpavlin 439 $log->warn("sort within tag is usually not what you want!");
841     }
842    
843 dpavlin 375 # use format?
844     if ($tag->{'format_name'}) {
845     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
846     }
847    
848 dpavlin 398 if ($field eq 'filename') {
849     $self->{'current_filename'} = join('',@v);
850     $log->debug("filename: ",$self->{'current_filename'});
851     } elsif ($field eq 'headline') {
852     $self->{'headline'} .= join('',@v);
853     $log->debug("headline: ",$self->{'headline'});
854     next; # don't return headline in data_structure!
855     }
856    
857 dpavlin 439 # delimiter will join repeatable fields
858     if ($tag->{'delimiter'}) {
859     @v = ( join($tag->{'delimiter'}, @v) );
860 dpavlin 366 }
861 dpavlin 373
862 dpavlin 439 # default types
863     my @types = qw(display swish);
864     # override by type attribute
865     @types = ( $tag->{'type'} ) if ($tag->{'type'});
866 dpavlin 374
867 dpavlin 439 foreach my $type (@types) {
868     # append to previous line?
869     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
870     if ($tag->{'append'}) {
871    
872     # I will delimit appended part with
873     # delimiter (or ,)
874     my $d = $tag->{'delimiter'};
875     # default delimiter
876     $d ||= ", ";
877    
878     my $last = pop @{$row->{$type}};
879     $d = "" if (! $last);
880     $last .= $d . join($d, @v);
881     push @{$row->{$type}}, $last;
882    
883     } else {
884     push @{$row->{$type}}, @v;
885     }
886     }
887    
888    
889 dpavlin 366 }
890    
891 dpavlin 368 if ($row) {
892     $row->{'tag'} = $field;
893 dpavlin 375
894     # TODO: name_sigular, name_plural
895     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
896     $row->{'name'} = $name ? $self->_x($name) : $field;
897    
898 dpavlin 439 # post-sort all values in field
899     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
900     $log->warn("sort at field tag not implemented");
901     }
902    
903 dpavlin 368 push @ds, $row;
904 dpavlin 374
905 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
906 dpavlin 368 }
907 dpavlin 366
908     }
909    
910 dpavlin 370 return @ds;
911 dpavlin 366
912     }
913    
914 dpavlin 370 =head2 output
915    
916     Create output from in-memory data structure using Template Toolkit template.
917    
918     my $text = $webpac->output( template => 'text.tt', data => @ds );
919    
920     =cut
921    
922     sub output {
923     my $self = shift;
924    
925     my $args = {@_};
926    
927 dpavlin 372 my $log = $self->_get_logger();
928 dpavlin 370
929 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
930     $log->logconfess("need data array") if (! $args->{'data'});
931    
932 dpavlin 370 my $out;
933    
934     $self->{'tt'}->process(
935     $args->{'template'},
936     $args,
937     \$out
938     ) || confess $self->{'tt'}->error();
939    
940     return $out;
941     }
942    
943 dpavlin 411 =head2 output_file
944    
945     Create output from in-memory data structure using Template Toolkit template
946     to a file.
947    
948     $webpac->output_file(
949     file => 'out.txt',
950     template => 'text.tt',
951     data => @ds
952     );
953    
954     =cut
955    
956     sub output_file {
957     my $self = shift;
958    
959     my $args = {@_};
960    
961     my $log = $self->_get_logger();
962    
963 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
964 dpavlin 411
965 dpavlin 421 $log->debug("creating file ",$file);
966 dpavlin 411
967 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
968 dpavlin 411 print $fh $self->output(
969     template => $args->{'template'},
970     data => $args->{'data'},
971     ) || $log->logdie("print: $!");
972     close($fh) || $log->logdie("close: $!");
973     }
974    
975 dpavlin 375 =head2 apply_format
976    
977     Apply format specified in tag with C<format_name="name"> and
978     C<format_delimiter=";;">.
979    
980     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
981    
982     Formats can contain C<lookup{...}> if you need them.
983    
984     =cut
985    
986     sub apply_format {
987     my $self = shift;
988    
989     my ($name,$delimiter,$data) = @_;
990    
991     my $log = $self->_get_logger();
992    
993     if (! $self->{'import_xml'}->{'format'}->{$name}) {
994     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
995     return $data;
996     }
997    
998     $log->warn("no delimiter for format $name") if (! $delimiter);
999    
1000     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1001    
1002     my @data = split(/\Q$delimiter\E/, $data);
1003    
1004     my $out = sprintf($format, @data);
1005     $log->debug("using format $name [$format] on $data to produce: $out");
1006    
1007     if ($out =~ m/$LOOKUP_REGEX/o) {
1008     return $self->lookup($out);
1009     } else {
1010     return $out;
1011     }
1012    
1013     }
1014    
1015    
1016 dpavlin 372 #
1017     #
1018     #
1019    
1020     =head1 INTERNAL METHODS
1021    
1022     Here is a quick list of internal methods, mostly useful to turn debugging
1023     on them (see L<LOGGING> below for explanation).
1024    
1025     =cut
1026    
1027     =head2 _eval
1028    
1029     Internal function to eval code without C<strict 'subs'>.
1030    
1031     =cut
1032    
1033     sub _eval {
1034     my $self = shift;
1035    
1036     my $code = shift || return;
1037    
1038     my $log = $self->_get_logger();
1039    
1040     no strict 'subs';
1041     my $ret = eval $code;
1042     if ($@) {
1043     $log->error("problem with eval code [$code]: $@");
1044     }
1045    
1046     $log->debug("eval: ",$code," [",$ret,"]");
1047    
1048     return $ret || 0;
1049     }
1050    
1051     =head2 _sort_by_order
1052    
1053     Sort xml tags data structure accoding to C<order=""> attribute.
1054    
1055     =cut
1056    
1057     sub _sort_by_order {
1058     my $self = shift;
1059    
1060     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1061     $self->{'import_xml'}->{'indexer'}->{$a};
1062     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1063     $self->{'import_xml'}->{'indexer'}->{$b};
1064    
1065     return $va <=> $vb;
1066     }
1067    
1068 dpavlin 375 =head2 _get_logger
1069    
1070     Get C<Log::Log4perl> object with a twist: domains are defined for each
1071     method
1072    
1073     my $log = $webpac->_get_logger();
1074    
1075     =cut
1076    
1077 dpavlin 372 sub _get_logger {
1078     my $self = shift;
1079    
1080 dpavlin 374 my $name = (caller(1))[3] || caller;
1081     return get_logger($name);
1082 dpavlin 372 }
1083    
1084 dpavlin 375 =head2 _x
1085    
1086     Convert string from UTF-8 to code page defined in C<import_xml>.
1087    
1088     my $text = $webpac->_x('utf8 text');
1089    
1090     =cut
1091    
1092     sub _x {
1093     my $self = shift;
1094     my $utf8 = shift || return;
1095    
1096     return $self->{'utf2cp'}->convert($utf8) ||
1097     $self->_get_logger()->logwarn("can't convert '$utf8'");
1098     }
1099    
1100 dpavlin 372 #
1101     #
1102     #
1103    
1104     =head1 LOGGING
1105    
1106     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1107     C<log.conf>.
1108    
1109     Methods defined above have different levels of logging, so
1110     it's descriptions will be useful to turn (mostry B<debug> logging) on
1111     or off to see why WabPAC isn't perforing as you expect it (it might even
1112     be a bug!).
1113    
1114     B<This is different from normal Log4perl behaviour>. To repeat, you can
1115     also use method names, and not only classes (which are just few)
1116     to filter logging.
1117    
1118 dpavlin 422
1119     =head1 MEMORY USAGE
1120    
1121     C<low_mem> options is double-edged sword. If enabled, WebPAC
1122     will run on memory constraint machines (which doesn't have enough
1123     physical RAM to create memory structure for whole source database).
1124    
1125     If your machine has 512Mb or more of RAM and database is around 10000 records,
1126     memory shouldn't be an issue. If you don't have enough physical RAM, you
1127     might consider using virtual memory (if your operating system is handling it
1128     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1129     parsed structure of ISIS database (this is what C<low_mem> option does).
1130    
1131     Hitting swap at end of reading source database is probably o.k. However,
1132     hitting swap before 90% will dramatically decrease performance and you will
1133     be better off with C<low_mem> and using rest of availble memory for
1134     operating system disk cache (Linux is particuallary good about this).
1135     However, every access to database record will require disk access, so
1136     generation phase will be slower 10-100 times.
1137    
1138     Parsed structures are essential - you just have option to trade RAM memory
1139     (which is fast) for disk space (which is slow). Be sure to have planty of
1140     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1141    
1142     However, when WebPAC is running on desktop machines (or laptops :-), it's
1143     highly undesireable for system to start swapping. Using C<low_mem> option can
1144     reduce WecPAC memory usage to around 64Mb for same database with lookup
1145     fields and sorted indexes which stay in RAM. Performance will suffer, but
1146     memory usage will really be minimal. It might be also more confortable to
1147     run WebPAC reniced on those machines.
1148    
1149 dpavlin 372 =cut
1150    
1151 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26