/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 448 - (hide annotations)
Wed Sep 15 16:53:51 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 26319 byte(s)
Major changes this time: updated for new bfilter (filtering on first
element), outline in sorted index when found in thesaurus.

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37 dpavlin 439 code_page => 'ISO-8859-2',
38     low_mem => 1,
39 dpavlin 352 );
40    
41     Default C<code_page> is C<ISO-8859-2>.
42    
43 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45 dpavlin 421 This method will also read configuration files
46 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
47     and configuration file specified by C<config_file>
48     which describes databases to be indexed.
49    
50 dpavlin 352 =cut
51    
52 dpavlin 363 # mapping between data type and tag which specify
53     # format in XML file
54     my %type2tag = (
55     'isis' => 'isis',
56     # 'excel' => 'column',
57     # 'marc' => 'marc',
58     # 'feed' => 'feed'
59     );
60    
61 dpavlin 352 sub new {
62     my $class = shift;
63     my $self = {@_};
64     bless($self, $class);
65    
66 dpavlin 422 $self->{'start_t'} = time();
67    
68 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
69     Log::Log4perl->init($log_file);
70    
71     my $log = $self->_get_logger();
72    
73 dpavlin 352 # fill in default values
74     # output codepage
75     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76    
77 dpavlin 353 #
78     # read global.conf
79     #
80 dpavlin 372 $log->debug("read 'global.conf'");
81 dpavlin 352
82 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83 dpavlin 352
84     # read global config parametars
85     foreach my $var (qw(
86     dbi_dbd
87     dbi_dsn
88     dbi_user
89     dbi_passwd
90     show_progress
91     my_unac_filter
92 dpavlin 370 output_template
93 dpavlin 352 )) {
94 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 dpavlin 352 }
96    
97 dpavlin 353 #
98     # read indexer config file
99     #
100 dpavlin 352
101 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102 dpavlin 352
103 dpavlin 370 # create UTF-8 convertor for import_xml files
104 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105 dpavlin 370
106     # create Template toolkit instance
107     $self->{'tt'} = Template->new(
108     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109     # FILTERS => {
110     # 'foo' => \&foo_filter,
111     # },
112     EVAL_PERL => 1,
113     );
114    
115 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
116     if ($self->{'low_mem'}) {
117 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118 dpavlin 421
119     my $db_file = "data.db";
120    
121     if (-e $db_file) {
122     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123     $log->debug("removed '$db_file' from last run");
124     }
125    
126 dpavlin 434 require DBM::Deep;
127 dpavlin 421
128     my $db = new DBM::Deep $db_file;
129    
130     $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132     if ($db->error()) {
133     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134     } else {
135 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
136 dpavlin 421 }
137    
138     $self->{'db'} = $db;
139     }
140    
141 dpavlin 352 return $self;
142     }
143    
144     =head2 open_isis
145    
146     Open CDS/ISIS database using OpenIsis module and read all records to memory.
147    
148     $webpac->open_isis(
149     filename => '/data/ISIS/ISIS',
150     code_page => '852',
151 dpavlin 431 limit_mfn => 500,
152     start_mfn => 6000,
153 dpavlin 352 lookup => [ ... ],
154     );
155    
156     By default, ISIS code page is assumed to be C<852>.
157    
158 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
159     from database (so you can skip beginning of your database if you need to).
160    
161 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
162     from database in example above.
163 dpavlin 352
164     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166     value in index.
167    
168     lookup => [
169     { 'key' => 'd:v900', 'val' => 'v250^a' },
170     { 'eval' => '"v901^a" eq "Podruèje"',
171     'key' => 'pa:v561^4:v562^4:v461^1',
172     'val' => 'v900' },
173     ]
174    
175 dpavlin 367 Returns number of last record read into memory (size of database, really).
176    
177 dpavlin 352 =cut
178    
179     sub open_isis {
180     my $self = shift;
181     my $arg = {@_};
182    
183 dpavlin 372 my $log = $self->_get_logger();
184    
185     $log->logcroak("need filename") if (! $arg->{'filename'});
186 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
187    
188 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190 dpavlin 421 # store data in object
191     $self->{'isis_filename'} = $arg->{'filename'};
192     $self->{'isis_code_page'} = $code_page;
193    
194 dpavlin 353 use OpenIsis;
195    
196 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
197    
198     # create Text::Iconv object
199     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200    
201 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
202 dpavlin 421 $log->debug("isis code page: $code_page");
203 dpavlin 363
204 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207 dpavlin 431 my $startmfn = 1;
208 dpavlin 352
209 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
210     $log->info("skipping to MFN $s");
211     $startmfn = $s;
212 dpavlin 448 } else {
213     $self->{'start_mfn'} = $startmfn;
214 dpavlin 431 }
215 dpavlin 357
216 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
217 dpavlin 363
218 dpavlin 431 $log->info("processing ",($maxmfn-$startmfn)." records...");
219    
220 dpavlin 352 # read database
221 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
222 dpavlin 352
223 dpavlin 418
224     $log->debug("mfn: $mfn\n");
225    
226 dpavlin 421 my $rec;
227    
228 dpavlin 352 # read record
229     my $row = OpenIsis::read( $isis_db, $mfn );
230     foreach my $k (keys %{$row}) {
231     if ($k ne "mfn") {
232     foreach my $l (@{$row->{$k}}) {
233     $l = $cp->convert($l);
234     # has subfields?
235     my $val;
236     if ($l =~ m/\^/) {
237     foreach my $t (split(/\^/,$l)) {
238     next if (! $t);
239     $val->{substr($t,0,1)} = substr($t,1);
240     }
241     } else {
242     $val = $l;
243     }
244    
245 dpavlin 421 push @{$rec->{$k}}, $val;
246 dpavlin 352 }
247 dpavlin 374 } else {
248 dpavlin 421 push @{$rec->{'000'}}, $mfn;
249 dpavlin 352 }
250    
251     }
252    
253 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
254    
255     # store
256     if ($self->{'low_mem'}) {
257     $self->{'db'}->put($mfn, $rec);
258     } else {
259     $self->{'data'}->{$mfn} = $rec;
260     }
261    
262 dpavlin 352 # create lookup
263 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
264 dpavlin 352
265 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
266    
267 dpavlin 352 }
268    
269 dpavlin 448 $self->{'current_mfn'} = -1;
270 dpavlin 389 $self->{'last_pcnt'} = 0;
271 dpavlin 362
272 dpavlin 418 $log->debug("max mfn: $maxmfn");
273    
274 dpavlin 352 # store max mfn and return it.
275     return $self->{'max_mfn'} = $maxmfn;
276     }
277    
278 dpavlin 362 =head2 fetch_rec
279    
280     Fetch next record from database. It will also display progress bar (once
281     it's implemented, that is).
282    
283     my $rec = $webpac->fetch_rec;
284    
285     =cut
286    
287     sub fetch_rec {
288     my $self = shift;
289    
290 dpavlin 372 my $log = $self->_get_logger();
291 dpavlin 362
292 dpavlin 448 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
293 dpavlin 372
294 dpavlin 448 if ($self->{'current_mfn'} == -1) {
295     $self->{'current_mfn'} = $self->{'start_mfn'};
296     } else {
297     $self->{'current_mfn'}++;
298     }
299    
300     my $mfn = $self->{'current_mfn'};
301    
302 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
303     $self->{'current_mfn'} = $self->{'max_mfn'};
304 dpavlin 373 $log->debug("at EOF");
305 dpavlin 362 return;
306     }
307    
308 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
309    
310 dpavlin 421 if ($self->{'low_mem'}) {
311     return $self->{'db'}->get($mfn);
312     } else {
313     return $self->{'data'}->{$mfn};
314     }
315 dpavlin 362 }
316    
317 dpavlin 448 =head2 mfn
318    
319     Returns current record number (MFN).
320    
321     print $webpac->mfn;
322    
323     =cut
324    
325     sub mfn {
326     my $self = shift;
327     return $self->{'current_mfn'};
328     }
329    
330 dpavlin 389 =head2 progress_bar
331    
332     Draw progress bar on STDERR.
333    
334     $webpac->progress_bar($current, $max);
335    
336     =cut
337    
338     sub progress_bar {
339     my $self = shift;
340    
341     my ($curr,$max) = @_;
342    
343     my $log = $self->_get_logger();
344    
345     $log->logconfess("no current value!") if (! $curr);
346     $log->logconfess("no maximum value!") if (! $max);
347    
348     if ($curr > $max) {
349     $max = $curr;
350     $log->debug("overflow to $curr");
351     }
352    
353     $self->{'last_pcnt'} ||= 1;
354    
355 dpavlin 422 my $p = int($curr * 100 / $max);
356 dpavlin 389
357 dpavlin 422 # reset on re-run
358     if ($p < $self->{'last_pcnt'}) {
359     $self->{'last_pcnt'} = $p;
360     $self->{'last_t'} = time();
361     $self->{'last_curr'} = 1;
362     }
363    
364 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
365 dpavlin 422
366     my $last_curr = $self->{'last_curr'} || $curr;
367     my $t = time();
368     my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
369     my $eta = ($max-$curr) / ($rate || 1);
370     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
371 dpavlin 389 $self->{'last_pcnt'} = $p;
372 dpavlin 422 $self->{'last_t'} = time();
373     $self->{'last_curr'} = $curr;
374 dpavlin 389 }
375 dpavlin 412 print STDERR "\n" if ($p == 100);
376 dpavlin 389 }
377    
378 dpavlin 422 =head2 fmt_time
379    
380     Format time (in seconds) for display.
381    
382     print $webpac->fmt_time(time());
383    
384     This method is called by L<progress_bar> to display remaining time.
385    
386     =cut
387    
388     sub fmt_time {
389     my $self = shift;
390    
391     my $t = shift || 0;
392     my $out = "";
393    
394     my ($ss,$mm,$hh) = gmtime($t);
395     $out .= "${hh}h" if ($hh);
396     $out .= sprintf("%02d:%02d", $mm,$ss);
397     $out .= " " if ($hh == 0);
398     return $out;
399     }
400    
401 dpavlin 363 =head2 open_import_xml
402    
403     Read file from C<import_xml/> directory and parse it.
404    
405     $webpac->open_import_xml(type => 'isis');
406    
407     =cut
408    
409     sub open_import_xml {
410     my $self = shift;
411    
412 dpavlin 372 my $log = $self->_get_logger();
413    
414 dpavlin 363 my $arg = {@_};
415 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
416 dpavlin 363
417 dpavlin 366 $self->{'type'} = $arg->{'type'};
418 dpavlin 363
419 dpavlin 366 my $type_base = $arg->{'type'};
420 dpavlin 363 $type_base =~ s/_.*$//g;
421    
422 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
423    
424 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
425 dpavlin 366
426     my $f = "./import_xml/".$self->{'type'}.".xml";
427 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
428 dpavlin 363
429 dpavlin 375 $log->info("reading '$f'");
430 dpavlin 363
431 dpavlin 375 $self->{'import_xml_file'} = $f;
432    
433 dpavlin 363 $self->{'import_xml'} = XMLin($f,
434 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
435 dpavlin 363 );
436    
437 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
438    
439 dpavlin 363 }
440    
441 dpavlin 355 =head2 create_lookup
442    
443     Create lookup from record using lookup definition.
444    
445 dpavlin 367 $self->create_lookup($rec, @lookups);
446    
447     Called internally by C<open_*> methods.
448    
449 dpavlin 355 =cut
450    
451     sub create_lookup {
452     my $self = shift;
453    
454 dpavlin 372 my $log = $self->_get_logger();
455 dpavlin 355
456 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
457     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
458    
459 dpavlin 355 foreach my $i (@_) {
460 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
461     $log->logconfess("need val") unless defined($i->{'val'});
462    
463     if (defined($i->{'eval'})) {
464     # eval first, so we can skip fill_in for key and val
465     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
466     if ($self->_eval($eval)) {
467     my $key = $self->fill_in($rec,$i->{'key'}) || next;
468     my @val = $self->fill_in($rec,$i->{'val'}) || next;
469 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
470 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
471     }
472     } else {
473 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
474     my @val = $self->fill_in($rec,$i->{'val'}) || next;
475     $log->debug("stored $key = ",sub { join(" | ",@val) });
476     push @{$self->{'lookup'}->{$key}}, @val;
477 dpavlin 355 }
478     }
479     }
480    
481 dpavlin 356 =head2 get_data
482    
483     Returns value from record.
484    
485 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
486 dpavlin 356
487     Arguments are:
488     record reference C<$rec>,
489     field C<$f>,
490     optional subfiled C<$sf>,
491     index for repeatable values C<$i>.
492    
493 dpavlin 367 Optinal variable C<$found> will be incremeted if there
494 dpavlin 356 is field.
495    
496     Returns value or empty string.
497    
498     =cut
499    
500     sub get_data {
501     my $self = shift;
502    
503     my ($rec,$f,$sf,$i,$found) = @_;
504 dpavlin 367
505 dpavlin 356 if ($$rec->{$f}) {
506 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
507 dpavlin 389 no strict 'refs';
508 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
509     $$found++ if (defined($$found));
510     return $$rec->{$f}->[$i]->{$sf};
511     } elsif ($$rec->{$f}->[$i]) {
512     $$found++ if (defined($$found));
513 dpavlin 366 # it still might have subfield, just
514     # not specified, so we'll dump all
515     if ($$rec->{$f}->[$i] =~ /HASH/o) {
516     my $out;
517     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
518     $out .= $$rec->{$f}->[$i]->{$k}." ";
519     }
520     return $out;
521     } else {
522     return $$rec->{$f}->[$i];
523     }
524 dpavlin 356 }
525     } else {
526     return '';
527     }
528     }
529    
530 dpavlin 352 =head2 fill_in
531    
532     Workhourse of all: takes record from in-memory structure of database and
533     strings with placeholders and returns string or array of with substituted
534     values from record.
535    
536 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
537 dpavlin 352
538     Optional argument is ordinal number for repeatable fields. By default,
539 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
540     element is 0).
541     Following example will read second value from repeatable field.
542 dpavlin 352
543 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
544 dpavlin 353
545     This function B<does not> perform parsing of format to inteligenty skip
546     delimiters before fields which aren't used.
547    
548 dpavlin 376 This method will automatically decode UTF-8 string to local code page
549     if needed.
550    
551 dpavlin 352 =cut
552    
553 dpavlin 372 sub fill_in {
554 dpavlin 371 my $self = shift;
555    
556 dpavlin 372 my $log = $self->_get_logger();
557 dpavlin 371
558 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
559     my $format = shift || $log->logconfess("need format to parse");
560 dpavlin 352 # iteration (for repeatable fields)
561     my $i = shift || 0;
562    
563     # FIXME remove for speedup?
564 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
565 dpavlin 352
566 dpavlin 376 if (utf8::is_utf8($format)) {
567     $format = $self->_x($format);
568     }
569    
570 dpavlin 352 my $found = 0;
571    
572 dpavlin 359 my $eval_code;
573     # remove eval{...} from beginning
574     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
575    
576 dpavlin 352 # do actual replacement of placeholders
577 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
578 dpavlin 352
579 dpavlin 353 if ($found) {
580 dpavlin 373 $log->debug("format: $format");
581 dpavlin 359 if ($eval_code) {
582     my $eval = $self->fill_in($rec,$eval_code,$i);
583 dpavlin 371 return if (! $self->_eval($eval));
584 dpavlin 359 }
585 dpavlin 353 # do we have lookups?
586 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
587     $log->debug("format '$format' has lookup");
588 dpavlin 353 return $self->lookup($format);
589     } else {
590     return $format;
591     }
592 dpavlin 352 } else {
593     return;
594     }
595     }
596    
597     =head2 lookup
598    
599 dpavlin 355 Perform lookups on format supplied to it.
600 dpavlin 352
601 dpavlin 367 my $text = $self->lookup('[v900]');
602 dpavlin 352
603 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
604    
605 dpavlin 352 =cut
606    
607     sub lookup {
608     my $self = shift;
609    
610 dpavlin 372 my $log = $self->_get_logger();
611 dpavlin 352
612 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
613    
614 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
615 dpavlin 352 my @in = ( $tmp );
616 dpavlin 372
617     $log->debug("lookup for: ",$tmp);
618    
619 dpavlin 352 my @out;
620     while (my $f = shift @in) {
621 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
622 dpavlin 352 my $k = $1;
623     if ($self->{'lookup'}->{$k}) {
624     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
625     my $tmp2 = $f;
626 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
627 dpavlin 352 push @in, $tmp2;
628     }
629     } else {
630     undef $f;
631     }
632     } elsif ($f) {
633     push @out, $f;
634     }
635     }
636 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
637 dpavlin 352 return @out;
638     } else {
639     return $tmp;
640     }
641     }
642    
643 dpavlin 356 =head2 parse
644    
645     Perform smart parsing of string, skipping delimiters for fields which aren't
646     defined. It can also eval code in format starting with C<eval{...}> and
647     return output or nothing depending on eval code.
648    
649 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
650 dpavlin 356
651     =cut
652    
653     sub parse {
654     my $self = shift;
655    
656 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
657 dpavlin 356
658 dpavlin 366 return if (! $format_utf8);
659    
660 dpavlin 372 my $log = $self->_get_logger();
661 dpavlin 358
662 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
663     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
664    
665 dpavlin 358 $i = 0 if (! $i);
666    
667 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
668 dpavlin 366
669 dpavlin 356 my @out;
670    
671 dpavlin 373 $log->debug("format: $format");
672    
673 dpavlin 356 my $eval_code;
674     # remove eval{...} from beginning
675     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
676    
677 dpavlin 358 my $prefix;
678     my $all_found=0;
679 dpavlin 356
680 dpavlin 373 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
681 dpavlin 356
682 dpavlin 358 my $del = $1 || '';
683 dpavlin 359 $prefix ||= $del if ($all_found == 0);
684 dpavlin 358
685     my $found = 0;
686     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
687    
688 dpavlin 356 if ($found) {
689 dpavlin 358 push @out, $del;
690     push @out, $tmp;
691     $all_found += $found;
692 dpavlin 356 }
693     }
694    
695 dpavlin 358 return if (! $all_found);
696 dpavlin 356
697 dpavlin 373 my $out = join('',@out);
698 dpavlin 358
699 dpavlin 373 if ($out) {
700     # add rest of format (suffix)
701     $out .= $format;
702 dpavlin 367
703 dpavlin 373 # add prefix if not there
704     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
705    
706     $log->debug("result: $out");
707     }
708    
709 dpavlin 359 if ($eval_code) {
710     my $eval = $self->fill_in($rec,$eval_code,$i);
711 dpavlin 373 $log->debug("about to eval{",$eval,"} format: $out");
712 dpavlin 371 return if (! $self->_eval($eval));
713 dpavlin 359 }
714    
715 dpavlin 358 return $out;
716 dpavlin 356 }
717    
718 dpavlin 367 =head2 parse_to_arr
719    
720     Similar to C<parse>, but returns array of all repeatable fields
721    
722     my @arr = $webpac->parse_to_arr($rec,'v250^a');
723    
724     =cut
725    
726     sub parse_to_arr {
727     my $self = shift;
728    
729     my ($rec, $format_utf8) = @_;
730    
731 dpavlin 372 my $log = $self->_get_logger();
732    
733     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
734 dpavlin 367 return if (! $format_utf8);
735    
736     my $i = 0;
737     my @arr;
738    
739     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
740     push @arr, $v;
741     }
742    
743 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
744    
745 dpavlin 367 return @arr;
746     }
747    
748 dpavlin 373 =head2 fill_in_to_arr
749    
750     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
751     for fields which have lookups, so they shouldn't be parsed but rather
752     C<fill_id>ed.
753    
754     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
755    
756     =cut
757    
758     sub fill_in_to_arr {
759     my $self = shift;
760    
761     my ($rec, $format_utf8) = @_;
762    
763     my $log = $self->_get_logger();
764    
765     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
766     return if (! $format_utf8);
767    
768     my $i = 0;
769     my @arr;
770    
771     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
772     push @arr, @v;
773     }
774    
775     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
776    
777     return @arr;
778     }
779    
780 dpavlin 441 =head2 sort_arr
781 dpavlin 373
782 dpavlin 441 Sort array ignoring case and html in data
783    
784     my @sorted = $webpac->sort_arr(@unsorted);
785    
786     =cut
787    
788     sub sort_arr {
789     my $self = shift;
790    
791     my $log = $self->_get_logger();
792    
793     # FIXME add Schwartzian Transformation?
794    
795     my @sorted = sort {
796     $a =~ s#<[^>]+/*>##;
797     $b =~ s#<[^>]+/*>##;
798     lc($b) cmp lc($a)
799     } @_;
800     $log->debug("sorted values: ",sub { join(", ",@sorted) });
801    
802     return @sorted;
803     }
804    
805    
806 dpavlin 366 =head2 data_structure
807    
808     Create in-memory data structure which represents layout from C<import_xml>.
809     It is used later to produce output.
810    
811 dpavlin 368 my @ds = $webpac->data_structure($rec);
812 dpavlin 366
813 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
814 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
815     <headline> tag.
816 dpavlin 374
817 dpavlin 366 =cut
818    
819 dpavlin 372 sub data_structure {
820 dpavlin 366 my $self = shift;
821    
822 dpavlin 372 my $log = $self->_get_logger();
823 dpavlin 366
824     my $rec = shift;
825 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
826 dpavlin 366
827 dpavlin 374 undef $self->{'currnet_filename'};
828 dpavlin 398 undef $self->{'headline'};
829 dpavlin 374
830 dpavlin 366 my @sorted_tags;
831     if ($self->{tags_by_order}) {
832     @sorted_tags = @{$self->{tags_by_order}};
833     } else {
834     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
835     $self->{tags_by_order} = \@sorted_tags;
836     }
837    
838 dpavlin 368 my @ds;
839 dpavlin 366
840 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
841    
842 dpavlin 366 foreach my $field (@sorted_tags) {
843    
844     my $row;
845    
846     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
847    
848     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
849 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
850 dpavlin 366
851 dpavlin 373 $log->debug("format: $format");
852    
853     my @v;
854     if ($format =~ /$LOOKUP_REGEX/o) {
855     @v = $self->fill_in_to_arr($rec,$format);
856     } else {
857     @v = $self->parse_to_arr($rec,$format);
858     }
859 dpavlin 367 next if (! @v);
860 dpavlin 366
861 dpavlin 439 if ($tag->{'sort'}) {
862 dpavlin 441 @v = $self->sort_arr(@v);
863 dpavlin 439 $log->warn("sort within tag is usually not what you want!");
864     }
865    
866 dpavlin 375 # use format?
867     if ($tag->{'format_name'}) {
868     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
869     }
870    
871 dpavlin 398 if ($field eq 'filename') {
872     $self->{'current_filename'} = join('',@v);
873     $log->debug("filename: ",$self->{'current_filename'});
874     } elsif ($field eq 'headline') {
875     $self->{'headline'} .= join('',@v);
876     $log->debug("headline: ",$self->{'headline'});
877     next; # don't return headline in data_structure!
878     }
879    
880 dpavlin 439 # delimiter will join repeatable fields
881     if ($tag->{'delimiter'}) {
882     @v = ( join($tag->{'delimiter'}, @v) );
883 dpavlin 366 }
884 dpavlin 373
885 dpavlin 439 # default types
886     my @types = qw(display swish);
887     # override by type attribute
888     @types = ( $tag->{'type'} ) if ($tag->{'type'});
889 dpavlin 374
890 dpavlin 439 foreach my $type (@types) {
891     # append to previous line?
892     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
893     if ($tag->{'append'}) {
894    
895     # I will delimit appended part with
896     # delimiter (or ,)
897     my $d = $tag->{'delimiter'};
898     # default delimiter
899     $d ||= ", ";
900    
901     my $last = pop @{$row->{$type}};
902     $d = "" if (! $last);
903     $last .= $d . join($d, @v);
904     push @{$row->{$type}}, $last;
905    
906     } else {
907     push @{$row->{$type}}, @v;
908     }
909     }
910    
911    
912 dpavlin 366 }
913    
914 dpavlin 368 if ($row) {
915     $row->{'tag'} = $field;
916 dpavlin 375
917     # TODO: name_sigular, name_plural
918     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
919     $row->{'name'} = $name ? $self->_x($name) : $field;
920    
921 dpavlin 439 # post-sort all values in field
922     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
923     $log->warn("sort at field tag not implemented");
924     }
925    
926 dpavlin 368 push @ds, $row;
927 dpavlin 374
928 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
929 dpavlin 368 }
930 dpavlin 366
931     }
932    
933 dpavlin 370 return @ds;
934 dpavlin 366
935     }
936    
937 dpavlin 370 =head2 output
938    
939     Create output from in-memory data structure using Template Toolkit template.
940    
941     my $text = $webpac->output( template => 'text.tt', data => @ds );
942    
943     =cut
944    
945     sub output {
946     my $self = shift;
947    
948     my $args = {@_};
949    
950 dpavlin 372 my $log = $self->_get_logger();
951 dpavlin 370
952 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
953     $log->logconfess("need data array") if (! $args->{'data'});
954    
955 dpavlin 370 my $out;
956    
957     $self->{'tt'}->process(
958     $args->{'template'},
959     $args,
960     \$out
961     ) || confess $self->{'tt'}->error();
962    
963     return $out;
964     }
965    
966 dpavlin 411 =head2 output_file
967    
968     Create output from in-memory data structure using Template Toolkit template
969     to a file.
970    
971     $webpac->output_file(
972     file => 'out.txt',
973     template => 'text.tt',
974     data => @ds
975     );
976    
977     =cut
978    
979     sub output_file {
980     my $self = shift;
981    
982     my $args = {@_};
983    
984     my $log = $self->_get_logger();
985    
986 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
987 dpavlin 411
988 dpavlin 421 $log->debug("creating file ",$file);
989 dpavlin 411
990 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
991 dpavlin 411 print $fh $self->output(
992     template => $args->{'template'},
993     data => $args->{'data'},
994     ) || $log->logdie("print: $!");
995     close($fh) || $log->logdie("close: $!");
996     }
997    
998 dpavlin 375 =head2 apply_format
999    
1000     Apply format specified in tag with C<format_name="name"> and
1001     C<format_delimiter=";;">.
1002    
1003     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1004    
1005     Formats can contain C<lookup{...}> if you need them.
1006    
1007     =cut
1008    
1009     sub apply_format {
1010     my $self = shift;
1011    
1012     my ($name,$delimiter,$data) = @_;
1013    
1014     my $log = $self->_get_logger();
1015    
1016     if (! $self->{'import_xml'}->{'format'}->{$name}) {
1017     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1018     return $data;
1019     }
1020    
1021     $log->warn("no delimiter for format $name") if (! $delimiter);
1022    
1023     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1024    
1025     my @data = split(/\Q$delimiter\E/, $data);
1026    
1027     my $out = sprintf($format, @data);
1028     $log->debug("using format $name [$format] on $data to produce: $out");
1029    
1030     if ($out =~ m/$LOOKUP_REGEX/o) {
1031     return $self->lookup($out);
1032     } else {
1033     return $out;
1034     }
1035    
1036     }
1037    
1038    
1039 dpavlin 372 #
1040     #
1041     #
1042    
1043     =head1 INTERNAL METHODS
1044    
1045     Here is a quick list of internal methods, mostly useful to turn debugging
1046     on them (see L<LOGGING> below for explanation).
1047    
1048     =cut
1049    
1050     =head2 _eval
1051    
1052     Internal function to eval code without C<strict 'subs'>.
1053    
1054     =cut
1055    
1056     sub _eval {
1057     my $self = shift;
1058    
1059     my $code = shift || return;
1060    
1061     my $log = $self->_get_logger();
1062    
1063     no strict 'subs';
1064     my $ret = eval $code;
1065     if ($@) {
1066     $log->error("problem with eval code [$code]: $@");
1067     }
1068    
1069     $log->debug("eval: ",$code," [",$ret,"]");
1070    
1071     return $ret || 0;
1072     }
1073    
1074     =head2 _sort_by_order
1075    
1076     Sort xml tags data structure accoding to C<order=""> attribute.
1077    
1078     =cut
1079    
1080     sub _sort_by_order {
1081     my $self = shift;
1082    
1083     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1084     $self->{'import_xml'}->{'indexer'}->{$a};
1085     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1086     $self->{'import_xml'}->{'indexer'}->{$b};
1087    
1088     return $va <=> $vb;
1089     }
1090    
1091 dpavlin 375 =head2 _get_logger
1092    
1093     Get C<Log::Log4perl> object with a twist: domains are defined for each
1094     method
1095    
1096     my $log = $webpac->_get_logger();
1097    
1098     =cut
1099    
1100 dpavlin 372 sub _get_logger {
1101     my $self = shift;
1102    
1103 dpavlin 374 my $name = (caller(1))[3] || caller;
1104     return get_logger($name);
1105 dpavlin 372 }
1106    
1107 dpavlin 375 =head2 _x
1108    
1109     Convert string from UTF-8 to code page defined in C<import_xml>.
1110    
1111     my $text = $webpac->_x('utf8 text');
1112    
1113     =cut
1114    
1115     sub _x {
1116     my $self = shift;
1117     my $utf8 = shift || return;
1118    
1119     return $self->{'utf2cp'}->convert($utf8) ||
1120     $self->_get_logger()->logwarn("can't convert '$utf8'");
1121     }
1122    
1123 dpavlin 372 #
1124     #
1125     #
1126    
1127     =head1 LOGGING
1128    
1129     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1130     C<log.conf>.
1131    
1132     Methods defined above have different levels of logging, so
1133     it's descriptions will be useful to turn (mostry B<debug> logging) on
1134     or off to see why WabPAC isn't perforing as you expect it (it might even
1135     be a bug!).
1136    
1137     B<This is different from normal Log4perl behaviour>. To repeat, you can
1138     also use method names, and not only classes (which are just few)
1139     to filter logging.
1140    
1141 dpavlin 422
1142     =head1 MEMORY USAGE
1143    
1144     C<low_mem> options is double-edged sword. If enabled, WebPAC
1145     will run on memory constraint machines (which doesn't have enough
1146     physical RAM to create memory structure for whole source database).
1147    
1148     If your machine has 512Mb or more of RAM and database is around 10000 records,
1149     memory shouldn't be an issue. If you don't have enough physical RAM, you
1150     might consider using virtual memory (if your operating system is handling it
1151     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1152     parsed structure of ISIS database (this is what C<low_mem> option does).
1153    
1154     Hitting swap at end of reading source database is probably o.k. However,
1155     hitting swap before 90% will dramatically decrease performance and you will
1156     be better off with C<low_mem> and using rest of availble memory for
1157     operating system disk cache (Linux is particuallary good about this).
1158     However, every access to database record will require disk access, so
1159     generation phase will be slower 10-100 times.
1160    
1161     Parsed structures are essential - you just have option to trade RAM memory
1162     (which is fast) for disk space (which is slow). Be sure to have planty of
1163     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1164    
1165     However, when WebPAC is running on desktop machines (or laptops :-), it's
1166     highly undesireable for system to start swapping. Using C<low_mem> option can
1167     reduce WecPAC memory usage to around 64Mb for same database with lookup
1168     fields and sorted indexes which stay in RAM. Performance will suffer, but
1169     memory usage will really be minimal. It might be also more confortable to
1170     run WebPAC reniced on those machines.
1171    
1172 dpavlin 372 =cut
1173    
1174 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26