/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 459 - (hide annotations)
Tue Sep 21 19:08:11 2004 UTC (16 years, 9 months ago) by dpavlin
File size: 26360 byte(s)
fix warning

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37 dpavlin 439 code_page => 'ISO-8859-2',
38     low_mem => 1,
39 dpavlin 352 );
40    
41     Default C<code_page> is C<ISO-8859-2>.
42    
43 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45 dpavlin 421 This method will also read configuration files
46 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
47     and configuration file specified by C<config_file>
48     which describes databases to be indexed.
49    
50 dpavlin 352 =cut
51    
52 dpavlin 363 # mapping between data type and tag which specify
53     # format in XML file
54     my %type2tag = (
55     'isis' => 'isis',
56     # 'excel' => 'column',
57     # 'marc' => 'marc',
58     # 'feed' => 'feed'
59     );
60    
61 dpavlin 352 sub new {
62     my $class = shift;
63     my $self = {@_};
64     bless($self, $class);
65    
66 dpavlin 422 $self->{'start_t'} = time();
67    
68 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
69     Log::Log4perl->init($log_file);
70    
71     my $log = $self->_get_logger();
72    
73 dpavlin 352 # fill in default values
74     # output codepage
75     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76    
77 dpavlin 353 #
78     # read global.conf
79     #
80 dpavlin 372 $log->debug("read 'global.conf'");
81 dpavlin 352
82 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83 dpavlin 352
84     # read global config parametars
85     foreach my $var (qw(
86     dbi_dbd
87     dbi_dsn
88     dbi_user
89     dbi_passwd
90     show_progress
91     my_unac_filter
92 dpavlin 370 output_template
93 dpavlin 352 )) {
94 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 dpavlin 352 }
96    
97 dpavlin 353 #
98     # read indexer config file
99     #
100 dpavlin 352
101 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102 dpavlin 352
103 dpavlin 370 # create UTF-8 convertor for import_xml files
104 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105 dpavlin 370
106     # create Template toolkit instance
107     $self->{'tt'} = Template->new(
108     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109     # FILTERS => {
110     # 'foo' => \&foo_filter,
111     # },
112     EVAL_PERL => 1,
113     );
114    
115 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
116     if ($self->{'low_mem'}) {
117 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118 dpavlin 421
119     my $db_file = "data.db";
120    
121     if (-e $db_file) {
122     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123     $log->debug("removed '$db_file' from last run");
124     }
125    
126 dpavlin 434 require DBM::Deep;
127 dpavlin 421
128     my $db = new DBM::Deep $db_file;
129    
130     $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132     if ($db->error()) {
133     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134     } else {
135 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
136 dpavlin 421 }
137    
138     $self->{'db'} = $db;
139     }
140    
141 dpavlin 352 return $self;
142     }
143    
144     =head2 open_isis
145    
146     Open CDS/ISIS database using OpenIsis module and read all records to memory.
147    
148     $webpac->open_isis(
149     filename => '/data/ISIS/ISIS',
150     code_page => '852',
151 dpavlin 431 limit_mfn => 500,
152     start_mfn => 6000,
153 dpavlin 352 lookup => [ ... ],
154     );
155    
156     By default, ISIS code page is assumed to be C<852>.
157    
158 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
159     from database (so you can skip beginning of your database if you need to).
160    
161 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
162     from database in example above.
163 dpavlin 352
164     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166     value in index.
167    
168     lookup => [
169     { 'key' => 'd:v900', 'val' => 'v250^a' },
170     { 'eval' => '"v901^a" eq "Podruèje"',
171     'key' => 'pa:v561^4:v562^4:v461^1',
172     'val' => 'v900' },
173     ]
174    
175 dpavlin 367 Returns number of last record read into memory (size of database, really).
176    
177 dpavlin 352 =cut
178    
179     sub open_isis {
180     my $self = shift;
181     my $arg = {@_};
182    
183 dpavlin 372 my $log = $self->_get_logger();
184    
185     $log->logcroak("need filename") if (! $arg->{'filename'});
186 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
187    
188 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190 dpavlin 421 # store data in object
191     $self->{'isis_filename'} = $arg->{'filename'};
192     $self->{'isis_code_page'} = $code_page;
193    
194 dpavlin 353 use OpenIsis;
195    
196 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
197    
198     # create Text::Iconv object
199     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200    
201 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
202 dpavlin 421 $log->debug("isis code page: $code_page");
203 dpavlin 363
204 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207 dpavlin 431 my $startmfn = 1;
208 dpavlin 352
209 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
210     $log->info("skipping to MFN $s");
211     $startmfn = $s;
212 dpavlin 448 } else {
213     $self->{'start_mfn'} = $startmfn;
214 dpavlin 431 }
215 dpavlin 357
216 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
217 dpavlin 363
218 dpavlin 431 $log->info("processing ",($maxmfn-$startmfn)." records...");
219    
220 dpavlin 352 # read database
221 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
222 dpavlin 352
223 dpavlin 418
224     $log->debug("mfn: $mfn\n");
225    
226 dpavlin 421 my $rec;
227    
228 dpavlin 352 # read record
229     my $row = OpenIsis::read( $isis_db, $mfn );
230     foreach my $k (keys %{$row}) {
231     if ($k ne "mfn") {
232     foreach my $l (@{$row->{$k}}) {
233     $l = $cp->convert($l);
234     # has subfields?
235     my $val;
236     if ($l =~ m/\^/) {
237     foreach my $t (split(/\^/,$l)) {
238     next if (! $t);
239     $val->{substr($t,0,1)} = substr($t,1);
240     }
241     } else {
242     $val = $l;
243     }
244    
245 dpavlin 421 push @{$rec->{$k}}, $val;
246 dpavlin 352 }
247 dpavlin 374 } else {
248 dpavlin 421 push @{$rec->{'000'}}, $mfn;
249 dpavlin 352 }
250    
251     }
252    
253 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
254    
255     # store
256     if ($self->{'low_mem'}) {
257     $self->{'db'}->put($mfn, $rec);
258     } else {
259     $self->{'data'}->{$mfn} = $rec;
260     }
261    
262 dpavlin 352 # create lookup
263 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
264 dpavlin 352
265 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
266    
267 dpavlin 352 }
268    
269 dpavlin 448 $self->{'current_mfn'} = -1;
270 dpavlin 389 $self->{'last_pcnt'} = 0;
271 dpavlin 362
272 dpavlin 418 $log->debug("max mfn: $maxmfn");
273    
274 dpavlin 352 # store max mfn and return it.
275     return $self->{'max_mfn'} = $maxmfn;
276     }
277    
278 dpavlin 362 =head2 fetch_rec
279    
280     Fetch next record from database. It will also display progress bar (once
281     it's implemented, that is).
282    
283     my $rec = $webpac->fetch_rec;
284    
285     =cut
286    
287     sub fetch_rec {
288     my $self = shift;
289    
290 dpavlin 372 my $log = $self->_get_logger();
291 dpavlin 362
292 dpavlin 448 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
293 dpavlin 372
294 dpavlin 448 if ($self->{'current_mfn'} == -1) {
295     $self->{'current_mfn'} = $self->{'start_mfn'};
296     } else {
297     $self->{'current_mfn'}++;
298     }
299    
300     my $mfn = $self->{'current_mfn'};
301    
302 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
303     $self->{'current_mfn'} = $self->{'max_mfn'};
304 dpavlin 373 $log->debug("at EOF");
305 dpavlin 362 return;
306     }
307    
308 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
309    
310 dpavlin 421 if ($self->{'low_mem'}) {
311     return $self->{'db'}->get($mfn);
312     } else {
313     return $self->{'data'}->{$mfn};
314     }
315 dpavlin 362 }
316    
317 dpavlin 448 =head2 mfn
318    
319     Returns current record number (MFN).
320    
321     print $webpac->mfn;
322    
323     =cut
324    
325     sub mfn {
326     my $self = shift;
327     return $self->{'current_mfn'};
328     }
329    
330 dpavlin 389 =head2 progress_bar
331    
332     Draw progress bar on STDERR.
333    
334     $webpac->progress_bar($current, $max);
335    
336     =cut
337    
338     sub progress_bar {
339     my $self = shift;
340    
341     my ($curr,$max) = @_;
342    
343     my $log = $self->_get_logger();
344    
345     $log->logconfess("no current value!") if (! $curr);
346     $log->logconfess("no maximum value!") if (! $max);
347    
348     if ($curr > $max) {
349     $max = $curr;
350     $log->debug("overflow to $curr");
351     }
352    
353     $self->{'last_pcnt'} ||= 1;
354    
355 dpavlin 454 my $p = int($curr * 100 / $max) || 1;
356 dpavlin 389
357 dpavlin 422 # reset on re-run
358     if ($p < $self->{'last_pcnt'}) {
359     $self->{'last_pcnt'} = $p;
360     $self->{'last_t'} = time();
361 dpavlin 453 $self->{'last_curr'} = undef;
362 dpavlin 422 }
363    
364 dpavlin 459 $self->{'last_t'} ||= time();
365    
366 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
367 dpavlin 422
368     my $last_curr = $self->{'last_curr'} || $curr;
369     my $t = time();
370     my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
371     my $eta = ($max-$curr) / ($rate || 1);
372     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
373 dpavlin 389 $self->{'last_pcnt'} = $p;
374 dpavlin 422 $self->{'last_t'} = time();
375     $self->{'last_curr'} = $curr;
376 dpavlin 389 }
377 dpavlin 412 print STDERR "\n" if ($p == 100);
378 dpavlin 389 }
379    
380 dpavlin 422 =head2 fmt_time
381    
382     Format time (in seconds) for display.
383    
384     print $webpac->fmt_time(time());
385    
386     This method is called by L<progress_bar> to display remaining time.
387    
388     =cut
389    
390     sub fmt_time {
391     my $self = shift;
392    
393     my $t = shift || 0;
394     my $out = "";
395    
396     my ($ss,$mm,$hh) = gmtime($t);
397     $out .= "${hh}h" if ($hh);
398     $out .= sprintf("%02d:%02d", $mm,$ss);
399     $out .= " " if ($hh == 0);
400     return $out;
401     }
402    
403 dpavlin 363 =head2 open_import_xml
404    
405     Read file from C<import_xml/> directory and parse it.
406    
407     $webpac->open_import_xml(type => 'isis');
408    
409     =cut
410    
411     sub open_import_xml {
412     my $self = shift;
413    
414 dpavlin 372 my $log = $self->_get_logger();
415    
416 dpavlin 363 my $arg = {@_};
417 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
418 dpavlin 363
419 dpavlin 366 $self->{'type'} = $arg->{'type'};
420 dpavlin 363
421 dpavlin 366 my $type_base = $arg->{'type'};
422 dpavlin 363 $type_base =~ s/_.*$//g;
423    
424 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
425    
426 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
427 dpavlin 366
428     my $f = "./import_xml/".$self->{'type'}.".xml";
429 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
430 dpavlin 363
431 dpavlin 375 $log->info("reading '$f'");
432 dpavlin 363
433 dpavlin 375 $self->{'import_xml_file'} = $f;
434    
435 dpavlin 363 $self->{'import_xml'} = XMLin($f,
436 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
437 dpavlin 363 );
438    
439 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
440    
441 dpavlin 363 }
442    
443 dpavlin 355 =head2 create_lookup
444    
445     Create lookup from record using lookup definition.
446    
447 dpavlin 367 $self->create_lookup($rec, @lookups);
448    
449     Called internally by C<open_*> methods.
450    
451 dpavlin 355 =cut
452    
453     sub create_lookup {
454     my $self = shift;
455    
456 dpavlin 372 my $log = $self->_get_logger();
457 dpavlin 355
458 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
459     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
460    
461 dpavlin 355 foreach my $i (@_) {
462 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
463     $log->logconfess("need val") unless defined($i->{'val'});
464    
465     if (defined($i->{'eval'})) {
466     # eval first, so we can skip fill_in for key and val
467     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
468     if ($self->_eval($eval)) {
469     my $key = $self->fill_in($rec,$i->{'key'}) || next;
470     my @val = $self->fill_in($rec,$i->{'val'}) || next;
471 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
472 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
473     }
474     } else {
475 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
476     my @val = $self->fill_in($rec,$i->{'val'}) || next;
477     $log->debug("stored $key = ",sub { join(" | ",@val) });
478     push @{$self->{'lookup'}->{$key}}, @val;
479 dpavlin 355 }
480     }
481     }
482    
483 dpavlin 356 =head2 get_data
484    
485     Returns value from record.
486    
487 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
488 dpavlin 356
489     Arguments are:
490     record reference C<$rec>,
491     field C<$f>,
492     optional subfiled C<$sf>,
493     index for repeatable values C<$i>.
494    
495 dpavlin 367 Optinal variable C<$found> will be incremeted if there
496 dpavlin 356 is field.
497    
498     Returns value or empty string.
499    
500     =cut
501    
502     sub get_data {
503     my $self = shift;
504    
505     my ($rec,$f,$sf,$i,$found) = @_;
506 dpavlin 367
507 dpavlin 356 if ($$rec->{$f}) {
508 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
509 dpavlin 389 no strict 'refs';
510 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
511     $$found++ if (defined($$found));
512     return $$rec->{$f}->[$i]->{$sf};
513     } elsif ($$rec->{$f}->[$i]) {
514     $$found++ if (defined($$found));
515 dpavlin 366 # it still might have subfield, just
516     # not specified, so we'll dump all
517     if ($$rec->{$f}->[$i] =~ /HASH/o) {
518     my $out;
519     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
520     $out .= $$rec->{$f}->[$i]->{$k}." ";
521     }
522     return $out;
523     } else {
524     return $$rec->{$f}->[$i];
525     }
526 dpavlin 356 }
527     } else {
528     return '';
529     }
530     }
531    
532 dpavlin 352 =head2 fill_in
533    
534     Workhourse of all: takes record from in-memory structure of database and
535     strings with placeholders and returns string or array of with substituted
536     values from record.
537    
538 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
539 dpavlin 352
540     Optional argument is ordinal number for repeatable fields. By default,
541 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
542     element is 0).
543     Following example will read second value from repeatable field.
544 dpavlin 352
545 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
546 dpavlin 353
547     This function B<does not> perform parsing of format to inteligenty skip
548     delimiters before fields which aren't used.
549    
550 dpavlin 376 This method will automatically decode UTF-8 string to local code page
551     if needed.
552    
553 dpavlin 352 =cut
554    
555 dpavlin 372 sub fill_in {
556 dpavlin 371 my $self = shift;
557    
558 dpavlin 372 my $log = $self->_get_logger();
559 dpavlin 371
560 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
561     my $format = shift || $log->logconfess("need format to parse");
562 dpavlin 352 # iteration (for repeatable fields)
563     my $i = shift || 0;
564    
565     # FIXME remove for speedup?
566 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
567 dpavlin 352
568 dpavlin 376 if (utf8::is_utf8($format)) {
569     $format = $self->_x($format);
570     }
571    
572 dpavlin 352 my $found = 0;
573    
574 dpavlin 359 my $eval_code;
575     # remove eval{...} from beginning
576     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
577    
578 dpavlin 352 # do actual replacement of placeholders
579 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
580 dpavlin 352
581 dpavlin 353 if ($found) {
582 dpavlin 373 $log->debug("format: $format");
583 dpavlin 359 if ($eval_code) {
584     my $eval = $self->fill_in($rec,$eval_code,$i);
585 dpavlin 371 return if (! $self->_eval($eval));
586 dpavlin 359 }
587 dpavlin 353 # do we have lookups?
588 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
589     $log->debug("format '$format' has lookup");
590 dpavlin 353 return $self->lookup($format);
591     } else {
592     return $format;
593     }
594 dpavlin 352 } else {
595     return;
596     }
597     }
598    
599     =head2 lookup
600    
601 dpavlin 355 Perform lookups on format supplied to it.
602 dpavlin 352
603 dpavlin 367 my $text = $self->lookup('[v900]');
604 dpavlin 352
605 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
606    
607 dpavlin 352 =cut
608    
609     sub lookup {
610     my $self = shift;
611    
612 dpavlin 372 my $log = $self->_get_logger();
613 dpavlin 352
614 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
615    
616 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
617 dpavlin 352 my @in = ( $tmp );
618 dpavlin 372
619     $log->debug("lookup for: ",$tmp);
620    
621 dpavlin 352 my @out;
622     while (my $f = shift @in) {
623 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
624 dpavlin 352 my $k = $1;
625     if ($self->{'lookup'}->{$k}) {
626     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
627     my $tmp2 = $f;
628 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
629 dpavlin 352 push @in, $tmp2;
630     }
631     } else {
632     undef $f;
633     }
634     } elsif ($f) {
635     push @out, $f;
636     }
637     }
638 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
639 dpavlin 352 return @out;
640     } else {
641     return $tmp;
642     }
643     }
644    
645 dpavlin 356 =head2 parse
646    
647     Perform smart parsing of string, skipping delimiters for fields which aren't
648     defined. It can also eval code in format starting with C<eval{...}> and
649     return output or nothing depending on eval code.
650    
651 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
652 dpavlin 356
653     =cut
654    
655     sub parse {
656     my $self = shift;
657    
658 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
659 dpavlin 356
660 dpavlin 366 return if (! $format_utf8);
661    
662 dpavlin 372 my $log = $self->_get_logger();
663 dpavlin 358
664 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
665     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
666    
667 dpavlin 358 $i = 0 if (! $i);
668    
669 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
670 dpavlin 366
671 dpavlin 356 my @out;
672    
673 dpavlin 373 $log->debug("format: $format");
674    
675 dpavlin 356 my $eval_code;
676     # remove eval{...} from beginning
677     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
678    
679 dpavlin 358 my $prefix;
680     my $all_found=0;
681 dpavlin 356
682 dpavlin 373 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
683 dpavlin 356
684 dpavlin 358 my $del = $1 || '';
685 dpavlin 359 $prefix ||= $del if ($all_found == 0);
686 dpavlin 358
687     my $found = 0;
688     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
689    
690 dpavlin 356 if ($found) {
691 dpavlin 358 push @out, $del;
692     push @out, $tmp;
693     $all_found += $found;
694 dpavlin 356 }
695     }
696    
697 dpavlin 358 return if (! $all_found);
698 dpavlin 356
699 dpavlin 373 my $out = join('',@out);
700 dpavlin 358
701 dpavlin 373 if ($out) {
702     # add rest of format (suffix)
703     $out .= $format;
704 dpavlin 367
705 dpavlin 373 # add prefix if not there
706     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
707    
708     $log->debug("result: $out");
709     }
710    
711 dpavlin 359 if ($eval_code) {
712     my $eval = $self->fill_in($rec,$eval_code,$i);
713 dpavlin 373 $log->debug("about to eval{",$eval,"} format: $out");
714 dpavlin 371 return if (! $self->_eval($eval));
715 dpavlin 359 }
716    
717 dpavlin 358 return $out;
718 dpavlin 356 }
719    
720 dpavlin 367 =head2 parse_to_arr
721    
722     Similar to C<parse>, but returns array of all repeatable fields
723    
724     my @arr = $webpac->parse_to_arr($rec,'v250^a');
725    
726     =cut
727    
728     sub parse_to_arr {
729     my $self = shift;
730    
731     my ($rec, $format_utf8) = @_;
732    
733 dpavlin 372 my $log = $self->_get_logger();
734    
735     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
736 dpavlin 367 return if (! $format_utf8);
737    
738     my $i = 0;
739     my @arr;
740    
741     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
742     push @arr, $v;
743     }
744    
745 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
746    
747 dpavlin 367 return @arr;
748     }
749    
750 dpavlin 373 =head2 fill_in_to_arr
751    
752     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
753     for fields which have lookups, so they shouldn't be parsed but rather
754     C<fill_id>ed.
755    
756     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
757    
758     =cut
759    
760     sub fill_in_to_arr {
761     my $self = shift;
762    
763     my ($rec, $format_utf8) = @_;
764    
765     my $log = $self->_get_logger();
766    
767     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
768     return if (! $format_utf8);
769    
770     my $i = 0;
771     my @arr;
772    
773     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
774     push @arr, @v;
775     }
776    
777     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
778    
779     return @arr;
780     }
781    
782 dpavlin 441 =head2 sort_arr
783 dpavlin 373
784 dpavlin 441 Sort array ignoring case and html in data
785    
786     my @sorted = $webpac->sort_arr(@unsorted);
787    
788     =cut
789    
790     sub sort_arr {
791     my $self = shift;
792    
793     my $log = $self->_get_logger();
794    
795     # FIXME add Schwartzian Transformation?
796    
797     my @sorted = sort {
798     $a =~ s#<[^>]+/*>##;
799     $b =~ s#<[^>]+/*>##;
800     lc($b) cmp lc($a)
801     } @_;
802     $log->debug("sorted values: ",sub { join(", ",@sorted) });
803    
804     return @sorted;
805     }
806    
807    
808 dpavlin 366 =head2 data_structure
809    
810     Create in-memory data structure which represents layout from C<import_xml>.
811     It is used later to produce output.
812    
813 dpavlin 368 my @ds = $webpac->data_structure($rec);
814 dpavlin 366
815 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
816 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
817     <headline> tag.
818 dpavlin 374
819 dpavlin 366 =cut
820    
821 dpavlin 372 sub data_structure {
822 dpavlin 366 my $self = shift;
823    
824 dpavlin 372 my $log = $self->_get_logger();
825 dpavlin 366
826     my $rec = shift;
827 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
828 dpavlin 366
829 dpavlin 374 undef $self->{'currnet_filename'};
830 dpavlin 398 undef $self->{'headline'};
831 dpavlin 374
832 dpavlin 366 my @sorted_tags;
833     if ($self->{tags_by_order}) {
834     @sorted_tags = @{$self->{tags_by_order}};
835     } else {
836     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
837     $self->{tags_by_order} = \@sorted_tags;
838     }
839    
840 dpavlin 368 my @ds;
841 dpavlin 366
842 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
843    
844 dpavlin 366 foreach my $field (@sorted_tags) {
845    
846     my $row;
847    
848     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
849    
850     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
851 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
852 dpavlin 366
853 dpavlin 373 $log->debug("format: $format");
854    
855     my @v;
856     if ($format =~ /$LOOKUP_REGEX/o) {
857     @v = $self->fill_in_to_arr($rec,$format);
858     } else {
859     @v = $self->parse_to_arr($rec,$format);
860     }
861 dpavlin 367 next if (! @v);
862 dpavlin 366
863 dpavlin 439 if ($tag->{'sort'}) {
864 dpavlin 441 @v = $self->sort_arr(@v);
865 dpavlin 439 $log->warn("sort within tag is usually not what you want!");
866     }
867    
868 dpavlin 375 # use format?
869     if ($tag->{'format_name'}) {
870     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
871     }
872    
873 dpavlin 398 if ($field eq 'filename') {
874     $self->{'current_filename'} = join('',@v);
875     $log->debug("filename: ",$self->{'current_filename'});
876     } elsif ($field eq 'headline') {
877     $self->{'headline'} .= join('',@v);
878     $log->debug("headline: ",$self->{'headline'});
879     next; # don't return headline in data_structure!
880     }
881    
882 dpavlin 439 # delimiter will join repeatable fields
883     if ($tag->{'delimiter'}) {
884     @v = ( join($tag->{'delimiter'}, @v) );
885 dpavlin 366 }
886 dpavlin 373
887 dpavlin 439 # default types
888     my @types = qw(display swish);
889     # override by type attribute
890     @types = ( $tag->{'type'} ) if ($tag->{'type'});
891 dpavlin 374
892 dpavlin 439 foreach my $type (@types) {
893     # append to previous line?
894     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
895     if ($tag->{'append'}) {
896    
897     # I will delimit appended part with
898     # delimiter (or ,)
899     my $d = $tag->{'delimiter'};
900     # default delimiter
901     $d ||= ", ";
902    
903     my $last = pop @{$row->{$type}};
904     $d = "" if (! $last);
905     $last .= $d . join($d, @v);
906     push @{$row->{$type}}, $last;
907    
908     } else {
909     push @{$row->{$type}}, @v;
910     }
911     }
912    
913    
914 dpavlin 366 }
915    
916 dpavlin 368 if ($row) {
917     $row->{'tag'} = $field;
918 dpavlin 375
919     # TODO: name_sigular, name_plural
920     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
921     $row->{'name'} = $name ? $self->_x($name) : $field;
922    
923 dpavlin 439 # post-sort all values in field
924     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
925     $log->warn("sort at field tag not implemented");
926     }
927    
928 dpavlin 368 push @ds, $row;
929 dpavlin 374
930 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
931 dpavlin 368 }
932 dpavlin 366
933     }
934    
935 dpavlin 370 return @ds;
936 dpavlin 366
937     }
938    
939 dpavlin 370 =head2 output
940    
941     Create output from in-memory data structure using Template Toolkit template.
942    
943     my $text = $webpac->output( template => 'text.tt', data => @ds );
944    
945     =cut
946    
947     sub output {
948     my $self = shift;
949    
950     my $args = {@_};
951    
952 dpavlin 372 my $log = $self->_get_logger();
953 dpavlin 370
954 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
955     $log->logconfess("need data array") if (! $args->{'data'});
956    
957 dpavlin 370 my $out;
958    
959     $self->{'tt'}->process(
960     $args->{'template'},
961     $args,
962     \$out
963     ) || confess $self->{'tt'}->error();
964    
965     return $out;
966     }
967    
968 dpavlin 411 =head2 output_file
969    
970     Create output from in-memory data structure using Template Toolkit template
971     to a file.
972    
973     $webpac->output_file(
974     file => 'out.txt',
975     template => 'text.tt',
976     data => @ds
977     );
978    
979     =cut
980    
981     sub output_file {
982     my $self = shift;
983    
984     my $args = {@_};
985    
986     my $log = $self->_get_logger();
987    
988 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
989 dpavlin 411
990 dpavlin 421 $log->debug("creating file ",$file);
991 dpavlin 411
992 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
993 dpavlin 411 print $fh $self->output(
994     template => $args->{'template'},
995     data => $args->{'data'},
996     ) || $log->logdie("print: $!");
997     close($fh) || $log->logdie("close: $!");
998     }
999    
1000 dpavlin 375 =head2 apply_format
1001    
1002     Apply format specified in tag with C<format_name="name"> and
1003     C<format_delimiter=";;">.
1004    
1005     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1006    
1007     Formats can contain C<lookup{...}> if you need them.
1008    
1009     =cut
1010    
1011     sub apply_format {
1012     my $self = shift;
1013    
1014     my ($name,$delimiter,$data) = @_;
1015    
1016     my $log = $self->_get_logger();
1017    
1018     if (! $self->{'import_xml'}->{'format'}->{$name}) {
1019     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1020     return $data;
1021     }
1022    
1023     $log->warn("no delimiter for format $name") if (! $delimiter);
1024    
1025     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1026    
1027     my @data = split(/\Q$delimiter\E/, $data);
1028    
1029     my $out = sprintf($format, @data);
1030     $log->debug("using format $name [$format] on $data to produce: $out");
1031    
1032     if ($out =~ m/$LOOKUP_REGEX/o) {
1033     return $self->lookup($out);
1034     } else {
1035     return $out;
1036     }
1037    
1038     }
1039    
1040    
1041 dpavlin 372 #
1042     #
1043     #
1044    
1045     =head1 INTERNAL METHODS
1046    
1047     Here is a quick list of internal methods, mostly useful to turn debugging
1048     on them (see L<LOGGING> below for explanation).
1049    
1050     =cut
1051    
1052     =head2 _eval
1053    
1054     Internal function to eval code without C<strict 'subs'>.
1055    
1056     =cut
1057    
1058     sub _eval {
1059     my $self = shift;
1060    
1061     my $code = shift || return;
1062    
1063     my $log = $self->_get_logger();
1064    
1065     no strict 'subs';
1066     my $ret = eval $code;
1067     if ($@) {
1068     $log->error("problem with eval code [$code]: $@");
1069     }
1070    
1071     $log->debug("eval: ",$code," [",$ret,"]");
1072    
1073     return $ret || 0;
1074     }
1075    
1076     =head2 _sort_by_order
1077    
1078     Sort xml tags data structure accoding to C<order=""> attribute.
1079    
1080     =cut
1081    
1082     sub _sort_by_order {
1083     my $self = shift;
1084    
1085     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1086     $self->{'import_xml'}->{'indexer'}->{$a};
1087     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1088     $self->{'import_xml'}->{'indexer'}->{$b};
1089    
1090     return $va <=> $vb;
1091     }
1092    
1093 dpavlin 375 =head2 _get_logger
1094    
1095     Get C<Log::Log4perl> object with a twist: domains are defined for each
1096     method
1097    
1098     my $log = $webpac->_get_logger();
1099    
1100     =cut
1101    
1102 dpavlin 372 sub _get_logger {
1103     my $self = shift;
1104    
1105 dpavlin 374 my $name = (caller(1))[3] || caller;
1106     return get_logger($name);
1107 dpavlin 372 }
1108    
1109 dpavlin 375 =head2 _x
1110    
1111     Convert string from UTF-8 to code page defined in C<import_xml>.
1112    
1113     my $text = $webpac->_x('utf8 text');
1114    
1115     =cut
1116    
1117     sub _x {
1118     my $self = shift;
1119     my $utf8 = shift || return;
1120    
1121     return $self->{'utf2cp'}->convert($utf8) ||
1122     $self->_get_logger()->logwarn("can't convert '$utf8'");
1123     }
1124    
1125 dpavlin 372 #
1126     #
1127     #
1128    
1129     =head1 LOGGING
1130    
1131     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1132     C<log.conf>.
1133    
1134     Methods defined above have different levels of logging, so
1135     it's descriptions will be useful to turn (mostry B<debug> logging) on
1136     or off to see why WabPAC isn't perforing as you expect it (it might even
1137     be a bug!).
1138    
1139     B<This is different from normal Log4perl behaviour>. To repeat, you can
1140     also use method names, and not only classes (which are just few)
1141     to filter logging.
1142    
1143 dpavlin 422
1144     =head1 MEMORY USAGE
1145    
1146     C<low_mem> options is double-edged sword. If enabled, WebPAC
1147     will run on memory constraint machines (which doesn't have enough
1148     physical RAM to create memory structure for whole source database).
1149    
1150     If your machine has 512Mb or more of RAM and database is around 10000 records,
1151     memory shouldn't be an issue. If you don't have enough physical RAM, you
1152     might consider using virtual memory (if your operating system is handling it
1153     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1154     parsed structure of ISIS database (this is what C<low_mem> option does).
1155    
1156     Hitting swap at end of reading source database is probably o.k. However,
1157     hitting swap before 90% will dramatically decrease performance and you will
1158     be better off with C<low_mem> and using rest of availble memory for
1159     operating system disk cache (Linux is particuallary good about this).
1160     However, every access to database record will require disk access, so
1161     generation phase will be slower 10-100 times.
1162    
1163     Parsed structures are essential - you just have option to trade RAM memory
1164     (which is fast) for disk space (which is slow). Be sure to have planty of
1165     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1166    
1167     However, when WebPAC is running on desktop machines (or laptops :-), it's
1168     highly undesireable for system to start swapping. Using C<low_mem> option can
1169     reduce WecPAC memory usage to around 64Mb for same database with lookup
1170     fields and sorted indexes which stay in RAM. Performance will suffer, but
1171     memory usage will really be minimal. It might be also more confortable to
1172     run WebPAC reniced on those machines.
1173    
1174 dpavlin 372 =cut
1175    
1176 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26