/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 555 - (hide annotations)
Fri Oct 29 22:09:04 2004 UTC (19 years, 5 months ago) by dpavlin
File size: 27613 byte(s)
Special suport for non-repeatable fileds:

Now you can specify s900^a instead of v900^a to get one and singe value.
Have in mind that if you DON'T have any repeatable value in this filed, you
WILL create infinite loop. So, use some repeatable value.

This is however, very useful for situation where you want to reference one
non-repeatable value from field which is repeatable.

There is experimental code that checks for infitite loops, but it's not
working right (and I'm too tired to fix this now).


1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37 dpavlin 439 code_page => 'ISO-8859-2',
38     low_mem => 1,
39 dpavlin 500 filter => {
40     'lower' => sub { lc($_[0]) },
41     },
42 dpavlin 352 );
43    
44     Default C<code_page> is C<ISO-8859-2>.
45    
46 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48 dpavlin 500 There is optinal parametar C<filter> which specify different filters which
49     can be applied using C<filter{name}> notation.
50    
51 dpavlin 421 This method will also read configuration files
52 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
53     and configuration file specified by C<config_file>
54     which describes databases to be indexed.
55    
56 dpavlin 352 =cut
57    
58 dpavlin 363 # mapping between data type and tag which specify
59     # format in XML file
60     my %type2tag = (
61     'isis' => 'isis',
62     # 'excel' => 'column',
63     # 'marc' => 'marc',
64     # 'feed' => 'feed'
65     );
66    
67 dpavlin 352 sub new {
68     my $class = shift;
69     my $self = {@_};
70     bless($self, $class);
71    
72 dpavlin 422 $self->{'start_t'} = time();
73    
74 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
75     Log::Log4perl->init($log_file);
76    
77     my $log = $self->_get_logger();
78    
79 dpavlin 352 # fill in default values
80     # output codepage
81     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
82    
83 dpavlin 353 #
84     # read global.conf
85     #
86 dpavlin 372 $log->debug("read 'global.conf'");
87 dpavlin 352
88 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
89 dpavlin 352
90     # read global config parametars
91     foreach my $var (qw(
92     dbi_dbd
93     dbi_dsn
94     dbi_user
95     dbi_passwd
96     show_progress
97     my_unac_filter
98 dpavlin 370 output_template
99 dpavlin 352 )) {
100 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
101 dpavlin 352 }
102    
103 dpavlin 353 #
104     # read indexer config file
105     #
106 dpavlin 352
107 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
108 dpavlin 352
109 dpavlin 370 # create UTF-8 convertor for import_xml files
110 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
111 dpavlin 370
112     # create Template toolkit instance
113     $self->{'tt'} = Template->new(
114     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
115     # FILTERS => {
116     # 'foo' => \&foo_filter,
117     # },
118     EVAL_PERL => 1,
119     );
120    
121 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
122     if ($self->{'low_mem'}) {
123 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
124 dpavlin 421
125     my $db_file = "data.db";
126    
127     if (-e $db_file) {
128     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
129     $log->debug("removed '$db_file' from last run");
130     }
131    
132 dpavlin 434 require DBM::Deep;
133 dpavlin 421
134     my $db = new DBM::Deep $db_file;
135    
136     $log->logdie("DBM::Deep error: $!") unless ($db);
137    
138     if ($db->error()) {
139     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
140     } else {
141 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
142 dpavlin 421 }
143    
144     $self->{'db'} = $db;
145     }
146    
147 dpavlin 500 $log->debug("filters defined: ",Dumper($self->{'filter'}));
148    
149 dpavlin 352 return $self;
150     }
151    
152     =head2 open_isis
153    
154     Open CDS/ISIS database using OpenIsis module and read all records to memory.
155    
156     $webpac->open_isis(
157     filename => '/data/ISIS/ISIS',
158     code_page => '852',
159 dpavlin 431 limit_mfn => 500,
160     start_mfn => 6000,
161 dpavlin 352 lookup => [ ... ],
162     );
163    
164     By default, ISIS code page is assumed to be C<852>.
165    
166 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
167     from database (so you can skip beginning of your database if you need to).
168    
169 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
170     from database in example above.
171 dpavlin 352
172     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
173     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
174     value in index.
175    
176     lookup => [
177     { 'key' => 'd:v900', 'val' => 'v250^a' },
178     { 'eval' => '"v901^a" eq "Podruèje"',
179     'key' => 'pa:v561^4:v562^4:v461^1',
180     'val' => 'v900' },
181     ]
182    
183 dpavlin 367 Returns number of last record read into memory (size of database, really).
184    
185 dpavlin 352 =cut
186    
187     sub open_isis {
188     my $self = shift;
189     my $arg = {@_};
190    
191 dpavlin 372 my $log = $self->_get_logger();
192    
193     $log->logcroak("need filename") if (! $arg->{'filename'});
194 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
195    
196 dpavlin 436 $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197    
198 dpavlin 421 # store data in object
199     $self->{'isis_filename'} = $arg->{'filename'};
200     $self->{'isis_code_page'} = $code_page;
201    
202 dpavlin 353 use OpenIsis;
203    
204 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
205    
206     # create Text::Iconv object
207     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
208    
209 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
210 dpavlin 421 $log->debug("isis code page: $code_page");
211 dpavlin 363
212 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
213    
214     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
215 dpavlin 431 my $startmfn = 1;
216 dpavlin 352
217 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
218     $log->info("skipping to MFN $s");
219     $startmfn = $s;
220 dpavlin 448 } else {
221     $self->{'start_mfn'} = $startmfn;
222 dpavlin 431 }
223 dpavlin 357
224 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
225 dpavlin 363
226 dpavlin 431 $log->info("processing ",($maxmfn-$startmfn)." records...");
227    
228 dpavlin 352 # read database
229 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
230 dpavlin 352
231 dpavlin 418
232     $log->debug("mfn: $mfn\n");
233    
234 dpavlin 421 my $rec;
235    
236 dpavlin 352 # read record
237     my $row = OpenIsis::read( $isis_db, $mfn );
238     foreach my $k (keys %{$row}) {
239     if ($k ne "mfn") {
240     foreach my $l (@{$row->{$k}}) {
241     $l = $cp->convert($l);
242     # has subfields?
243     my $val;
244     if ($l =~ m/\^/) {
245     foreach my $t (split(/\^/,$l)) {
246     next if (! $t);
247     $val->{substr($t,0,1)} = substr($t,1);
248     }
249     } else {
250     $val = $l;
251     }
252    
253 dpavlin 421 push @{$rec->{$k}}, $val;
254 dpavlin 352 }
255 dpavlin 374 } else {
256 dpavlin 421 push @{$rec->{'000'}}, $mfn;
257 dpavlin 352 }
258    
259     }
260    
261 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
262    
263     # store
264     if ($self->{'low_mem'}) {
265     $self->{'db'}->put($mfn, $rec);
266     } else {
267     $self->{'data'}->{$mfn} = $rec;
268     }
269    
270 dpavlin 352 # create lookup
271 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
272 dpavlin 352
273 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
274    
275 dpavlin 352 }
276    
277 dpavlin 448 $self->{'current_mfn'} = -1;
278 dpavlin 389 $self->{'last_pcnt'} = 0;
279 dpavlin 362
280 dpavlin 418 $log->debug("max mfn: $maxmfn");
281    
282 dpavlin 352 # store max mfn and return it.
283     return $self->{'max_mfn'} = $maxmfn;
284     }
285    
286 dpavlin 362 =head2 fetch_rec
287    
288     Fetch next record from database. It will also display progress bar (once
289     it's implemented, that is).
290    
291     my $rec = $webpac->fetch_rec;
292    
293     =cut
294    
295     sub fetch_rec {
296     my $self = shift;
297    
298 dpavlin 372 my $log = $self->_get_logger();
299 dpavlin 362
300 dpavlin 448 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
301 dpavlin 372
302 dpavlin 448 if ($self->{'current_mfn'} == -1) {
303     $self->{'current_mfn'} = $self->{'start_mfn'};
304     } else {
305     $self->{'current_mfn'}++;
306     }
307    
308     my $mfn = $self->{'current_mfn'};
309    
310 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
311     $self->{'current_mfn'} = $self->{'max_mfn'};
312 dpavlin 373 $log->debug("at EOF");
313 dpavlin 362 return;
314     }
315    
316 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
317    
318 dpavlin 421 if ($self->{'low_mem'}) {
319     return $self->{'db'}->get($mfn);
320     } else {
321     return $self->{'data'}->{$mfn};
322     }
323 dpavlin 362 }
324    
325 dpavlin 448 =head2 mfn
326    
327     Returns current record number (MFN).
328    
329     print $webpac->mfn;
330    
331     =cut
332    
333     sub mfn {
334     my $self = shift;
335     return $self->{'current_mfn'};
336     }
337    
338 dpavlin 389 =head2 progress_bar
339    
340     Draw progress bar on STDERR.
341    
342     $webpac->progress_bar($current, $max);
343    
344     =cut
345    
346     sub progress_bar {
347     my $self = shift;
348    
349     my ($curr,$max) = @_;
350    
351     my $log = $self->_get_logger();
352    
353     $log->logconfess("no current value!") if (! $curr);
354     $log->logconfess("no maximum value!") if (! $max);
355    
356     if ($curr > $max) {
357     $max = $curr;
358     $log->debug("overflow to $curr");
359     }
360    
361     $self->{'last_pcnt'} ||= 1;
362    
363 dpavlin 454 my $p = int($curr * 100 / $max) || 1;
364 dpavlin 389
365 dpavlin 422 # reset on re-run
366     if ($p < $self->{'last_pcnt'}) {
367     $self->{'last_pcnt'} = $p;
368     $self->{'last_t'} = time();
369 dpavlin 453 $self->{'last_curr'} = undef;
370 dpavlin 422 }
371    
372 dpavlin 459 $self->{'last_t'} ||= time();
373    
374 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
375 dpavlin 422
376     my $last_curr = $self->{'last_curr'} || $curr;
377     my $t = time();
378     my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
379     my $eta = ($max-$curr) / ($rate || 1);
380     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
381 dpavlin 389 $self->{'last_pcnt'} = $p;
382 dpavlin 422 $self->{'last_t'} = time();
383     $self->{'last_curr'} = $curr;
384 dpavlin 389 }
385 dpavlin 412 print STDERR "\n" if ($p == 100);
386 dpavlin 389 }
387    
388 dpavlin 422 =head2 fmt_time
389    
390     Format time (in seconds) for display.
391    
392     print $webpac->fmt_time(time());
393    
394     This method is called by L<progress_bar> to display remaining time.
395    
396     =cut
397    
398     sub fmt_time {
399     my $self = shift;
400    
401     my $t = shift || 0;
402     my $out = "";
403    
404     my ($ss,$mm,$hh) = gmtime($t);
405     $out .= "${hh}h" if ($hh);
406     $out .= sprintf("%02d:%02d", $mm,$ss);
407     $out .= " " if ($hh == 0);
408     return $out;
409     }
410    
411 dpavlin 363 =head2 open_import_xml
412    
413     Read file from C<import_xml/> directory and parse it.
414    
415     $webpac->open_import_xml(type => 'isis');
416    
417     =cut
418    
419     sub open_import_xml {
420     my $self = shift;
421    
422 dpavlin 372 my $log = $self->_get_logger();
423    
424 dpavlin 363 my $arg = {@_};
425 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
426 dpavlin 363
427 dpavlin 366 $self->{'type'} = $arg->{'type'};
428 dpavlin 363
429 dpavlin 366 my $type_base = $arg->{'type'};
430 dpavlin 363 $type_base =~ s/_.*$//g;
431    
432 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
433    
434 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
435 dpavlin 366
436     my $f = "./import_xml/".$self->{'type'}.".xml";
437 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
438 dpavlin 363
439 dpavlin 375 $log->info("reading '$f'");
440 dpavlin 363
441 dpavlin 375 $self->{'import_xml_file'} = $f;
442    
443 dpavlin 363 $self->{'import_xml'} = XMLin($f,
444 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
445 dpavlin 363 );
446    
447 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
448    
449 dpavlin 363 }
450    
451 dpavlin 355 =head2 create_lookup
452    
453     Create lookup from record using lookup definition.
454    
455 dpavlin 367 $self->create_lookup($rec, @lookups);
456    
457     Called internally by C<open_*> methods.
458    
459 dpavlin 355 =cut
460    
461     sub create_lookup {
462     my $self = shift;
463    
464 dpavlin 372 my $log = $self->_get_logger();
465 dpavlin 355
466 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
467     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
468    
469 dpavlin 355 foreach my $i (@_) {
470 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
471     $log->logconfess("need val") unless defined($i->{'val'});
472    
473     if (defined($i->{'eval'})) {
474     # eval first, so we can skip fill_in for key and val
475     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
476     if ($self->_eval($eval)) {
477     my $key = $self->fill_in($rec,$i->{'key'}) || next;
478     my @val = $self->fill_in($rec,$i->{'val'}) || next;
479 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
480 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
481     }
482     } else {
483 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
484     my @val = $self->fill_in($rec,$i->{'val'}) || next;
485     $log->debug("stored $key = ",sub { join(" | ",@val) });
486     push @{$self->{'lookup'}->{$key}}, @val;
487 dpavlin 355 }
488     }
489     }
490    
491 dpavlin 356 =head2 get_data
492    
493     Returns value from record.
494    
495 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
496 dpavlin 356
497     Arguments are:
498     record reference C<$rec>,
499     field C<$f>,
500     optional subfiled C<$sf>,
501     index for repeatable values C<$i>.
502    
503 dpavlin 367 Optinal variable C<$found> will be incremeted if there
504 dpavlin 356 is field.
505    
506     Returns value or empty string.
507    
508     =cut
509    
510     sub get_data {
511     my $self = shift;
512    
513     my ($rec,$f,$sf,$i,$found) = @_;
514 dpavlin 367
515 dpavlin 356 if ($$rec->{$f}) {
516 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
517 dpavlin 389 no strict 'refs';
518 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
519     $$found++ if (defined($$found));
520     return $$rec->{$f}->[$i]->{$sf};
521     } elsif ($$rec->{$f}->[$i]) {
522     $$found++ if (defined($$found));
523 dpavlin 366 # it still might have subfield, just
524     # not specified, so we'll dump all
525     if ($$rec->{$f}->[$i] =~ /HASH/o) {
526     my $out;
527     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
528     $out .= $$rec->{$f}->[$i]->{$k}." ";
529     }
530     return $out;
531     } else {
532     return $$rec->{$f}->[$i];
533     }
534 dpavlin 356 }
535     } else {
536     return '';
537     }
538     }
539    
540 dpavlin 352 =head2 fill_in
541    
542     Workhourse of all: takes record from in-memory structure of database and
543     strings with placeholders and returns string or array of with substituted
544     values from record.
545    
546 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
547 dpavlin 352
548     Optional argument is ordinal number for repeatable fields. By default,
549 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
550     element is 0).
551     Following example will read second value from repeatable field.
552 dpavlin 352
553 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
554 dpavlin 353
555     This function B<does not> perform parsing of format to inteligenty skip
556     delimiters before fields which aren't used.
557    
558 dpavlin 376 This method will automatically decode UTF-8 string to local code page
559     if needed.
560    
561 dpavlin 352 =cut
562    
563 dpavlin 372 sub fill_in {
564 dpavlin 371 my $self = shift;
565    
566 dpavlin 372 my $log = $self->_get_logger();
567 dpavlin 371
568 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
569     my $format = shift || $log->logconfess("need format to parse");
570 dpavlin 352 # iteration (for repeatable fields)
571     my $i = shift || 0;
572    
573 dpavlin 555 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
574    
575 dpavlin 352 # FIXME remove for speedup?
576 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
577 dpavlin 352
578 dpavlin 376 if (utf8::is_utf8($format)) {
579     $format = $self->_x($format);
580     }
581    
582 dpavlin 352 my $found = 0;
583    
584 dpavlin 359 my $eval_code;
585     # remove eval{...} from beginning
586     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
587    
588 dpavlin 500 my $filter_name;
589     # remove filter{...} from beginning
590     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
591    
592 dpavlin 352 # do actual replacement of placeholders
593 dpavlin 555 # repeatable fields
594 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
595 dpavlin 555 # non-repeatable fields
596     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
597 dpavlin 352
598 dpavlin 353 if ($found) {
599 dpavlin 373 $log->debug("format: $format");
600 dpavlin 359 if ($eval_code) {
601     my $eval = $self->fill_in($rec,$eval_code,$i);
602 dpavlin 371 return if (! $self->_eval($eval));
603 dpavlin 359 }
604 dpavlin 500 if ($filter_name && $self->{'filter'}->{$filter_name}) {
605     $log->debug("filter '$filter_name' for $format");
606     $format = $self->{'filter'}->{$filter_name}->($format);
607     return unless(defined($format));
608     $log->debug("filter result: $format");
609     }
610 dpavlin 353 # do we have lookups?
611 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
612     $log->debug("format '$format' has lookup");
613 dpavlin 353 return $self->lookup($format);
614     } else {
615     return $format;
616     }
617 dpavlin 352 } else {
618     return;
619     }
620     }
621    
622     =head2 lookup
623    
624 dpavlin 355 Perform lookups on format supplied to it.
625 dpavlin 352
626 dpavlin 367 my $text = $self->lookup('[v900]');
627 dpavlin 352
628 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
629    
630 dpavlin 352 =cut
631    
632     sub lookup {
633     my $self = shift;
634    
635 dpavlin 372 my $log = $self->_get_logger();
636 dpavlin 352
637 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
638    
639 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
640 dpavlin 352 my @in = ( $tmp );
641 dpavlin 372
642     $log->debug("lookup for: ",$tmp);
643    
644 dpavlin 352 my @out;
645     while (my $f = shift @in) {
646 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
647 dpavlin 352 my $k = $1;
648     if ($self->{'lookup'}->{$k}) {
649     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
650     my $tmp2 = $f;
651 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
652 dpavlin 352 push @in, $tmp2;
653     }
654     } else {
655     undef $f;
656     }
657     } elsif ($f) {
658     push @out, $f;
659     }
660     }
661 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
662 dpavlin 352 return @out;
663     } else {
664     return $tmp;
665     }
666     }
667    
668 dpavlin 356 =head2 parse
669    
670     Perform smart parsing of string, skipping delimiters for fields which aren't
671     defined. It can also eval code in format starting with C<eval{...}> and
672     return output or nothing depending on eval code.
673    
674 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
675 dpavlin 356
676     =cut
677    
678     sub parse {
679     my $self = shift;
680    
681 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
682 dpavlin 356
683 dpavlin 366 return if (! $format_utf8);
684    
685 dpavlin 372 my $log = $self->_get_logger();
686 dpavlin 358
687 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
688     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
689    
690 dpavlin 358 $i = 0 if (! $i);
691    
692 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
693 dpavlin 366
694 dpavlin 356 my @out;
695    
696 dpavlin 373 $log->debug("format: $format");
697    
698 dpavlin 356 my $eval_code;
699     # remove eval{...} from beginning
700     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
701    
702 dpavlin 500 my $filter_name;
703     # remove filter{...} from beginning
704     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
705    
706 dpavlin 358 my $prefix;
707     my $all_found=0;
708 dpavlin 356
709 dpavlin 555 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
710 dpavlin 356
711 dpavlin 358 my $del = $1 || '';
712 dpavlin 359 $prefix ||= $del if ($all_found == 0);
713 dpavlin 358
714 dpavlin 555 # repeatable index
715     my $r = $i;
716     $r = 0 if (lc("$2") eq 's');
717    
718 dpavlin 358 my $found = 0;
719 dpavlin 555 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
720 dpavlin 358
721 dpavlin 356 if ($found) {
722 dpavlin 358 push @out, $del;
723     push @out, $tmp;
724     $all_found += $found;
725 dpavlin 356 }
726     }
727    
728 dpavlin 358 return if (! $all_found);
729 dpavlin 356
730 dpavlin 373 my $out = join('',@out);
731 dpavlin 358
732 dpavlin 373 if ($out) {
733     # add rest of format (suffix)
734     $out .= $format;
735 dpavlin 367
736 dpavlin 373 # add prefix if not there
737     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
738    
739     $log->debug("result: $out");
740     }
741    
742 dpavlin 359 if ($eval_code) {
743 dpavlin 501 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
744 dpavlin 500 $log->debug("about to eval{$eval} format: $out");
745 dpavlin 371 return if (! $self->_eval($eval));
746 dpavlin 359 }
747 dpavlin 500
748     if ($filter_name && $self->{'filter'}->{$filter_name}) {
749     $log->debug("about to filter{$filter_name} format: $out");
750     $out = $self->{'filter'}->{$filter_name}->($out);
751     return unless(defined($out));
752     $log->debug("filter result: $out");
753     }
754 dpavlin 359
755 dpavlin 358 return $out;
756 dpavlin 356 }
757    
758 dpavlin 367 =head2 parse_to_arr
759    
760     Similar to C<parse>, but returns array of all repeatable fields
761    
762     my @arr = $webpac->parse_to_arr($rec,'v250^a');
763    
764     =cut
765    
766     sub parse_to_arr {
767     my $self = shift;
768    
769     my ($rec, $format_utf8) = @_;
770    
771 dpavlin 372 my $log = $self->_get_logger();
772    
773     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
774 dpavlin 367 return if (! $format_utf8);
775    
776     my $i = 0;
777     my @arr;
778    
779     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
780     push @arr, $v;
781     }
782    
783 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
784    
785 dpavlin 367 return @arr;
786     }
787    
788 dpavlin 373 =head2 fill_in_to_arr
789    
790     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
791     for fields which have lookups, so they shouldn't be parsed but rather
792     C<fill_id>ed.
793    
794     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
795    
796     =cut
797    
798     sub fill_in_to_arr {
799     my $self = shift;
800    
801     my ($rec, $format_utf8) = @_;
802    
803     my $log = $self->_get_logger();
804    
805     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
806     return if (! $format_utf8);
807    
808     my $i = 0;
809     my @arr;
810    
811     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
812     push @arr, @v;
813     }
814    
815     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
816    
817     return @arr;
818     }
819    
820 dpavlin 441 =head2 sort_arr
821 dpavlin 373
822 dpavlin 441 Sort array ignoring case and html in data
823    
824     my @sorted = $webpac->sort_arr(@unsorted);
825    
826     =cut
827    
828     sub sort_arr {
829     my $self = shift;
830    
831     my $log = $self->_get_logger();
832    
833     # FIXME add Schwartzian Transformation?
834    
835     my @sorted = sort {
836     $a =~ s#<[^>]+/*>##;
837     $b =~ s#<[^>]+/*>##;
838     lc($b) cmp lc($a)
839     } @_;
840     $log->debug("sorted values: ",sub { join(", ",@sorted) });
841    
842     return @sorted;
843     }
844    
845    
846 dpavlin 366 =head2 data_structure
847    
848     Create in-memory data structure which represents layout from C<import_xml>.
849     It is used later to produce output.
850    
851 dpavlin 368 my @ds = $webpac->data_structure($rec);
852 dpavlin 366
853 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
854 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
855     <headline> tag.
856 dpavlin 374
857 dpavlin 366 =cut
858    
859 dpavlin 372 sub data_structure {
860 dpavlin 366 my $self = shift;
861    
862 dpavlin 372 my $log = $self->_get_logger();
863 dpavlin 366
864     my $rec = shift;
865 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
866 dpavlin 366
867 dpavlin 374 undef $self->{'currnet_filename'};
868 dpavlin 398 undef $self->{'headline'};
869 dpavlin 374
870 dpavlin 366 my @sorted_tags;
871     if ($self->{tags_by_order}) {
872     @sorted_tags = @{$self->{tags_by_order}};
873     } else {
874     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
875     $self->{tags_by_order} = \@sorted_tags;
876     }
877    
878 dpavlin 368 my @ds;
879 dpavlin 366
880 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
881    
882 dpavlin 366 foreach my $field (@sorted_tags) {
883    
884     my $row;
885    
886     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
887    
888     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
889 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
890 dpavlin 366
891 dpavlin 373 $log->debug("format: $format");
892    
893     my @v;
894     if ($format =~ /$LOOKUP_REGEX/o) {
895     @v = $self->fill_in_to_arr($rec,$format);
896     } else {
897     @v = $self->parse_to_arr($rec,$format);
898     }
899 dpavlin 367 next if (! @v);
900 dpavlin 366
901 dpavlin 439 if ($tag->{'sort'}) {
902 dpavlin 441 @v = $self->sort_arr(@v);
903 dpavlin 439 $log->warn("sort within tag is usually not what you want!");
904     }
905    
906 dpavlin 375 # use format?
907     if ($tag->{'format_name'}) {
908     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
909     }
910    
911 dpavlin 398 if ($field eq 'filename') {
912     $self->{'current_filename'} = join('',@v);
913     $log->debug("filename: ",$self->{'current_filename'});
914     } elsif ($field eq 'headline') {
915     $self->{'headline'} .= join('',@v);
916     $log->debug("headline: ",$self->{'headline'});
917     next; # don't return headline in data_structure!
918     }
919    
920 dpavlin 439 # delimiter will join repeatable fields
921     if ($tag->{'delimiter'}) {
922     @v = ( join($tag->{'delimiter'}, @v) );
923 dpavlin 366 }
924 dpavlin 373
925 dpavlin 439 # default types
926     my @types = qw(display swish);
927     # override by type attribute
928     @types = ( $tag->{'type'} ) if ($tag->{'type'});
929 dpavlin 374
930 dpavlin 439 foreach my $type (@types) {
931     # append to previous line?
932     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
933     if ($tag->{'append'}) {
934    
935     # I will delimit appended part with
936     # delimiter (or ,)
937     my $d = $tag->{'delimiter'};
938     # default delimiter
939 dpavlin 501 $d ||= " ";
940 dpavlin 439
941     my $last = pop @{$row->{$type}};
942     $d = "" if (! $last);
943     $last .= $d . join($d, @v);
944     push @{$row->{$type}}, $last;
945    
946     } else {
947     push @{$row->{$type}}, @v;
948     }
949     }
950    
951    
952 dpavlin 366 }
953    
954 dpavlin 368 if ($row) {
955     $row->{'tag'} = $field;
956 dpavlin 375
957     # TODO: name_sigular, name_plural
958     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
959     $row->{'name'} = $name ? $self->_x($name) : $field;
960    
961 dpavlin 439 # post-sort all values in field
962     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
963     $log->warn("sort at field tag not implemented");
964     }
965    
966 dpavlin 368 push @ds, $row;
967 dpavlin 374
968 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
969 dpavlin 368 }
970 dpavlin 366
971     }
972    
973 dpavlin 370 return @ds;
974 dpavlin 366
975     }
976    
977 dpavlin 370 =head2 output
978    
979     Create output from in-memory data structure using Template Toolkit template.
980    
981     my $text = $webpac->output( template => 'text.tt', data => @ds );
982    
983     =cut
984    
985     sub output {
986     my $self = shift;
987    
988     my $args = {@_};
989    
990 dpavlin 372 my $log = $self->_get_logger();
991 dpavlin 370
992 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
993     $log->logconfess("need data array") if (! $args->{'data'});
994    
995 dpavlin 370 my $out;
996    
997     $self->{'tt'}->process(
998     $args->{'template'},
999     $args,
1000     \$out
1001     ) || confess $self->{'tt'}->error();
1002    
1003     return $out;
1004     }
1005    
1006 dpavlin 411 =head2 output_file
1007    
1008     Create output from in-memory data structure using Template Toolkit template
1009     to a file.
1010    
1011     $webpac->output_file(
1012     file => 'out.txt',
1013     template => 'text.tt',
1014     data => @ds
1015     );
1016    
1017     =cut
1018    
1019     sub output_file {
1020     my $self = shift;
1021    
1022     my $args = {@_};
1023    
1024     my $log = $self->_get_logger();
1025    
1026 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
1027 dpavlin 411
1028 dpavlin 421 $log->debug("creating file ",$file);
1029 dpavlin 411
1030 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1031 dpavlin 411 print $fh $self->output(
1032     template => $args->{'template'},
1033     data => $args->{'data'},
1034     ) || $log->logdie("print: $!");
1035     close($fh) || $log->logdie("close: $!");
1036     }
1037    
1038 dpavlin 375 =head2 apply_format
1039    
1040     Apply format specified in tag with C<format_name="name"> and
1041     C<format_delimiter=";;">.
1042    
1043     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1044    
1045     Formats can contain C<lookup{...}> if you need them.
1046    
1047     =cut
1048    
1049     sub apply_format {
1050     my $self = shift;
1051    
1052     my ($name,$delimiter,$data) = @_;
1053    
1054     my $log = $self->_get_logger();
1055    
1056     if (! $self->{'import_xml'}->{'format'}->{$name}) {
1057     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1058     return $data;
1059     }
1060    
1061     $log->warn("no delimiter for format $name") if (! $delimiter);
1062    
1063     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1064    
1065     my @data = split(/\Q$delimiter\E/, $data);
1066    
1067     my $out = sprintf($format, @data);
1068     $log->debug("using format $name [$format] on $data to produce: $out");
1069    
1070     if ($out =~ m/$LOOKUP_REGEX/o) {
1071     return $self->lookup($out);
1072     } else {
1073     return $out;
1074     }
1075    
1076     }
1077    
1078    
1079 dpavlin 372 #
1080     #
1081     #
1082    
1083     =head1 INTERNAL METHODS
1084    
1085     Here is a quick list of internal methods, mostly useful to turn debugging
1086     on them (see L<LOGGING> below for explanation).
1087    
1088     =cut
1089    
1090     =head2 _eval
1091    
1092     Internal function to eval code without C<strict 'subs'>.
1093    
1094     =cut
1095    
1096     sub _eval {
1097     my $self = shift;
1098    
1099     my $code = shift || return;
1100    
1101     my $log = $self->_get_logger();
1102    
1103     no strict 'subs';
1104     my $ret = eval $code;
1105     if ($@) {
1106     $log->error("problem with eval code [$code]: $@");
1107     }
1108    
1109     $log->debug("eval: ",$code," [",$ret,"]");
1110    
1111 dpavlin 500 return $ret || undef;
1112 dpavlin 372 }
1113    
1114     =head2 _sort_by_order
1115    
1116     Sort xml tags data structure accoding to C<order=""> attribute.
1117    
1118     =cut
1119    
1120     sub _sort_by_order {
1121     my $self = shift;
1122    
1123     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1124     $self->{'import_xml'}->{'indexer'}->{$a};
1125     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1126     $self->{'import_xml'}->{'indexer'}->{$b};
1127    
1128     return $va <=> $vb;
1129     }
1130    
1131 dpavlin 375 =head2 _get_logger
1132    
1133     Get C<Log::Log4perl> object with a twist: domains are defined for each
1134     method
1135    
1136     my $log = $webpac->_get_logger();
1137    
1138     =cut
1139    
1140 dpavlin 372 sub _get_logger {
1141     my $self = shift;
1142    
1143 dpavlin 374 my $name = (caller(1))[3] || caller;
1144     return get_logger($name);
1145 dpavlin 372 }
1146    
1147 dpavlin 375 =head2 _x
1148    
1149     Convert string from UTF-8 to code page defined in C<import_xml>.
1150    
1151     my $text = $webpac->_x('utf8 text');
1152    
1153     =cut
1154    
1155     sub _x {
1156     my $self = shift;
1157     my $utf8 = shift || return;
1158    
1159     return $self->{'utf2cp'}->convert($utf8) ||
1160     $self->_get_logger()->logwarn("can't convert '$utf8'");
1161     }
1162    
1163 dpavlin 372 #
1164     #
1165     #
1166    
1167     =head1 LOGGING
1168    
1169     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1170     C<log.conf>.
1171    
1172     Methods defined above have different levels of logging, so
1173     it's descriptions will be useful to turn (mostry B<debug> logging) on
1174     or off to see why WabPAC isn't perforing as you expect it (it might even
1175     be a bug!).
1176    
1177     B<This is different from normal Log4perl behaviour>. To repeat, you can
1178     also use method names, and not only classes (which are just few)
1179     to filter logging.
1180    
1181 dpavlin 422
1182     =head1 MEMORY USAGE
1183    
1184     C<low_mem> options is double-edged sword. If enabled, WebPAC
1185     will run on memory constraint machines (which doesn't have enough
1186     physical RAM to create memory structure for whole source database).
1187    
1188     If your machine has 512Mb or more of RAM and database is around 10000 records,
1189     memory shouldn't be an issue. If you don't have enough physical RAM, you
1190     might consider using virtual memory (if your operating system is handling it
1191     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1192     parsed structure of ISIS database (this is what C<low_mem> option does).
1193    
1194     Hitting swap at end of reading source database is probably o.k. However,
1195     hitting swap before 90% will dramatically decrease performance and you will
1196     be better off with C<low_mem> and using rest of availble memory for
1197     operating system disk cache (Linux is particuallary good about this).
1198     However, every access to database record will require disk access, so
1199     generation phase will be slower 10-100 times.
1200    
1201     Parsed structures are essential - you just have option to trade RAM memory
1202     (which is fast) for disk space (which is slow). Be sure to have planty of
1203     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1204    
1205     However, when WebPAC is running on desktop machines (or laptops :-), it's
1206     highly undesireable for system to start swapping. Using C<low_mem> option can
1207     reduce WecPAC memory usage to around 64Mb for same database with lookup
1208     fields and sorted indexes which stay in RAM. Performance will suffer, but
1209     memory usage will really be minimal. It might be also more confortable to
1210     run WebPAC reniced on those machines.
1211    
1212 dpavlin 372 =cut
1213    
1214 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26