/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 431 - (hide annotations)
Sun Sep 12 20:31:34 2004 UTC (19 years, 7 months ago) by dpavlin
File size: 24602 byte(s)
new option: start_mfn

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37     [code_page => 'ISO-8859-2',]
38 dpavlin 421 [low_mem => 1,]
39 dpavlin 352 );
40    
41     Default C<code_page> is C<ISO-8859-2>.
42    
43 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45 dpavlin 421 This method will also read configuration files
46 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
47     and configuration file specified by C<config_file>
48     which describes databases to be indexed.
49    
50 dpavlin 352 =cut
51    
52 dpavlin 363 # mapping between data type and tag which specify
53     # format in XML file
54     my %type2tag = (
55     'isis' => 'isis',
56     # 'excel' => 'column',
57     # 'marc' => 'marc',
58     # 'feed' => 'feed'
59     );
60    
61 dpavlin 352 sub new {
62     my $class = shift;
63     my $self = {@_};
64     bless($self, $class);
65    
66 dpavlin 422 $self->{'start_t'} = time();
67    
68 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
69     Log::Log4perl->init($log_file);
70    
71     my $log = $self->_get_logger();
72    
73 dpavlin 352 # fill in default values
74     # output codepage
75     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76    
77 dpavlin 353 #
78     # read global.conf
79     #
80 dpavlin 372 $log->debug("read 'global.conf'");
81 dpavlin 352
82 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83 dpavlin 352
84     # read global config parametars
85     foreach my $var (qw(
86     dbi_dbd
87     dbi_dsn
88     dbi_user
89     dbi_passwd
90     show_progress
91     my_unac_filter
92 dpavlin 370 output_template
93 dpavlin 352 )) {
94 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 dpavlin 352 }
96    
97 dpavlin 353 #
98     # read indexer config file
99     #
100 dpavlin 352
101 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102 dpavlin 352
103 dpavlin 370 # create UTF-8 convertor for import_xml files
104 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105 dpavlin 370
106     # create Template toolkit instance
107     $self->{'tt'} = Template->new(
108     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109     # FILTERS => {
110     # 'foo' => \&foo_filter,
111     # },
112     EVAL_PERL => 1,
113     );
114    
115 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
116     if ($self->{'low_mem'}) {
117 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118 dpavlin 421
119     my $db_file = "data.db";
120    
121     if (-e $db_file) {
122     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123     $log->debug("removed '$db_file' from last run");
124     }
125    
126     use DBM::Deep;
127    
128     my $db = new DBM::Deep $db_file;
129    
130     $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132     if ($db->error()) {
133     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134     } else {
135 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
136 dpavlin 421 }
137    
138     $self->{'db'} = $db;
139     }
140    
141 dpavlin 352 return $self;
142     }
143    
144     =head2 open_isis
145    
146     Open CDS/ISIS database using OpenIsis module and read all records to memory.
147    
148     $webpac->open_isis(
149     filename => '/data/ISIS/ISIS',
150     code_page => '852',
151 dpavlin 431 limit_mfn => 500,
152     start_mfn => 6000,
153 dpavlin 352 lookup => [ ... ],
154     );
155    
156     By default, ISIS code page is assumed to be C<852>.
157    
158 dpavlin 431 If optional parametar C<start_mfn> is set, this will be first MFN to read
159     from database (so you can skip beginning of your database if you need to).
160    
161 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
162     from database in example above.
163 dpavlin 352
164     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166     value in index.
167    
168     lookup => [
169     { 'key' => 'd:v900', 'val' => 'v250^a' },
170     { 'eval' => '"v901^a" eq "Podruèje"',
171     'key' => 'pa:v561^4:v562^4:v461^1',
172     'val' => 'v900' },
173     ]
174    
175 dpavlin 367 Returns number of last record read into memory (size of database, really).
176    
177 dpavlin 352 =cut
178    
179     sub open_isis {
180     my $self = shift;
181     my $arg = {@_};
182    
183 dpavlin 372 my $log = $self->_get_logger();
184    
185     $log->logcroak("need filename") if (! $arg->{'filename'});
186 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
187    
188 dpavlin 421 # store data in object
189     $self->{'isis_filename'} = $arg->{'filename'};
190     $self->{'isis_code_page'} = $code_page;
191    
192 dpavlin 353 use OpenIsis;
193    
194 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
195    
196     # create Text::Iconv object
197     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
198    
199 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
200 dpavlin 421 $log->debug("isis code page: $code_page");
201 dpavlin 363
202 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
203    
204     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
205 dpavlin 431 my $startmfn = 1;
206 dpavlin 352
207 dpavlin 431 if (my $s = $self->{'start_mfn'}) {
208     $log->info("skipping to MFN $s");
209     $startmfn = $s;
210     }
211 dpavlin 357
212 dpavlin 431 $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
213 dpavlin 363
214 dpavlin 431 $log->info("processing ",($maxmfn-$startmfn)." records...");
215    
216 dpavlin 352 # read database
217 dpavlin 431 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
218 dpavlin 352
219 dpavlin 418
220     $log->debug("mfn: $mfn\n");
221    
222 dpavlin 421 my $rec;
223    
224 dpavlin 352 # read record
225     my $row = OpenIsis::read( $isis_db, $mfn );
226     foreach my $k (keys %{$row}) {
227     if ($k ne "mfn") {
228     foreach my $l (@{$row->{$k}}) {
229     $l = $cp->convert($l);
230     # has subfields?
231     my $val;
232     if ($l =~ m/\^/) {
233     foreach my $t (split(/\^/,$l)) {
234     next if (! $t);
235     $val->{substr($t,0,1)} = substr($t,1);
236     }
237     } else {
238     $val = $l;
239     }
240    
241 dpavlin 421 push @{$rec->{$k}}, $val;
242 dpavlin 352 }
243 dpavlin 374 } else {
244 dpavlin 421 push @{$rec->{'000'}}, $mfn;
245 dpavlin 352 }
246    
247     }
248    
249 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
250    
251     # store
252     if ($self->{'low_mem'}) {
253     $self->{'db'}->put($mfn, $rec);
254     } else {
255     $self->{'data'}->{$mfn} = $rec;
256     }
257    
258 dpavlin 352 # create lookup
259 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
260 dpavlin 352
261 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
262    
263 dpavlin 352 }
264    
265 dpavlin 431 $self->{'current_mfn'} = $startmfn;
266 dpavlin 389 $self->{'last_pcnt'} = 0;
267 dpavlin 362
268 dpavlin 418 $log->debug("max mfn: $maxmfn");
269    
270 dpavlin 352 # store max mfn and return it.
271     return $self->{'max_mfn'} = $maxmfn;
272     }
273    
274 dpavlin 362 =head2 fetch_rec
275    
276     Fetch next record from database. It will also display progress bar (once
277     it's implemented, that is).
278    
279     my $rec = $webpac->fetch_rec;
280    
281     =cut
282    
283     sub fetch_rec {
284     my $self = shift;
285    
286 dpavlin 372 my $log = $self->_get_logger();
287 dpavlin 362
288 dpavlin 372 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
289    
290 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
291     $self->{'current_mfn'} = $self->{'max_mfn'};
292 dpavlin 373 $log->debug("at EOF");
293 dpavlin 362 return;
294     }
295    
296 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
297    
298 dpavlin 421 if ($self->{'low_mem'}) {
299     return $self->{'db'}->get($mfn);
300     } else {
301     return $self->{'data'}->{$mfn};
302     }
303 dpavlin 362 }
304    
305 dpavlin 389 =head2 progress_bar
306    
307     Draw progress bar on STDERR.
308    
309     $webpac->progress_bar($current, $max);
310    
311     =cut
312    
313     sub progress_bar {
314     my $self = shift;
315    
316     my ($curr,$max) = @_;
317    
318     my $log = $self->_get_logger();
319    
320     $log->logconfess("no current value!") if (! $curr);
321     $log->logconfess("no maximum value!") if (! $max);
322    
323     if ($curr > $max) {
324     $max = $curr;
325     $log->debug("overflow to $curr");
326     }
327    
328     $self->{'last_pcnt'} ||= 1;
329    
330 dpavlin 422 my $p = int($curr * 100 / $max);
331 dpavlin 389
332 dpavlin 422 # reset on re-run
333     if ($p < $self->{'last_pcnt'}) {
334     $self->{'last_pcnt'} = $p;
335     $self->{'last_t'} = time();
336     $self->{'last_curr'} = 1;
337     }
338    
339 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
340 dpavlin 422
341     my $last_curr = $self->{'last_curr'} || $curr;
342     my $t = time();
343     my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
344     my $eta = ($max-$curr) / ($rate || 1);
345     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
346 dpavlin 389 $self->{'last_pcnt'} = $p;
347 dpavlin 422 $self->{'last_t'} = time();
348     $self->{'last_curr'} = $curr;
349 dpavlin 389 }
350 dpavlin 412 print STDERR "\n" if ($p == 100);
351 dpavlin 389 }
352    
353 dpavlin 422 =head2 fmt_time
354    
355     Format time (in seconds) for display.
356    
357     print $webpac->fmt_time(time());
358    
359     This method is called by L<progress_bar> to display remaining time.
360    
361     =cut
362    
363     sub fmt_time {
364     my $self = shift;
365    
366     my $t = shift || 0;
367     my $out = "";
368    
369     my ($ss,$mm,$hh) = gmtime($t);
370     $out .= "${hh}h" if ($hh);
371     $out .= sprintf("%02d:%02d", $mm,$ss);
372     $out .= " " if ($hh == 0);
373     return $out;
374     }
375    
376 dpavlin 363 =head2 open_import_xml
377    
378     Read file from C<import_xml/> directory and parse it.
379    
380     $webpac->open_import_xml(type => 'isis');
381    
382     =cut
383    
384     sub open_import_xml {
385     my $self = shift;
386    
387 dpavlin 372 my $log = $self->_get_logger();
388    
389 dpavlin 363 my $arg = {@_};
390 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
391 dpavlin 363
392 dpavlin 366 $self->{'type'} = $arg->{'type'};
393 dpavlin 363
394 dpavlin 366 my $type_base = $arg->{'type'};
395 dpavlin 363 $type_base =~ s/_.*$//g;
396    
397 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
398    
399 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
400 dpavlin 366
401     my $f = "./import_xml/".$self->{'type'}.".xml";
402 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
403 dpavlin 363
404 dpavlin 375 $log->info("reading '$f'");
405 dpavlin 363
406 dpavlin 375 $self->{'import_xml_file'} = $f;
407    
408 dpavlin 363 $self->{'import_xml'} = XMLin($f,
409 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
410 dpavlin 363 );
411    
412 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
413    
414 dpavlin 363 }
415    
416 dpavlin 355 =head2 create_lookup
417    
418     Create lookup from record using lookup definition.
419    
420 dpavlin 367 $self->create_lookup($rec, @lookups);
421    
422     Called internally by C<open_*> methods.
423    
424 dpavlin 355 =cut
425    
426     sub create_lookup {
427     my $self = shift;
428    
429 dpavlin 372 my $log = $self->_get_logger();
430 dpavlin 355
431 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
432     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
433    
434 dpavlin 355 foreach my $i (@_) {
435 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
436     $log->logconfess("need val") unless defined($i->{'val'});
437    
438     if (defined($i->{'eval'})) {
439     # eval first, so we can skip fill_in for key and val
440     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
441     if ($self->_eval($eval)) {
442     my $key = $self->fill_in($rec,$i->{'key'}) || next;
443     my @val = $self->fill_in($rec,$i->{'val'}) || next;
444 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
445 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
446     }
447     } else {
448 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
449     my @val = $self->fill_in($rec,$i->{'val'}) || next;
450     $log->debug("stored $key = ",sub { join(" | ",@val) });
451     push @{$self->{'lookup'}->{$key}}, @val;
452 dpavlin 355 }
453     }
454     }
455    
456 dpavlin 356 =head2 get_data
457    
458     Returns value from record.
459    
460 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
461 dpavlin 356
462     Arguments are:
463     record reference C<$rec>,
464     field C<$f>,
465     optional subfiled C<$sf>,
466     index for repeatable values C<$i>.
467    
468 dpavlin 367 Optinal variable C<$found> will be incremeted if there
469 dpavlin 356 is field.
470    
471     Returns value or empty string.
472    
473     =cut
474    
475     sub get_data {
476     my $self = shift;
477    
478     my ($rec,$f,$sf,$i,$found) = @_;
479 dpavlin 367
480 dpavlin 356 if ($$rec->{$f}) {
481 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
482 dpavlin 389 no strict 'refs';
483 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
484     $$found++ if (defined($$found));
485     return $$rec->{$f}->[$i]->{$sf};
486     } elsif ($$rec->{$f}->[$i]) {
487     $$found++ if (defined($$found));
488 dpavlin 366 # it still might have subfield, just
489     # not specified, so we'll dump all
490     if ($$rec->{$f}->[$i] =~ /HASH/o) {
491     my $out;
492     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
493     $out .= $$rec->{$f}->[$i]->{$k}." ";
494     }
495     return $out;
496     } else {
497     return $$rec->{$f}->[$i];
498     }
499 dpavlin 356 }
500     } else {
501     return '';
502     }
503     }
504    
505 dpavlin 352 =head2 fill_in
506    
507     Workhourse of all: takes record from in-memory structure of database and
508     strings with placeholders and returns string or array of with substituted
509     values from record.
510    
511 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
512 dpavlin 352
513     Optional argument is ordinal number for repeatable fields. By default,
514 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
515     element is 0).
516     Following example will read second value from repeatable field.
517 dpavlin 352
518 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
519 dpavlin 353
520     This function B<does not> perform parsing of format to inteligenty skip
521     delimiters before fields which aren't used.
522    
523 dpavlin 376 This method will automatically decode UTF-8 string to local code page
524     if needed.
525    
526 dpavlin 352 =cut
527    
528 dpavlin 372 sub fill_in {
529 dpavlin 371 my $self = shift;
530    
531 dpavlin 372 my $log = $self->_get_logger();
532 dpavlin 371
533 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
534     my $format = shift || $log->logconfess("need format to parse");
535 dpavlin 352 # iteration (for repeatable fields)
536     my $i = shift || 0;
537    
538     # FIXME remove for speedup?
539 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
540 dpavlin 352
541 dpavlin 376 if (utf8::is_utf8($format)) {
542     $format = $self->_x($format);
543     }
544    
545 dpavlin 352 my $found = 0;
546    
547 dpavlin 359 my $eval_code;
548     # remove eval{...} from beginning
549     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
550    
551 dpavlin 352 # do actual replacement of placeholders
552 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
553 dpavlin 352
554 dpavlin 353 if ($found) {
555 dpavlin 373 $log->debug("format: $format");
556 dpavlin 359 if ($eval_code) {
557     my $eval = $self->fill_in($rec,$eval_code,$i);
558 dpavlin 371 return if (! $self->_eval($eval));
559 dpavlin 359 }
560 dpavlin 353 # do we have lookups?
561 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
562     $log->debug("format '$format' has lookup");
563 dpavlin 353 return $self->lookup($format);
564     } else {
565     return $format;
566     }
567 dpavlin 352 } else {
568     return;
569     }
570     }
571    
572     =head2 lookup
573    
574 dpavlin 355 Perform lookups on format supplied to it.
575 dpavlin 352
576 dpavlin 367 my $text = $self->lookup('[v900]');
577 dpavlin 352
578 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
579    
580 dpavlin 352 =cut
581    
582     sub lookup {
583     my $self = shift;
584    
585 dpavlin 372 my $log = $self->_get_logger();
586 dpavlin 352
587 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
588    
589 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
590 dpavlin 352 my @in = ( $tmp );
591 dpavlin 372
592     $log->debug("lookup for: ",$tmp);
593    
594 dpavlin 352 my @out;
595     while (my $f = shift @in) {
596 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
597 dpavlin 352 my $k = $1;
598     if ($self->{'lookup'}->{$k}) {
599     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
600     my $tmp2 = $f;
601 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
602 dpavlin 352 push @in, $tmp2;
603     }
604     } else {
605     undef $f;
606     }
607     } elsif ($f) {
608     push @out, $f;
609     }
610     }
611 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
612 dpavlin 352 return @out;
613     } else {
614     return $tmp;
615     }
616     }
617    
618 dpavlin 356 =head2 parse
619    
620     Perform smart parsing of string, skipping delimiters for fields which aren't
621     defined. It can also eval code in format starting with C<eval{...}> and
622     return output or nothing depending on eval code.
623    
624 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
625 dpavlin 356
626     =cut
627    
628     sub parse {
629     my $self = shift;
630    
631 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
632 dpavlin 356
633 dpavlin 366 return if (! $format_utf8);
634    
635 dpavlin 372 my $log = $self->_get_logger();
636 dpavlin 358
637 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
638     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
639    
640 dpavlin 358 $i = 0 if (! $i);
641    
642 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
643 dpavlin 366
644 dpavlin 356 my @out;
645    
646 dpavlin 373 $log->debug("format: $format");
647    
648 dpavlin 356 my $eval_code;
649     # remove eval{...} from beginning
650     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
651    
652 dpavlin 358 my $prefix;
653     my $all_found=0;
654 dpavlin 356
655 dpavlin 373 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
656 dpavlin 356
657 dpavlin 358 my $del = $1 || '';
658 dpavlin 359 $prefix ||= $del if ($all_found == 0);
659 dpavlin 358
660     my $found = 0;
661     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
662    
663 dpavlin 356 if ($found) {
664 dpavlin 358 push @out, $del;
665     push @out, $tmp;
666     $all_found += $found;
667 dpavlin 356 }
668     }
669    
670 dpavlin 358 return if (! $all_found);
671 dpavlin 356
672 dpavlin 373 my $out = join('',@out);
673 dpavlin 358
674 dpavlin 373 if ($out) {
675     # add rest of format (suffix)
676     $out .= $format;
677 dpavlin 367
678 dpavlin 373 # add prefix if not there
679     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
680    
681     $log->debug("result: $out");
682     }
683    
684 dpavlin 359 if ($eval_code) {
685     my $eval = $self->fill_in($rec,$eval_code,$i);
686 dpavlin 373 $log->debug("about to eval{",$eval,"} format: $out");
687 dpavlin 371 return if (! $self->_eval($eval));
688 dpavlin 359 }
689    
690 dpavlin 358 return $out;
691 dpavlin 356 }
692    
693 dpavlin 367 =head2 parse_to_arr
694    
695     Similar to C<parse>, but returns array of all repeatable fields
696    
697     my @arr = $webpac->parse_to_arr($rec,'v250^a');
698    
699     =cut
700    
701     sub parse_to_arr {
702     my $self = shift;
703    
704     my ($rec, $format_utf8) = @_;
705    
706 dpavlin 372 my $log = $self->_get_logger();
707    
708     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
709 dpavlin 367 return if (! $format_utf8);
710    
711     my $i = 0;
712     my @arr;
713    
714     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
715     push @arr, $v;
716     }
717    
718 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
719    
720 dpavlin 367 return @arr;
721     }
722    
723 dpavlin 373 =head2 fill_in_to_arr
724    
725     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
726     for fields which have lookups, so they shouldn't be parsed but rather
727     C<fill_id>ed.
728    
729     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
730    
731     =cut
732    
733     sub fill_in_to_arr {
734     my $self = shift;
735    
736     my ($rec, $format_utf8) = @_;
737    
738     my $log = $self->_get_logger();
739    
740     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
741     return if (! $format_utf8);
742    
743     my $i = 0;
744     my @arr;
745    
746     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
747     push @arr, @v;
748     }
749    
750     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
751    
752     return @arr;
753     }
754    
755    
756 dpavlin 366 =head2 data_structure
757    
758     Create in-memory data structure which represents layout from C<import_xml>.
759     It is used later to produce output.
760    
761 dpavlin 368 my @ds = $webpac->data_structure($rec);
762 dpavlin 366
763 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
764 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
765     <headline> tag.
766 dpavlin 374
767 dpavlin 366 =cut
768    
769 dpavlin 372 sub data_structure {
770 dpavlin 366 my $self = shift;
771    
772 dpavlin 372 my $log = $self->_get_logger();
773 dpavlin 366
774     my $rec = shift;
775 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
776 dpavlin 366
777 dpavlin 374 undef $self->{'currnet_filename'};
778 dpavlin 398 undef $self->{'headline'};
779 dpavlin 374
780 dpavlin 366 my @sorted_tags;
781     if ($self->{tags_by_order}) {
782     @sorted_tags = @{$self->{tags_by_order}};
783     } else {
784     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
785     $self->{tags_by_order} = \@sorted_tags;
786     }
787    
788 dpavlin 368 my @ds;
789 dpavlin 366
790 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
791    
792 dpavlin 366 foreach my $field (@sorted_tags) {
793    
794     my $row;
795    
796     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
797    
798     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
799 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
800 dpavlin 366
801 dpavlin 373 $log->debug("format: $format");
802    
803     my @v;
804     if ($format =~ /$LOOKUP_REGEX/o) {
805     @v = $self->fill_in_to_arr($rec,$format);
806     } else {
807     @v = $self->parse_to_arr($rec,$format);
808     }
809 dpavlin 367 next if (! @v);
810 dpavlin 366
811 dpavlin 375 # use format?
812     if ($tag->{'format_name'}) {
813     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
814     }
815    
816 dpavlin 398 if ($field eq 'filename') {
817     $self->{'current_filename'} = join('',@v);
818     $log->debug("filename: ",$self->{'current_filename'});
819     } elsif ($field eq 'headline') {
820     $self->{'headline'} .= join('',@v);
821     $log->debug("headline: ",$self->{'headline'});
822     next; # don't return headline in data_structure!
823     }
824    
825 dpavlin 366 # does tag have type?
826     if ($tag->{'type'}) {
827 dpavlin 367 push @{$row->{$tag->{'type'}}}, @v;
828 dpavlin 366 } else {
829 dpavlin 367 push @{$row->{'display'}}, @v;
830     push @{$row->{'swish'}}, @v;
831 dpavlin 366 }
832 dpavlin 373
833 dpavlin 374
834 dpavlin 366 }
835    
836 dpavlin 368 if ($row) {
837     $row->{'tag'} = $field;
838 dpavlin 375
839     # TODO: name_sigular, name_plural
840     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
841     $row->{'name'} = $name ? $self->_x($name) : $field;
842    
843 dpavlin 368 push @ds, $row;
844 dpavlin 374
845 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
846 dpavlin 368 }
847 dpavlin 366
848     }
849    
850 dpavlin 370 return @ds;
851 dpavlin 366
852     }
853    
854 dpavlin 370 =head2 output
855    
856     Create output from in-memory data structure using Template Toolkit template.
857    
858     my $text = $webpac->output( template => 'text.tt', data => @ds );
859    
860     =cut
861    
862     sub output {
863     my $self = shift;
864    
865     my $args = {@_};
866    
867 dpavlin 372 my $log = $self->_get_logger();
868 dpavlin 370
869 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
870     $log->logconfess("need data array") if (! $args->{'data'});
871    
872 dpavlin 370 my $out;
873    
874     $self->{'tt'}->process(
875     $args->{'template'},
876     $args,
877     \$out
878     ) || confess $self->{'tt'}->error();
879    
880     return $out;
881     }
882    
883 dpavlin 411 =head2 output_file
884    
885     Create output from in-memory data structure using Template Toolkit template
886     to a file.
887    
888     $webpac->output_file(
889     file => 'out.txt',
890     template => 'text.tt',
891     data => @ds
892     );
893    
894     =cut
895    
896     sub output_file {
897     my $self = shift;
898    
899     my $args = {@_};
900    
901     my $log = $self->_get_logger();
902    
903 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
904 dpavlin 411
905 dpavlin 421 $log->debug("creating file ",$file);
906 dpavlin 411
907 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
908 dpavlin 411 print $fh $self->output(
909     template => $args->{'template'},
910     data => $args->{'data'},
911     ) || $log->logdie("print: $!");
912     close($fh) || $log->logdie("close: $!");
913     }
914    
915 dpavlin 375 =head2 apply_format
916    
917     Apply format specified in tag with C<format_name="name"> and
918     C<format_delimiter=";;">.
919    
920     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
921    
922     Formats can contain C<lookup{...}> if you need them.
923    
924     =cut
925    
926     sub apply_format {
927     my $self = shift;
928    
929     my ($name,$delimiter,$data) = @_;
930    
931     my $log = $self->_get_logger();
932    
933     if (! $self->{'import_xml'}->{'format'}->{$name}) {
934     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
935     return $data;
936     }
937    
938     $log->warn("no delimiter for format $name") if (! $delimiter);
939    
940     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
941    
942     my @data = split(/\Q$delimiter\E/, $data);
943    
944     my $out = sprintf($format, @data);
945     $log->debug("using format $name [$format] on $data to produce: $out");
946    
947     if ($out =~ m/$LOOKUP_REGEX/o) {
948     return $self->lookup($out);
949     } else {
950     return $out;
951     }
952    
953     }
954    
955    
956 dpavlin 372 #
957     #
958     #
959    
960     =head1 INTERNAL METHODS
961    
962     Here is a quick list of internal methods, mostly useful to turn debugging
963     on them (see L<LOGGING> below for explanation).
964    
965     =cut
966    
967     =head2 _eval
968    
969     Internal function to eval code without C<strict 'subs'>.
970    
971     =cut
972    
973     sub _eval {
974     my $self = shift;
975    
976     my $code = shift || return;
977    
978     my $log = $self->_get_logger();
979    
980     no strict 'subs';
981     my $ret = eval $code;
982     if ($@) {
983     $log->error("problem with eval code [$code]: $@");
984     }
985    
986     $log->debug("eval: ",$code," [",$ret,"]");
987    
988     return $ret || 0;
989     }
990    
991     =head2 _sort_by_order
992    
993     Sort xml tags data structure accoding to C<order=""> attribute.
994    
995     =cut
996    
997     sub _sort_by_order {
998     my $self = shift;
999    
1000     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1001     $self->{'import_xml'}->{'indexer'}->{$a};
1002     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1003     $self->{'import_xml'}->{'indexer'}->{$b};
1004    
1005     return $va <=> $vb;
1006     }
1007    
1008 dpavlin 375 =head2 _get_logger
1009    
1010     Get C<Log::Log4perl> object with a twist: domains are defined for each
1011     method
1012    
1013     my $log = $webpac->_get_logger();
1014    
1015     =cut
1016    
1017 dpavlin 372 sub _get_logger {
1018     my $self = shift;
1019    
1020 dpavlin 374 my $name = (caller(1))[3] || caller;
1021     return get_logger($name);
1022 dpavlin 372 }
1023    
1024 dpavlin 375 =head2 _x
1025    
1026     Convert string from UTF-8 to code page defined in C<import_xml>.
1027    
1028     my $text = $webpac->_x('utf8 text');
1029    
1030     =cut
1031    
1032     sub _x {
1033     my $self = shift;
1034     my $utf8 = shift || return;
1035    
1036     return $self->{'utf2cp'}->convert($utf8) ||
1037     $self->_get_logger()->logwarn("can't convert '$utf8'");
1038     }
1039    
1040 dpavlin 372 #
1041     #
1042     #
1043    
1044     =head1 LOGGING
1045    
1046     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1047     C<log.conf>.
1048    
1049     Methods defined above have different levels of logging, so
1050     it's descriptions will be useful to turn (mostry B<debug> logging) on
1051     or off to see why WabPAC isn't perforing as you expect it (it might even
1052     be a bug!).
1053    
1054     B<This is different from normal Log4perl behaviour>. To repeat, you can
1055     also use method names, and not only classes (which are just few)
1056     to filter logging.
1057    
1058 dpavlin 422
1059     =head1 MEMORY USAGE
1060    
1061     C<low_mem> options is double-edged sword. If enabled, WebPAC
1062     will run on memory constraint machines (which doesn't have enough
1063     physical RAM to create memory structure for whole source database).
1064    
1065     If your machine has 512Mb or more of RAM and database is around 10000 records,
1066     memory shouldn't be an issue. If you don't have enough physical RAM, you
1067     might consider using virtual memory (if your operating system is handling it
1068     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1069     parsed structure of ISIS database (this is what C<low_mem> option does).
1070    
1071     Hitting swap at end of reading source database is probably o.k. However,
1072     hitting swap before 90% will dramatically decrease performance and you will
1073     be better off with C<low_mem> and using rest of availble memory for
1074     operating system disk cache (Linux is particuallary good about this).
1075     However, every access to database record will require disk access, so
1076     generation phase will be slower 10-100 times.
1077    
1078     Parsed structures are essential - you just have option to trade RAM memory
1079     (which is fast) for disk space (which is slow). Be sure to have planty of
1080     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1081    
1082     However, when WebPAC is running on desktop machines (or laptops :-), it's
1083     highly undesireable for system to start swapping. Using C<low_mem> option can
1084     reduce WecPAC memory usage to around 64Mb for same database with lookup
1085     fields and sorted indexes which stay in RAM. Performance will suffer, but
1086     memory usage will really be minimal. It might be also more confortable to
1087     run WebPAC reniced on those machines.
1088    
1089 dpavlin 372 =cut
1090    
1091 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26