/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 422 - (hide annotations)
Sat Sep 11 08:36:38 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 24276 byte(s)
better progress_bar, more documentation

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 422 use Time::HiRes qw(time);
13 dpavlin 352
14 dpavlin 358 use Data::Dumper;
15    
16 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21 dpavlin 352 =head1 NAME
22    
23 dpavlin 354 WebPAC - base class for WebPAC
24 dpavlin 352
25     =head1 DESCRIPTION
26    
27 dpavlin 354 This module implements methods used by WebPAC.
28 dpavlin 352
29     =head1 METHODS
30    
31     =head2 new
32    
33 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
34 dpavlin 352
35 dpavlin 354 my $webpac = new WebPAC(
36 dpavlin 352 config_file => 'name.conf',
37     [code_page => 'ISO-8859-2',]
38 dpavlin 421 [low_mem => 1,]
39 dpavlin 352 );
40    
41     Default C<code_page> is C<ISO-8859-2>.
42    
43 dpavlin 422 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45 dpavlin 421 This method will also read configuration files
46 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
47     and configuration file specified by C<config_file>
48     which describes databases to be indexed.
49    
50 dpavlin 352 =cut
51    
52 dpavlin 363 # mapping between data type and tag which specify
53     # format in XML file
54     my %type2tag = (
55     'isis' => 'isis',
56     # 'excel' => 'column',
57     # 'marc' => 'marc',
58     # 'feed' => 'feed'
59     );
60    
61 dpavlin 352 sub new {
62     my $class = shift;
63     my $self = {@_};
64     bless($self, $class);
65    
66 dpavlin 422 $self->{'start_t'} = time();
67    
68 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
69     Log::Log4perl->init($log_file);
70    
71     my $log = $self->_get_logger();
72    
73 dpavlin 352 # fill in default values
74     # output codepage
75     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76    
77 dpavlin 353 #
78     # read global.conf
79     #
80 dpavlin 372 $log->debug("read 'global.conf'");
81 dpavlin 352
82 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83 dpavlin 352
84     # read global config parametars
85     foreach my $var (qw(
86     dbi_dbd
87     dbi_dsn
88     dbi_user
89     dbi_passwd
90     show_progress
91     my_unac_filter
92 dpavlin 370 output_template
93 dpavlin 352 )) {
94 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 dpavlin 352 }
96    
97 dpavlin 353 #
98     # read indexer config file
99     #
100 dpavlin 352
101 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102 dpavlin 352
103 dpavlin 370 # create UTF-8 convertor for import_xml files
104 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105 dpavlin 370
106     # create Template toolkit instance
107     $self->{'tt'} = Template->new(
108     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109     # FILTERS => {
110     # 'foo' => \&foo_filter,
111     # },
112     EVAL_PERL => 1,
113     );
114    
115 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
116     if ($self->{'low_mem'}) {
117 dpavlin 422 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118 dpavlin 421
119     my $db_file = "data.db";
120    
121     if (-e $db_file) {
122     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123     $log->debug("removed '$db_file' from last run");
124     }
125    
126     use DBM::Deep;
127    
128     my $db = new DBM::Deep $db_file;
129    
130     $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132     if ($db->error()) {
133     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134     } else {
135 dpavlin 422 $log->debug("using file '$db_file' for DBM::Deep");
136 dpavlin 421 }
137    
138     $self->{'db'} = $db;
139     }
140    
141 dpavlin 352 return $self;
142     }
143    
144     =head2 open_isis
145    
146     Open CDS/ISIS database using OpenIsis module and read all records to memory.
147    
148     $webpac->open_isis(
149     filename => '/data/ISIS/ISIS',
150     code_page => '852',
151     limit_mfn => '500',
152     lookup => [ ... ],
153     );
154    
155     By default, ISIS code page is assumed to be C<852>.
156    
157 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
158     from database in example above.
159 dpavlin 352
160     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
161     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
162     value in index.
163    
164     lookup => [
165     { 'key' => 'd:v900', 'val' => 'v250^a' },
166     { 'eval' => '"v901^a" eq "Podruèje"',
167     'key' => 'pa:v561^4:v562^4:v461^1',
168     'val' => 'v900' },
169     ]
170    
171 dpavlin 367 Returns number of last record read into memory (size of database, really).
172    
173 dpavlin 352 =cut
174    
175     sub open_isis {
176     my $self = shift;
177     my $arg = {@_};
178    
179 dpavlin 372 my $log = $self->_get_logger();
180    
181     $log->logcroak("need filename") if (! $arg->{'filename'});
182 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
183    
184 dpavlin 421 # store data in object
185     $self->{'isis_filename'} = $arg->{'filename'};
186     $self->{'isis_code_page'} = $code_page;
187    
188 dpavlin 353 use OpenIsis;
189    
190 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
191    
192     # create Text::Iconv object
193     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
194    
195 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
196 dpavlin 421 $log->debug("isis code page: $code_page");
197 dpavlin 363
198 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
199    
200     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
201    
202 dpavlin 357 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
203    
204 dpavlin 372 $log->info("processing $maxmfn records...");
205 dpavlin 363
206 dpavlin 352 # read database
207     for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
208    
209 dpavlin 418
210     $log->debug("mfn: $mfn\n");
211    
212 dpavlin 421 my $rec;
213    
214 dpavlin 352 # read record
215     my $row = OpenIsis::read( $isis_db, $mfn );
216     foreach my $k (keys %{$row}) {
217     if ($k ne "mfn") {
218     foreach my $l (@{$row->{$k}}) {
219     $l = $cp->convert($l);
220     # has subfields?
221     my $val;
222     if ($l =~ m/\^/) {
223     foreach my $t (split(/\^/,$l)) {
224     next if (! $t);
225     $val->{substr($t,0,1)} = substr($t,1);
226     }
227     } else {
228     $val = $l;
229     }
230    
231 dpavlin 421 push @{$rec->{$k}}, $val;
232 dpavlin 352 }
233 dpavlin 374 } else {
234 dpavlin 421 push @{$rec->{'000'}}, $mfn;
235 dpavlin 352 }
236    
237     }
238    
239 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
240    
241     # store
242     if ($self->{'low_mem'}) {
243     $self->{'db'}->put($mfn, $rec);
244     } else {
245     $self->{'data'}->{$mfn} = $rec;
246     }
247    
248 dpavlin 352 # create lookup
249 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
250 dpavlin 352
251 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
252    
253 dpavlin 352 }
254    
255 dpavlin 362 $self->{'current_mfn'} = 1;
256 dpavlin 389 $self->{'last_pcnt'} = 0;
257 dpavlin 362
258 dpavlin 418 $log->debug("max mfn: $maxmfn");
259    
260 dpavlin 352 # store max mfn and return it.
261     return $self->{'max_mfn'} = $maxmfn;
262     }
263    
264 dpavlin 362 =head2 fetch_rec
265    
266     Fetch next record from database. It will also display progress bar (once
267     it's implemented, that is).
268    
269     my $rec = $webpac->fetch_rec;
270    
271     =cut
272    
273     sub fetch_rec {
274     my $self = shift;
275    
276 dpavlin 372 my $log = $self->_get_logger();
277 dpavlin 362
278 dpavlin 372 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
279    
280 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
281     $self->{'current_mfn'} = $self->{'max_mfn'};
282 dpavlin 373 $log->debug("at EOF");
283 dpavlin 362 return;
284     }
285    
286 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
287    
288 dpavlin 421 if ($self->{'low_mem'}) {
289     return $self->{'db'}->get($mfn);
290     } else {
291     return $self->{'data'}->{$mfn};
292     }
293 dpavlin 362 }
294    
295 dpavlin 389 =head2 progress_bar
296    
297     Draw progress bar on STDERR.
298    
299     $webpac->progress_bar($current, $max);
300    
301     =cut
302    
303     sub progress_bar {
304     my $self = shift;
305    
306     my ($curr,$max) = @_;
307    
308     my $log = $self->_get_logger();
309    
310     $log->logconfess("no current value!") if (! $curr);
311     $log->logconfess("no maximum value!") if (! $max);
312    
313     if ($curr > $max) {
314     $max = $curr;
315     $log->debug("overflow to $curr");
316     }
317    
318     $self->{'last_pcnt'} ||= 1;
319    
320 dpavlin 422 my $p = int($curr * 100 / $max);
321 dpavlin 389
322 dpavlin 422 # reset on re-run
323     if ($p < $self->{'last_pcnt'}) {
324     $self->{'last_pcnt'} = $p;
325     $self->{'last_t'} = time();
326     $self->{'last_curr'} = 1;
327     }
328    
329 dpavlin 389 if ($p != $self->{'last_pcnt'}) {
330 dpavlin 422
331     my $last_curr = $self->{'last_curr'} || $curr;
332     my $t = time();
333     my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
334     my $eta = ($max-$curr) / ($rate || 1);
335     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
336 dpavlin 389 $self->{'last_pcnt'} = $p;
337 dpavlin 422 $self->{'last_t'} = time();
338     $self->{'last_curr'} = $curr;
339 dpavlin 389 }
340 dpavlin 412 print STDERR "\n" if ($p == 100);
341 dpavlin 389 }
342    
343 dpavlin 422 =head2 fmt_time
344    
345     Format time (in seconds) for display.
346    
347     print $webpac->fmt_time(time());
348    
349     This method is called by L<progress_bar> to display remaining time.
350    
351     =cut
352    
353     sub fmt_time {
354     my $self = shift;
355    
356     my $t = shift || 0;
357     my $out = "";
358    
359     my ($ss,$mm,$hh) = gmtime($t);
360     $out .= "${hh}h" if ($hh);
361     $out .= sprintf("%02d:%02d", $mm,$ss);
362     $out .= " " if ($hh == 0);
363     return $out;
364     }
365    
366 dpavlin 363 =head2 open_import_xml
367    
368     Read file from C<import_xml/> directory and parse it.
369    
370     $webpac->open_import_xml(type => 'isis');
371    
372     =cut
373    
374     sub open_import_xml {
375     my $self = shift;
376    
377 dpavlin 372 my $log = $self->_get_logger();
378    
379 dpavlin 363 my $arg = {@_};
380 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
381 dpavlin 363
382 dpavlin 366 $self->{'type'} = $arg->{'type'};
383 dpavlin 363
384 dpavlin 366 my $type_base = $arg->{'type'};
385 dpavlin 363 $type_base =~ s/_.*$//g;
386    
387 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
388    
389 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
390 dpavlin 366
391     my $f = "./import_xml/".$self->{'type'}.".xml";
392 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
393 dpavlin 363
394 dpavlin 375 $log->info("reading '$f'");
395 dpavlin 363
396 dpavlin 375 $self->{'import_xml_file'} = $f;
397    
398 dpavlin 363 $self->{'import_xml'} = XMLin($f,
399 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
400 dpavlin 363 );
401    
402 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
403    
404 dpavlin 363 }
405    
406 dpavlin 355 =head2 create_lookup
407    
408     Create lookup from record using lookup definition.
409    
410 dpavlin 367 $self->create_lookup($rec, @lookups);
411    
412     Called internally by C<open_*> methods.
413    
414 dpavlin 355 =cut
415    
416     sub create_lookup {
417     my $self = shift;
418    
419 dpavlin 372 my $log = $self->_get_logger();
420 dpavlin 355
421 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
422     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
423    
424 dpavlin 355 foreach my $i (@_) {
425 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
426     $log->logconfess("need val") unless defined($i->{'val'});
427    
428     if (defined($i->{'eval'})) {
429     # eval first, so we can skip fill_in for key and val
430     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
431     if ($self->_eval($eval)) {
432     my $key = $self->fill_in($rec,$i->{'key'}) || next;
433     my @val = $self->fill_in($rec,$i->{'val'}) || next;
434 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
435 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
436     }
437     } else {
438 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
439     my @val = $self->fill_in($rec,$i->{'val'}) || next;
440     $log->debug("stored $key = ",sub { join(" | ",@val) });
441     push @{$self->{'lookup'}->{$key}}, @val;
442 dpavlin 355 }
443     }
444     }
445    
446 dpavlin 356 =head2 get_data
447    
448     Returns value from record.
449    
450 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
451 dpavlin 356
452     Arguments are:
453     record reference C<$rec>,
454     field C<$f>,
455     optional subfiled C<$sf>,
456     index for repeatable values C<$i>.
457    
458 dpavlin 367 Optinal variable C<$found> will be incremeted if there
459 dpavlin 356 is field.
460    
461     Returns value or empty string.
462    
463     =cut
464    
465     sub get_data {
466     my $self = shift;
467    
468     my ($rec,$f,$sf,$i,$found) = @_;
469 dpavlin 367
470 dpavlin 356 if ($$rec->{$f}) {
471 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
472 dpavlin 389 no strict 'refs';
473 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
474     $$found++ if (defined($$found));
475     return $$rec->{$f}->[$i]->{$sf};
476     } elsif ($$rec->{$f}->[$i]) {
477     $$found++ if (defined($$found));
478 dpavlin 366 # it still might have subfield, just
479     # not specified, so we'll dump all
480     if ($$rec->{$f}->[$i] =~ /HASH/o) {
481     my $out;
482     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
483     $out .= $$rec->{$f}->[$i]->{$k}." ";
484     }
485     return $out;
486     } else {
487     return $$rec->{$f}->[$i];
488     }
489 dpavlin 356 }
490     } else {
491     return '';
492     }
493     }
494    
495 dpavlin 352 =head2 fill_in
496    
497     Workhourse of all: takes record from in-memory structure of database and
498     strings with placeholders and returns string or array of with substituted
499     values from record.
500    
501 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
502 dpavlin 352
503     Optional argument is ordinal number for repeatable fields. By default,
504 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
505     element is 0).
506     Following example will read second value from repeatable field.
507 dpavlin 352
508 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
509 dpavlin 353
510     This function B<does not> perform parsing of format to inteligenty skip
511     delimiters before fields which aren't used.
512    
513 dpavlin 376 This method will automatically decode UTF-8 string to local code page
514     if needed.
515    
516 dpavlin 352 =cut
517    
518 dpavlin 372 sub fill_in {
519 dpavlin 371 my $self = shift;
520    
521 dpavlin 372 my $log = $self->_get_logger();
522 dpavlin 371
523 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
524     my $format = shift || $log->logconfess("need format to parse");
525 dpavlin 352 # iteration (for repeatable fields)
526     my $i = shift || 0;
527    
528     # FIXME remove for speedup?
529 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
530 dpavlin 352
531 dpavlin 376 if (utf8::is_utf8($format)) {
532     $format = $self->_x($format);
533     }
534    
535 dpavlin 352 my $found = 0;
536    
537 dpavlin 359 my $eval_code;
538     # remove eval{...} from beginning
539     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
540    
541 dpavlin 352 # do actual replacement of placeholders
542 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
543 dpavlin 352
544 dpavlin 353 if ($found) {
545 dpavlin 373 $log->debug("format: $format");
546 dpavlin 359 if ($eval_code) {
547     my $eval = $self->fill_in($rec,$eval_code,$i);
548 dpavlin 371 return if (! $self->_eval($eval));
549 dpavlin 359 }
550 dpavlin 353 # do we have lookups?
551 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
552     $log->debug("format '$format' has lookup");
553 dpavlin 353 return $self->lookup($format);
554     } else {
555     return $format;
556     }
557 dpavlin 352 } else {
558     return;
559     }
560     }
561    
562     =head2 lookup
563    
564 dpavlin 355 Perform lookups on format supplied to it.
565 dpavlin 352
566 dpavlin 367 my $text = $self->lookup('[v900]');
567 dpavlin 352
568 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
569    
570 dpavlin 352 =cut
571    
572     sub lookup {
573     my $self = shift;
574    
575 dpavlin 372 my $log = $self->_get_logger();
576 dpavlin 352
577 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
578    
579 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
580 dpavlin 352 my @in = ( $tmp );
581 dpavlin 372
582     $log->debug("lookup for: ",$tmp);
583    
584 dpavlin 352 my @out;
585     while (my $f = shift @in) {
586 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
587 dpavlin 352 my $k = $1;
588     if ($self->{'lookup'}->{$k}) {
589     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
590     my $tmp2 = $f;
591 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
592 dpavlin 352 push @in, $tmp2;
593     }
594     } else {
595     undef $f;
596     }
597     } elsif ($f) {
598     push @out, $f;
599     }
600     }
601 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
602 dpavlin 352 return @out;
603     } else {
604     return $tmp;
605     }
606     }
607    
608 dpavlin 356 =head2 parse
609    
610     Perform smart parsing of string, skipping delimiters for fields which aren't
611     defined. It can also eval code in format starting with C<eval{...}> and
612     return output or nothing depending on eval code.
613    
614 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
615 dpavlin 356
616     =cut
617    
618     sub parse {
619     my $self = shift;
620    
621 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
622 dpavlin 356
623 dpavlin 366 return if (! $format_utf8);
624    
625 dpavlin 372 my $log = $self->_get_logger();
626 dpavlin 358
627 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
628     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
629    
630 dpavlin 358 $i = 0 if (! $i);
631    
632 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
633 dpavlin 366
634 dpavlin 356 my @out;
635    
636 dpavlin 373 $log->debug("format: $format");
637    
638 dpavlin 356 my $eval_code;
639     # remove eval{...} from beginning
640     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
641    
642 dpavlin 358 my $prefix;
643     my $all_found=0;
644 dpavlin 356
645 dpavlin 373 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
646 dpavlin 356
647 dpavlin 358 my $del = $1 || '';
648 dpavlin 359 $prefix ||= $del if ($all_found == 0);
649 dpavlin 358
650     my $found = 0;
651     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
652    
653 dpavlin 356 if ($found) {
654 dpavlin 358 push @out, $del;
655     push @out, $tmp;
656     $all_found += $found;
657 dpavlin 356 }
658     }
659    
660 dpavlin 358 return if (! $all_found);
661 dpavlin 356
662 dpavlin 373 my $out = join('',@out);
663 dpavlin 358
664 dpavlin 373 if ($out) {
665     # add rest of format (suffix)
666     $out .= $format;
667 dpavlin 367
668 dpavlin 373 # add prefix if not there
669     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
670    
671     $log->debug("result: $out");
672     }
673    
674 dpavlin 359 if ($eval_code) {
675     my $eval = $self->fill_in($rec,$eval_code,$i);
676 dpavlin 373 $log->debug("about to eval{",$eval,"} format: $out");
677 dpavlin 371 return if (! $self->_eval($eval));
678 dpavlin 359 }
679    
680 dpavlin 358 return $out;
681 dpavlin 356 }
682    
683 dpavlin 367 =head2 parse_to_arr
684    
685     Similar to C<parse>, but returns array of all repeatable fields
686    
687     my @arr = $webpac->parse_to_arr($rec,'v250^a');
688    
689     =cut
690    
691     sub parse_to_arr {
692     my $self = shift;
693    
694     my ($rec, $format_utf8) = @_;
695    
696 dpavlin 372 my $log = $self->_get_logger();
697    
698     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
699 dpavlin 367 return if (! $format_utf8);
700    
701     my $i = 0;
702     my @arr;
703    
704     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
705     push @arr, $v;
706     }
707    
708 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
709    
710 dpavlin 367 return @arr;
711     }
712    
713 dpavlin 373 =head2 fill_in_to_arr
714    
715     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
716     for fields which have lookups, so they shouldn't be parsed but rather
717     C<fill_id>ed.
718    
719     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
720    
721     =cut
722    
723     sub fill_in_to_arr {
724     my $self = shift;
725    
726     my ($rec, $format_utf8) = @_;
727    
728     my $log = $self->_get_logger();
729    
730     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
731     return if (! $format_utf8);
732    
733     my $i = 0;
734     my @arr;
735    
736     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
737     push @arr, @v;
738     }
739    
740     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
741    
742     return @arr;
743     }
744    
745    
746 dpavlin 366 =head2 data_structure
747    
748     Create in-memory data structure which represents layout from C<import_xml>.
749     It is used later to produce output.
750    
751 dpavlin 368 my @ds = $webpac->data_structure($rec);
752 dpavlin 366
753 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
754 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
755     <headline> tag.
756 dpavlin 374
757 dpavlin 366 =cut
758    
759 dpavlin 372 sub data_structure {
760 dpavlin 366 my $self = shift;
761    
762 dpavlin 372 my $log = $self->_get_logger();
763 dpavlin 366
764     my $rec = shift;
765 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
766 dpavlin 366
767 dpavlin 374 undef $self->{'currnet_filename'};
768 dpavlin 398 undef $self->{'headline'};
769 dpavlin 374
770 dpavlin 366 my @sorted_tags;
771     if ($self->{tags_by_order}) {
772     @sorted_tags = @{$self->{tags_by_order}};
773     } else {
774     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
775     $self->{tags_by_order} = \@sorted_tags;
776     }
777    
778 dpavlin 368 my @ds;
779 dpavlin 366
780 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
781    
782 dpavlin 366 foreach my $field (@sorted_tags) {
783    
784     my $row;
785    
786     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
787    
788     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
789 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
790 dpavlin 366
791 dpavlin 373 $log->debug("format: $format");
792    
793     my @v;
794     if ($format =~ /$LOOKUP_REGEX/o) {
795     @v = $self->fill_in_to_arr($rec,$format);
796     } else {
797     @v = $self->parse_to_arr($rec,$format);
798     }
799 dpavlin 367 next if (! @v);
800 dpavlin 366
801 dpavlin 375 # use format?
802     if ($tag->{'format_name'}) {
803     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
804     }
805    
806 dpavlin 398 if ($field eq 'filename') {
807     $self->{'current_filename'} = join('',@v);
808     $log->debug("filename: ",$self->{'current_filename'});
809     } elsif ($field eq 'headline') {
810     $self->{'headline'} .= join('',@v);
811     $log->debug("headline: ",$self->{'headline'});
812     next; # don't return headline in data_structure!
813     }
814    
815 dpavlin 366 # does tag have type?
816     if ($tag->{'type'}) {
817 dpavlin 367 push @{$row->{$tag->{'type'}}}, @v;
818 dpavlin 366 } else {
819 dpavlin 367 push @{$row->{'display'}}, @v;
820     push @{$row->{'swish'}}, @v;
821 dpavlin 366 }
822 dpavlin 373
823 dpavlin 374
824 dpavlin 366 }
825    
826 dpavlin 368 if ($row) {
827     $row->{'tag'} = $field;
828 dpavlin 375
829     # TODO: name_sigular, name_plural
830     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
831     $row->{'name'} = $name ? $self->_x($name) : $field;
832    
833 dpavlin 368 push @ds, $row;
834 dpavlin 374
835 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
836 dpavlin 368 }
837 dpavlin 366
838     }
839    
840 dpavlin 370 return @ds;
841 dpavlin 366
842     }
843    
844 dpavlin 370 =head2 output
845    
846     Create output from in-memory data structure using Template Toolkit template.
847    
848     my $text = $webpac->output( template => 'text.tt', data => @ds );
849    
850     =cut
851    
852     sub output {
853     my $self = shift;
854    
855     my $args = {@_};
856    
857 dpavlin 372 my $log = $self->_get_logger();
858 dpavlin 370
859 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
860     $log->logconfess("need data array") if (! $args->{'data'});
861    
862 dpavlin 370 my $out;
863    
864     $self->{'tt'}->process(
865     $args->{'template'},
866     $args,
867     \$out
868     ) || confess $self->{'tt'}->error();
869    
870     return $out;
871     }
872    
873 dpavlin 411 =head2 output_file
874    
875     Create output from in-memory data structure using Template Toolkit template
876     to a file.
877    
878     $webpac->output_file(
879     file => 'out.txt',
880     template => 'text.tt',
881     data => @ds
882     );
883    
884     =cut
885    
886     sub output_file {
887     my $self = shift;
888    
889     my $args = {@_};
890    
891     my $log = $self->_get_logger();
892    
893 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
894 dpavlin 411
895 dpavlin 421 $log->debug("creating file ",$file);
896 dpavlin 411
897 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
898 dpavlin 411 print $fh $self->output(
899     template => $args->{'template'},
900     data => $args->{'data'},
901     ) || $log->logdie("print: $!");
902     close($fh) || $log->logdie("close: $!");
903     }
904    
905 dpavlin 375 =head2 apply_format
906    
907     Apply format specified in tag with C<format_name="name"> and
908     C<format_delimiter=";;">.
909    
910     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
911    
912     Formats can contain C<lookup{...}> if you need them.
913    
914     =cut
915    
916     sub apply_format {
917     my $self = shift;
918    
919     my ($name,$delimiter,$data) = @_;
920    
921     my $log = $self->_get_logger();
922    
923     if (! $self->{'import_xml'}->{'format'}->{$name}) {
924     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
925     return $data;
926     }
927    
928     $log->warn("no delimiter for format $name") if (! $delimiter);
929    
930     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
931    
932     my @data = split(/\Q$delimiter\E/, $data);
933    
934     my $out = sprintf($format, @data);
935     $log->debug("using format $name [$format] on $data to produce: $out");
936    
937     if ($out =~ m/$LOOKUP_REGEX/o) {
938     return $self->lookup($out);
939     } else {
940     return $out;
941     }
942    
943     }
944    
945    
946 dpavlin 372 #
947     #
948     #
949    
950     =head1 INTERNAL METHODS
951    
952     Here is a quick list of internal methods, mostly useful to turn debugging
953     on them (see L<LOGGING> below for explanation).
954    
955     =cut
956    
957     =head2 _eval
958    
959     Internal function to eval code without C<strict 'subs'>.
960    
961     =cut
962    
963     sub _eval {
964     my $self = shift;
965    
966     my $code = shift || return;
967    
968     my $log = $self->_get_logger();
969    
970     no strict 'subs';
971     my $ret = eval $code;
972     if ($@) {
973     $log->error("problem with eval code [$code]: $@");
974     }
975    
976     $log->debug("eval: ",$code," [",$ret,"]");
977    
978     return $ret || 0;
979     }
980    
981     =head2 _sort_by_order
982    
983     Sort xml tags data structure accoding to C<order=""> attribute.
984    
985     =cut
986    
987     sub _sort_by_order {
988     my $self = shift;
989    
990     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
991     $self->{'import_xml'}->{'indexer'}->{$a};
992     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
993     $self->{'import_xml'}->{'indexer'}->{$b};
994    
995     return $va <=> $vb;
996     }
997    
998 dpavlin 375 =head2 _get_logger
999    
1000     Get C<Log::Log4perl> object with a twist: domains are defined for each
1001     method
1002    
1003     my $log = $webpac->_get_logger();
1004    
1005     =cut
1006    
1007 dpavlin 372 sub _get_logger {
1008     my $self = shift;
1009    
1010 dpavlin 374 my $name = (caller(1))[3] || caller;
1011     return get_logger($name);
1012 dpavlin 372 }
1013    
1014 dpavlin 375 =head2 _x
1015    
1016     Convert string from UTF-8 to code page defined in C<import_xml>.
1017    
1018     my $text = $webpac->_x('utf8 text');
1019    
1020     =cut
1021    
1022     sub _x {
1023     my $self = shift;
1024     my $utf8 = shift || return;
1025    
1026     return $self->{'utf2cp'}->convert($utf8) ||
1027     $self->_get_logger()->logwarn("can't convert '$utf8'");
1028     }
1029    
1030 dpavlin 372 #
1031     #
1032     #
1033    
1034     =head1 LOGGING
1035    
1036     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1037     C<log.conf>.
1038    
1039     Methods defined above have different levels of logging, so
1040     it's descriptions will be useful to turn (mostry B<debug> logging) on
1041     or off to see why WabPAC isn't perforing as you expect it (it might even
1042     be a bug!).
1043    
1044     B<This is different from normal Log4perl behaviour>. To repeat, you can
1045     also use method names, and not only classes (which are just few)
1046     to filter logging.
1047    
1048 dpavlin 422
1049     =head1 MEMORY USAGE
1050    
1051     C<low_mem> options is double-edged sword. If enabled, WebPAC
1052     will run on memory constraint machines (which doesn't have enough
1053     physical RAM to create memory structure for whole source database).
1054    
1055     If your machine has 512Mb or more of RAM and database is around 10000 records,
1056     memory shouldn't be an issue. If you don't have enough physical RAM, you
1057     might consider using virtual memory (if your operating system is handling it
1058     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1059     parsed structure of ISIS database (this is what C<low_mem> option does).
1060    
1061     Hitting swap at end of reading source database is probably o.k. However,
1062     hitting swap before 90% will dramatically decrease performance and you will
1063     be better off with C<low_mem> and using rest of availble memory for
1064     operating system disk cache (Linux is particuallary good about this).
1065     However, every access to database record will require disk access, so
1066     generation phase will be slower 10-100 times.
1067    
1068     Parsed structures are essential - you just have option to trade RAM memory
1069     (which is fast) for disk space (which is slow). Be sure to have planty of
1070     disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1071    
1072     However, when WebPAC is running on desktop machines (or laptops :-), it's
1073     highly undesireable for system to start swapping. Using C<low_mem> option can
1074     reduce WecPAC memory usage to around 64Mb for same database with lookup
1075     fields and sorted indexes which stay in RAM. Performance will suffer, but
1076     memory usage will really be minimal. It might be also more confortable to
1077     run WebPAC reniced on those machines.
1078    
1079 dpavlin 372 =cut
1080    
1081 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26