/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 421 - (hide annotations)
Fri Sep 10 22:24:42 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 22721 byte(s)
low_mem option for desktop class-machines

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3 dpavlin 367 use warnings;
4     use strict;
5    
6 dpavlin 352 use Carp;
7 dpavlin 353 use Text::Iconv;
8     use Config::IniFiles;
9 dpavlin 363 use XML::Simple;
10 dpavlin 370 use Template;
11 dpavlin 372 use Log::Log4perl qw(get_logger :levels);
12 dpavlin 352
13 dpavlin 358 use Data::Dumper;
14    
15 dpavlin 373 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
16     #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
17     my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
18     my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
19    
20 dpavlin 352 =head1 NAME
21    
22 dpavlin 354 WebPAC - base class for WebPAC
23 dpavlin 352
24     =head1 DESCRIPTION
25    
26 dpavlin 354 This module implements methods used by WebPAC.
27 dpavlin 352
28     =head1 METHODS
29    
30     =head2 new
31    
32 dpavlin 421 Create new instance of WebPAC using configuration specified by C<config_file>.
33 dpavlin 352
34 dpavlin 354 my $webpac = new WebPAC(
35 dpavlin 352 config_file => 'name.conf',
36     [code_page => 'ISO-8859-2',]
37 dpavlin 421 [low_mem => 1,]
38 dpavlin 352 );
39    
40     Default C<code_page> is C<ISO-8859-2>.
41    
42 dpavlin 421 This method will also read configuration files
43 dpavlin 353 C<global.conf> (used by indexer and Web font-end)
44     and configuration file specified by C<config_file>
45     which describes databases to be indexed.
46    
47 dpavlin 421 C<low_mem> options is double-edged sword. If enabled, WebPAC
48     will run on memory constraint machines (which doesn't have enough
49     physical RAM to create memory structure for whole ISIS database).
50    
51     If your machine has 512Mb or more and database is around 10000 records,
52     memory shouldn't be an issue. If you don't have enough physical RAM, you
53     might consider using virtual memory (if your operating system is handling it
54     well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
55     parsed structure of ISIS database.
56    
57     However, when WebPAC is running on desktop machines (or laptops :-), it's
58     highly undesireable for system to start swapping. Using C<low_mem> option can
59     reduce WecPAC memory usage to 16Mb for same database with lookup fields and
60     sorted indexes which stay in RAM. Performance will suffer, but memory usage
61     will really be minimal. It might be also more confortable to run WebPAC reniced
62     on those machines.
63    
64 dpavlin 352 =cut
65    
66 dpavlin 363 # mapping between data type and tag which specify
67     # format in XML file
68     my %type2tag = (
69     'isis' => 'isis',
70     # 'excel' => 'column',
71     # 'marc' => 'marc',
72     # 'feed' => 'feed'
73     );
74    
75 dpavlin 352 sub new {
76     my $class = shift;
77     my $self = {@_};
78     bless($self, $class);
79    
80 dpavlin 372 my $log_file = $self->{'log'} || "log.conf";
81     Log::Log4perl->init($log_file);
82    
83     my $log = $self->_get_logger();
84    
85 dpavlin 352 # fill in default values
86     # output codepage
87     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
88    
89 dpavlin 353 #
90     # read global.conf
91     #
92 dpavlin 372 $log->debug("read 'global.conf'");
93 dpavlin 352
94 dpavlin 372 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
95 dpavlin 352
96     # read global config parametars
97     foreach my $var (qw(
98     dbi_dbd
99     dbi_dsn
100     dbi_user
101     dbi_passwd
102     show_progress
103     my_unac_filter
104 dpavlin 370 output_template
105 dpavlin 352 )) {
106 dpavlin 370 $self->{'global_config'}->{$var} = $config->val('global', $var);
107 dpavlin 352 }
108    
109 dpavlin 353 #
110     # read indexer config file
111     #
112 dpavlin 352
113 dpavlin 372 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
114 dpavlin 352
115 dpavlin 370 # create UTF-8 convertor for import_xml files
116 dpavlin 366 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
117 dpavlin 370
118     # create Template toolkit instance
119     $self->{'tt'} = Template->new(
120     INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
121     # FILTERS => {
122     # 'foo' => \&foo_filter,
123     # },
124     EVAL_PERL => 1,
125     );
126    
127 dpavlin 421 # running with low_mem flag? well, use DBM::Deep then.
128     if ($self->{'low_mem'}) {
129     $log->info("running with low_mem which impacts performance (<64 Mb memory usage)");
130    
131     my $db_file = "data.db";
132    
133     if (-e $db_file) {
134     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
135     $log->debug("removed '$db_file' from last run");
136     }
137    
138     use DBM::Deep;
139    
140     my $db = new DBM::Deep $db_file;
141    
142     $log->logdie("DBM::Deep error: $!") unless ($db);
143    
144     if ($db->error()) {
145     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
146     } else {
147     $log->debug("using file $db_file for DBM::Deep");
148     }
149    
150     $self->{'db'} = $db;
151     }
152    
153 dpavlin 352 return $self;
154     }
155    
156     =head2 open_isis
157    
158     Open CDS/ISIS database using OpenIsis module and read all records to memory.
159    
160     $webpac->open_isis(
161     filename => '/data/ISIS/ISIS',
162     code_page => '852',
163     limit_mfn => '500',
164     lookup => [ ... ],
165     );
166    
167     By default, ISIS code page is assumed to be C<852>.
168    
169 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
170     from database in example above.
171 dpavlin 352
172     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
173     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
174     value in index.
175    
176     lookup => [
177     { 'key' => 'd:v900', 'val' => 'v250^a' },
178     { 'eval' => '"v901^a" eq "Podruèje"',
179     'key' => 'pa:v561^4:v562^4:v461^1',
180     'val' => 'v900' },
181     ]
182    
183 dpavlin 367 Returns number of last record read into memory (size of database, really).
184    
185 dpavlin 352 =cut
186    
187     sub open_isis {
188     my $self = shift;
189     my $arg = {@_};
190    
191 dpavlin 372 my $log = $self->_get_logger();
192    
193     $log->logcroak("need filename") if (! $arg->{'filename'});
194 dpavlin 352 my $code_page = $arg->{'code_page'} || '852';
195    
196 dpavlin 421 # store data in object
197     $self->{'isis_filename'} = $arg->{'filename'};
198     $self->{'isis_code_page'} = $code_page;
199    
200 dpavlin 353 use OpenIsis;
201    
202 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
203    
204     # create Text::Iconv object
205     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
206    
207 dpavlin 372 $log->info("reading ISIS database '",$arg->{'filename'},"'");
208 dpavlin 421 $log->debug("isis code page: $code_page");
209 dpavlin 363
210 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
211    
212     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
213    
214 dpavlin 357 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
215    
216 dpavlin 372 $log->info("processing $maxmfn records...");
217 dpavlin 363
218 dpavlin 352 # read database
219     for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
220    
221 dpavlin 418
222     $log->debug("mfn: $mfn\n");
223    
224 dpavlin 421 my $rec;
225    
226 dpavlin 352 # read record
227     my $row = OpenIsis::read( $isis_db, $mfn );
228     foreach my $k (keys %{$row}) {
229     if ($k ne "mfn") {
230     foreach my $l (@{$row->{$k}}) {
231     $l = $cp->convert($l);
232     # has subfields?
233     my $val;
234     if ($l =~ m/\^/) {
235     foreach my $t (split(/\^/,$l)) {
236     next if (! $t);
237     $val->{substr($t,0,1)} = substr($t,1);
238     }
239     } else {
240     $val = $l;
241     }
242    
243 dpavlin 421 push @{$rec->{$k}}, $val;
244 dpavlin 352 }
245 dpavlin 374 } else {
246 dpavlin 421 push @{$rec->{'000'}}, $mfn;
247 dpavlin 352 }
248    
249     }
250    
251 dpavlin 421 $log->confess("record $mfn empty?") unless ($rec);
252    
253     # store
254     if ($self->{'low_mem'}) {
255     $self->{'db'}->put($mfn, $rec);
256     } else {
257     $self->{'data'}->{$mfn} = $rec;
258     }
259    
260 dpavlin 352 # create lookup
261 dpavlin 355 $self->create_lookup($rec, @{$arg->{'lookup'}});
262 dpavlin 352
263 dpavlin 389 $self->progress_bar($mfn,$maxmfn);
264    
265 dpavlin 352 }
266    
267 dpavlin 362 $self->{'current_mfn'} = 1;
268 dpavlin 389 $self->{'last_pcnt'} = 0;
269 dpavlin 362
270 dpavlin 418 $log->debug("max mfn: $maxmfn");
271    
272 dpavlin 352 # store max mfn and return it.
273     return $self->{'max_mfn'} = $maxmfn;
274     }
275    
276 dpavlin 362 =head2 fetch_rec
277    
278     Fetch next record from database. It will also display progress bar (once
279     it's implemented, that is).
280    
281     my $rec = $webpac->fetch_rec;
282    
283     =cut
284    
285     sub fetch_rec {
286     my $self = shift;
287    
288 dpavlin 372 my $log = $self->_get_logger();
289 dpavlin 362
290 dpavlin 372 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
291    
292 dpavlin 362 if ($mfn > $self->{'max_mfn'}) {
293     $self->{'current_mfn'} = $self->{'max_mfn'};
294 dpavlin 373 $log->debug("at EOF");
295 dpavlin 362 return;
296     }
297    
298 dpavlin 389 $self->progress_bar($mfn,$self->{'max_mfn'});
299    
300 dpavlin 421 if ($self->{'low_mem'}) {
301     return $self->{'db'}->get($mfn);
302     } else {
303     return $self->{'data'}->{$mfn};
304     }
305 dpavlin 362 }
306    
307 dpavlin 389 =head2 progress_bar
308    
309     Draw progress bar on STDERR.
310    
311     $webpac->progress_bar($current, $max);
312    
313     =cut
314    
315     sub progress_bar {
316     my $self = shift;
317    
318     my ($curr,$max) = @_;
319    
320     my $log = $self->_get_logger();
321    
322     $log->logconfess("no current value!") if (! $curr);
323     $log->logconfess("no maximum value!") if (! $max);
324    
325     if ($curr > $max) {
326     $max = $curr;
327     $log->debug("overflow to $curr");
328     }
329    
330     $self->{'last_pcnt'} ||= 1;
331    
332     $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});
333    
334     my $p = int($curr * 100 / $max);
335     if ($p != $self->{'last_pcnt'}) {
336     printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );
337     $self->{'last_pcnt'} = $p;
338     }
339 dpavlin 412 print STDERR "\n" if ($p == 100);
340 dpavlin 389 }
341    
342 dpavlin 363 =head2 open_import_xml
343    
344     Read file from C<import_xml/> directory and parse it.
345    
346     $webpac->open_import_xml(type => 'isis');
347    
348     =cut
349    
350     sub open_import_xml {
351     my $self = shift;
352    
353 dpavlin 372 my $log = $self->_get_logger();
354    
355 dpavlin 363 my $arg = {@_};
356 dpavlin 372 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
357 dpavlin 363
358 dpavlin 366 $self->{'type'} = $arg->{'type'};
359 dpavlin 363
360 dpavlin 366 my $type_base = $arg->{'type'};
361 dpavlin 363 $type_base =~ s/_.*$//g;
362    
363 dpavlin 366 $self->{'tag'} = $type2tag{$type_base};
364    
365 dpavlin 375 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
366 dpavlin 366
367     my $f = "./import_xml/".$self->{'type'}.".xml";
368 dpavlin 372 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
369 dpavlin 363
370 dpavlin 375 $log->info("reading '$f'");
371 dpavlin 363
372 dpavlin 375 $self->{'import_xml_file'} = $f;
373    
374 dpavlin 363 $self->{'import_xml'} = XMLin($f,
375 dpavlin 366 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
376 dpavlin 363 );
377    
378 dpavlin 375 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
379    
380 dpavlin 363 }
381    
382 dpavlin 355 =head2 create_lookup
383    
384     Create lookup from record using lookup definition.
385    
386 dpavlin 367 $self->create_lookup($rec, @lookups);
387    
388     Called internally by C<open_*> methods.
389    
390 dpavlin 355 =cut
391    
392     sub create_lookup {
393     my $self = shift;
394    
395 dpavlin 372 my $log = $self->_get_logger();
396 dpavlin 355
397 dpavlin 372 my $rec = shift || $log->logconfess("need record to create lookup");
398     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
399    
400 dpavlin 355 foreach my $i (@_) {
401 dpavlin 418 $log->logconfess("need key") unless defined($i->{'key'});
402     $log->logconfess("need val") unless defined($i->{'val'});
403    
404     if (defined($i->{'eval'})) {
405     # eval first, so we can skip fill_in for key and val
406     my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
407     if ($self->_eval($eval)) {
408     my $key = $self->fill_in($rec,$i->{'key'}) || next;
409     my @val = $self->fill_in($rec,$i->{'val'}) || next;
410 dpavlin 373 $log->debug("stored $key = ",sub { join(" | ",@val) });
411 dpavlin 355 push @{$self->{'lookup'}->{$key}}, @val;
412     }
413     } else {
414 dpavlin 418 my $key = $self->fill_in($rec,$i->{'key'}) || next;
415     my @val = $self->fill_in($rec,$i->{'val'}) || next;
416     $log->debug("stored $key = ",sub { join(" | ",@val) });
417     push @{$self->{'lookup'}->{$key}}, @val;
418 dpavlin 355 }
419     }
420     }
421    
422 dpavlin 356 =head2 get_data
423    
424     Returns value from record.
425    
426 dpavlin 367 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
427 dpavlin 356
428     Arguments are:
429     record reference C<$rec>,
430     field C<$f>,
431     optional subfiled C<$sf>,
432     index for repeatable values C<$i>.
433    
434 dpavlin 367 Optinal variable C<$found> will be incremeted if there
435 dpavlin 356 is field.
436    
437     Returns value or empty string.
438    
439     =cut
440    
441     sub get_data {
442     my $self = shift;
443    
444     my ($rec,$f,$sf,$i,$found) = @_;
445 dpavlin 367
446 dpavlin 356 if ($$rec->{$f}) {
447 dpavlin 367 return '' if (! $$rec->{$f}->[$i]);
448 dpavlin 389 no strict 'refs';
449 dpavlin 356 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
450     $$found++ if (defined($$found));
451     return $$rec->{$f}->[$i]->{$sf};
452     } elsif ($$rec->{$f}->[$i]) {
453     $$found++ if (defined($$found));
454 dpavlin 366 # it still might have subfield, just
455     # not specified, so we'll dump all
456     if ($$rec->{$f}->[$i] =~ /HASH/o) {
457     my $out;
458     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
459     $out .= $$rec->{$f}->[$i]->{$k}." ";
460     }
461     return $out;
462     } else {
463     return $$rec->{$f}->[$i];
464     }
465 dpavlin 356 }
466     } else {
467     return '';
468     }
469     }
470    
471 dpavlin 352 =head2 fill_in
472    
473     Workhourse of all: takes record from in-memory structure of database and
474     strings with placeholders and returns string or array of with substituted
475     values from record.
476    
477 dpavlin 367 my $text = $webpac->fill_in($rec,'v250^a');
478 dpavlin 352
479     Optional argument is ordinal number for repeatable fields. By default,
480 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
481     element is 0).
482     Following example will read second value from repeatable field.
483 dpavlin 352
484 dpavlin 367 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
485 dpavlin 353
486     This function B<does not> perform parsing of format to inteligenty skip
487     delimiters before fields which aren't used.
488    
489 dpavlin 376 This method will automatically decode UTF-8 string to local code page
490     if needed.
491    
492 dpavlin 352 =cut
493    
494 dpavlin 372 sub fill_in {
495 dpavlin 371 my $self = shift;
496    
497 dpavlin 372 my $log = $self->_get_logger();
498 dpavlin 371
499 dpavlin 372 my $rec = shift || $log->logconfess("need data record");
500     my $format = shift || $log->logconfess("need format to parse");
501 dpavlin 352 # iteration (for repeatable fields)
502     my $i = shift || 0;
503    
504     # FIXME remove for speedup?
505 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
506 dpavlin 352
507 dpavlin 376 if (utf8::is_utf8($format)) {
508     $format = $self->_x($format);
509     }
510    
511 dpavlin 352 my $found = 0;
512    
513 dpavlin 359 my $eval_code;
514     # remove eval{...} from beginning
515     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
516    
517 dpavlin 352 # do actual replacement of placeholders
518 dpavlin 373 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
519 dpavlin 352
520 dpavlin 353 if ($found) {
521 dpavlin 373 $log->debug("format: $format");
522 dpavlin 359 if ($eval_code) {
523     my $eval = $self->fill_in($rec,$eval_code,$i);
524 dpavlin 371 return if (! $self->_eval($eval));
525 dpavlin 359 }
526 dpavlin 353 # do we have lookups?
527 dpavlin 373 if ($format =~ /$LOOKUP_REGEX/o) {
528     $log->debug("format '$format' has lookup");
529 dpavlin 353 return $self->lookup($format);
530     } else {
531     return $format;
532     }
533 dpavlin 352 } else {
534     return;
535     }
536     }
537    
538     =head2 lookup
539    
540 dpavlin 355 Perform lookups on format supplied to it.
541 dpavlin 352
542 dpavlin 367 my $text = $self->lookup('[v900]');
543 dpavlin 352
544 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
545    
546 dpavlin 352 =cut
547    
548     sub lookup {
549     my $self = shift;
550    
551 dpavlin 372 my $log = $self->_get_logger();
552 dpavlin 352
553 dpavlin 372 my $tmp = shift || $log->logconfess("need format");
554    
555 dpavlin 373 if ($tmp =~ /$LOOKUP_REGEX/o) {
556 dpavlin 352 my @in = ( $tmp );
557 dpavlin 372
558     $log->debug("lookup for: ",$tmp);
559    
560 dpavlin 352 my @out;
561     while (my $f = shift @in) {
562 dpavlin 373 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
563 dpavlin 352 my $k = $1;
564     if ($self->{'lookup'}->{$k}) {
565     foreach my $nv (@{$self->{'lookup'}->{$k}}) {
566     my $tmp2 = $f;
567 dpavlin 373 $tmp2 =~ s/lookup{$k}/$nv/g;
568 dpavlin 352 push @in, $tmp2;
569     }
570     } else {
571     undef $f;
572     }
573     } elsif ($f) {
574     push @out, $f;
575     }
576     }
577 dpavlin 373 $log->logconfess("return is array and it's not expected!") unless wantarray;
578 dpavlin 352 return @out;
579     } else {
580     return $tmp;
581     }
582     }
583    
584 dpavlin 356 =head2 parse
585    
586     Perform smart parsing of string, skipping delimiters for fields which aren't
587     defined. It can also eval code in format starting with C<eval{...}> and
588     return output or nothing depending on eval code.
589    
590 dpavlin 367 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
591 dpavlin 356
592     =cut
593    
594     sub parse {
595     my $self = shift;
596    
597 dpavlin 366 my ($rec, $format_utf8, $i) = @_;
598 dpavlin 356
599 dpavlin 366 return if (! $format_utf8);
600    
601 dpavlin 372 my $log = $self->_get_logger();
602 dpavlin 358
603 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
604     $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
605    
606 dpavlin 358 $i = 0 if (! $i);
607    
608 dpavlin 375 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
609 dpavlin 366
610 dpavlin 356 my @out;
611    
612 dpavlin 373 $log->debug("format: $format");
613    
614 dpavlin 356 my $eval_code;
615     # remove eval{...} from beginning
616     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
617    
618 dpavlin 358 my $prefix;
619     my $all_found=0;
620 dpavlin 356
621 dpavlin 373 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
622 dpavlin 356
623 dpavlin 358 my $del = $1 || '';
624 dpavlin 359 $prefix ||= $del if ($all_found == 0);
625 dpavlin 358
626     my $found = 0;
627     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
628    
629 dpavlin 356 if ($found) {
630 dpavlin 358 push @out, $del;
631     push @out, $tmp;
632     $all_found += $found;
633 dpavlin 356 }
634     }
635    
636 dpavlin 358 return if (! $all_found);
637 dpavlin 356
638 dpavlin 373 my $out = join('',@out);
639 dpavlin 358
640 dpavlin 373 if ($out) {
641     # add rest of format (suffix)
642     $out .= $format;
643 dpavlin 367
644 dpavlin 373 # add prefix if not there
645     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
646    
647     $log->debug("result: $out");
648     }
649    
650 dpavlin 359 if ($eval_code) {
651     my $eval = $self->fill_in($rec,$eval_code,$i);
652 dpavlin 373 $log->debug("about to eval{",$eval,"} format: $out");
653 dpavlin 371 return if (! $self->_eval($eval));
654 dpavlin 359 }
655    
656 dpavlin 358 return $out;
657 dpavlin 356 }
658    
659 dpavlin 367 =head2 parse_to_arr
660    
661     Similar to C<parse>, but returns array of all repeatable fields
662    
663     my @arr = $webpac->parse_to_arr($rec,'v250^a');
664    
665     =cut
666    
667     sub parse_to_arr {
668     my $self = shift;
669    
670     my ($rec, $format_utf8) = @_;
671    
672 dpavlin 372 my $log = $self->_get_logger();
673    
674     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
675 dpavlin 367 return if (! $format_utf8);
676    
677     my $i = 0;
678     my @arr;
679    
680     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
681     push @arr, $v;
682     }
683    
684 dpavlin 373 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
685    
686 dpavlin 367 return @arr;
687     }
688    
689 dpavlin 373 =head2 fill_in_to_arr
690    
691     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
692     for fields which have lookups, so they shouldn't be parsed but rather
693     C<fill_id>ed.
694    
695     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
696    
697     =cut
698    
699     sub fill_in_to_arr {
700     my $self = shift;
701    
702     my ($rec, $format_utf8) = @_;
703    
704     my $log = $self->_get_logger();
705    
706     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
707     return if (! $format_utf8);
708    
709     my $i = 0;
710     my @arr;
711    
712     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
713     push @arr, @v;
714     }
715    
716     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
717    
718     return @arr;
719     }
720    
721    
722 dpavlin 366 =head2 data_structure
723    
724     Create in-memory data structure which represents layout from C<import_xml>.
725     It is used later to produce output.
726    
727 dpavlin 368 my @ds = $webpac->data_structure($rec);
728 dpavlin 366
729 dpavlin 374 This method will also set C<$webpac->{'currnet_filename'}> if there is
730 dpavlin 398 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
731     <headline> tag.
732 dpavlin 374
733 dpavlin 366 =cut
734    
735 dpavlin 372 sub data_structure {
736 dpavlin 366 my $self = shift;
737    
738 dpavlin 372 my $log = $self->_get_logger();
739 dpavlin 366
740     my $rec = shift;
741 dpavlin 372 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
742 dpavlin 366
743 dpavlin 374 undef $self->{'currnet_filename'};
744 dpavlin 398 undef $self->{'headline'};
745 dpavlin 374
746 dpavlin 366 my @sorted_tags;
747     if ($self->{tags_by_order}) {
748     @sorted_tags = @{$self->{tags_by_order}};
749     } else {
750     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
751     $self->{tags_by_order} = \@sorted_tags;
752     }
753    
754 dpavlin 368 my @ds;
755 dpavlin 366
756 dpavlin 373 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
757    
758 dpavlin 366 foreach my $field (@sorted_tags) {
759    
760     my $row;
761    
762     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
763    
764     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
765 dpavlin 373 my $format = $tag->{'value'} || $tag->{'content'};
766 dpavlin 366
767 dpavlin 373 $log->debug("format: $format");
768    
769     my @v;
770     if ($format =~ /$LOOKUP_REGEX/o) {
771     @v = $self->fill_in_to_arr($rec,$format);
772     } else {
773     @v = $self->parse_to_arr($rec,$format);
774     }
775 dpavlin 367 next if (! @v);
776 dpavlin 366
777 dpavlin 375 # use format?
778     if ($tag->{'format_name'}) {
779     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
780     }
781    
782 dpavlin 398 if ($field eq 'filename') {
783     $self->{'current_filename'} = join('',@v);
784     $log->debug("filename: ",$self->{'current_filename'});
785     } elsif ($field eq 'headline') {
786     $self->{'headline'} .= join('',@v);
787     $log->debug("headline: ",$self->{'headline'});
788     next; # don't return headline in data_structure!
789     }
790    
791 dpavlin 366 # does tag have type?
792     if ($tag->{'type'}) {
793 dpavlin 367 push @{$row->{$tag->{'type'}}}, @v;
794 dpavlin 366 } else {
795 dpavlin 367 push @{$row->{'display'}}, @v;
796     push @{$row->{'swish'}}, @v;
797 dpavlin 366 }
798 dpavlin 373
799 dpavlin 374
800 dpavlin 366 }
801    
802 dpavlin 368 if ($row) {
803     $row->{'tag'} = $field;
804 dpavlin 375
805     # TODO: name_sigular, name_plural
806     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
807     $row->{'name'} = $name ? $self->_x($name) : $field;
808    
809 dpavlin 368 push @ds, $row;
810 dpavlin 374
811 dpavlin 373 $log->debug("row $field: ",sub { Dumper($row) });
812 dpavlin 368 }
813 dpavlin 366
814     }
815    
816 dpavlin 370 return @ds;
817 dpavlin 366
818     }
819    
820 dpavlin 370 =head2 output
821    
822     Create output from in-memory data structure using Template Toolkit template.
823    
824     my $text = $webpac->output( template => 'text.tt', data => @ds );
825    
826     =cut
827    
828     sub output {
829     my $self = shift;
830    
831     my $args = {@_};
832    
833 dpavlin 372 my $log = $self->_get_logger();
834 dpavlin 370
835 dpavlin 372 $log->logconfess("need template name") if (! $args->{'template'});
836     $log->logconfess("need data array") if (! $args->{'data'});
837    
838 dpavlin 370 my $out;
839    
840     $self->{'tt'}->process(
841     $args->{'template'},
842     $args,
843     \$out
844     ) || confess $self->{'tt'}->error();
845    
846     return $out;
847     }
848    
849 dpavlin 411 =head2 output_file
850    
851     Create output from in-memory data structure using Template Toolkit template
852     to a file.
853    
854     $webpac->output_file(
855     file => 'out.txt',
856     template => 'text.tt',
857     data => @ds
858     );
859    
860     =cut
861    
862     sub output_file {
863     my $self = shift;
864    
865     my $args = {@_};
866    
867     my $log = $self->_get_logger();
868    
869 dpavlin 421 my $file = $args->{'file'} || $log->logconfess("need file name");
870 dpavlin 411
871 dpavlin 421 $log->debug("creating file ",$file);
872 dpavlin 411
873 dpavlin 421 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
874 dpavlin 411 print $fh $self->output(
875     template => $args->{'template'},
876     data => $args->{'data'},
877     ) || $log->logdie("print: $!");
878     close($fh) || $log->logdie("close: $!");
879     }
880    
881 dpavlin 375 =head2 apply_format
882    
883     Apply format specified in tag with C<format_name="name"> and
884     C<format_delimiter=";;">.
885    
886     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
887    
888     Formats can contain C<lookup{...}> if you need them.
889    
890     =cut
891    
892     sub apply_format {
893     my $self = shift;
894    
895     my ($name,$delimiter,$data) = @_;
896    
897     my $log = $self->_get_logger();
898    
899     if (! $self->{'import_xml'}->{'format'}->{$name}) {
900     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
901     return $data;
902     }
903    
904     $log->warn("no delimiter for format $name") if (! $delimiter);
905    
906     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
907    
908     my @data = split(/\Q$delimiter\E/, $data);
909    
910     my $out = sprintf($format, @data);
911     $log->debug("using format $name [$format] on $data to produce: $out");
912    
913     if ($out =~ m/$LOOKUP_REGEX/o) {
914     return $self->lookup($out);
915     } else {
916     return $out;
917     }
918    
919     }
920    
921    
922 dpavlin 372 #
923     #
924     #
925    
926     =head1 INTERNAL METHODS
927    
928     Here is a quick list of internal methods, mostly useful to turn debugging
929     on them (see L<LOGGING> below for explanation).
930    
931     =cut
932    
933     =head2 _eval
934    
935     Internal function to eval code without C<strict 'subs'>.
936    
937     =cut
938    
939     sub _eval {
940     my $self = shift;
941    
942     my $code = shift || return;
943    
944     my $log = $self->_get_logger();
945    
946     no strict 'subs';
947     my $ret = eval $code;
948     if ($@) {
949     $log->error("problem with eval code [$code]: $@");
950     }
951    
952     $log->debug("eval: ",$code," [",$ret,"]");
953    
954     return $ret || 0;
955     }
956    
957     =head2 _sort_by_order
958    
959     Sort xml tags data structure accoding to C<order=""> attribute.
960    
961     =cut
962    
963     sub _sort_by_order {
964     my $self = shift;
965    
966     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
967     $self->{'import_xml'}->{'indexer'}->{$a};
968     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
969     $self->{'import_xml'}->{'indexer'}->{$b};
970    
971     return $va <=> $vb;
972     }
973    
974 dpavlin 375 =head2 _get_logger
975    
976     Get C<Log::Log4perl> object with a twist: domains are defined for each
977     method
978    
979     my $log = $webpac->_get_logger();
980    
981     =cut
982    
983 dpavlin 372 sub _get_logger {
984     my $self = shift;
985    
986 dpavlin 374 my $name = (caller(1))[3] || caller;
987     return get_logger($name);
988 dpavlin 372 }
989    
990 dpavlin 375 =head2 _x
991    
992     Convert string from UTF-8 to code page defined in C<import_xml>.
993    
994     my $text = $webpac->_x('utf8 text');
995    
996     =cut
997    
998     sub _x {
999     my $self = shift;
1000     my $utf8 = shift || return;
1001    
1002     return $self->{'utf2cp'}->convert($utf8) ||
1003     $self->_get_logger()->logwarn("can't convert '$utf8'");
1004     }
1005    
1006 dpavlin 372 #
1007     #
1008     #
1009    
1010     =head1 LOGGING
1011    
1012     Logging in WebPAC is performed by L<Log::Log4perl> with config file
1013     C<log.conf>.
1014    
1015     Methods defined above have different levels of logging, so
1016     it's descriptions will be useful to turn (mostry B<debug> logging) on
1017     or off to see why WabPAC isn't perforing as you expect it (it might even
1018     be a bug!).
1019    
1020     B<This is different from normal Log4perl behaviour>. To repeat, you can
1021     also use method names, and not only classes (which are just few)
1022     to filter logging.
1023    
1024     =cut
1025    
1026 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26