/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 422 - (show annotations)
Sat Sep 11 08:36:38 2004 UTC (19 years, 6 months ago) by dpavlin
File size: 24276 byte(s)
better progress_bar, more documentation

1 package WebPAC;
2
3 use warnings;
4 use strict;
5
6 use Carp;
7 use Text::Iconv;
8 use Config::IniFiles;
9 use XML::Simple;
10 use Template;
11 use Log::Log4perl qw(get_logger :levels);
12 use Time::HiRes qw(time);
13
14 use Data::Dumper;
15
16 #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17 #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18 my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19 my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20
21 =head1 NAME
22
23 WebPAC - base class for WebPAC
24
25 =head1 DESCRIPTION
26
27 This module implements methods used by WebPAC.
28
29 =head1 METHODS
30
31 =head2 new
32
33 Create new instance of WebPAC using configuration specified by C<config_file>.
34
35 my $webpac = new WebPAC(
36 config_file => 'name.conf',
37 [code_page => 'ISO-8859-2',]
38 [low_mem => 1,]
39 );
40
41 Default C<code_page> is C<ISO-8859-2>.
42
43 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44
45 This method will also read configuration files
46 C<global.conf> (used by indexer and Web font-end)
47 and configuration file specified by C<config_file>
48 which describes databases to be indexed.
49
50 =cut
51
52 # mapping between data type and tag which specify
53 # format in XML file
54 my %type2tag = (
55 'isis' => 'isis',
56 # 'excel' => 'column',
57 # 'marc' => 'marc',
58 # 'feed' => 'feed'
59 );
60
61 sub new {
62 my $class = shift;
63 my $self = {@_};
64 bless($self, $class);
65
66 $self->{'start_t'} = time();
67
68 my $log_file = $self->{'log'} || "log.conf";
69 Log::Log4perl->init($log_file);
70
71 my $log = $self->_get_logger();
72
73 # fill in default values
74 # output codepage
75 $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
76
77 #
78 # read global.conf
79 #
80 $log->debug("read 'global.conf'");
81
82 my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83
84 # read global config parametars
85 foreach my $var (qw(
86 dbi_dbd
87 dbi_dsn
88 dbi_user
89 dbi_passwd
90 show_progress
91 my_unac_filter
92 output_template
93 )) {
94 $self->{'global_config'}->{$var} = $config->val('global', $var);
95 }
96
97 #
98 # read indexer config file
99 #
100
101 $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102
103 # create UTF-8 convertor for import_xml files
104 $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105
106 # create Template toolkit instance
107 $self->{'tt'} = Template->new(
108 INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109 # FILTERS => {
110 # 'foo' => \&foo_filter,
111 # },
112 EVAL_PERL => 1,
113 );
114
115 # running with low_mem flag? well, use DBM::Deep then.
116 if ($self->{'low_mem'}) {
117 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118
119 my $db_file = "data.db";
120
121 if (-e $db_file) {
122 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123 $log->debug("removed '$db_file' from last run");
124 }
125
126 use DBM::Deep;
127
128 my $db = new DBM::Deep $db_file;
129
130 $log->logdie("DBM::Deep error: $!") unless ($db);
131
132 if ($db->error()) {
133 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134 } else {
135 $log->debug("using file '$db_file' for DBM::Deep");
136 }
137
138 $self->{'db'} = $db;
139 }
140
141 return $self;
142 }
143
144 =head2 open_isis
145
146 Open CDS/ISIS database using OpenIsis module and read all records to memory.
147
148 $webpac->open_isis(
149 filename => '/data/ISIS/ISIS',
150 code_page => '852',
151 limit_mfn => '500',
152 lookup => [ ... ],
153 );
154
155 By default, ISIS code page is assumed to be C<852>.
156
157 If optional parametar C<limit_mfn> is set, it will read just 500 records
158 from database in example above.
159
160 C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
161 C<val>. Optional parametar C<eval> is perl code to evaluate before storing
162 value in index.
163
164 lookup => [
165 { 'key' => 'd:v900', 'val' => 'v250^a' },
166 { 'eval' => '"v901^a" eq "Podruèje"',
167 'key' => 'pa:v561^4:v562^4:v461^1',
168 'val' => 'v900' },
169 ]
170
171 Returns number of last record read into memory (size of database, really).
172
173 =cut
174
175 sub open_isis {
176 my $self = shift;
177 my $arg = {@_};
178
179 my $log = $self->_get_logger();
180
181 $log->logcroak("need filename") if (! $arg->{'filename'});
182 my $code_page = $arg->{'code_page'} || '852';
183
184 # store data in object
185 $self->{'isis_filename'} = $arg->{'filename'};
186 $self->{'isis_code_page'} = $code_page;
187
188 use OpenIsis;
189
190 #$self->{'isis_code_page'} = $code_page;
191
192 # create Text::Iconv object
193 my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
194
195 $log->info("reading ISIS database '",$arg->{'filename'},"'");
196 $log->debug("isis code page: $code_page");
197
198 my $isis_db = OpenIsis::open($arg->{'filename'});
199
200 my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
201
202 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
203
204 $log->info("processing $maxmfn records...");
205
206 # read database
207 for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
208
209
210 $log->debug("mfn: $mfn\n");
211
212 my $rec;
213
214 # read record
215 my $row = OpenIsis::read( $isis_db, $mfn );
216 foreach my $k (keys %{$row}) {
217 if ($k ne "mfn") {
218 foreach my $l (@{$row->{$k}}) {
219 $l = $cp->convert($l);
220 # has subfields?
221 my $val;
222 if ($l =~ m/\^/) {
223 foreach my $t (split(/\^/,$l)) {
224 next if (! $t);
225 $val->{substr($t,0,1)} = substr($t,1);
226 }
227 } else {
228 $val = $l;
229 }
230
231 push @{$rec->{$k}}, $val;
232 }
233 } else {
234 push @{$rec->{'000'}}, $mfn;
235 }
236
237 }
238
239 $log->confess("record $mfn empty?") unless ($rec);
240
241 # store
242 if ($self->{'low_mem'}) {
243 $self->{'db'}->put($mfn, $rec);
244 } else {
245 $self->{'data'}->{$mfn} = $rec;
246 }
247
248 # create lookup
249 $self->create_lookup($rec, @{$arg->{'lookup'}});
250
251 $self->progress_bar($mfn,$maxmfn);
252
253 }
254
255 $self->{'current_mfn'} = 1;
256 $self->{'last_pcnt'} = 0;
257
258 $log->debug("max mfn: $maxmfn");
259
260 # store max mfn and return it.
261 return $self->{'max_mfn'} = $maxmfn;
262 }
263
264 =head2 fetch_rec
265
266 Fetch next record from database. It will also display progress bar (once
267 it's implemented, that is).
268
269 my $rec = $webpac->fetch_rec;
270
271 =cut
272
273 sub fetch_rec {
274 my $self = shift;
275
276 my $log = $self->_get_logger();
277
278 my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
279
280 if ($mfn > $self->{'max_mfn'}) {
281 $self->{'current_mfn'} = $self->{'max_mfn'};
282 $log->debug("at EOF");
283 return;
284 }
285
286 $self->progress_bar($mfn,$self->{'max_mfn'});
287
288 if ($self->{'low_mem'}) {
289 return $self->{'db'}->get($mfn);
290 } else {
291 return $self->{'data'}->{$mfn};
292 }
293 }
294
295 =head2 progress_bar
296
297 Draw progress bar on STDERR.
298
299 $webpac->progress_bar($current, $max);
300
301 =cut
302
303 sub progress_bar {
304 my $self = shift;
305
306 my ($curr,$max) = @_;
307
308 my $log = $self->_get_logger();
309
310 $log->logconfess("no current value!") if (! $curr);
311 $log->logconfess("no maximum value!") if (! $max);
312
313 if ($curr > $max) {
314 $max = $curr;
315 $log->debug("overflow to $curr");
316 }
317
318 $self->{'last_pcnt'} ||= 1;
319
320 my $p = int($curr * 100 / $max);
321
322 # reset on re-run
323 if ($p < $self->{'last_pcnt'}) {
324 $self->{'last_pcnt'} = $p;
325 $self->{'last_t'} = time();
326 $self->{'last_curr'} = 1;
327 }
328
329 if ($p != $self->{'last_pcnt'}) {
330
331 my $last_curr = $self->{'last_curr'} || $curr;
332 my $t = time();
333 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
334 my $eta = ($max-$curr) / ($rate || 1);
335 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
336 $self->{'last_pcnt'} = $p;
337 $self->{'last_t'} = time();
338 $self->{'last_curr'} = $curr;
339 }
340 print STDERR "\n" if ($p == 100);
341 }
342
343 =head2 fmt_time
344
345 Format time (in seconds) for display.
346
347 print $webpac->fmt_time(time());
348
349 This method is called by L<progress_bar> to display remaining time.
350
351 =cut
352
353 sub fmt_time {
354 my $self = shift;
355
356 my $t = shift || 0;
357 my $out = "";
358
359 my ($ss,$mm,$hh) = gmtime($t);
360 $out .= "${hh}h" if ($hh);
361 $out .= sprintf("%02d:%02d", $mm,$ss);
362 $out .= " " if ($hh == 0);
363 return $out;
364 }
365
366 =head2 open_import_xml
367
368 Read file from C<import_xml/> directory and parse it.
369
370 $webpac->open_import_xml(type => 'isis');
371
372 =cut
373
374 sub open_import_xml {
375 my $self = shift;
376
377 my $log = $self->_get_logger();
378
379 my $arg = {@_};
380 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
381
382 $self->{'type'} = $arg->{'type'};
383
384 my $type_base = $arg->{'type'};
385 $type_base =~ s/_.*$//g;
386
387 $self->{'tag'} = $type2tag{$type_base};
388
389 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
390
391 my $f = "./import_xml/".$self->{'type'}.".xml";
392 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
393
394 $log->info("reading '$f'");
395
396 $self->{'import_xml_file'} = $f;
397
398 $self->{'import_xml'} = XMLin($f,
399 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
400 );
401
402 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
403
404 }
405
406 =head2 create_lookup
407
408 Create lookup from record using lookup definition.
409
410 $self->create_lookup($rec, @lookups);
411
412 Called internally by C<open_*> methods.
413
414 =cut
415
416 sub create_lookup {
417 my $self = shift;
418
419 my $log = $self->_get_logger();
420
421 my $rec = shift || $log->logconfess("need record to create lookup");
422 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
423
424 foreach my $i (@_) {
425 $log->logconfess("need key") unless defined($i->{'key'});
426 $log->logconfess("need val") unless defined($i->{'val'});
427
428 if (defined($i->{'eval'})) {
429 # eval first, so we can skip fill_in for key and val
430 my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
431 if ($self->_eval($eval)) {
432 my $key = $self->fill_in($rec,$i->{'key'}) || next;
433 my @val = $self->fill_in($rec,$i->{'val'}) || next;
434 $log->debug("stored $key = ",sub { join(" | ",@val) });
435 push @{$self->{'lookup'}->{$key}}, @val;
436 }
437 } else {
438 my $key = $self->fill_in($rec,$i->{'key'}) || next;
439 my @val = $self->fill_in($rec,$i->{'val'}) || next;
440 $log->debug("stored $key = ",sub { join(" | ",@val) });
441 push @{$self->{'lookup'}->{$key}}, @val;
442 }
443 }
444 }
445
446 =head2 get_data
447
448 Returns value from record.
449
450 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
451
452 Arguments are:
453 record reference C<$rec>,
454 field C<$f>,
455 optional subfiled C<$sf>,
456 index for repeatable values C<$i>.
457
458 Optinal variable C<$found> will be incremeted if there
459 is field.
460
461 Returns value or empty string.
462
463 =cut
464
465 sub get_data {
466 my $self = shift;
467
468 my ($rec,$f,$sf,$i,$found) = @_;
469
470 if ($$rec->{$f}) {
471 return '' if (! $$rec->{$f}->[$i]);
472 no strict 'refs';
473 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
474 $$found++ if (defined($$found));
475 return $$rec->{$f}->[$i]->{$sf};
476 } elsif ($$rec->{$f}->[$i]) {
477 $$found++ if (defined($$found));
478 # it still might have subfield, just
479 # not specified, so we'll dump all
480 if ($$rec->{$f}->[$i] =~ /HASH/o) {
481 my $out;
482 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
483 $out .= $$rec->{$f}->[$i]->{$k}." ";
484 }
485 return $out;
486 } else {
487 return $$rec->{$f}->[$i];
488 }
489 }
490 } else {
491 return '';
492 }
493 }
494
495 =head2 fill_in
496
497 Workhourse of all: takes record from in-memory structure of database and
498 strings with placeholders and returns string or array of with substituted
499 values from record.
500
501 my $text = $webpac->fill_in($rec,'v250^a');
502
503 Optional argument is ordinal number for repeatable fields. By default,
504 it's assume to be first repeatable field (fields are perl array, so first
505 element is 0).
506 Following example will read second value from repeatable field.
507
508 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
509
510 This function B<does not> perform parsing of format to inteligenty skip
511 delimiters before fields which aren't used.
512
513 This method will automatically decode UTF-8 string to local code page
514 if needed.
515
516 =cut
517
518 sub fill_in {
519 my $self = shift;
520
521 my $log = $self->_get_logger();
522
523 my $rec = shift || $log->logconfess("need data record");
524 my $format = shift || $log->logconfess("need format to parse");
525 # iteration (for repeatable fields)
526 my $i = shift || 0;
527
528 # FIXME remove for speedup?
529 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
530
531 if (utf8::is_utf8($format)) {
532 $format = $self->_x($format);
533 }
534
535 my $found = 0;
536
537 my $eval_code;
538 # remove eval{...} from beginning
539 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
540
541 # do actual replacement of placeholders
542 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
543
544 if ($found) {
545 $log->debug("format: $format");
546 if ($eval_code) {
547 my $eval = $self->fill_in($rec,$eval_code,$i);
548 return if (! $self->_eval($eval));
549 }
550 # do we have lookups?
551 if ($format =~ /$LOOKUP_REGEX/o) {
552 $log->debug("format '$format' has lookup");
553 return $self->lookup($format);
554 } else {
555 return $format;
556 }
557 } else {
558 return;
559 }
560 }
561
562 =head2 lookup
563
564 Perform lookups on format supplied to it.
565
566 my $text = $self->lookup('[v900]');
567
568 Lookups can be nested (like C<[d:[a:[v900]]]>).
569
570 =cut
571
572 sub lookup {
573 my $self = shift;
574
575 my $log = $self->_get_logger();
576
577 my $tmp = shift || $log->logconfess("need format");
578
579 if ($tmp =~ /$LOOKUP_REGEX/o) {
580 my @in = ( $tmp );
581
582 $log->debug("lookup for: ",$tmp);
583
584 my @out;
585 while (my $f = shift @in) {
586 if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
587 my $k = $1;
588 if ($self->{'lookup'}->{$k}) {
589 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
590 my $tmp2 = $f;
591 $tmp2 =~ s/lookup{$k}/$nv/g;
592 push @in, $tmp2;
593 }
594 } else {
595 undef $f;
596 }
597 } elsif ($f) {
598 push @out, $f;
599 }
600 }
601 $log->logconfess("return is array and it's not expected!") unless wantarray;
602 return @out;
603 } else {
604 return $tmp;
605 }
606 }
607
608 =head2 parse
609
610 Perform smart parsing of string, skipping delimiters for fields which aren't
611 defined. It can also eval code in format starting with C<eval{...}> and
612 return output or nothing depending on eval code.
613
614 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
615
616 =cut
617
618 sub parse {
619 my $self = shift;
620
621 my ($rec, $format_utf8, $i) = @_;
622
623 return if (! $format_utf8);
624
625 my $log = $self->_get_logger();
626
627 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
628 $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
629
630 $i = 0 if (! $i);
631
632 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
633
634 my @out;
635
636 $log->debug("format: $format");
637
638 my $eval_code;
639 # remove eval{...} from beginning
640 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
641
642 my $prefix;
643 my $all_found=0;
644
645 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
646
647 my $del = $1 || '';
648 $prefix ||= $del if ($all_found == 0);
649
650 my $found = 0;
651 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
652
653 if ($found) {
654 push @out, $del;
655 push @out, $tmp;
656 $all_found += $found;
657 }
658 }
659
660 return if (! $all_found);
661
662 my $out = join('',@out);
663
664 if ($out) {
665 # add rest of format (suffix)
666 $out .= $format;
667
668 # add prefix if not there
669 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
670
671 $log->debug("result: $out");
672 }
673
674 if ($eval_code) {
675 my $eval = $self->fill_in($rec,$eval_code,$i);
676 $log->debug("about to eval{",$eval,"} format: $out");
677 return if (! $self->_eval($eval));
678 }
679
680 return $out;
681 }
682
683 =head2 parse_to_arr
684
685 Similar to C<parse>, but returns array of all repeatable fields
686
687 my @arr = $webpac->parse_to_arr($rec,'v250^a');
688
689 =cut
690
691 sub parse_to_arr {
692 my $self = shift;
693
694 my ($rec, $format_utf8) = @_;
695
696 my $log = $self->_get_logger();
697
698 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
699 return if (! $format_utf8);
700
701 my $i = 0;
702 my @arr;
703
704 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
705 push @arr, $v;
706 }
707
708 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
709
710 return @arr;
711 }
712
713 =head2 fill_in_to_arr
714
715 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
716 for fields which have lookups, so they shouldn't be parsed but rather
717 C<fill_id>ed.
718
719 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
720
721 =cut
722
723 sub fill_in_to_arr {
724 my $self = shift;
725
726 my ($rec, $format_utf8) = @_;
727
728 my $log = $self->_get_logger();
729
730 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
731 return if (! $format_utf8);
732
733 my $i = 0;
734 my @arr;
735
736 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
737 push @arr, @v;
738 }
739
740 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
741
742 return @arr;
743 }
744
745
746 =head2 data_structure
747
748 Create in-memory data structure which represents layout from C<import_xml>.
749 It is used later to produce output.
750
751 my @ds = $webpac->data_structure($rec);
752
753 This method will also set C<$webpac->{'currnet_filename'}> if there is
754 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
755 <headline> tag.
756
757 =cut
758
759 sub data_structure {
760 my $self = shift;
761
762 my $log = $self->_get_logger();
763
764 my $rec = shift;
765 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
766
767 undef $self->{'currnet_filename'};
768 undef $self->{'headline'};
769
770 my @sorted_tags;
771 if ($self->{tags_by_order}) {
772 @sorted_tags = @{$self->{tags_by_order}};
773 } else {
774 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
775 $self->{tags_by_order} = \@sorted_tags;
776 }
777
778 my @ds;
779
780 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
781
782 foreach my $field (@sorted_tags) {
783
784 my $row;
785
786 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
787
788 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
789 my $format = $tag->{'value'} || $tag->{'content'};
790
791 $log->debug("format: $format");
792
793 my @v;
794 if ($format =~ /$LOOKUP_REGEX/o) {
795 @v = $self->fill_in_to_arr($rec,$format);
796 } else {
797 @v = $self->parse_to_arr($rec,$format);
798 }
799 next if (! @v);
800
801 # use format?
802 if ($tag->{'format_name'}) {
803 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
804 }
805
806 if ($field eq 'filename') {
807 $self->{'current_filename'} = join('',@v);
808 $log->debug("filename: ",$self->{'current_filename'});
809 } elsif ($field eq 'headline') {
810 $self->{'headline'} .= join('',@v);
811 $log->debug("headline: ",$self->{'headline'});
812 next; # don't return headline in data_structure!
813 }
814
815 # does tag have type?
816 if ($tag->{'type'}) {
817 push @{$row->{$tag->{'type'}}}, @v;
818 } else {
819 push @{$row->{'display'}}, @v;
820 push @{$row->{'swish'}}, @v;
821 }
822
823
824 }
825
826 if ($row) {
827 $row->{'tag'} = $field;
828
829 # TODO: name_sigular, name_plural
830 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
831 $row->{'name'} = $name ? $self->_x($name) : $field;
832
833 push @ds, $row;
834
835 $log->debug("row $field: ",sub { Dumper($row) });
836 }
837
838 }
839
840 return @ds;
841
842 }
843
844 =head2 output
845
846 Create output from in-memory data structure using Template Toolkit template.
847
848 my $text = $webpac->output( template => 'text.tt', data => @ds );
849
850 =cut
851
852 sub output {
853 my $self = shift;
854
855 my $args = {@_};
856
857 my $log = $self->_get_logger();
858
859 $log->logconfess("need template name") if (! $args->{'template'});
860 $log->logconfess("need data array") if (! $args->{'data'});
861
862 my $out;
863
864 $self->{'tt'}->process(
865 $args->{'template'},
866 $args,
867 \$out
868 ) || confess $self->{'tt'}->error();
869
870 return $out;
871 }
872
873 =head2 output_file
874
875 Create output from in-memory data structure using Template Toolkit template
876 to a file.
877
878 $webpac->output_file(
879 file => 'out.txt',
880 template => 'text.tt',
881 data => @ds
882 );
883
884 =cut
885
886 sub output_file {
887 my $self = shift;
888
889 my $args = {@_};
890
891 my $log = $self->_get_logger();
892
893 my $file = $args->{'file'} || $log->logconfess("need file name");
894
895 $log->debug("creating file ",$file);
896
897 open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
898 print $fh $self->output(
899 template => $args->{'template'},
900 data => $args->{'data'},
901 ) || $log->logdie("print: $!");
902 close($fh) || $log->logdie("close: $!");
903 }
904
905 =head2 apply_format
906
907 Apply format specified in tag with C<format_name="name"> and
908 C<format_delimiter=";;">.
909
910 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
911
912 Formats can contain C<lookup{...}> if you need them.
913
914 =cut
915
916 sub apply_format {
917 my $self = shift;
918
919 my ($name,$delimiter,$data) = @_;
920
921 my $log = $self->_get_logger();
922
923 if (! $self->{'import_xml'}->{'format'}->{$name}) {
924 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
925 return $data;
926 }
927
928 $log->warn("no delimiter for format $name") if (! $delimiter);
929
930 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
931
932 my @data = split(/\Q$delimiter\E/, $data);
933
934 my $out = sprintf($format, @data);
935 $log->debug("using format $name [$format] on $data to produce: $out");
936
937 if ($out =~ m/$LOOKUP_REGEX/o) {
938 return $self->lookup($out);
939 } else {
940 return $out;
941 }
942
943 }
944
945
946 #
947 #
948 #
949
950 =head1 INTERNAL METHODS
951
952 Here is a quick list of internal methods, mostly useful to turn debugging
953 on them (see L<LOGGING> below for explanation).
954
955 =cut
956
957 =head2 _eval
958
959 Internal function to eval code without C<strict 'subs'>.
960
961 =cut
962
963 sub _eval {
964 my $self = shift;
965
966 my $code = shift || return;
967
968 my $log = $self->_get_logger();
969
970 no strict 'subs';
971 my $ret = eval $code;
972 if ($@) {
973 $log->error("problem with eval code [$code]: $@");
974 }
975
976 $log->debug("eval: ",$code," [",$ret,"]");
977
978 return $ret || 0;
979 }
980
981 =head2 _sort_by_order
982
983 Sort xml tags data structure accoding to C<order=""> attribute.
984
985 =cut
986
987 sub _sort_by_order {
988 my $self = shift;
989
990 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
991 $self->{'import_xml'}->{'indexer'}->{$a};
992 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
993 $self->{'import_xml'}->{'indexer'}->{$b};
994
995 return $va <=> $vb;
996 }
997
998 =head2 _get_logger
999
1000 Get C<Log::Log4perl> object with a twist: domains are defined for each
1001 method
1002
1003 my $log = $webpac->_get_logger();
1004
1005 =cut
1006
1007 sub _get_logger {
1008 my $self = shift;
1009
1010 my $name = (caller(1))[3] || caller;
1011 return get_logger($name);
1012 }
1013
1014 =head2 _x
1015
1016 Convert string from UTF-8 to code page defined in C<import_xml>.
1017
1018 my $text = $webpac->_x('utf8 text');
1019
1020 =cut
1021
1022 sub _x {
1023 my $self = shift;
1024 my $utf8 = shift || return;
1025
1026 return $self->{'utf2cp'}->convert($utf8) ||
1027 $self->_get_logger()->logwarn("can't convert '$utf8'");
1028 }
1029
1030 #
1031 #
1032 #
1033
1034 =head1 LOGGING
1035
1036 Logging in WebPAC is performed by L<Log::Log4perl> with config file
1037 C<log.conf>.
1038
1039 Methods defined above have different levels of logging, so
1040 it's descriptions will be useful to turn (mostry B<debug> logging) on
1041 or off to see why WabPAC isn't perforing as you expect it (it might even
1042 be a bug!).
1043
1044 B<This is different from normal Log4perl behaviour>. To repeat, you can
1045 also use method names, and not only classes (which are just few)
1046 to filter logging.
1047
1048
1049 =head1 MEMORY USAGE
1050
1051 C<low_mem> options is double-edged sword. If enabled, WebPAC
1052 will run on memory constraint machines (which doesn't have enough
1053 physical RAM to create memory structure for whole source database).
1054
1055 If your machine has 512Mb or more of RAM and database is around 10000 records,
1056 memory shouldn't be an issue. If you don't have enough physical RAM, you
1057 might consider using virtual memory (if your operating system is handling it
1058 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1059 parsed structure of ISIS database (this is what C<low_mem> option does).
1060
1061 Hitting swap at end of reading source database is probably o.k. However,
1062 hitting swap before 90% will dramatically decrease performance and you will
1063 be better off with C<low_mem> and using rest of availble memory for
1064 operating system disk cache (Linux is particuallary good about this).
1065 However, every access to database record will require disk access, so
1066 generation phase will be slower 10-100 times.
1067
1068 Parsed structures are essential - you just have option to trade RAM memory
1069 (which is fast) for disk space (which is slow). Be sure to have planty of
1070 disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1071
1072 However, when WebPAC is running on desktop machines (or laptops :-), it's
1073 highly undesireable for system to start swapping. Using C<low_mem> option can
1074 reduce WecPAC memory usage to around 64Mb for same database with lookup
1075 fields and sorted indexes which stay in RAM. Performance will suffer, but
1076 memory usage will really be minimal. It might be also more confortable to
1077 run WebPAC reniced on those machines.
1078
1079 =cut
1080
1081 1;

  ViewVC Help
Powered by ViewVC 1.1.26