/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 366 by dpavlin, Thu Jun 17 01:44:25 2004 UTC revision 563 by dpavlin, Sat Oct 30 23:58:36 2004 UTC
# Line 1  Line 1 
1  package WebPAC;  package WebPAC;
2    
3    use warnings;
4    use strict;
5    
6  use Carp;  use Carp;
7  use Text::Iconv;  use Text::Iconv;
8  use Config::IniFiles;  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10    use Template;
11    use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
16    #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17    #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18    my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19    my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21  =head1 NAME  =head1 NAME
22    
23  WebPAC - base class for WebPAC  WebPAC - base class for WebPAC
# Line 19  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38            low_mem => 1,
39            filter => {
40                    'lower' => sub { lc($_[0]) },
41            },
42   );   );
43    
44  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
45    
46  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48    There is optinal parametar C<filter> which specify different filters which
49    can be applied using C<filter{name}> notation.
50    Same filters can be used in Template Toolkit files.
51    
52    This method will also read configuration files
53  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
54  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
55  which describes databases to be indexed.  which describes databases to be indexed.
# Line 49  sub new { Line 70  sub new {
70          my $self = {@_};          my $self = {@_};
71          bless($self, $class);          bless($self, $class);
72    
73            $self->{'start_t'} = time();
74    
75            my $log_file = $self->{'log'} || "log.conf";
76            Log::Log4perl->init($log_file);
77    
78            my $log = $self->_get_logger();
79    
80          # fill in default values          # fill in default values
81          # output codepage          # output codepage
82          $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});          $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
# Line 56  sub new { Line 84  sub new {
84          #          #
85          # read global.conf          # read global.conf
86          #          #
87            $log->debug("read 'global.conf'");
88    
89          $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";          my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
90    
91          # read global config parametars          # read global config parametars
92          foreach my $var (qw(          foreach my $var (qw(
# Line 67  sub new { Line 96  sub new {
96                          dbi_passwd                          dbi_passwd
97                          show_progress                          show_progress
98                          my_unac_filter                          my_unac_filter
99                            output_template
100                  )) {                  )) {
101                  $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);                  $self->{'global_config'}->{$var} = $config->val('global', $var);
102          }          }
103    
104          #          #
105          # read indexer config file          # read indexer config file
106          #          #
107    
108          $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";          $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
109    
110            # create UTF-8 convertor for import_xml files
111          $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});          $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
112    
113            # create Template toolkit instance
114            $self->{'tt'} = Template->new(
115                    INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
116                    FILTERS => $self->{'filter'},
117                    EVAL_PERL => 1,
118            );
119    
120            # running with low_mem flag? well, use DBM::Deep then.
121            if ($self->{'low_mem'}) {
122                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
123    
124                    my $db_file = "data.db";
125    
126                    if (-e $db_file) {
127                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
128                            $log->debug("removed '$db_file' from last run");
129                    }
130    
131                    require DBM::Deep;
132    
133                    my $db = new DBM::Deep $db_file;
134    
135                    $log->logdie("DBM::Deep error: $!") unless ($db);
136    
137                    if ($db->error()) {
138                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
139                    } else {
140                            $log->debug("using file '$db_file' for DBM::Deep");
141                    }
142    
143                    $self->{'db'} = $db;
144            }
145    
146            $log->debug("filters defined: ",Dumper($self->{'filter'}));
147    
148          return $self;          return $self;
149  }  }
150    
# Line 88  Open CDS/ISIS database using OpenIsis mo Line 155  Open CDS/ISIS database using OpenIsis mo
155   $webpac->open_isis(   $webpac->open_isis(
156          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
157          code_page => '852',          code_page => '852',
158          limit_mfn => '500',          limit_mfn => 500,
159            start_mfn => 6000,
160          lookup => [ ... ],          lookup => [ ... ],
161   );   );
162    
163  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
164    
165    If optional parametar C<start_mfn> is set, this will be first MFN to read
166    from database (so you can skip beginning of your database if you need to).
167    
168  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
169  from database in example above.  from database in example above.
170    
 Returns number of last record read into memory (size of database, really).  
   
171  C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and  C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
172  C<val>. Optional parametar C<eval> is perl code to evaluate before storing  C<val>. Optional parametar C<eval> is perl code to evaluate before storing
173  value in index.  value in index.
# Line 110  value in index. Line 179  value in index.
179      'val' => 'v900' },      'val' => 'v900' },
180   ]   ]
181    
182    Returns number of last record read into memory (size of database, really).
183    
184  =cut  =cut
185    
186  sub open_isis {  sub open_isis {
187          my $self = shift;          my $self = shift;
188          my $arg = {@_};          my $arg = {@_};
189    
190          croak "need filename" if (! $arg->{'filename'});          my $log = $self->_get_logger();
191    
192            $log->logcroak("need filename") if (! $arg->{'filename'});
193          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
194    
195            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
196    
197            # store data in object
198            $self->{'isis_filename'} = $arg->{'filename'};
199            $self->{'isis_code_page'} = $code_page;
200    
201          use OpenIsis;          use OpenIsis;
202    
203          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 126  sub open_isis { Line 205  sub open_isis {
205          # create Text::Iconv object          # create Text::Iconv object
206          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
207    
208          print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});          $log->info("reading ISIS database '",$arg->{'filename'},"'");
209            $log->debug("isis code page: $code_page");
210    
211          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
212    
213          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
214            my $startmfn = 1;
215    
216          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          if (my $s = $self->{'start_mfn'}) {
217                    $log->info("skipping to MFN $s");
218                    $startmfn = $s;
219            } else {
220                    $self->{'start_mfn'} = $startmfn;
221            }
222    
223            $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
224    
225          print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});          $log->info("processing ",($maxmfn-$startmfn)." records...");
226    
227          # read database          # read database
228          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
229    
230    
231                    $log->debug("mfn: $mfn\n");
232    
233                    my $rec;
234    
235                  # read record                  # read record
236                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
# Line 156  sub open_isis { Line 249  sub open_isis {
249                                                  $val = $l;                                                  $val = $l;
250                                          }                                          }
251    
252                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
253                                  }                                  }
254                            } else {
255                                    push @{$rec->{'000'}}, $mfn;
256                          }                          }
257    
258                  }                  }
259    
260                    $log->confess("record $mfn empty?") unless ($rec);
261    
262                    # store
263                    if ($self->{'low_mem'}) {
264                            $self->{'db'}->put($mfn, $rec);
265                    } else {
266                            $self->{'data'}->{$mfn} = $rec;
267                    }
268    
269                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn};  
270                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
271    
272                    $self->progress_bar($mfn,$maxmfn);
273    
274          }          }
275    
276          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = -1;
277            $self->{'last_pcnt'} = 0;
278    
279            $log->debug("max mfn: $maxmfn");
280    
281          # store max mfn and return it.          # store max mfn and return it.
282          return $self->{'max_mfn'} = $maxmfn;          return $self->{'max_mfn'} = $maxmfn;
# Line 186  it's implemented, that is). Line 294  it's implemented, that is).
294  sub fetch_rec {  sub fetch_rec {
295          my $self = shift;          my $self = shift;
296    
297          my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";          my $log = $self->_get_logger();
298    
299            $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
300    
301            if ($self->{'current_mfn'} == -1) {
302                    $self->{'current_mfn'} = $self->{'start_mfn'};
303            } else {
304                    $self->{'current_mfn'}++;
305            }
306    
307            my $mfn = $self->{'current_mfn'};
308    
309          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{'max_mfn'}) {
310                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{'current_mfn'} = $self->{'max_mfn'};
311                    $log->debug("at EOF");
312                  return;                  return;
313          }          }
314    
315          return $self->{'data'}->{$mfn};          $self->progress_bar($mfn,$self->{'max_mfn'});
316    
317            if ($self->{'low_mem'}) {
318                    return $self->{'db'}->get($mfn);
319            } else {
320                    return $self->{'data'}->{$mfn};
321            }
322    }
323    
324    =head2 mfn
325    
326    Returns current record number (MFN).
327    
328     print $webpac->mfn;
329    
330    =cut
331    
332    sub mfn {
333            my $self = shift;
334            return $self->{'current_mfn'};
335    }
336    
337    =head2 progress_bar
338    
339    Draw progress bar on STDERR.
340    
341     $webpac->progress_bar($current, $max);
342    
343    =cut
344    
345    sub progress_bar {
346            my $self = shift;
347    
348            my ($curr,$max) = @_;
349    
350            my $log = $self->_get_logger();
351    
352            $log->logconfess("no current value!") if (! $curr);
353            $log->logconfess("no maximum value!") if (! $max);
354    
355            if ($curr > $max) {
356                    $max = $curr;
357                    $log->debug("overflow to $curr");
358            }
359    
360            $self->{'last_pcnt'} ||= 1;
361    
362            my $p = int($curr * 100 / $max) || 1;
363    
364            # reset on re-run
365            if ($p < $self->{'last_pcnt'}) {
366                    $self->{'last_pcnt'} = $p;
367                    $self->{'last_t'} = time();
368                    $self->{'last_curr'} = undef;
369            }
370    
371            $self->{'last_t'} ||= time();
372    
373            if ($p != $self->{'last_pcnt'}) {
374    
375                    my $last_curr = $self->{'last_curr'} || $curr;
376                    my $t = time();
377                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
378                    my $eta = ($max-$curr) / ($rate || 1);
379                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
380                    $self->{'last_pcnt'} = $p;
381                    $self->{'last_t'} = time();
382                    $self->{'last_curr'} = $curr;
383            }
384            print STDERR "\n" if ($p == 100);
385    }
386    
387    =head2 fmt_time
388    
389    Format time (in seconds) for display.
390    
391     print $webpac->fmt_time(time());
392    
393    This method is called by L<progress_bar> to display remaining time.
394    
395    =cut
396    
397    sub fmt_time {
398            my $self = shift;
399    
400            my $t = shift || 0;
401            my $out = "";
402    
403            my ($ss,$mm,$hh) = gmtime($t);
404            $out .= "${hh}h" if ($hh);
405            $out .= sprintf("%02d:%02d", $mm,$ss);
406            $out .= "  " if ($hh == 0);
407            return $out;
408  }  }
409    
410  =head2 open_import_xml  =head2 open_import_xml
# Line 207  Read file from C<import_xml/> directory Line 418  Read file from C<import_xml/> directory
418  sub open_import_xml {  sub open_import_xml {
419          my $self = shift;          my $self = shift;
420    
421            my $log = $self->_get_logger();
422    
423          my $arg = {@_};          my $arg = {@_};
424          confess "need type to load file from import_xml/" if (! $arg->{'type'});          $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
425    
426          $self->{'type'} = $arg->{'type'};          $self->{'type'} = $arg->{'type'};
427    
# Line 217  sub open_import_xml { Line 430  sub open_import_xml {
430    
431          $self->{'tag'} = $type2tag{$type_base};          $self->{'tag'} = $type2tag{$type_base};
432    
433          print STDERR "using type ",$self->{'type'}," tag ",$self->{'tag'},"\n" if ($self->{'debug'});          $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
434    
435          my $f = "./import_xml/".$self->{'type'}.".xml";          my $f = "./import_xml/".$self->{'type'}.".xml";
436          confess "import_xml file '$f' doesn't exist!" if (! -e "$f");          $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
437    
438          print STDERR "reading '$f'\n" if ($self->{'debug'});          $log->info("reading '$f'");
439    
440            $self->{'import_xml_file'} = $f;
441    
442          $self->{'import_xml'} = XMLin($f,          $self->{'import_xml'} = XMLin($f,
443                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],
                 ForceContent => 1  
444          );          );
445    
446          print Dumper($self->{'import_xml'});          $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
447    
448  }  }
449    
# Line 237  sub open_import_xml { Line 451  sub open_import_xml {
451    
452  Create lookup from record using lookup definition.  Create lookup from record using lookup definition.
453    
454     $self->create_lookup($rec, @lookups);
455    
456    Called internally by C<open_*> methods.
457    
458  =cut  =cut
459    
460  sub create_lookup {  sub create_lookup {
461          my $self = shift;          my $self = shift;
462    
463          my $rec = shift || confess "need record to create lookup";          my $log = $self->_get_logger();
464          confess("need HASH as first argument!") if ($rec !~ /HASH/o);  
465            my $rec = shift || $log->logconfess("need record to create lookup");
466            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
467    
468          foreach my $i (@_) {          foreach my $i (@_) {
469                  if ($i->{'eval'}) {                  $log->logconfess("need key") unless defined($i->{'key'});
470                          my $eval = $self->fill_in($rec,$i->{'eval'});                  $log->logconfess("need val") unless defined($i->{'val'});
471                          my $key = $self->fill_in($rec,$i->{'key'});  
472                          my @val = $self->fill_in($rec,$i->{'val'});                  if (defined($i->{'eval'})) {
473                          if ($key && @val && eval $eval) {                          # eval first, so we can skip fill_in for key and val
474                            my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
475                            if ($self->_eval($eval)) {
476                                    my $key = $self->fill_in($rec,$i->{'key'}) || next;
477                                    my @val = $self->fill_in($rec,$i->{'val'}) || next;
478                                    $log->debug("stored $key = ",sub { join(" | ",@val) });
479                                  push @{$self->{'lookup'}->{$key}}, @val;                                  push @{$self->{'lookup'}->{$key}}, @val;
480                          }                          }
481                  } else {                  } else {
482                          my $key = $self->fill_in($rec,$i->{'key'});                          my $key = $self->fill_in($rec,$i->{'key'}) || next;
483                          my @val = $self->fill_in($rec,$i->{'val'});                          my @val = $self->fill_in($rec,$i->{'val'}) || next;
484                          if ($key && @val) {                          $log->debug("stored $key = ",sub { join(" | ",@val) });
485                                  push @{$self->{'lookup'}->{$key}}, @val;                          push @{$self->{'lookup'}->{$key}}, @val;
                         }  
486                  }                  }
487          }          }
488  }  }
# Line 267  sub create_lookup { Line 491  sub create_lookup {
491    
492  Returns value from record.  Returns value from record.
493    
494   $self->get_data(\$rec,$f,$sf,$i,\$found);   my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
495    
496  Arguments are:  Arguments are:
497  record reference C<$rec>,  record reference C<$rec>,
# Line 275  field C<$f>, Line 499  field C<$f>,
499  optional subfiled C<$sf>,  optional subfiled C<$sf>,
500  index for repeatable values C<$i>.  index for repeatable values C<$i>.
501    
502  Optinal variable C<$found> will be incremeted if thre  Optinal variable C<$found> will be incremeted if there
503  is field.  is field.
504    
505  Returns value or empty string.  Returns value or empty string.
# Line 286  sub get_data { Line 510  sub get_data {
510          my $self = shift;          my $self = shift;
511    
512          my ($rec,$f,$sf,$i,$found) = @_;          my ($rec,$f,$sf,$i,$found) = @_;
513    
514          if ($$rec->{$f}) {          if ($$rec->{$f}) {
515                    return '' if (! $$rec->{$f}->[$i]);
516                    no strict 'refs';
517                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {
518                          $$found++ if (defined($$found));                          $$found++ if (defined($$found));
519                          return $$rec->{$f}->[$i]->{$sf};                          return $$rec->{$f}->[$i]->{$sf};
# Line 315  Workhourse of all: takes record from in- Line 542  Workhourse of all: takes record from in-
542  strings with placeholders and returns string or array of with substituted  strings with placeholders and returns string or array of with substituted
543  values from record.  values from record.
544    
545   $webpac->fill_in($rec,'v250^a');   my $text = $webpac->fill_in($rec,'v250^a');
546    
547  Optional argument is ordinal number for repeatable fields. By default,  Optional argument is ordinal number for repeatable fields. By default,
548  it's assume to be first repeatable field (fields are perl array, so first  it's assume to be first repeatable field (fields are perl array, so first
549  element is 0).  element is 0).
550  Following example will read second value from repeatable field.  Following example will read second value from repeatable field.
551    
552   $webpac->fill_in($rec,'Title: v250^a',1);   my $text = $webpac->fill_in($rec,'Title: v250^a',1);
553    
554  This function B<does not> perform parsing of format to inteligenty skip  This function B<does not> perform parsing of format to inteligenty skip
555  delimiters before fields which aren't used.  delimiters before fields which aren't used.
556    
557    This method will automatically decode UTF-8 string to local code page
558    if needed.
559    
560  =cut  =cut
561    
562  sub fill_in {  sub fill_in {
563          my $self = shift;          my $self = shift;
564    
565          my $rec = shift || confess "need data record";          my $log = $self->_get_logger();
566          my $format = shift || confess "need format to parse";  
567            my $rec = shift || $log->logconfess("need data record");
568            my $format = shift || $log->logconfess("need format to parse");
569          # iteration (for repeatable fields)          # iteration (for repeatable fields)
570          my $i = shift || 0;          my $i = shift || 0;
571    
572            $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
573    
574          # FIXME remove for speedup?          # FIXME remove for speedup?
575          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
576    
577            if (utf8::is_utf8($format)) {
578                    $format = $self->_x($format);
579            }
580    
581          my $found = 0;          my $found = 0;
582    
# Line 346  sub fill_in { Line 584  sub fill_in {
584          # remove eval{...} from beginning          # remove eval{...} from beginning
585          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
586    
587            my $filter_name;
588            # remove filter{...} from beginning
589            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
590    
591          # do actual replacement of placeholders          # do actual replacement of placeholders
592          $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;          # repeatable fields
593            $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
594            # non-repeatable fields
595            $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
596    
597          if ($found) {          if ($found) {
598                    $log->debug("format: $format");
599                  if ($eval_code) {                  if ($eval_code) {
600                          my $eval = $self->fill_in($rec,$eval_code,$i);                          my $eval = $self->fill_in($rec,$eval_code,$i);
601                          return if (! eval $eval);                          return if (! $self->_eval($eval));
602                    }
603                    if ($filter_name && $self->{'filter'}->{$filter_name}) {
604                            $log->debug("filter '$filter_name' for $format");
605                            $format = $self->{'filter'}->{$filter_name}->($format);
606                            return unless(defined($format));
607                            $log->debug("filter result: $format");
608                  }                  }
609                  # do we have lookups?                  # do we have lookups?
610                  if ($format =~ /\[[^\[\]]+\]/o) {                  if ($format =~ /$LOOKUP_REGEX/o) {
611                            $log->debug("format '$format' has lookup");
612                          return $self->lookup($format);                          return $self->lookup($format);
613                  } else {                  } else {
614                          return $format;                          return $format;
# Line 369  sub fill_in { Line 622  sub fill_in {
622    
623  Perform lookups on format supplied to it.  Perform lookups on format supplied to it.
624    
625   my $txt = $self->lookup('[v900]');   my $text = $self->lookup('[v900]');
626    
627  Lookups can be nested (like C<[d:[a:[v900]]]>).  Lookups can be nested (like C<[d:[a:[v900]]]>).
628    
# Line 378  Lookups can be nested (like C<[d:[a:[v90 Line 631  Lookups can be nested (like C<[d:[a:[v90
631  sub lookup {  sub lookup {
632          my $self = shift;          my $self = shift;
633    
634          my $tmp = shift || confess "need format";          my $log = $self->_get_logger();
635    
636            my $tmp = shift || $log->logconfess("need format");
637    
638          if ($tmp =~ /\[[^\[\]]+\]/o) {          if ($tmp =~ /$LOOKUP_REGEX/o) {
639                  my @in = ( $tmp );                  my @in = ( $tmp );
640  #print "##lookup $tmp\n";  
641                    $log->debug("lookup for: ",$tmp);
642    
643                  my @out;                  my @out;
644                  while (my $f = shift @in) {                  while (my $f = shift @in) {
645                          if ($f =~ /\[([^\[\]]+)\]/) {                          if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
646                                  my $k = $1;                                  my $k = $1;
647                                  if ($self->{'lookup'}->{$k}) {                                  if ($self->{'lookup'}->{$k}) {
 #print "## lookup key = $k\n";  
648                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {
649                                                  my $tmp2 = $f;                                                  my $tmp2 = $f;
650                                                  $tmp2 =~ s/\[$k\]/$nv/g;                                                  $tmp2 =~ s/lookup{$k}/$nv/g;
651                                                  push @in, $tmp2;                                                  push @in, $tmp2;
 #print "## lookup in => $tmp2\n";  
652                                          }                                          }
653                                  } else {                                  } else {
654                                          undef $f;                                          undef $f;
655                                  }                                  }
656                          } elsif ($f) {                          } elsif ($f) {
657                                  push @out, $f;                                  push @out, $f;
 #print "## lookup out => $f\n";  
658                          }                          }
659                  }                  }
660                    $log->logconfess("return is array and it's not expected!") unless wantarray;
661                  return @out;                  return @out;
662          } else {          } else {
663                  return $tmp;                  return $tmp;
# Line 415  Perform smart parsing of string, skippin Line 670  Perform smart parsing of string, skippin
670  defined. It can also eval code in format starting with C<eval{...}> and  defined. It can also eval code in format starting with C<eval{...}> and
671  return output or nothing depending on eval code.  return output or nothing depending on eval code.
672    
673   $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);   my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
674    
675  =cut  =cut
676    
# Line 426  sub parse { Line 681  sub parse {
681    
682          return if (! $format_utf8);          return if (! $format_utf8);
683    
684          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          my $log = $self->_get_logger();
685          confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});  
686            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
687            $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
688    
689          $i = 0 if (! $i);          $i = 0 if (! $i);
690    
691          my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});          my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
692    
693          my @out;          my @out;
694    
695            $log->debug("format: $format");
696    
697          my $eval_code;          my $eval_code;
698          # remove eval{...} from beginning          # remove eval{...} from beginning
699          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
700    
701            my $filter_name;
702            # remove filter{...} from beginning
703            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
704    
705          my $prefix;          my $prefix;
706          my $all_found=0;          my $all_found=0;
707    
708  #print "## $format\n";          while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
         while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {  
 #print "## [ $1 | $2 | $3 ] $format\n";  
709    
710                  my $del = $1 || '';                  my $del = $1 || '';
711                  $prefix ||= $del if ($all_found == 0);                  $prefix ||= $del if ($all_found == 0);
712    
713                    # repeatable index
714                    my $r = $i;
715                    $r = 0 if (lc("$2") eq 's');
716    
717                  my $found = 0;                  my $found = 0;
718                  my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);                  my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
719    
720                  if ($found) {                  if ($found) {
721                          push @out, $del;                          push @out, $del;
# Line 461  sub parse { Line 726  sub parse {
726    
727          return if (! $all_found);          return if (! $all_found);
728    
729          my $out = join('',@out) . $format;          my $out = join('',@out);
730    
731            if ($out) {
732                    # add rest of format (suffix)
733                    $out .= $format;
734    
735                    # add prefix if not there
736                    $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
737    
738                    $log->debug("result: $out");
739            }
740    
         # add prefix if not there  
         $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);  
           
741          if ($eval_code) {          if ($eval_code) {
742                  my $eval = $self->fill_in($rec,$eval_code,$i);                  my $eval = $self->fill_in($rec,$eval_code,$i) || return;
743                  return if (! eval $eval);                  $log->debug("about to eval{$eval} format: $out");
744                    return if (! $self->_eval($eval));
745            }
746            
747            if ($filter_name && $self->{'filter'}->{$filter_name}) {
748                    $log->debug("about to filter{$filter_name} format: $out");
749                    $out = $self->{'filter'}->{$filter_name}->($out);
750                    return unless(defined($out));
751                    $log->debug("filter result: $out");
752          }          }
753    
754          return $out;          return $out;
755  }  }
756    
757  =head2 data_structure  =head2 parse_to_arr
758    
759  Create in-memory data structure which represents layout from C<import_xml>.  Similar to C<parse>, but returns array of all repeatable fields
 It is used later to produce output.  
760    
761   my $ds = $webpac->data_structure($rec);   my @arr = $webpac->parse_to_arr($rec,'v250^a');
762    
763  =cut  =cut
764    
765  # private method _sort_by_order  sub parse_to_arr {
 # sort subrouting using order="" attribute  
 sub _sort_by_order {  
766          my $self = shift;          my $self = shift;
767    
768          my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||          my ($rec, $format_utf8) = @_;
                 $self->{'import_xml'}->{'indexer'}->{$a};  
         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||  
                 $self->{'import_xml'}->{'indexer'}->{$b};  
769    
770          return $va <=> $vb;          my $log = $self->_get_logger();
771    
772            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
773            return if (! $format_utf8);
774    
775            my $i = 0;
776            my @arr;
777    
778            while (my $v = $self->parse($rec,$format_utf8,$i++)) {
779                    push @arr, $v;
780            }
781    
782            $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
783    
784            return @arr;
785  }  }
786    
787    =head2 fill_in_to_arr
788    
789    Similar to C<fill_in>, but returns array of all repeatable fields. Usable
790    for fields which have lookups, so they shouldn't be parsed but rather
791    C<fill_id>ed.
792    
793     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
794    
795    =cut
796    
797    sub fill_in_to_arr {
798            my $self = shift;
799    
800            my ($rec, $format_utf8) = @_;
801    
802            my $log = $self->_get_logger();
803    
804            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
805            return if (! $format_utf8);
806    
807            my $i = 0;
808            my @arr;
809    
810            while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
811                    push @arr, @v;
812            }
813    
814            $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
815    
816            return @arr;
817    }
818    
819    =head2 sort_arr
820    
821    Sort array ignoring case and html in data
822    
823     my @sorted = $webpac->sort_arr(@unsorted);
824    
825    =cut
826    
827    sub sort_arr {
828            my $self = shift;
829    
830            my $log = $self->_get_logger();
831    
832            # FIXME add Schwartzian Transformation?
833    
834            my @sorted = sort {
835                    $a =~ s#<[^>]+/*>##;
836                    $b =~ s#<[^>]+/*>##;
837                    lc($b) cmp lc($a)
838            } @_;
839            $log->debug("sorted values: ",sub { join(", ",@sorted) });
840    
841            return @sorted;
842    }
843    
844    
845    =head2 data_structure
846    
847    Create in-memory data structure which represents layout from C<import_xml>.
848    It is used later to produce output.
849    
850     my @ds = $webpac->data_structure($rec);
851    
852    This method will also set C<$webpac->{'currnet_filename'}> if there is
853    <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
854    <headline> tag.
855    
856    =cut
857    
858  sub data_structure {  sub data_structure {
859          my $self = shift;          my $self = shift;
860    
861            my $log = $self->_get_logger();
862    
863          my $rec = shift;          my $rec = shift;
864          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
865    
866            undef $self->{'currnet_filename'};
867            undef $self->{'headline'};
868    
869          my @sorted_tags;          my @sorted_tags;
870          if ($self->{tags_by_order}) {          if ($self->{tags_by_order}) {
# Line 510  sub data_structure { Line 874  sub data_structure {
874                  $self->{tags_by_order} = \@sorted_tags;                  $self->{tags_by_order} = \@sorted_tags;
875          }          }
876    
877          my $ds;          my @ds;
878    
879            $log->debug("tags: ",sub { join(", ",@sorted_tags) });
880    
881          foreach my $field (@sorted_tags) {          foreach my $field (@sorted_tags) {
882    
883                  my $row;                  my $row;
                 my $i = 0;  
884    
885  #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});  #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
886    
887                  foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {                  foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
888                            my $format = $tag->{'value'} || $tag->{'content'};
889    
890                          my $v = $self->parse($rec,$tag->{'content'},$i);                          $log->debug("format: $format");
 print "## $i:",$tag->{'content'}," = ",($v || 'null'),"\n";  
   
                         next if (!$v || $v && $v eq '');  
891    
892                          # does tag have type?                          my @v;
893                          if ($tag->{'type'}) {                          if ($format =~ /$LOOKUP_REGEX/o) {
894                                  push @{$row->{$tag->{'type'}}}, $v;                                  @v = $self->fill_in_to_arr($rec,$format);
895                          } else {                          } else {
896                                  push @{$row->{'display'}}, $v;                                  @v = $self->parse_to_arr($rec,$format);
897                                  push @{$row->{'swish'}}, $v;                          }
898                            next if (! @v);
899    
900                            if ($tag->{'sort'}) {
901                                    @v = $self->sort_arr(@v);
902                            }
903    
904                            # use format?
905                            if ($tag->{'format_name'}) {
906                                    @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
907                            }
908    
909                            if ($field eq 'filename') {
910                                    $self->{'current_filename'} = join('',@v);
911                                    $log->debug("filename: ",$self->{'current_filename'});
912                            } elsif ($field eq 'headline') {
913                                    $self->{'headline'} .= join('',@v);
914                                    $log->debug("headline: ",$self->{'headline'});
915                                    next; # don't return headline in data_structure!
916                            }
917    
918                            # delimiter will join repeatable fields
919                            if ($tag->{'delimiter'}) {
920                                    @v = ( join($tag->{'delimiter'}, @v) );
921                            }
922    
923                            # default types
924                            my @types = qw(display swish);
925                            # override by type attribute
926                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
927    
928                            foreach my $type (@types) {
929                                    # append to previous line?
930                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
931                                    if ($tag->{'append'}) {
932    
933                                            # I will delimit appended part with
934                                            # delimiter (or ,)
935                                            my $d = $tag->{'delimiter'};
936                                            # default delimiter
937                                            $d ||= " ";
938    
939                                            my $last = pop @{$row->{$type}};
940                                            $d = "" if (! $last);
941                                            $last .= $d . join($d, @v);
942                                            push @{$row->{$type}}, $last;
943    
944                                    } else {
945                                            push @{$row->{$type}}, @v;
946                                    }
947                            }
948    
949    
950                    }
951    
952                    if ($row) {
953                            $row->{'tag'} = $field;
954    
955                            # TODO: name_sigular, name_plural
956                            my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
957                            $row->{'name'} = $name ? $self->_x($name) : $field;
958    
959                            # post-sort all values in field
960                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
961                                    $log->warn("sort at field tag not implemented");
962                          }                          }
963    
964                            push @ds, $row;
965    
966                            $log->debug("row $field: ",sub { Dumper($row) });
967                  }                  }
968    
969                  push @{$ds->{$field}}, $row if ($row);          }
970    
971            return @ds;
972    
973    }
974    
975    =head2 output
976    
977    Create output from in-memory data structure using Template Toolkit template.
978    
979    my $text = $webpac->output( template => 'text.tt', data => @ds );
980    
981    =cut
982    
983    sub output {
984            my $self = shift;
985    
986            my $args = {@_};
987    
988            my $log = $self->_get_logger();
989    
990            $log->logconfess("need template name") if (! $args->{'template'});
991            $log->logconfess("need data array") if (! $args->{'data'});
992    
993            my $out;
994    
995            $self->{'tt'}->process(
996                    $args->{'template'},
997                    $args,
998                    \$out
999            ) || confess $self->{'tt'}->error();
1000    
1001            return $out;
1002    }
1003    
1004    =head2 output_file
1005    
1006    Create output from in-memory data structure using Template Toolkit template
1007    to a file.
1008    
1009     $webpac->output_file(
1010            file => 'out.txt',
1011            template => 'text.tt',
1012            data => @ds
1013     );
1014    
1015    =cut
1016    
1017    sub output_file {
1018            my $self = shift;
1019    
1020            my $args = {@_};
1021    
1022            my $log = $self->_get_logger();
1023    
1024            my $file = $args->{'file'} || $log->logconfess("need file name");
1025    
1026            $log->debug("creating file ",$file);
1027    
1028            open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1029            print $fh $self->output(
1030                    template => $args->{'template'},
1031                    data => $args->{'data'},
1032            ) || $log->logdie("print: $!");
1033            close($fh) || $log->logdie("close: $!");
1034    }
1035    
1036    =head2 apply_format
1037    
1038    Apply format specified in tag with C<format_name="name"> and
1039    C<format_delimiter=";;">.
1040    
1041     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1042    
1043    Formats can contain C<lookup{...}> if you need them.
1044    
1045    =cut
1046    
1047    sub apply_format {
1048            my $self = shift;
1049    
1050            my ($name,$delimiter,$data) = @_;
1051    
1052            my $log = $self->_get_logger();
1053    
1054            if (! $self->{'import_xml'}->{'format'}->{$name}) {
1055                    $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1056                    return $data;
1057            }
1058    
1059            $log->warn("no delimiter for format $name") if (! $delimiter);
1060    
1061            my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1062    
1063            my @data = split(/\Q$delimiter\E/, $data);
1064    
1065            my $out = sprintf($format, @data);
1066            $log->debug("using format $name [$format] on $data to produce: $out");
1067    
1068            if ($out =~ m/$LOOKUP_REGEX/o) {
1069                    return $self->lookup($out);
1070            } else {
1071                    return $out;
1072            }
1073    
1074    }
1075    
1076    
1077    #
1078    #
1079    #
1080    
1081    =head1 INTERNAL METHODS
1082    
1083    Here is a quick list of internal methods, mostly useful to turn debugging
1084    on them (see L<LOGGING> below for explanation).
1085    
1086    =cut
1087    
1088    =head2 _eval
1089    
1090    Internal function to eval code without C<strict 'subs'>.
1091    
1092    =cut
1093    
1094    sub _eval {
1095            my $self = shift;
1096    
1097            my $code = shift || return;
1098    
1099            my $log = $self->_get_logger();
1100    
1101            no strict 'subs';
1102            my $ret = eval $code;
1103            if ($@) {
1104                    $log->error("problem with eval code [$code]: $@");
1105          }          }
1106    
1107          print Dumper($ds);          $log->debug("eval: ",$code," [",$ret,"]");
1108    
1109            return $ret || undef;
1110    }
1111    
1112    =head2 _sort_by_order
1113    
1114    Sort xml tags data structure accoding to C<order=""> attribute.
1115    
1116    =cut
1117    
1118    sub _sort_by_order {
1119            my $self = shift;
1120    
1121            my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1122                    $self->{'import_xml'}->{'indexer'}->{$a};
1123            my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1124                    $self->{'import_xml'}->{'indexer'}->{$b};
1125    
1126            return $va <=> $vb;
1127    }
1128    
1129    =head2 _get_logger
1130    
1131    Get C<Log::Log4perl> object with a twist: domains are defined for each
1132    method
1133    
1134     my $log = $webpac->_get_logger();
1135    
1136    =cut
1137    
1138    sub _get_logger {
1139            my $self = shift;
1140    
1141            my $name = (caller(1))[3] || caller;
1142            return get_logger($name);
1143  }  }
1144    
1145    =head2 _x
1146    
1147    Convert string from UTF-8 to code page defined in C<import_xml>.
1148    
1149     my $text = $webpac->_x('utf8 text');
1150    
1151    =cut
1152    
1153    sub _x {
1154            my $self = shift;
1155            my $utf8 = shift || return;
1156    
1157            return $self->{'utf2cp'}->convert($utf8) ||
1158                    $self->_get_logger()->logwarn("can't convert '$utf8'");
1159    }
1160    
1161    #
1162    #
1163    #
1164    
1165    =head1 LOGGING
1166    
1167    Logging in WebPAC is performed by L<Log::Log4perl> with config file
1168    C<log.conf>.
1169    
1170    Methods defined above have different levels of logging, so
1171    it's descriptions will be useful to turn (mostry B<debug> logging) on
1172    or off to see why WabPAC isn't perforing as you expect it (it might even
1173    be a bug!).
1174    
1175    B<This is different from normal Log4perl behaviour>. To repeat, you can
1176    also use method names, and not only classes (which are just few)
1177    to filter logging.
1178    
1179    
1180    =head1 MEMORY USAGE
1181    
1182    C<low_mem> options is double-edged sword. If enabled, WebPAC
1183    will run on memory constraint machines (which doesn't have enough
1184    physical RAM to create memory structure for whole source database).
1185    
1186    If your machine has 512Mb or more of RAM and database is around 10000 records,
1187    memory shouldn't be an issue. If you don't have enough physical RAM, you
1188    might consider using virtual memory (if your operating system is handling it
1189    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1190    parsed structure of ISIS database (this is what C<low_mem> option does).
1191    
1192    Hitting swap at end of reading source database is probably o.k. However,
1193    hitting swap before 90% will dramatically decrease performance and you will
1194    be better off with C<low_mem> and using rest of availble memory for
1195    operating system disk cache (Linux is particuallary good about this).
1196    However, every access to database record will require disk access, so
1197    generation phase will be slower 10-100 times.
1198    
1199    Parsed structures are essential - you just have option to trade RAM memory
1200    (which is fast) for disk space (which is slow). Be sure to have planty of
1201    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1202    
1203    However, when WebPAC is running on desktop machines (or laptops :-), it's
1204    highly undesireable for system to start swapping. Using C<low_mem> option can
1205    reduce WecPAC memory usage to around 64Mb for same database with lookup
1206    fields and sorted indexes which stay in RAM. Performance will suffer, but
1207    memory usage will really be minimal. It might be also more confortable to
1208    run WebPAC reniced on those machines.
1209    
1210    =cut
1211    
1212  1;  1;

Legend:
Removed from v.366  
changed lines
  Added in v.563

  ViewVC Help
Powered by ViewVC 1.1.26