/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 366 by dpavlin, Thu Jun 17 01:44:25 2004 UTC revision 453 by dpavlin, Wed Sep 15 21:21:36 2004 UTC
# Line 1  Line 1 
1  package WebPAC;  package WebPAC;
2    
3    use warnings;
4    use strict;
5    
6  use Carp;  use Carp;
7  use Text::Iconv;  use Text::Iconv;
8  use Config::IniFiles;  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10    use Template;
11    use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
16    #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17    #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18    my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19    my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21  =head1 NAME  =head1 NAME
22    
23  WebPAC - base class for WebPAC  WebPAC - base class for WebPAC
# Line 19  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38            low_mem => 1,
39   );   );
40    
41  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
42    
43  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45    This method will also read configuration files
46  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
47  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
48  which describes databases to be indexed.  which describes databases to be indexed.
# Line 49  sub new { Line 63  sub new {
63          my $self = {@_};          my $self = {@_};
64          bless($self, $class);          bless($self, $class);
65    
66            $self->{'start_t'} = time();
67    
68            my $log_file = $self->{'log'} || "log.conf";
69            Log::Log4perl->init($log_file);
70    
71            my $log = $self->_get_logger();
72    
73          # fill in default values          # fill in default values
74          # output codepage          # output codepage
75          $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});          $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
# Line 56  sub new { Line 77  sub new {
77          #          #
78          # read global.conf          # read global.conf
79          #          #
80            $log->debug("read 'global.conf'");
81    
82          $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";          my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83    
84          # read global config parametars          # read global config parametars
85          foreach my $var (qw(          foreach my $var (qw(
# Line 67  sub new { Line 89  sub new {
89                          dbi_passwd                          dbi_passwd
90                          show_progress                          show_progress
91                          my_unac_filter                          my_unac_filter
92                            output_template
93                  )) {                  )) {
94                  $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);                  $self->{'global_config'}->{$var} = $config->val('global', $var);
95          }          }
96    
97          #          #
98          # read indexer config file          # read indexer config file
99          #          #
100    
101          $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";          $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102    
103            # create UTF-8 convertor for import_xml files
104          $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});          $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105    
106            # create Template toolkit instance
107            $self->{'tt'} = Template->new(
108                    INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109    #               FILTERS => {
110    #                       'foo' => \&foo_filter,
111    #               },
112                    EVAL_PERL => 1,
113            );
114    
115            # running with low_mem flag? well, use DBM::Deep then.
116            if ($self->{'low_mem'}) {
117                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119                    my $db_file = "data.db";
120    
121                    if (-e $db_file) {
122                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123                            $log->debug("removed '$db_file' from last run");
124                    }
125    
126                    require DBM::Deep;
127    
128                    my $db = new DBM::Deep $db_file;
129    
130                    $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132                    if ($db->error()) {
133                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134                    } else {
135                            $log->debug("using file '$db_file' for DBM::Deep");
136                    }
137    
138                    $self->{'db'} = $db;
139            }
140    
141          return $self;          return $self;
142  }  }
143    
# Line 88  Open CDS/ISIS database using OpenIsis mo Line 148  Open CDS/ISIS database using OpenIsis mo
148   $webpac->open_isis(   $webpac->open_isis(
149          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
150          code_page => '852',          code_page => '852',
151          limit_mfn => '500',          limit_mfn => 500,
152            start_mfn => 6000,
153          lookup => [ ... ],          lookup => [ ... ],
154   );   );
155    
156  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
157    
158    If optional parametar C<start_mfn> is set, this will be first MFN to read
159    from database (so you can skip beginning of your database if you need to).
160    
161  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
162  from database in example above.  from database in example above.
163    
 Returns number of last record read into memory (size of database, really).  
   
164  C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and  C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165  C<val>. Optional parametar C<eval> is perl code to evaluate before storing  C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166  value in index.  value in index.
# Line 110  value in index. Line 172  value in index.
172      'val' => 'v900' },      'val' => 'v900' },
173   ]   ]
174    
175    Returns number of last record read into memory (size of database, really).
176    
177  =cut  =cut
178    
179  sub open_isis {  sub open_isis {
180          my $self = shift;          my $self = shift;
181          my $arg = {@_};          my $arg = {@_};
182    
183          croak "need filename" if (! $arg->{'filename'});          my $log = $self->_get_logger();
184    
185            $log->logcroak("need filename") if (! $arg->{'filename'});
186          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
187    
188            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190            # store data in object
191            $self->{'isis_filename'} = $arg->{'filename'};
192            $self->{'isis_code_page'} = $code_page;
193    
194          use OpenIsis;          use OpenIsis;
195    
196          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 126  sub open_isis { Line 198  sub open_isis {
198          # create Text::Iconv object          # create Text::Iconv object
199          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200    
201          print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});          $log->info("reading ISIS database '",$arg->{'filename'},"'");
202            $log->debug("isis code page: $code_page");
203    
204          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207            my $startmfn = 1;
208    
209            if (my $s = $self->{'start_mfn'}) {
210                    $log->info("skipping to MFN $s");
211                    $startmfn = $s;
212            } else {
213                    $self->{'start_mfn'} = $startmfn;
214            }
215    
216          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
217    
218          print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});          $log->info("processing ",($maxmfn-$startmfn)." records...");
219    
220          # read database          # read database
221          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
222    
223    
224                    $log->debug("mfn: $mfn\n");
225    
226                    my $rec;
227    
228                  # read record                  # read record
229                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
# Line 156  sub open_isis { Line 242  sub open_isis {
242                                                  $val = $l;                                                  $val = $l;
243                                          }                                          }
244    
245                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
246                                  }                                  }
247                            } else {
248                                    push @{$rec->{'000'}}, $mfn;
249                          }                          }
250    
251                  }                  }
252    
253                    $log->confess("record $mfn empty?") unless ($rec);
254    
255                    # store
256                    if ($self->{'low_mem'}) {
257                            $self->{'db'}->put($mfn, $rec);
258                    } else {
259                            $self->{'data'}->{$mfn} = $rec;
260                    }
261    
262                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn};  
263                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
264    
265                    $self->progress_bar($mfn,$maxmfn);
266    
267          }          }
268    
269          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = -1;
270            $self->{'last_pcnt'} = 0;
271    
272            $log->debug("max mfn: $maxmfn");
273    
274          # store max mfn and return it.          # store max mfn and return it.
275          return $self->{'max_mfn'} = $maxmfn;          return $self->{'max_mfn'} = $maxmfn;
# Line 186  it's implemented, that is). Line 287  it's implemented, that is).
287  sub fetch_rec {  sub fetch_rec {
288          my $self = shift;          my $self = shift;
289    
290          my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";          my $log = $self->_get_logger();
291    
292            $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
293    
294            if ($self->{'current_mfn'} == -1) {
295                    $self->{'current_mfn'} = $self->{'start_mfn'};
296            } else {
297                    $self->{'current_mfn'}++;
298            }
299    
300            my $mfn = $self->{'current_mfn'};
301    
302          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{'max_mfn'}) {
303                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{'current_mfn'} = $self->{'max_mfn'};
304                    $log->debug("at EOF");
305                  return;                  return;
306          }          }
307    
308          return $self->{'data'}->{$mfn};          $self->progress_bar($mfn,$self->{'max_mfn'});
309    
310            if ($self->{'low_mem'}) {
311                    return $self->{'db'}->get($mfn);
312            } else {
313                    return $self->{'data'}->{$mfn};
314            }
315    }
316    
317    =head2 mfn
318    
319    Returns current record number (MFN).
320    
321     print $webpac->mfn;
322    
323    =cut
324    
325    sub mfn {
326            my $self = shift;
327            return $self->{'current_mfn'};
328    }
329    
330    =head2 progress_bar
331    
332    Draw progress bar on STDERR.
333    
334     $webpac->progress_bar($current, $max);
335    
336    =cut
337    
338    sub progress_bar {
339            my $self = shift;
340    
341            my ($curr,$max) = @_;
342    
343            my $log = $self->_get_logger();
344    
345            $log->logconfess("no current value!") if (! $curr);
346            $log->logconfess("no maximum value!") if (! $max);
347    
348            if ($curr > $max) {
349                    $max = $curr;
350                    $log->debug("overflow to $curr");
351            }
352    
353            $self->{'last_pcnt'} ||= 1;
354    
355            my $p = int($curr * 100 / $max);
356    
357            # reset on re-run
358            if ($p < $self->{'last_pcnt'}) {
359                    $self->{'last_pcnt'} = $p;
360                    $self->{'last_t'} = time();
361                    $self->{'last_curr'} = undef;
362            }
363    
364            if ($p != $self->{'last_pcnt'}) {
365    
366                    my $last_curr = $self->{'last_curr'} || $curr;
367                    my $t = time();
368                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
369                    my $eta = ($max-$curr) / ($rate || 1);
370                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
371                    $self->{'last_pcnt'} = $p;
372                    $self->{'last_t'} = time();
373                    $self->{'last_curr'} = $curr;
374            }
375            print STDERR "\n" if ($p == 100);
376    }
377    
378    =head2 fmt_time
379    
380    Format time (in seconds) for display.
381    
382     print $webpac->fmt_time(time());
383    
384    This method is called by L<progress_bar> to display remaining time.
385    
386    =cut
387    
388    sub fmt_time {
389            my $self = shift;
390    
391            my $t = shift || 0;
392            my $out = "";
393    
394            my ($ss,$mm,$hh) = gmtime($t);
395            $out .= "${hh}h" if ($hh);
396            $out .= sprintf("%02d:%02d", $mm,$ss);
397            $out .= "  " if ($hh == 0);
398            return $out;
399  }  }
400    
401  =head2 open_import_xml  =head2 open_import_xml
# Line 207  Read file from C<import_xml/> directory Line 409  Read file from C<import_xml/> directory
409  sub open_import_xml {  sub open_import_xml {
410          my $self = shift;          my $self = shift;
411    
412            my $log = $self->_get_logger();
413    
414          my $arg = {@_};          my $arg = {@_};
415          confess "need type to load file from import_xml/" if (! $arg->{'type'});          $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
416    
417          $self->{'type'} = $arg->{'type'};          $self->{'type'} = $arg->{'type'};
418    
# Line 217  sub open_import_xml { Line 421  sub open_import_xml {
421    
422          $self->{'tag'} = $type2tag{$type_base};          $self->{'tag'} = $type2tag{$type_base};
423    
424          print STDERR "using type ",$self->{'type'}," tag ",$self->{'tag'},"\n" if ($self->{'debug'});          $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
425    
426          my $f = "./import_xml/".$self->{'type'}.".xml";          my $f = "./import_xml/".$self->{'type'}.".xml";
427          confess "import_xml file '$f' doesn't exist!" if (! -e "$f");          $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
428    
429          print STDERR "reading '$f'\n" if ($self->{'debug'});          $log->info("reading '$f'");
430    
431            $self->{'import_xml_file'} = $f;
432    
433          $self->{'import_xml'} = XMLin($f,          $self->{'import_xml'} = XMLin($f,
434                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],
                 ForceContent => 1  
435          );          );
436    
437          print Dumper($self->{'import_xml'});          $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
438    
439  }  }
440    
# Line 237  sub open_import_xml { Line 442  sub open_import_xml {
442    
443  Create lookup from record using lookup definition.  Create lookup from record using lookup definition.
444    
445     $self->create_lookup($rec, @lookups);
446    
447    Called internally by C<open_*> methods.
448    
449  =cut  =cut
450    
451  sub create_lookup {  sub create_lookup {
452          my $self = shift;          my $self = shift;
453    
454          my $rec = shift || confess "need record to create lookup";          my $log = $self->_get_logger();
455          confess("need HASH as first argument!") if ($rec !~ /HASH/o);  
456            my $rec = shift || $log->logconfess("need record to create lookup");
457            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
458    
459          foreach my $i (@_) {          foreach my $i (@_) {
460                  if ($i->{'eval'}) {                  $log->logconfess("need key") unless defined($i->{'key'});
461                          my $eval = $self->fill_in($rec,$i->{'eval'});                  $log->logconfess("need val") unless defined($i->{'val'});
462                          my $key = $self->fill_in($rec,$i->{'key'});  
463                          my @val = $self->fill_in($rec,$i->{'val'});                  if (defined($i->{'eval'})) {
464                          if ($key && @val && eval $eval) {                          # eval first, so we can skip fill_in for key and val
465                            my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
466                            if ($self->_eval($eval)) {
467                                    my $key = $self->fill_in($rec,$i->{'key'}) || next;
468                                    my @val = $self->fill_in($rec,$i->{'val'}) || next;
469                                    $log->debug("stored $key = ",sub { join(" | ",@val) });
470                                  push @{$self->{'lookup'}->{$key}}, @val;                                  push @{$self->{'lookup'}->{$key}}, @val;
471                          }                          }
472                  } else {                  } else {
473                          my $key = $self->fill_in($rec,$i->{'key'});                          my $key = $self->fill_in($rec,$i->{'key'}) || next;
474                          my @val = $self->fill_in($rec,$i->{'val'});                          my @val = $self->fill_in($rec,$i->{'val'}) || next;
475                          if ($key && @val) {                          $log->debug("stored $key = ",sub { join(" | ",@val) });
476                                  push @{$self->{'lookup'}->{$key}}, @val;                          push @{$self->{'lookup'}->{$key}}, @val;
                         }  
477                  }                  }
478          }          }
479  }  }
# Line 267  sub create_lookup { Line 482  sub create_lookup {
482    
483  Returns value from record.  Returns value from record.
484    
485   $self->get_data(\$rec,$f,$sf,$i,\$found);   my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
486    
487  Arguments are:  Arguments are:
488  record reference C<$rec>,  record reference C<$rec>,
# Line 275  field C<$f>, Line 490  field C<$f>,
490  optional subfiled C<$sf>,  optional subfiled C<$sf>,
491  index for repeatable values C<$i>.  index for repeatable values C<$i>.
492    
493  Optinal variable C<$found> will be incremeted if thre  Optinal variable C<$found> will be incremeted if there
494  is field.  is field.
495    
496  Returns value or empty string.  Returns value or empty string.
# Line 286  sub get_data { Line 501  sub get_data {
501          my $self = shift;          my $self = shift;
502    
503          my ($rec,$f,$sf,$i,$found) = @_;          my ($rec,$f,$sf,$i,$found) = @_;
504    
505          if ($$rec->{$f}) {          if ($$rec->{$f}) {
506                    return '' if (! $$rec->{$f}->[$i]);
507                    no strict 'refs';
508                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {
509                          $$found++ if (defined($$found));                          $$found++ if (defined($$found));
510                          return $$rec->{$f}->[$i]->{$sf};                          return $$rec->{$f}->[$i]->{$sf};
# Line 315  Workhourse of all: takes record from in- Line 533  Workhourse of all: takes record from in-
533  strings with placeholders and returns string or array of with substituted  strings with placeholders and returns string or array of with substituted
534  values from record.  values from record.
535    
536   $webpac->fill_in($rec,'v250^a');   my $text = $webpac->fill_in($rec,'v250^a');
537    
538  Optional argument is ordinal number for repeatable fields. By default,  Optional argument is ordinal number for repeatable fields. By default,
539  it's assume to be first repeatable field (fields are perl array, so first  it's assume to be first repeatable field (fields are perl array, so first
540  element is 0).  element is 0).
541  Following example will read second value from repeatable field.  Following example will read second value from repeatable field.
542    
543   $webpac->fill_in($rec,'Title: v250^a',1);   my $text = $webpac->fill_in($rec,'Title: v250^a',1);
544    
545  This function B<does not> perform parsing of format to inteligenty skip  This function B<does not> perform parsing of format to inteligenty skip
546  delimiters before fields which aren't used.  delimiters before fields which aren't used.
547    
548    This method will automatically decode UTF-8 string to local code page
549    if needed.
550    
551  =cut  =cut
552    
553  sub fill_in {  sub fill_in {
554          my $self = shift;          my $self = shift;
555    
556          my $rec = shift || confess "need data record";          my $log = $self->_get_logger();
557          my $format = shift || confess "need format to parse";  
558            my $rec = shift || $log->logconfess("need data record");
559            my $format = shift || $log->logconfess("need format to parse");
560          # iteration (for repeatable fields)          # iteration (for repeatable fields)
561          my $i = shift || 0;          my $i = shift || 0;
562    
563          # FIXME remove for speedup?          # FIXME remove for speedup?
564          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
565    
566            if (utf8::is_utf8($format)) {
567                    $format = $self->_x($format);
568            }
569    
570          my $found = 0;          my $found = 0;
571    
# Line 347  sub fill_in { Line 574  sub fill_in {
574          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
575    
576          # do actual replacement of placeholders          # do actual replacement of placeholders
577          $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
578    
579          if ($found) {          if ($found) {
580                    $log->debug("format: $format");
581                  if ($eval_code) {                  if ($eval_code) {
582                          my $eval = $self->fill_in($rec,$eval_code,$i);                          my $eval = $self->fill_in($rec,$eval_code,$i);
583                          return if (! eval $eval);                          return if (! $self->_eval($eval));
584                  }                  }
585                  # do we have lookups?                  # do we have lookups?
586                  if ($format =~ /\[[^\[\]]+\]/o) {                  if ($format =~ /$LOOKUP_REGEX/o) {
587                            $log->debug("format '$format' has lookup");
588                          return $self->lookup($format);                          return $self->lookup($format);
589                  } else {                  } else {
590                          return $format;                          return $format;
# Line 369  sub fill_in { Line 598  sub fill_in {
598    
599  Perform lookups on format supplied to it.  Perform lookups on format supplied to it.
600    
601   my $txt = $self->lookup('[v900]');   my $text = $self->lookup('[v900]');
602    
603  Lookups can be nested (like C<[d:[a:[v900]]]>).  Lookups can be nested (like C<[d:[a:[v900]]]>).
604    
# Line 378  Lookups can be nested (like C<[d:[a:[v90 Line 607  Lookups can be nested (like C<[d:[a:[v90
607  sub lookup {  sub lookup {
608          my $self = shift;          my $self = shift;
609    
610          my $tmp = shift || confess "need format";          my $log = $self->_get_logger();
611    
612            my $tmp = shift || $log->logconfess("need format");
613    
614          if ($tmp =~ /\[[^\[\]]+\]/o) {          if ($tmp =~ /$LOOKUP_REGEX/o) {
615                  my @in = ( $tmp );                  my @in = ( $tmp );
616  #print "##lookup $tmp\n";  
617                    $log->debug("lookup for: ",$tmp);
618    
619                  my @out;                  my @out;
620                  while (my $f = shift @in) {                  while (my $f = shift @in) {
621                          if ($f =~ /\[([^\[\]]+)\]/) {                          if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
622                                  my $k = $1;                                  my $k = $1;
623                                  if ($self->{'lookup'}->{$k}) {                                  if ($self->{'lookup'}->{$k}) {
 #print "## lookup key = $k\n";  
624                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {
625                                                  my $tmp2 = $f;                                                  my $tmp2 = $f;
626                                                  $tmp2 =~ s/\[$k\]/$nv/g;                                                  $tmp2 =~ s/lookup{$k}/$nv/g;
627                                                  push @in, $tmp2;                                                  push @in, $tmp2;
 #print "## lookup in => $tmp2\n";  
628                                          }                                          }
629                                  } else {                                  } else {
630                                          undef $f;                                          undef $f;
631                                  }                                  }
632                          } elsif ($f) {                          } elsif ($f) {
633                                  push @out, $f;                                  push @out, $f;
 #print "## lookup out => $f\n";  
634                          }                          }
635                  }                  }
636                    $log->logconfess("return is array and it's not expected!") unless wantarray;
637                  return @out;                  return @out;
638          } else {          } else {
639                  return $tmp;                  return $tmp;
# Line 415  Perform smart parsing of string, skippin Line 646  Perform smart parsing of string, skippin
646  defined. It can also eval code in format starting with C<eval{...}> and  defined. It can also eval code in format starting with C<eval{...}> and
647  return output or nothing depending on eval code.  return output or nothing depending on eval code.
648    
649   $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);   my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
650    
651  =cut  =cut
652    
# Line 426  sub parse { Line 657  sub parse {
657    
658          return if (! $format_utf8);          return if (! $format_utf8);
659    
660          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          my $log = $self->_get_logger();
661          confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});  
662            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
663            $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
664    
665          $i = 0 if (! $i);          $i = 0 if (! $i);
666    
667          my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});          my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
668    
669          my @out;          my @out;
670    
671            $log->debug("format: $format");
672    
673          my $eval_code;          my $eval_code;
674          # remove eval{...} from beginning          # remove eval{...} from beginning
675          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
# Line 442  sub parse { Line 677  sub parse {
677          my $prefix;          my $prefix;
678          my $all_found=0;          my $all_found=0;
679    
680  #print "## $format\n";          while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
         while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {  
 #print "## [ $1 | $2 | $3 ] $format\n";  
681    
682                  my $del = $1 || '';                  my $del = $1 || '';
683                  $prefix ||= $del if ($all_found == 0);                  $prefix ||= $del if ($all_found == 0);
# Line 461  sub parse { Line 694  sub parse {
694    
695          return if (! $all_found);          return if (! $all_found);
696    
697          my $out = join('',@out) . $format;          my $out = join('',@out);
698    
699            if ($out) {
700                    # add rest of format (suffix)
701                    $out .= $format;
702    
703                    # add prefix if not there
704                    $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
705    
706                    $log->debug("result: $out");
707            }
708    
         # add prefix if not there  
         $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);  
           
709          if ($eval_code) {          if ($eval_code) {
710                  my $eval = $self->fill_in($rec,$eval_code,$i);                  my $eval = $self->fill_in($rec,$eval_code,$i);
711                  return if (! eval $eval);                  $log->debug("about to eval{",$eval,"} format: $out");
712                    return if (! $self->_eval($eval));
713          }          }
714    
715          return $out;          return $out;
716  }  }
717    
718  =head2 data_structure  =head2 parse_to_arr
719    
720  Create in-memory data structure which represents layout from C<import_xml>.  Similar to C<parse>, but returns array of all repeatable fields
 It is used later to produce output.  
721    
722   my $ds = $webpac->data_structure($rec);   my @arr = $webpac->parse_to_arr($rec,'v250^a');
723    
724  =cut  =cut
725    
726  # private method _sort_by_order  sub parse_to_arr {
 # sort subrouting using order="" attribute  
 sub _sort_by_order {  
727          my $self = shift;          my $self = shift;
728    
729          my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||          my ($rec, $format_utf8) = @_;
                 $self->{'import_xml'}->{'indexer'}->{$a};  
         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||  
                 $self->{'import_xml'}->{'indexer'}->{$b};  
730    
731          return $va <=> $vb;          my $log = $self->_get_logger();
732    
733            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
734            return if (! $format_utf8);
735    
736            my $i = 0;
737            my @arr;
738    
739            while (my $v = $self->parse($rec,$format_utf8,$i++)) {
740                    push @arr, $v;
741            }
742    
743            $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
744    
745            return @arr;
746    }
747    
748    =head2 fill_in_to_arr
749    
750    Similar to C<fill_in>, but returns array of all repeatable fields. Usable
751    for fields which have lookups, so they shouldn't be parsed but rather
752    C<fill_id>ed.
753    
754     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
755    
756    =cut
757    
758    sub fill_in_to_arr {
759            my $self = shift;
760    
761            my ($rec, $format_utf8) = @_;
762    
763            my $log = $self->_get_logger();
764    
765            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
766            return if (! $format_utf8);
767    
768            my $i = 0;
769            my @arr;
770    
771            while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
772                    push @arr, @v;
773            }
774    
775            $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
776    
777            return @arr;
778    }
779    
780    =head2 sort_arr
781    
782    Sort array ignoring case and html in data
783    
784     my @sorted = $webpac->sort_arr(@unsorted);
785    
786    =cut
787    
788    sub sort_arr {
789            my $self = shift;
790    
791            my $log = $self->_get_logger();
792    
793            # FIXME add Schwartzian Transformation?
794    
795            my @sorted = sort {
796                    $a =~ s#<[^>]+/*>##;
797                    $b =~ s#<[^>]+/*>##;
798                    lc($b) cmp lc($a)
799            } @_;
800            $log->debug("sorted values: ",sub { join(", ",@sorted) });
801    
802            return @sorted;
803  }  }
804    
805    
806    =head2 data_structure
807    
808    Create in-memory data structure which represents layout from C<import_xml>.
809    It is used later to produce output.
810    
811     my @ds = $webpac->data_structure($rec);
812    
813    This method will also set C<$webpac->{'currnet_filename'}> if there is
814    <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
815    <headline> tag.
816    
817    =cut
818    
819  sub data_structure {  sub data_structure {
820          my $self = shift;          my $self = shift;
821    
822            my $log = $self->_get_logger();
823    
824          my $rec = shift;          my $rec = shift;
825          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
826    
827            undef $self->{'currnet_filename'};
828            undef $self->{'headline'};
829    
830          my @sorted_tags;          my @sorted_tags;
831          if ($self->{tags_by_order}) {          if ($self->{tags_by_order}) {
# Line 510  sub data_structure { Line 835  sub data_structure {
835                  $self->{tags_by_order} = \@sorted_tags;                  $self->{tags_by_order} = \@sorted_tags;
836          }          }
837    
838          my $ds;          my @ds;
839    
840            $log->debug("tags: ",sub { join(", ",@sorted_tags) });
841    
842          foreach my $field (@sorted_tags) {          foreach my $field (@sorted_tags) {
843    
844                  my $row;                  my $row;
                 my $i = 0;  
845    
846  #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});  #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
847    
848                  foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {                  foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
849                            my $format = $tag->{'value'} || $tag->{'content'};
850    
851                          my $v = $self->parse($rec,$tag->{'content'},$i);                          $log->debug("format: $format");
 print "## $i:",$tag->{'content'}," = ",($v || 'null'),"\n";  
   
                         next if (!$v || $v && $v eq '');  
852    
853                          # does tag have type?                          my @v;
854                          if ($tag->{'type'}) {                          if ($format =~ /$LOOKUP_REGEX/o) {
855                                  push @{$row->{$tag->{'type'}}}, $v;                                  @v = $self->fill_in_to_arr($rec,$format);
856                          } else {                          } else {
857                                  push @{$row->{'display'}}, $v;                                  @v = $self->parse_to_arr($rec,$format);
858                                  push @{$row->{'swish'}}, $v;                          }
859                            next if (! @v);
860    
861                            if ($tag->{'sort'}) {
862                                    @v = $self->sort_arr(@v);
863                                    $log->warn("sort within tag is usually not what you want!");
864                            }
865    
866                            # use format?
867                            if ($tag->{'format_name'}) {
868                                    @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
869                            }
870    
871                            if ($field eq 'filename') {
872                                    $self->{'current_filename'} = join('',@v);
873                                    $log->debug("filename: ",$self->{'current_filename'});
874                            } elsif ($field eq 'headline') {
875                                    $self->{'headline'} .= join('',@v);
876                                    $log->debug("headline: ",$self->{'headline'});
877                                    next; # don't return headline in data_structure!
878                          }                          }
879    
880                            # delimiter will join repeatable fields
881                            if ($tag->{'delimiter'}) {
882                                    @v = ( join($tag->{'delimiter'}, @v) );
883                            }
884    
885                            # default types
886                            my @types = qw(display swish);
887                            # override by type attribute
888                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
889    
890                            foreach my $type (@types) {
891                                    # append to previous line?
892                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
893                                    if ($tag->{'append'}) {
894    
895                                            # I will delimit appended part with
896                                            # delimiter (or ,)
897                                            my $d = $tag->{'delimiter'};
898                                            # default delimiter
899                                            $d ||= ", ";
900    
901                                            my $last = pop @{$row->{$type}};
902                                            $d = "" if (! $last);
903                                            $last .= $d . join($d, @v);
904                                            push @{$row->{$type}}, $last;
905    
906                                    } else {
907                                            push @{$row->{$type}}, @v;
908                                    }
909                            }
910    
911    
912                  }                  }
913    
914                  push @{$ds->{$field}}, $row if ($row);                  if ($row) {
915                            $row->{'tag'} = $field;
916    
917                            # TODO: name_sigular, name_plural
918                            my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
919                            $row->{'name'} = $name ? $self->_x($name) : $field;
920    
921                            # post-sort all values in field
922                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
923                                    $log->warn("sort at field tag not implemented");
924                            }
925    
926                            push @ds, $row;
927    
928                            $log->debug("row $field: ",sub { Dumper($row) });
929                    }
930    
931          }          }
932    
933          print Dumper($ds);          return @ds;
934    
935  }  }
936    
937    =head2 output
938    
939    Create output from in-memory data structure using Template Toolkit template.
940    
941    my $text = $webpac->output( template => 'text.tt', data => @ds );
942    
943    =cut
944    
945    sub output {
946            my $self = shift;
947    
948            my $args = {@_};
949    
950            my $log = $self->_get_logger();
951    
952            $log->logconfess("need template name") if (! $args->{'template'});
953            $log->logconfess("need data array") if (! $args->{'data'});
954    
955            my $out;
956    
957            $self->{'tt'}->process(
958                    $args->{'template'},
959                    $args,
960                    \$out
961            ) || confess $self->{'tt'}->error();
962    
963            return $out;
964    }
965    
966    =head2 output_file
967    
968    Create output from in-memory data structure using Template Toolkit template
969    to a file.
970    
971     $webpac->output_file(
972            file => 'out.txt',
973            template => 'text.tt',
974            data => @ds
975     );
976    
977    =cut
978    
979    sub output_file {
980            my $self = shift;
981    
982            my $args = {@_};
983    
984            my $log = $self->_get_logger();
985    
986            my $file = $args->{'file'} || $log->logconfess("need file name");
987    
988            $log->debug("creating file ",$file);
989    
990            open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
991            print $fh $self->output(
992                    template => $args->{'template'},
993                    data => $args->{'data'},
994            ) || $log->logdie("print: $!");
995            close($fh) || $log->logdie("close: $!");
996    }
997    
998    =head2 apply_format
999    
1000    Apply format specified in tag with C<format_name="name"> and
1001    C<format_delimiter=";;">.
1002    
1003     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1004    
1005    Formats can contain C<lookup{...}> if you need them.
1006    
1007    =cut
1008    
1009    sub apply_format {
1010            my $self = shift;
1011    
1012            my ($name,$delimiter,$data) = @_;
1013    
1014            my $log = $self->_get_logger();
1015    
1016            if (! $self->{'import_xml'}->{'format'}->{$name}) {
1017                    $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1018                    return $data;
1019            }
1020    
1021            $log->warn("no delimiter for format $name") if (! $delimiter);
1022    
1023            my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1024    
1025            my @data = split(/\Q$delimiter\E/, $data);
1026    
1027            my $out = sprintf($format, @data);
1028            $log->debug("using format $name [$format] on $data to produce: $out");
1029    
1030            if ($out =~ m/$LOOKUP_REGEX/o) {
1031                    return $self->lookup($out);
1032            } else {
1033                    return $out;
1034            }
1035    
1036    }
1037    
1038    
1039    #
1040    #
1041    #
1042    
1043    =head1 INTERNAL METHODS
1044    
1045    Here is a quick list of internal methods, mostly useful to turn debugging
1046    on them (see L<LOGGING> below for explanation).
1047    
1048    =cut
1049    
1050    =head2 _eval
1051    
1052    Internal function to eval code without C<strict 'subs'>.
1053    
1054    =cut
1055    
1056    sub _eval {
1057            my $self = shift;
1058    
1059            my $code = shift || return;
1060    
1061            my $log = $self->_get_logger();
1062    
1063            no strict 'subs';
1064            my $ret = eval $code;
1065            if ($@) {
1066                    $log->error("problem with eval code [$code]: $@");
1067            }
1068    
1069            $log->debug("eval: ",$code," [",$ret,"]");
1070    
1071            return $ret || 0;
1072    }
1073    
1074    =head2 _sort_by_order
1075    
1076    Sort xml tags data structure accoding to C<order=""> attribute.
1077    
1078    =cut
1079    
1080    sub _sort_by_order {
1081            my $self = shift;
1082    
1083            my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1084                    $self->{'import_xml'}->{'indexer'}->{$a};
1085            my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1086                    $self->{'import_xml'}->{'indexer'}->{$b};
1087    
1088            return $va <=> $vb;
1089    }
1090    
1091    =head2 _get_logger
1092    
1093    Get C<Log::Log4perl> object with a twist: domains are defined for each
1094    method
1095    
1096     my $log = $webpac->_get_logger();
1097    
1098    =cut
1099    
1100    sub _get_logger {
1101            my $self = shift;
1102    
1103            my $name = (caller(1))[3] || caller;
1104            return get_logger($name);
1105    }
1106    
1107    =head2 _x
1108    
1109    Convert string from UTF-8 to code page defined in C<import_xml>.
1110    
1111     my $text = $webpac->_x('utf8 text');
1112    
1113    =cut
1114    
1115    sub _x {
1116            my $self = shift;
1117            my $utf8 = shift || return;
1118    
1119            return $self->{'utf2cp'}->convert($utf8) ||
1120                    $self->_get_logger()->logwarn("can't convert '$utf8'");
1121    }
1122    
1123    #
1124    #
1125    #
1126    
1127    =head1 LOGGING
1128    
1129    Logging in WebPAC is performed by L<Log::Log4perl> with config file
1130    C<log.conf>.
1131    
1132    Methods defined above have different levels of logging, so
1133    it's descriptions will be useful to turn (mostry B<debug> logging) on
1134    or off to see why WabPAC isn't perforing as you expect it (it might even
1135    be a bug!).
1136    
1137    B<This is different from normal Log4perl behaviour>. To repeat, you can
1138    also use method names, and not only classes (which are just few)
1139    to filter logging.
1140    
1141    
1142    =head1 MEMORY USAGE
1143    
1144    C<low_mem> options is double-edged sword. If enabled, WebPAC
1145    will run on memory constraint machines (which doesn't have enough
1146    physical RAM to create memory structure for whole source database).
1147    
1148    If your machine has 512Mb or more of RAM and database is around 10000 records,
1149    memory shouldn't be an issue. If you don't have enough physical RAM, you
1150    might consider using virtual memory (if your operating system is handling it
1151    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1152    parsed structure of ISIS database (this is what C<low_mem> option does).
1153    
1154    Hitting swap at end of reading source database is probably o.k. However,
1155    hitting swap before 90% will dramatically decrease performance and you will
1156    be better off with C<low_mem> and using rest of availble memory for
1157    operating system disk cache (Linux is particuallary good about this).
1158    However, every access to database record will require disk access, so
1159    generation phase will be slower 10-100 times.
1160    
1161    Parsed structures are essential - you just have option to trade RAM memory
1162    (which is fast) for disk space (which is slow). Be sure to have planty of
1163    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1164    
1165    However, when WebPAC is running on desktop machines (or laptops :-), it's
1166    highly undesireable for system to start swapping. Using C<low_mem> option can
1167    reduce WecPAC memory usage to around 64Mb for same database with lookup
1168    fields and sorted indexes which stay in RAM. Performance will suffer, but
1169    memory usage will really be minimal. It might be also more confortable to
1170    run WebPAC reniced on those machines.
1171    
1172    =cut
1173    
1174  1;  1;

Legend:
Removed from v.366  
changed lines
  Added in v.453

  ViewVC Help
Powered by ViewVC 1.1.26