/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 366 by dpavlin, Thu Jun 17 01:44:25 2004 UTC revision 459 by dpavlin, Tue Sep 21 19:08:11 2004 UTC
# Line 1  Line 1 
1  package WebPAC;  package WebPAC;
2    
3    use warnings;
4    use strict;
5    
6  use Carp;  use Carp;
7  use Text::Iconv;  use Text::Iconv;
8  use Config::IniFiles;  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10    use Template;
11    use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
16    #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17    #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18    my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19    my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21  =head1 NAME  =head1 NAME
22    
23  WebPAC - base class for WebPAC  WebPAC - base class for WebPAC
# Line 19  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38            low_mem => 1,
39   );   );
40    
41  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
42    
43  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45    This method will also read configuration files
46  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
47  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
48  which describes databases to be indexed.  which describes databases to be indexed.
# Line 49  sub new { Line 63  sub new {
63          my $self = {@_};          my $self = {@_};
64          bless($self, $class);          bless($self, $class);
65    
66            $self->{'start_t'} = time();
67    
68            my $log_file = $self->{'log'} || "log.conf";
69            Log::Log4perl->init($log_file);
70    
71            my $log = $self->_get_logger();
72    
73          # fill in default values          # fill in default values
74          # output codepage          # output codepage
75          $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});          $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
# Line 56  sub new { Line 77  sub new {
77          #          #
78          # read global.conf          # read global.conf
79          #          #
80            $log->debug("read 'global.conf'");
81    
82          $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";          my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
83    
84          # read global config parametars          # read global config parametars
85          foreach my $var (qw(          foreach my $var (qw(
# Line 67  sub new { Line 89  sub new {
89                          dbi_passwd                          dbi_passwd
90                          show_progress                          show_progress
91                          my_unac_filter                          my_unac_filter
92                            output_template
93                  )) {                  )) {
94                  $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);                  $self->{'global_config'}->{$var} = $config->val('global', $var);
95          }          }
96    
97          #          #
98          # read indexer config file          # read indexer config file
99          #          #
100    
101          $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";          $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
102    
103            # create UTF-8 convertor for import_xml files
104          $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});          $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
105    
106            # create Template toolkit instance
107            $self->{'tt'} = Template->new(
108                    INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
109    #               FILTERS => {
110    #                       'foo' => \&foo_filter,
111    #               },
112                    EVAL_PERL => 1,
113            );
114    
115            # running with low_mem flag? well, use DBM::Deep then.
116            if ($self->{'low_mem'}) {
117                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119                    my $db_file = "data.db";
120    
121                    if (-e $db_file) {
122                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123                            $log->debug("removed '$db_file' from last run");
124                    }
125    
126                    require DBM::Deep;
127    
128                    my $db = new DBM::Deep $db_file;
129    
130                    $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132                    if ($db->error()) {
133                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134                    } else {
135                            $log->debug("using file '$db_file' for DBM::Deep");
136                    }
137    
138                    $self->{'db'} = $db;
139            }
140    
141          return $self;          return $self;
142  }  }
143    
# Line 88  Open CDS/ISIS database using OpenIsis mo Line 148  Open CDS/ISIS database using OpenIsis mo
148   $webpac->open_isis(   $webpac->open_isis(
149          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
150          code_page => '852',          code_page => '852',
151          limit_mfn => '500',          limit_mfn => 500,
152            start_mfn => 6000,
153          lookup => [ ... ],          lookup => [ ... ],
154   );   );
155    
156  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
157    
158    If optional parametar C<start_mfn> is set, this will be first MFN to read
159    from database (so you can skip beginning of your database if you need to).
160    
161  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
162  from database in example above.  from database in example above.
163    
 Returns number of last record read into memory (size of database, really).  
   
164  C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and  C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
165  C<val>. Optional parametar C<eval> is perl code to evaluate before storing  C<val>. Optional parametar C<eval> is perl code to evaluate before storing
166  value in index.  value in index.
# Line 110  value in index. Line 172  value in index.
172      'val' => 'v900' },      'val' => 'v900' },
173   ]   ]
174    
175    Returns number of last record read into memory (size of database, really).
176    
177  =cut  =cut
178    
179  sub open_isis {  sub open_isis {
180          my $self = shift;          my $self = shift;
181          my $arg = {@_};          my $arg = {@_};
182    
183          croak "need filename" if (! $arg->{'filename'});          my $log = $self->_get_logger();
184    
185            $log->logcroak("need filename") if (! $arg->{'filename'});
186          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
187    
188            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190            # store data in object
191            $self->{'isis_filename'} = $arg->{'filename'};
192            $self->{'isis_code_page'} = $code_page;
193    
194          use OpenIsis;          use OpenIsis;
195    
196          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 126  sub open_isis { Line 198  sub open_isis {
198          # create Text::Iconv object          # create Text::Iconv object
199          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
200    
201          print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});          $log->info("reading ISIS database '",$arg->{'filename'},"'");
202            $log->debug("isis code page: $code_page");
203    
204          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207            my $startmfn = 1;
208    
209            if (my $s = $self->{'start_mfn'}) {
210                    $log->info("skipping to MFN $s");
211                    $startmfn = $s;
212            } else {
213                    $self->{'start_mfn'} = $startmfn;
214            }
215    
216          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
217    
218          print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});          $log->info("processing ",($maxmfn-$startmfn)." records...");
219    
220          # read database          # read database
221          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
222    
223    
224                    $log->debug("mfn: $mfn\n");
225    
226                    my $rec;
227    
228                  # read record                  # read record
229                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
# Line 156  sub open_isis { Line 242  sub open_isis {
242                                                  $val = $l;                                                  $val = $l;
243                                          }                                          }
244    
245                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
246                                  }                                  }
247                            } else {
248                                    push @{$rec->{'000'}}, $mfn;
249                          }                          }
250    
251                  }                  }
252    
253                    $log->confess("record $mfn empty?") unless ($rec);
254    
255                    # store
256                    if ($self->{'low_mem'}) {
257                            $self->{'db'}->put($mfn, $rec);
258                    } else {
259                            $self->{'data'}->{$mfn} = $rec;
260                    }
261    
262                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn};  
263                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
264    
265                    $self->progress_bar($mfn,$maxmfn);
266    
267          }          }
268    
269          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = -1;
270            $self->{'last_pcnt'} = 0;
271    
272            $log->debug("max mfn: $maxmfn");
273    
274          # store max mfn and return it.          # store max mfn and return it.
275          return $self->{'max_mfn'} = $maxmfn;          return $self->{'max_mfn'} = $maxmfn;
# Line 186  it's implemented, that is). Line 287  it's implemented, that is).
287  sub fetch_rec {  sub fetch_rec {
288          my $self = shift;          my $self = shift;
289    
290          my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";          my $log = $self->_get_logger();
291    
292            $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
293    
294            if ($self->{'current_mfn'} == -1) {
295                    $self->{'current_mfn'} = $self->{'start_mfn'};
296            } else {
297                    $self->{'current_mfn'}++;
298            }
299    
300            my $mfn = $self->{'current_mfn'};
301    
302          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{'max_mfn'}) {
303                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{'current_mfn'} = $self->{'max_mfn'};
304                    $log->debug("at EOF");
305                  return;                  return;
306          }          }
307    
308          return $self->{'data'}->{$mfn};          $self->progress_bar($mfn,$self->{'max_mfn'});
309    
310            if ($self->{'low_mem'}) {
311                    return $self->{'db'}->get($mfn);
312            } else {
313                    return $self->{'data'}->{$mfn};
314            }
315    }
316    
317    =head2 mfn
318    
319    Returns current record number (MFN).
320    
321     print $webpac->mfn;
322    
323    =cut
324    
325    sub mfn {
326            my $self = shift;
327            return $self->{'current_mfn'};
328    }
329    
330    =head2 progress_bar
331    
332    Draw progress bar on STDERR.
333    
334     $webpac->progress_bar($current, $max);
335    
336    =cut
337    
338    sub progress_bar {
339            my $self = shift;
340    
341            my ($curr,$max) = @_;
342    
343            my $log = $self->_get_logger();
344    
345            $log->logconfess("no current value!") if (! $curr);
346            $log->logconfess("no maximum value!") if (! $max);
347    
348            if ($curr > $max) {
349                    $max = $curr;
350                    $log->debug("overflow to $curr");
351            }
352    
353            $self->{'last_pcnt'} ||= 1;
354    
355            my $p = int($curr * 100 / $max) || 1;
356    
357            # reset on re-run
358            if ($p < $self->{'last_pcnt'}) {
359                    $self->{'last_pcnt'} = $p;
360                    $self->{'last_t'} = time();
361                    $self->{'last_curr'} = undef;
362            }
363    
364            $self->{'last_t'} ||= time();
365    
366            if ($p != $self->{'last_pcnt'}) {
367    
368                    my $last_curr = $self->{'last_curr'} || $curr;
369                    my $t = time();
370                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
371                    my $eta = ($max-$curr) / ($rate || 1);
372                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
373                    $self->{'last_pcnt'} = $p;
374                    $self->{'last_t'} = time();
375                    $self->{'last_curr'} = $curr;
376            }
377            print STDERR "\n" if ($p == 100);
378    }
379    
380    =head2 fmt_time
381    
382    Format time (in seconds) for display.
383    
384     print $webpac->fmt_time(time());
385    
386    This method is called by L<progress_bar> to display remaining time.
387    
388    =cut
389    
390    sub fmt_time {
391            my $self = shift;
392    
393            my $t = shift || 0;
394            my $out = "";
395    
396            my ($ss,$mm,$hh) = gmtime($t);
397            $out .= "${hh}h" if ($hh);
398            $out .= sprintf("%02d:%02d", $mm,$ss);
399            $out .= "  " if ($hh == 0);
400            return $out;
401  }  }
402    
403  =head2 open_import_xml  =head2 open_import_xml
# Line 207  Read file from C<import_xml/> directory Line 411  Read file from C<import_xml/> directory
411  sub open_import_xml {  sub open_import_xml {
412          my $self = shift;          my $self = shift;
413    
414            my $log = $self->_get_logger();
415    
416          my $arg = {@_};          my $arg = {@_};
417          confess "need type to load file from import_xml/" if (! $arg->{'type'});          $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
418    
419          $self->{'type'} = $arg->{'type'};          $self->{'type'} = $arg->{'type'};
420    
# Line 217  sub open_import_xml { Line 423  sub open_import_xml {
423    
424          $self->{'tag'} = $type2tag{$type_base};          $self->{'tag'} = $type2tag{$type_base};
425    
426          print STDERR "using type ",$self->{'type'}," tag ",$self->{'tag'},"\n" if ($self->{'debug'});          $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
427    
428          my $f = "./import_xml/".$self->{'type'}.".xml";          my $f = "./import_xml/".$self->{'type'}.".xml";
429          confess "import_xml file '$f' doesn't exist!" if (! -e "$f");          $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
430    
431            $log->info("reading '$f'");
432    
433          print STDERR "reading '$f'\n" if ($self->{'debug'});          $self->{'import_xml_file'} = $f;
434    
435          $self->{'import_xml'} = XMLin($f,          $self->{'import_xml'} = XMLin($f,
436                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],
                 ForceContent => 1  
437          );          );
438    
439          print Dumper($self->{'import_xml'});          $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
440    
441  }  }
442    
# Line 237  sub open_import_xml { Line 444  sub open_import_xml {
444    
445  Create lookup from record using lookup definition.  Create lookup from record using lookup definition.
446    
447     $self->create_lookup($rec, @lookups);
448    
449    Called internally by C<open_*> methods.
450    
451  =cut  =cut
452    
453  sub create_lookup {  sub create_lookup {
454          my $self = shift;          my $self = shift;
455    
456          my $rec = shift || confess "need record to create lookup";          my $log = $self->_get_logger();
457          confess("need HASH as first argument!") if ($rec !~ /HASH/o);  
458            my $rec = shift || $log->logconfess("need record to create lookup");
459            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
460    
461          foreach my $i (@_) {          foreach my $i (@_) {
462                  if ($i->{'eval'}) {                  $log->logconfess("need key") unless defined($i->{'key'});
463                          my $eval = $self->fill_in($rec,$i->{'eval'});                  $log->logconfess("need val") unless defined($i->{'val'});
464                          my $key = $self->fill_in($rec,$i->{'key'});  
465                          my @val = $self->fill_in($rec,$i->{'val'});                  if (defined($i->{'eval'})) {
466                          if ($key && @val && eval $eval) {                          # eval first, so we can skip fill_in for key and val
467                            my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
468                            if ($self->_eval($eval)) {
469                                    my $key = $self->fill_in($rec,$i->{'key'}) || next;
470                                    my @val = $self->fill_in($rec,$i->{'val'}) || next;
471                                    $log->debug("stored $key = ",sub { join(" | ",@val) });
472                                  push @{$self->{'lookup'}->{$key}}, @val;                                  push @{$self->{'lookup'}->{$key}}, @val;
473                          }                          }
474                  } else {                  } else {
475                          my $key = $self->fill_in($rec,$i->{'key'});                          my $key = $self->fill_in($rec,$i->{'key'}) || next;
476                          my @val = $self->fill_in($rec,$i->{'val'});                          my @val = $self->fill_in($rec,$i->{'val'}) || next;
477                          if ($key && @val) {                          $log->debug("stored $key = ",sub { join(" | ",@val) });
478                                  push @{$self->{'lookup'}->{$key}}, @val;                          push @{$self->{'lookup'}->{$key}}, @val;
                         }  
479                  }                  }
480          }          }
481  }  }
# Line 267  sub create_lookup { Line 484  sub create_lookup {
484    
485  Returns value from record.  Returns value from record.
486    
487   $self->get_data(\$rec,$f,$sf,$i,\$found);   my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
488    
489  Arguments are:  Arguments are:
490  record reference C<$rec>,  record reference C<$rec>,
# Line 275  field C<$f>, Line 492  field C<$f>,
492  optional subfiled C<$sf>,  optional subfiled C<$sf>,
493  index for repeatable values C<$i>.  index for repeatable values C<$i>.
494    
495  Optinal variable C<$found> will be incremeted if thre  Optinal variable C<$found> will be incremeted if there
496  is field.  is field.
497    
498  Returns value or empty string.  Returns value or empty string.
# Line 286  sub get_data { Line 503  sub get_data {
503          my $self = shift;          my $self = shift;
504    
505          my ($rec,$f,$sf,$i,$found) = @_;          my ($rec,$f,$sf,$i,$found) = @_;
506    
507          if ($$rec->{$f}) {          if ($$rec->{$f}) {
508                    return '' if (! $$rec->{$f}->[$i]);
509                    no strict 'refs';
510                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {
511                          $$found++ if (defined($$found));                          $$found++ if (defined($$found));
512                          return $$rec->{$f}->[$i]->{$sf};                          return $$rec->{$f}->[$i]->{$sf};
# Line 315  Workhourse of all: takes record from in- Line 535  Workhourse of all: takes record from in-
535  strings with placeholders and returns string or array of with substituted  strings with placeholders and returns string or array of with substituted
536  values from record.  values from record.
537    
538   $webpac->fill_in($rec,'v250^a');   my $text = $webpac->fill_in($rec,'v250^a');
539    
540  Optional argument is ordinal number for repeatable fields. By default,  Optional argument is ordinal number for repeatable fields. By default,
541  it's assume to be first repeatable field (fields are perl array, so first  it's assume to be first repeatable field (fields are perl array, so first
542  element is 0).  element is 0).
543  Following example will read second value from repeatable field.  Following example will read second value from repeatable field.
544    
545   $webpac->fill_in($rec,'Title: v250^a',1);   my $text = $webpac->fill_in($rec,'Title: v250^a',1);
546    
547  This function B<does not> perform parsing of format to inteligenty skip  This function B<does not> perform parsing of format to inteligenty skip
548  delimiters before fields which aren't used.  delimiters before fields which aren't used.
549    
550    This method will automatically decode UTF-8 string to local code page
551    if needed.
552    
553  =cut  =cut
554    
555  sub fill_in {  sub fill_in {
556          my $self = shift;          my $self = shift;
557    
558          my $rec = shift || confess "need data record";          my $log = $self->_get_logger();
559          my $format = shift || confess "need format to parse";  
560            my $rec = shift || $log->logconfess("need data record");
561            my $format = shift || $log->logconfess("need format to parse");
562          # iteration (for repeatable fields)          # iteration (for repeatable fields)
563          my $i = shift || 0;          my $i = shift || 0;
564    
565          # FIXME remove for speedup?          # FIXME remove for speedup?
566          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
567    
568            if (utf8::is_utf8($format)) {
569                    $format = $self->_x($format);
570            }
571    
572          my $found = 0;          my $found = 0;
573    
# Line 347  sub fill_in { Line 576  sub fill_in {
576          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
577    
578          # do actual replacement of placeholders          # do actual replacement of placeholders
579          $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
580    
581          if ($found) {          if ($found) {
582                    $log->debug("format: $format");
583                  if ($eval_code) {                  if ($eval_code) {
584                          my $eval = $self->fill_in($rec,$eval_code,$i);                          my $eval = $self->fill_in($rec,$eval_code,$i);
585                          return if (! eval $eval);                          return if (! $self->_eval($eval));
586                  }                  }
587                  # do we have lookups?                  # do we have lookups?
588                  if ($format =~ /\[[^\[\]]+\]/o) {                  if ($format =~ /$LOOKUP_REGEX/o) {
589                            $log->debug("format '$format' has lookup");
590                          return $self->lookup($format);                          return $self->lookup($format);
591                  } else {                  } else {
592                          return $format;                          return $format;
# Line 369  sub fill_in { Line 600  sub fill_in {
600    
601  Perform lookups on format supplied to it.  Perform lookups on format supplied to it.
602    
603   my $txt = $self->lookup('[v900]');   my $text = $self->lookup('[v900]');
604    
605  Lookups can be nested (like C<[d:[a:[v900]]]>).  Lookups can be nested (like C<[d:[a:[v900]]]>).
606    
# Line 378  Lookups can be nested (like C<[d:[a:[v90 Line 609  Lookups can be nested (like C<[d:[a:[v90
609  sub lookup {  sub lookup {
610          my $self = shift;          my $self = shift;
611    
612          my $tmp = shift || confess "need format";          my $log = $self->_get_logger();
613    
614          if ($tmp =~ /\[[^\[\]]+\]/o) {          my $tmp = shift || $log->logconfess("need format");
615    
616            if ($tmp =~ /$LOOKUP_REGEX/o) {
617                  my @in = ( $tmp );                  my @in = ( $tmp );
618  #print "##lookup $tmp\n";  
619                    $log->debug("lookup for: ",$tmp);
620    
621                  my @out;                  my @out;
622                  while (my $f = shift @in) {                  while (my $f = shift @in) {
623                          if ($f =~ /\[([^\[\]]+)\]/) {                          if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
624                                  my $k = $1;                                  my $k = $1;
625                                  if ($self->{'lookup'}->{$k}) {                                  if ($self->{'lookup'}->{$k}) {
 #print "## lookup key = $k\n";  
626                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {
627                                                  my $tmp2 = $f;                                                  my $tmp2 = $f;
628                                                  $tmp2 =~ s/\[$k\]/$nv/g;                                                  $tmp2 =~ s/lookup{$k}/$nv/g;
629                                                  push @in, $tmp2;                                                  push @in, $tmp2;
 #print "## lookup in => $tmp2\n";  
630                                          }                                          }
631                                  } else {                                  } else {
632                                          undef $f;                                          undef $f;
633                                  }                                  }
634                          } elsif ($f) {                          } elsif ($f) {
635                                  push @out, $f;                                  push @out, $f;
 #print "## lookup out => $f\n";  
636                          }                          }
637                  }                  }
638                    $log->logconfess("return is array and it's not expected!") unless wantarray;
639                  return @out;                  return @out;
640          } else {          } else {
641                  return $tmp;                  return $tmp;
# Line 415  Perform smart parsing of string, skippin Line 648  Perform smart parsing of string, skippin
648  defined. It can also eval code in format starting with C<eval{...}> and  defined. It can also eval code in format starting with C<eval{...}> and
649  return output or nothing depending on eval code.  return output or nothing depending on eval code.
650    
651   $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);   my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
652    
653  =cut  =cut
654    
# Line 426  sub parse { Line 659  sub parse {
659    
660          return if (! $format_utf8);          return if (! $format_utf8);
661    
662          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          my $log = $self->_get_logger();
663          confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});  
664            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
665            $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
666    
667          $i = 0 if (! $i);          $i = 0 if (! $i);
668    
669          my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});          my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
670    
671          my @out;          my @out;
672    
673            $log->debug("format: $format");
674    
675          my $eval_code;          my $eval_code;
676          # remove eval{...} from beginning          # remove eval{...} from beginning
677          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
# Line 442  sub parse { Line 679  sub parse {
679          my $prefix;          my $prefix;
680          my $all_found=0;          my $all_found=0;
681    
682  #print "## $format\n";          while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
         while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {  
 #print "## [ $1 | $2 | $3 ] $format\n";  
683    
684                  my $del = $1 || '';                  my $del = $1 || '';
685                  $prefix ||= $del if ($all_found == 0);                  $prefix ||= $del if ($all_found == 0);
# Line 461  sub parse { Line 696  sub parse {
696    
697          return if (! $all_found);          return if (! $all_found);
698    
699          my $out = join('',@out) . $format;          my $out = join('',@out);
700    
701            if ($out) {
702                    # add rest of format (suffix)
703                    $out .= $format;
704    
705                    # add prefix if not there
706                    $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
707    
708                    $log->debug("result: $out");
709            }
710    
         # add prefix if not there  
         $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);  
           
711          if ($eval_code) {          if ($eval_code) {
712                  my $eval = $self->fill_in($rec,$eval_code,$i);                  my $eval = $self->fill_in($rec,$eval_code,$i);
713                  return if (! eval $eval);                  $log->debug("about to eval{",$eval,"} format: $out");
714                    return if (! $self->_eval($eval));
715          }          }
716    
717          return $out;          return $out;
718  }  }
719    
720  =head2 data_structure  =head2 parse_to_arr
721    
722  Create in-memory data structure which represents layout from C<import_xml>.  Similar to C<parse>, but returns array of all repeatable fields
 It is used later to produce output.  
723    
724   my $ds = $webpac->data_structure($rec);   my @arr = $webpac->parse_to_arr($rec,'v250^a');
725    
726  =cut  =cut
727    
728  # private method _sort_by_order  sub parse_to_arr {
 # sort subrouting using order="" attribute  
 sub _sort_by_order {  
729          my $self = shift;          my $self = shift;
730    
731          my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||          my ($rec, $format_utf8) = @_;
                 $self->{'import_xml'}->{'indexer'}->{$a};  
         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||  
                 $self->{'import_xml'}->{'indexer'}->{$b};  
732    
733          return $va <=> $vb;          my $log = $self->_get_logger();
734    
735            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
736            return if (! $format_utf8);
737    
738            my $i = 0;
739            my @arr;
740    
741            while (my $v = $self->parse($rec,$format_utf8,$i++)) {
742                    push @arr, $v;
743            }
744    
745            $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
746    
747            return @arr;
748  }  }
749    
750    =head2 fill_in_to_arr
751    
752    Similar to C<fill_in>, but returns array of all repeatable fields. Usable
753    for fields which have lookups, so they shouldn't be parsed but rather
754    C<fill_id>ed.
755    
756     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
757    
758    =cut
759    
760    sub fill_in_to_arr {
761            my $self = shift;
762    
763            my ($rec, $format_utf8) = @_;
764    
765            my $log = $self->_get_logger();
766    
767            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
768            return if (! $format_utf8);
769    
770            my $i = 0;
771            my @arr;
772    
773            while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
774                    push @arr, @v;
775            }
776    
777            $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
778    
779            return @arr;
780    }
781    
782    =head2 sort_arr
783    
784    Sort array ignoring case and html in data
785    
786     my @sorted = $webpac->sort_arr(@unsorted);
787    
788    =cut
789    
790    sub sort_arr {
791            my $self = shift;
792    
793            my $log = $self->_get_logger();
794    
795            # FIXME add Schwartzian Transformation?
796    
797            my @sorted = sort {
798                    $a =~ s#<[^>]+/*>##;
799                    $b =~ s#<[^>]+/*>##;
800                    lc($b) cmp lc($a)
801            } @_;
802            $log->debug("sorted values: ",sub { join(", ",@sorted) });
803    
804            return @sorted;
805    }
806    
807    
808    =head2 data_structure
809    
810    Create in-memory data structure which represents layout from C<import_xml>.
811    It is used later to produce output.
812    
813     my @ds = $webpac->data_structure($rec);
814    
815    This method will also set C<$webpac->{'currnet_filename'}> if there is
816    <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
817    <headline> tag.
818    
819    =cut
820    
821  sub data_structure {  sub data_structure {
822          my $self = shift;          my $self = shift;
823    
824            my $log = $self->_get_logger();
825    
826          my $rec = shift;          my $rec = shift;
827          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
828    
829            undef $self->{'currnet_filename'};
830            undef $self->{'headline'};
831    
832          my @sorted_tags;          my @sorted_tags;
833          if ($self->{tags_by_order}) {          if ($self->{tags_by_order}) {
# Line 510  sub data_structure { Line 837  sub data_structure {
837                  $self->{tags_by_order} = \@sorted_tags;                  $self->{tags_by_order} = \@sorted_tags;
838          }          }
839    
840          my $ds;          my @ds;
841    
842            $log->debug("tags: ",sub { join(", ",@sorted_tags) });
843    
844          foreach my $field (@sorted_tags) {          foreach my $field (@sorted_tags) {
845    
846                  my $row;                  my $row;
                 my $i = 0;  
847    
848  #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});  #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
849    
850                  foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {                  foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
851                            my $format = $tag->{'value'} || $tag->{'content'};
852    
853                          my $v = $self->parse($rec,$tag->{'content'},$i);                          $log->debug("format: $format");
 print "## $i:",$tag->{'content'}," = ",($v || 'null'),"\n";  
   
                         next if (!$v || $v && $v eq '');  
854    
855                          # does tag have type?                          my @v;
856                          if ($tag->{'type'}) {                          if ($format =~ /$LOOKUP_REGEX/o) {
857                                  push @{$row->{$tag->{'type'}}}, $v;                                  @v = $self->fill_in_to_arr($rec,$format);
858                          } else {                          } else {
859                                  push @{$row->{'display'}}, $v;                                  @v = $self->parse_to_arr($rec,$format);
860                                  push @{$row->{'swish'}}, $v;                          }
861                            next if (! @v);
862    
863                            if ($tag->{'sort'}) {
864                                    @v = $self->sort_arr(@v);
865                                    $log->warn("sort within tag is usually not what you want!");
866                            }
867    
868                            # use format?
869                            if ($tag->{'format_name'}) {
870                                    @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
871                            }
872    
873                            if ($field eq 'filename') {
874                                    $self->{'current_filename'} = join('',@v);
875                                    $log->debug("filename: ",$self->{'current_filename'});
876                            } elsif ($field eq 'headline') {
877                                    $self->{'headline'} .= join('',@v);
878                                    $log->debug("headline: ",$self->{'headline'});
879                                    next; # don't return headline in data_structure!
880                          }                          }
881    
882                            # delimiter will join repeatable fields
883                            if ($tag->{'delimiter'}) {
884                                    @v = ( join($tag->{'delimiter'}, @v) );
885                            }
886    
887                            # default types
888                            my @types = qw(display swish);
889                            # override by type attribute
890                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
891    
892                            foreach my $type (@types) {
893                                    # append to previous line?
894                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
895                                    if ($tag->{'append'}) {
896    
897                                            # I will delimit appended part with
898                                            # delimiter (or ,)
899                                            my $d = $tag->{'delimiter'};
900                                            # default delimiter
901                                            $d ||= ", ";
902    
903                                            my $last = pop @{$row->{$type}};
904                                            $d = "" if (! $last);
905                                            $last .= $d . join($d, @v);
906                                            push @{$row->{$type}}, $last;
907    
908                                    } else {
909                                            push @{$row->{$type}}, @v;
910                                    }
911                            }
912    
913    
914                  }                  }
915    
916                  push @{$ds->{$field}}, $row if ($row);                  if ($row) {
917                            $row->{'tag'} = $field;
918    
919                            # TODO: name_sigular, name_plural
920                            my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
921                            $row->{'name'} = $name ? $self->_x($name) : $field;
922    
923                            # post-sort all values in field
924                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
925                                    $log->warn("sort at field tag not implemented");
926                            }
927    
928                            push @ds, $row;
929    
930                            $log->debug("row $field: ",sub { Dumper($row) });
931                    }
932    
933          }          }
934    
935          print Dumper($ds);          return @ds;
936    
937  }  }
938    
939    =head2 output
940    
941    Create output from in-memory data structure using Template Toolkit template.
942    
943    my $text = $webpac->output( template => 'text.tt', data => @ds );
944    
945    =cut
946    
947    sub output {
948            my $self = shift;
949    
950            my $args = {@_};
951    
952            my $log = $self->_get_logger();
953    
954            $log->logconfess("need template name") if (! $args->{'template'});
955            $log->logconfess("need data array") if (! $args->{'data'});
956    
957            my $out;
958    
959            $self->{'tt'}->process(
960                    $args->{'template'},
961                    $args,
962                    \$out
963            ) || confess $self->{'tt'}->error();
964    
965            return $out;
966    }
967    
968    =head2 output_file
969    
970    Create output from in-memory data structure using Template Toolkit template
971    to a file.
972    
973     $webpac->output_file(
974            file => 'out.txt',
975            template => 'text.tt',
976            data => @ds
977     );
978    
979    =cut
980    
981    sub output_file {
982            my $self = shift;
983    
984            my $args = {@_};
985    
986            my $log = $self->_get_logger();
987    
988            my $file = $args->{'file'} || $log->logconfess("need file name");
989    
990            $log->debug("creating file ",$file);
991    
992            open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
993            print $fh $self->output(
994                    template => $args->{'template'},
995                    data => $args->{'data'},
996            ) || $log->logdie("print: $!");
997            close($fh) || $log->logdie("close: $!");
998    }
999    
1000    =head2 apply_format
1001    
1002    Apply format specified in tag with C<format_name="name"> and
1003    C<format_delimiter=";;">.
1004    
1005     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1006    
1007    Formats can contain C<lookup{...}> if you need them.
1008    
1009    =cut
1010    
1011    sub apply_format {
1012            my $self = shift;
1013    
1014            my ($name,$delimiter,$data) = @_;
1015    
1016            my $log = $self->_get_logger();
1017    
1018            if (! $self->{'import_xml'}->{'format'}->{$name}) {
1019                    $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1020                    return $data;
1021            }
1022    
1023            $log->warn("no delimiter for format $name") if (! $delimiter);
1024    
1025            my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1026    
1027            my @data = split(/\Q$delimiter\E/, $data);
1028    
1029            my $out = sprintf($format, @data);
1030            $log->debug("using format $name [$format] on $data to produce: $out");
1031    
1032            if ($out =~ m/$LOOKUP_REGEX/o) {
1033                    return $self->lookup($out);
1034            } else {
1035                    return $out;
1036            }
1037    
1038    }
1039    
1040    
1041    #
1042    #
1043    #
1044    
1045    =head1 INTERNAL METHODS
1046    
1047    Here is a quick list of internal methods, mostly useful to turn debugging
1048    on them (see L<LOGGING> below for explanation).
1049    
1050    =cut
1051    
1052    =head2 _eval
1053    
1054    Internal function to eval code without C<strict 'subs'>.
1055    
1056    =cut
1057    
1058    sub _eval {
1059            my $self = shift;
1060    
1061            my $code = shift || return;
1062    
1063            my $log = $self->_get_logger();
1064    
1065            no strict 'subs';
1066            my $ret = eval $code;
1067            if ($@) {
1068                    $log->error("problem with eval code [$code]: $@");
1069            }
1070    
1071            $log->debug("eval: ",$code," [",$ret,"]");
1072    
1073            return $ret || 0;
1074    }
1075    
1076    =head2 _sort_by_order
1077    
1078    Sort xml tags data structure accoding to C<order=""> attribute.
1079    
1080    =cut
1081    
1082    sub _sort_by_order {
1083            my $self = shift;
1084    
1085            my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1086                    $self->{'import_xml'}->{'indexer'}->{$a};
1087            my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1088                    $self->{'import_xml'}->{'indexer'}->{$b};
1089    
1090            return $va <=> $vb;
1091    }
1092    
1093    =head2 _get_logger
1094    
1095    Get C<Log::Log4perl> object with a twist: domains are defined for each
1096    method
1097    
1098     my $log = $webpac->_get_logger();
1099    
1100    =cut
1101    
1102    sub _get_logger {
1103            my $self = shift;
1104    
1105            my $name = (caller(1))[3] || caller;
1106            return get_logger($name);
1107    }
1108    
1109    =head2 _x
1110    
1111    Convert string from UTF-8 to code page defined in C<import_xml>.
1112    
1113     my $text = $webpac->_x('utf8 text');
1114    
1115    =cut
1116    
1117    sub _x {
1118            my $self = shift;
1119            my $utf8 = shift || return;
1120    
1121            return $self->{'utf2cp'}->convert($utf8) ||
1122                    $self->_get_logger()->logwarn("can't convert '$utf8'");
1123    }
1124    
1125    #
1126    #
1127    #
1128    
1129    =head1 LOGGING
1130    
1131    Logging in WebPAC is performed by L<Log::Log4perl> with config file
1132    C<log.conf>.
1133    
1134    Methods defined above have different levels of logging, so
1135    it's descriptions will be useful to turn (mostry B<debug> logging) on
1136    or off to see why WabPAC isn't perforing as you expect it (it might even
1137    be a bug!).
1138    
1139    B<This is different from normal Log4perl behaviour>. To repeat, you can
1140    also use method names, and not only classes (which are just few)
1141    to filter logging.
1142    
1143    
1144    =head1 MEMORY USAGE
1145    
1146    C<low_mem> options is double-edged sword. If enabled, WebPAC
1147    will run on memory constraint machines (which doesn't have enough
1148    physical RAM to create memory structure for whole source database).
1149    
1150    If your machine has 512Mb or more of RAM and database is around 10000 records,
1151    memory shouldn't be an issue. If you don't have enough physical RAM, you
1152    might consider using virtual memory (if your operating system is handling it
1153    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1154    parsed structure of ISIS database (this is what C<low_mem> option does).
1155    
1156    Hitting swap at end of reading source database is probably o.k. However,
1157    hitting swap before 90% will dramatically decrease performance and you will
1158    be better off with C<low_mem> and using rest of availble memory for
1159    operating system disk cache (Linux is particuallary good about this).
1160    However, every access to database record will require disk access, so
1161    generation phase will be slower 10-100 times.
1162    
1163    Parsed structures are essential - you just have option to trade RAM memory
1164    (which is fast) for disk space (which is slow). Be sure to have planty of
1165    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1166    
1167    However, when WebPAC is running on desktop machines (or laptops :-), it's
1168    highly undesireable for system to start swapping. Using C<low_mem> option can
1169    reduce WecPAC memory usage to around 64Mb for same database with lookup
1170    fields and sorted indexes which stay in RAM. Performance will suffer, but
1171    memory usage will really be minimal. It might be also more confortable to
1172    run WebPAC reniced on those machines.
1173    
1174    =cut
1175    
1176  1;  1;

Legend:
Removed from v.366  
changed lines
  Added in v.459

  ViewVC Help
Powered by ViewVC 1.1.26