/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 366 by dpavlin, Thu Jun 17 01:44:25 2004 UTC revision 501 by dpavlin, Sun Oct 10 11:25:10 2004 UTC
# Line 1  Line 1 
1  package WebPAC;  package WebPAC;
2    
3    use warnings;
4    use strict;
5    
6  use Carp;  use Carp;
7  use Text::Iconv;  use Text::Iconv;
8  use Config::IniFiles;  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10    use Template;
11    use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
16    #my $LOOKUP_REGEX = '\[[^\[\]]+\]';
17    #my $LOOKUP_REGEX_SAVE = '\[([^\[\]]+)\]';
18    my $LOOKUP_REGEX = 'lookup{[^\{\}]+}';
19    my $LOOKUP_REGEX_SAVE = 'lookup{([^\{\}]+)}';
20    
21  =head1 NAME  =head1 NAME
22    
23  WebPAC - base class for WebPAC  WebPAC - base class for WebPAC
# Line 19  This module implements methods used by W Line 30  This module implements methods used by W
30    
31  =head2 new  =head2 new
32    
33  This will create new instance of WebPAC using configuration specified by C<config_file>.  Create new instance of WebPAC using configuration specified by C<config_file>.
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38            low_mem => 1,
39            filter => {
40                    'lower' => sub { lc($_[0]) },
41            },
42   );   );
43    
44  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
45    
46  It will also read configuration files  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
47    
48    There is optinal parametar C<filter> which specify different filters which
49    can be applied using C<filter{name}> notation.
50    
51    This method will also read configuration files
52  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
53  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
54  which describes databases to be indexed.  which describes databases to be indexed.
# Line 49  sub new { Line 69  sub new {
69          my $self = {@_};          my $self = {@_};
70          bless($self, $class);          bless($self, $class);
71    
72            $self->{'start_t'} = time();
73    
74            my $log_file = $self->{'log'} || "log.conf";
75            Log::Log4perl->init($log_file);
76    
77            my $log = $self->_get_logger();
78    
79          # fill in default values          # fill in default values
80          # output codepage          # output codepage
81          $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});          $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
# Line 56  sub new { Line 83  sub new {
83          #          #
84          # read global.conf          # read global.conf
85          #          #
86            $log->debug("read 'global.conf'");
87    
88          $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";          my $config = new Config::IniFiles( -file => 'global.conf' ) || $log->logcroak("can't open 'global.conf'");
89    
90          # read global config parametars          # read global config parametars
91          foreach my $var (qw(          foreach my $var (qw(
# Line 67  sub new { Line 95  sub new {
95                          dbi_passwd                          dbi_passwd
96                          show_progress                          show_progress
97                          my_unac_filter                          my_unac_filter
98                            output_template
99                  )) {                  )) {
100                  $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);                  $self->{'global_config'}->{$var} = $config->val('global', $var);
101          }          }
102    
103          #          #
104          # read indexer config file          # read indexer config file
105          #          #
106    
107          $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";          $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || $log->logcroak("can't open '",$self->{config_file},"'");
108    
109            # create UTF-8 convertor for import_xml files
110          $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});          $self->{'utf2cp'} = Text::Iconv->new('UTF-8' ,$self->{'code_page'});
111    
112            # create Template toolkit instance
113            $self->{'tt'} = Template->new(
114                    INCLUDE_PATH => ($self->{'global_config_file'}->{'output_template'} || './output_template'),
115    #               FILTERS => {
116    #                       'foo' => \&foo_filter,
117    #               },
118                    EVAL_PERL => 1,
119            );
120    
121            # running with low_mem flag? well, use DBM::Deep then.
122            if ($self->{'low_mem'}) {
123                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
124    
125                    my $db_file = "data.db";
126    
127                    if (-e $db_file) {
128                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
129                            $log->debug("removed '$db_file' from last run");
130                    }
131    
132                    require DBM::Deep;
133    
134                    my $db = new DBM::Deep $db_file;
135    
136                    $log->logdie("DBM::Deep error: $!") unless ($db);
137    
138                    if ($db->error()) {
139                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
140                    } else {
141                            $log->debug("using file '$db_file' for DBM::Deep");
142                    }
143    
144                    $self->{'db'} = $db;
145            }
146    
147            $log->debug("filters defined: ",Dumper($self->{'filter'}));
148    
149          return $self;          return $self;
150  }  }
151    
# Line 88  Open CDS/ISIS database using OpenIsis mo Line 156  Open CDS/ISIS database using OpenIsis mo
156   $webpac->open_isis(   $webpac->open_isis(
157          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
158          code_page => '852',          code_page => '852',
159          limit_mfn => '500',          limit_mfn => 500,
160            start_mfn => 6000,
161          lookup => [ ... ],          lookup => [ ... ],
162   );   );
163    
164  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
165    
166    If optional parametar C<start_mfn> is set, this will be first MFN to read
167    from database (so you can skip beginning of your database if you need to).
168    
169  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
170  from database in example above.  from database in example above.
171    
 Returns number of last record read into memory (size of database, really).  
   
172  C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and  C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
173  C<val>. Optional parametar C<eval> is perl code to evaluate before storing  C<val>. Optional parametar C<eval> is perl code to evaluate before storing
174  value in index.  value in index.
# Line 110  value in index. Line 180  value in index.
180      'val' => 'v900' },      'val' => 'v900' },
181   ]   ]
182    
183    Returns number of last record read into memory (size of database, really).
184    
185  =cut  =cut
186    
187  sub open_isis {  sub open_isis {
188          my $self = shift;          my $self = shift;
189          my $arg = {@_};          my $arg = {@_};
190    
191          croak "need filename" if (! $arg->{'filename'});          my $log = $self->_get_logger();
192    
193            $log->logcroak("need filename") if (! $arg->{'filename'});
194          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
195    
196            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
197    
198            # store data in object
199            $self->{'isis_filename'} = $arg->{'filename'};
200            $self->{'isis_code_page'} = $code_page;
201    
202          use OpenIsis;          use OpenIsis;
203    
204          #$self->{'isis_code_page'} = $code_page;          #$self->{'isis_code_page'} = $code_page;
# Line 126  sub open_isis { Line 206  sub open_isis {
206          # create Text::Iconv object          # create Text::Iconv object
207          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
208    
209          print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});          $log->info("reading ISIS database '",$arg->{'filename'},"'");
210            $log->debug("isis code page: $code_page");
211    
212          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
213    
214          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
215            my $startmfn = 1;
216    
217          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          if (my $s = $self->{'start_mfn'}) {
218                    $log->info("skipping to MFN $s");
219                    $startmfn = $s;
220            } else {
221                    $self->{'start_mfn'} = $startmfn;
222            }
223    
224            $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
225    
226          print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});          $log->info("processing ",($maxmfn-$startmfn)." records...");
227    
228          # read database          # read database
229          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
230    
231    
232                    $log->debug("mfn: $mfn\n");
233    
234                    my $rec;
235    
236                  # read record                  # read record
237                  my $row = OpenIsis::read( $isis_db, $mfn );                  my $row = OpenIsis::read( $isis_db, $mfn );
# Line 156  sub open_isis { Line 250  sub open_isis {
250                                                  $val = $l;                                                  $val = $l;
251                                          }                                          }
252    
253                                          push @{$self->{'data'}->{$mfn}->{$k}}, $val;                                          push @{$rec->{$k}}, $val;
254                                  }                                  }
255                            } else {
256                                    push @{$rec->{'000'}}, $mfn;
257                          }                          }
258    
259                  }                  }
260    
261                    $log->confess("record $mfn empty?") unless ($rec);
262    
263                    # store
264                    if ($self->{'low_mem'}) {
265                            $self->{'db'}->put($mfn, $rec);
266                    } else {
267                            $self->{'data'}->{$mfn} = $rec;
268                    }
269    
270                  # create lookup                  # create lookup
                 my $rec = $self->{'data'}->{$mfn};  
271                  $self->create_lookup($rec, @{$arg->{'lookup'}});                  $self->create_lookup($rec, @{$arg->{'lookup'}});
272    
273                    $self->progress_bar($mfn,$maxmfn);
274    
275          }          }
276    
277          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = -1;
278            $self->{'last_pcnt'} = 0;
279    
280            $log->debug("max mfn: $maxmfn");
281    
282          # store max mfn and return it.          # store max mfn and return it.
283          return $self->{'max_mfn'} = $maxmfn;          return $self->{'max_mfn'} = $maxmfn;
# Line 186  it's implemented, that is). Line 295  it's implemented, that is).
295  sub fetch_rec {  sub fetch_rec {
296          my $self = shift;          my $self = shift;
297    
298          my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";          my $log = $self->_get_logger();
299    
300            $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
301    
302            if ($self->{'current_mfn'} == -1) {
303                    $self->{'current_mfn'} = $self->{'start_mfn'};
304            } else {
305                    $self->{'current_mfn'}++;
306            }
307    
308            my $mfn = $self->{'current_mfn'};
309    
310          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{'max_mfn'}) {
311                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{'current_mfn'} = $self->{'max_mfn'};
312                    $log->debug("at EOF");
313                  return;                  return;
314          }          }
315    
316          return $self->{'data'}->{$mfn};          $self->progress_bar($mfn,$self->{'max_mfn'});
317    
318            if ($self->{'low_mem'}) {
319                    return $self->{'db'}->get($mfn);
320            } else {
321                    return $self->{'data'}->{$mfn};
322            }
323    }
324    
325    =head2 mfn
326    
327    Returns current record number (MFN).
328    
329     print $webpac->mfn;
330    
331    =cut
332    
333    sub mfn {
334            my $self = shift;
335            return $self->{'current_mfn'};
336    }
337    
338    =head2 progress_bar
339    
340    Draw progress bar on STDERR.
341    
342     $webpac->progress_bar($current, $max);
343    
344    =cut
345    
346    sub progress_bar {
347            my $self = shift;
348    
349            my ($curr,$max) = @_;
350    
351            my $log = $self->_get_logger();
352    
353            $log->logconfess("no current value!") if (! $curr);
354            $log->logconfess("no maximum value!") if (! $max);
355    
356            if ($curr > $max) {
357                    $max = $curr;
358                    $log->debug("overflow to $curr");
359            }
360    
361            $self->{'last_pcnt'} ||= 1;
362    
363            my $p = int($curr * 100 / $max) || 1;
364    
365            # reset on re-run
366            if ($p < $self->{'last_pcnt'}) {
367                    $self->{'last_pcnt'} = $p;
368                    $self->{'last_t'} = time();
369                    $self->{'last_curr'} = undef;
370            }
371    
372            $self->{'last_t'} ||= time();
373    
374            if ($p != $self->{'last_pcnt'}) {
375    
376                    my $last_curr = $self->{'last_curr'} || $curr;
377                    my $t = time();
378                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
379                    my $eta = ($max-$curr) / ($rate || 1);
380                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
381                    $self->{'last_pcnt'} = $p;
382                    $self->{'last_t'} = time();
383                    $self->{'last_curr'} = $curr;
384            }
385            print STDERR "\n" if ($p == 100);
386    }
387    
388    =head2 fmt_time
389    
390    Format time (in seconds) for display.
391    
392     print $webpac->fmt_time(time());
393    
394    This method is called by L<progress_bar> to display remaining time.
395    
396    =cut
397    
398    sub fmt_time {
399            my $self = shift;
400    
401            my $t = shift || 0;
402            my $out = "";
403    
404            my ($ss,$mm,$hh) = gmtime($t);
405            $out .= "${hh}h" if ($hh);
406            $out .= sprintf("%02d:%02d", $mm,$ss);
407            $out .= "  " if ($hh == 0);
408            return $out;
409  }  }
410    
411  =head2 open_import_xml  =head2 open_import_xml
# Line 207  Read file from C<import_xml/> directory Line 419  Read file from C<import_xml/> directory
419  sub open_import_xml {  sub open_import_xml {
420          my $self = shift;          my $self = shift;
421    
422            my $log = $self->_get_logger();
423    
424          my $arg = {@_};          my $arg = {@_};
425          confess "need type to load file from import_xml/" if (! $arg->{'type'});          $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
426    
427          $self->{'type'} = $arg->{'type'};          $self->{'type'} = $arg->{'type'};
428    
# Line 217  sub open_import_xml { Line 431  sub open_import_xml {
431    
432          $self->{'tag'} = $type2tag{$type_base};          $self->{'tag'} = $type2tag{$type_base};
433    
434          print STDERR "using type ",$self->{'type'}," tag ",$self->{'tag'},"\n" if ($self->{'debug'});          $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
435    
436          my $f = "./import_xml/".$self->{'type'}.".xml";          my $f = "./import_xml/".$self->{'type'}.".xml";
437          confess "import_xml file '$f' doesn't exist!" if (! -e "$f");          $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
438    
439            $log->info("reading '$f'");
440    
441          print STDERR "reading '$f'\n" if ($self->{'debug'});          $self->{'import_xml_file'} = $f;
442    
443          $self->{'import_xml'} = XMLin($f,          $self->{'import_xml'} = XMLin($f,
444                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],                  ForceArray => [ $self->{'tag'}, 'config', 'format' ],
                 ForceContent => 1  
445          );          );
446    
447          print Dumper($self->{'import_xml'});          $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
448    
449  }  }
450    
# Line 237  sub open_import_xml { Line 452  sub open_import_xml {
452    
453  Create lookup from record using lookup definition.  Create lookup from record using lookup definition.
454    
455     $self->create_lookup($rec, @lookups);
456    
457    Called internally by C<open_*> methods.
458    
459  =cut  =cut
460    
461  sub create_lookup {  sub create_lookup {
462          my $self = shift;          my $self = shift;
463    
464          my $rec = shift || confess "need record to create lookup";          my $log = $self->_get_logger();
465          confess("need HASH as first argument!") if ($rec !~ /HASH/o);  
466            my $rec = shift || $log->logconfess("need record to create lookup");
467            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
468    
469          foreach my $i (@_) {          foreach my $i (@_) {
470                  if ($i->{'eval'}) {                  $log->logconfess("need key") unless defined($i->{'key'});
471                          my $eval = $self->fill_in($rec,$i->{'eval'});                  $log->logconfess("need val") unless defined($i->{'val'});
472                          my $key = $self->fill_in($rec,$i->{'key'});  
473                          my @val = $self->fill_in($rec,$i->{'val'});                  if (defined($i->{'eval'})) {
474                          if ($key && @val && eval $eval) {                          # eval first, so we can skip fill_in for key and val
475                            my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
476                            if ($self->_eval($eval)) {
477                                    my $key = $self->fill_in($rec,$i->{'key'}) || next;
478                                    my @val = $self->fill_in($rec,$i->{'val'}) || next;
479                                    $log->debug("stored $key = ",sub { join(" | ",@val) });
480                                  push @{$self->{'lookup'}->{$key}}, @val;                                  push @{$self->{'lookup'}->{$key}}, @val;
481                          }                          }
482                  } else {                  } else {
483                          my $key = $self->fill_in($rec,$i->{'key'});                          my $key = $self->fill_in($rec,$i->{'key'}) || next;
484                          my @val = $self->fill_in($rec,$i->{'val'});                          my @val = $self->fill_in($rec,$i->{'val'}) || next;
485                          if ($key && @val) {                          $log->debug("stored $key = ",sub { join(" | ",@val) });
486                                  push @{$self->{'lookup'}->{$key}}, @val;                          push @{$self->{'lookup'}->{$key}}, @val;
                         }  
487                  }                  }
488          }          }
489  }  }
# Line 267  sub create_lookup { Line 492  sub create_lookup {
492    
493  Returns value from record.  Returns value from record.
494    
495   $self->get_data(\$rec,$f,$sf,$i,\$found);   my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
496    
497  Arguments are:  Arguments are:
498  record reference C<$rec>,  record reference C<$rec>,
# Line 275  field C<$f>, Line 500  field C<$f>,
500  optional subfiled C<$sf>,  optional subfiled C<$sf>,
501  index for repeatable values C<$i>.  index for repeatable values C<$i>.
502    
503  Optinal variable C<$found> will be incremeted if thre  Optinal variable C<$found> will be incremeted if there
504  is field.  is field.
505    
506  Returns value or empty string.  Returns value or empty string.
# Line 286  sub get_data { Line 511  sub get_data {
511          my $self = shift;          my $self = shift;
512    
513          my ($rec,$f,$sf,$i,$found) = @_;          my ($rec,$f,$sf,$i,$found) = @_;
514    
515          if ($$rec->{$f}) {          if ($$rec->{$f}) {
516                    return '' if (! $$rec->{$f}->[$i]);
517                    no strict 'refs';
518                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {                  if ($sf && $$rec->{$f}->[$i]->{$sf}) {
519                          $$found++ if (defined($$found));                          $$found++ if (defined($$found));
520                          return $$rec->{$f}->[$i]->{$sf};                          return $$rec->{$f}->[$i]->{$sf};
# Line 315  Workhourse of all: takes record from in- Line 543  Workhourse of all: takes record from in-
543  strings with placeholders and returns string or array of with substituted  strings with placeholders and returns string or array of with substituted
544  values from record.  values from record.
545    
546   $webpac->fill_in($rec,'v250^a');   my $text = $webpac->fill_in($rec,'v250^a');
547    
548  Optional argument is ordinal number for repeatable fields. By default,  Optional argument is ordinal number for repeatable fields. By default,
549  it's assume to be first repeatable field (fields are perl array, so first  it's assume to be first repeatable field (fields are perl array, so first
550  element is 0).  element is 0).
551  Following example will read second value from repeatable field.  Following example will read second value from repeatable field.
552    
553   $webpac->fill_in($rec,'Title: v250^a',1);   my $text = $webpac->fill_in($rec,'Title: v250^a',1);
554    
555  This function B<does not> perform parsing of format to inteligenty skip  This function B<does not> perform parsing of format to inteligenty skip
556  delimiters before fields which aren't used.  delimiters before fields which aren't used.
557    
558    This method will automatically decode UTF-8 string to local code page
559    if needed.
560    
561  =cut  =cut
562    
563  sub fill_in {  sub fill_in {
564          my $self = shift;          my $self = shift;
565    
566          my $rec = shift || confess "need data record";          my $log = $self->_get_logger();
567          my $format = shift || confess "need format to parse";  
568            my $rec = shift || $log->logconfess("need data record");
569            my $format = shift || $log->logconfess("need format to parse");
570          # iteration (for repeatable fields)          # iteration (for repeatable fields)
571          my $i = shift || 0;          my $i = shift || 0;
572    
573          # FIXME remove for speedup?          # FIXME remove for speedup?
574          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
575    
576            if (utf8::is_utf8($format)) {
577                    $format = $self->_x($format);
578            }
579    
580          my $found = 0;          my $found = 0;
581    
# Line 346  sub fill_in { Line 583  sub fill_in {
583          # remove eval{...} from beginning          # remove eval{...} from beginning
584          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
585    
586            my $filter_name;
587            # remove filter{...} from beginning
588            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
589    
590          # do actual replacement of placeholders          # do actual replacement of placeholders
591          $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
592    
593          if ($found) {          if ($found) {
594                    $log->debug("format: $format");
595                  if ($eval_code) {                  if ($eval_code) {
596                          my $eval = $self->fill_in($rec,$eval_code,$i);                          my $eval = $self->fill_in($rec,$eval_code,$i);
597                          return if (! eval $eval);                          return if (! $self->_eval($eval));
598                    }
599                    if ($filter_name && $self->{'filter'}->{$filter_name}) {
600                            $log->debug("filter '$filter_name' for $format");
601                            $format = $self->{'filter'}->{$filter_name}->($format);
602                            return unless(defined($format));
603                            $log->debug("filter result: $format");
604                  }                  }
605                  # do we have lookups?                  # do we have lookups?
606                  if ($format =~ /\[[^\[\]]+\]/o) {                  if ($format =~ /$LOOKUP_REGEX/o) {
607                            $log->debug("format '$format' has lookup");
608                          return $self->lookup($format);                          return $self->lookup($format);
609                  } else {                  } else {
610                          return $format;                          return $format;
# Line 369  sub fill_in { Line 618  sub fill_in {
618    
619  Perform lookups on format supplied to it.  Perform lookups on format supplied to it.
620    
621   my $txt = $self->lookup('[v900]');   my $text = $self->lookup('[v900]');
622    
623  Lookups can be nested (like C<[d:[a:[v900]]]>).  Lookups can be nested (like C<[d:[a:[v900]]]>).
624    
# Line 378  Lookups can be nested (like C<[d:[a:[v90 Line 627  Lookups can be nested (like C<[d:[a:[v90
627  sub lookup {  sub lookup {
628          my $self = shift;          my $self = shift;
629    
630          my $tmp = shift || confess "need format";          my $log = $self->_get_logger();
631    
632            my $tmp = shift || $log->logconfess("need format");
633    
634          if ($tmp =~ /\[[^\[\]]+\]/o) {          if ($tmp =~ /$LOOKUP_REGEX/o) {
635                  my @in = ( $tmp );                  my @in = ( $tmp );
636  #print "##lookup $tmp\n";  
637                    $log->debug("lookup for: ",$tmp);
638    
639                  my @out;                  my @out;
640                  while (my $f = shift @in) {                  while (my $f = shift @in) {
641                          if ($f =~ /\[([^\[\]]+)\]/) {                          if ($f =~ /$LOOKUP_REGEX_SAVE/o) {
642                                  my $k = $1;                                  my $k = $1;
643                                  if ($self->{'lookup'}->{$k}) {                                  if ($self->{'lookup'}->{$k}) {
 #print "## lookup key = $k\n";  
644                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {                                          foreach my $nv (@{$self->{'lookup'}->{$k}}) {
645                                                  my $tmp2 = $f;                                                  my $tmp2 = $f;
646                                                  $tmp2 =~ s/\[$k\]/$nv/g;                                                  $tmp2 =~ s/lookup{$k}/$nv/g;
647                                                  push @in, $tmp2;                                                  push @in, $tmp2;
 #print "## lookup in => $tmp2\n";  
648                                          }                                          }
649                                  } else {                                  } else {
650                                          undef $f;                                          undef $f;
651                                  }                                  }
652                          } elsif ($f) {                          } elsif ($f) {
653                                  push @out, $f;                                  push @out, $f;
 #print "## lookup out => $f\n";  
654                          }                          }
655                  }                  }
656                    $log->logconfess("return is array and it's not expected!") unless wantarray;
657                  return @out;                  return @out;
658          } else {          } else {
659                  return $tmp;                  return $tmp;
# Line 415  Perform smart parsing of string, skippin Line 666  Perform smart parsing of string, skippin
666  defined. It can also eval code in format starting with C<eval{...}> and  defined. It can also eval code in format starting with C<eval{...}> and
667  return output or nothing depending on eval code.  return output or nothing depending on eval code.
668    
669   $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);   my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
670    
671  =cut  =cut
672    
# Line 426  sub parse { Line 677  sub parse {
677    
678          return if (! $format_utf8);          return if (! $format_utf8);
679    
680          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          my $log = $self->_get_logger();
681          confess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});  
682            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
683            $log->logconfess("need utf2cp Text::Iconv object!") if (! $self->{'utf2cp'});
684    
685          $i = 0 if (! $i);          $i = 0 if (! $i);
686    
687          my $format = $self->{'utf2cp'}->convert($format_utf8) || confess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});          my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
688    
689          my @out;          my @out;
690    
691            $log->debug("format: $format");
692    
693          my $eval_code;          my $eval_code;
694          # remove eval{...} from beginning          # remove eval{...} from beginning
695          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
696    
697            my $filter_name;
698            # remove filter{...} from beginning
699            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
700    
701          my $prefix;          my $prefix;
702          my $all_found=0;          my $all_found=0;
703    
704  #print "## $format\n";          while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
         while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {  
 #print "## [ $1 | $2 | $3 ] $format\n";  
705    
706                  my $del = $1 || '';                  my $del = $1 || '';
707                  $prefix ||= $del if ($all_found == 0);                  $prefix ||= $del if ($all_found == 0);
# Line 461  sub parse { Line 718  sub parse {
718    
719          return if (! $all_found);          return if (! $all_found);
720    
721          my $out = join('',@out) . $format;          my $out = join('',@out);
722    
723            if ($out) {
724                    # add rest of format (suffix)
725                    $out .= $format;
726    
727                    # add prefix if not there
728                    $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
729    
730                    $log->debug("result: $out");
731            }
732    
         # add prefix if not there  
         $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);  
           
733          if ($eval_code) {          if ($eval_code) {
734                  my $eval = $self->fill_in($rec,$eval_code,$i);                  my $eval = $self->fill_in($rec,$eval_code,$i) || return;
735                  return if (! eval $eval);                  $log->debug("about to eval{$eval} format: $out");
736                    return if (! $self->_eval($eval));
737            }
738            
739            if ($filter_name && $self->{'filter'}->{$filter_name}) {
740                    $log->debug("about to filter{$filter_name} format: $out");
741                    $out = $self->{'filter'}->{$filter_name}->($out);
742                    return unless(defined($out));
743                    $log->debug("filter result: $out");
744          }          }
745    
746          return $out;          return $out;
747  }  }
748    
749  =head2 data_structure  =head2 parse_to_arr
750    
751  Create in-memory data structure which represents layout from C<import_xml>.  Similar to C<parse>, but returns array of all repeatable fields
 It is used later to produce output.  
752    
753   my $ds = $webpac->data_structure($rec);   my @arr = $webpac->parse_to_arr($rec,'v250^a');
754    
755  =cut  =cut
756    
757  # private method _sort_by_order  sub parse_to_arr {
 # sort subrouting using order="" attribute  
 sub _sort_by_order {  
758          my $self = shift;          my $self = shift;
759    
760          my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||          my ($rec, $format_utf8) = @_;
                 $self->{'import_xml'}->{'indexer'}->{$a};  
         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||  
                 $self->{'import_xml'}->{'indexer'}->{$b};  
761    
762          return $va <=> $vb;          my $log = $self->_get_logger();
763    
764            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
765            return if (! $format_utf8);
766    
767            my $i = 0;
768            my @arr;
769    
770            while (my $v = $self->parse($rec,$format_utf8,$i++)) {
771                    push @arr, $v;
772            }
773    
774            $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
775    
776            return @arr;
777    }
778    
779    =head2 fill_in_to_arr
780    
781    Similar to C<fill_in>, but returns array of all repeatable fields. Usable
782    for fields which have lookups, so they shouldn't be parsed but rather
783    C<fill_id>ed.
784    
785     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
786    
787    =cut
788    
789    sub fill_in_to_arr {
790            my $self = shift;
791    
792            my ($rec, $format_utf8) = @_;
793    
794            my $log = $self->_get_logger();
795    
796            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
797            return if (! $format_utf8);
798    
799            my $i = 0;
800            my @arr;
801    
802            while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
803                    push @arr, @v;
804            }
805    
806            $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
807    
808            return @arr;
809  }  }
810    
811    =head2 sort_arr
812    
813    Sort array ignoring case and html in data
814    
815     my @sorted = $webpac->sort_arr(@unsorted);
816    
817    =cut
818    
819    sub sort_arr {
820            my $self = shift;
821    
822            my $log = $self->_get_logger();
823    
824            # FIXME add Schwartzian Transformation?
825    
826            my @sorted = sort {
827                    $a =~ s#<[^>]+/*>##;
828                    $b =~ s#<[^>]+/*>##;
829                    lc($b) cmp lc($a)
830            } @_;
831            $log->debug("sorted values: ",sub { join(", ",@sorted) });
832    
833            return @sorted;
834    }
835    
836    
837    =head2 data_structure
838    
839    Create in-memory data structure which represents layout from C<import_xml>.
840    It is used later to produce output.
841    
842     my @ds = $webpac->data_structure($rec);
843    
844    This method will also set C<$webpac->{'currnet_filename'}> if there is
845    <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
846    <headline> tag.
847    
848    =cut
849    
850  sub data_structure {  sub data_structure {
851          my $self = shift;          my $self = shift;
852    
853            my $log = $self->_get_logger();
854    
855          my $rec = shift;          my $rec = shift;
856          confess("need HASH as first argument!") if ($rec !~ /HASH/o);          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
857    
858            undef $self->{'currnet_filename'};
859            undef $self->{'headline'};
860    
861          my @sorted_tags;          my @sorted_tags;
862          if ($self->{tags_by_order}) {          if ($self->{tags_by_order}) {
# Line 510  sub data_structure { Line 866  sub data_structure {
866                  $self->{tags_by_order} = \@sorted_tags;                  $self->{tags_by_order} = \@sorted_tags;
867          }          }
868    
869          my $ds;          my @ds;
870    
871            $log->debug("tags: ",sub { join(", ",@sorted_tags) });
872    
873          foreach my $field (@sorted_tags) {          foreach my $field (@sorted_tags) {
874    
875                  my $row;                  my $row;
                 my $i = 0;  
876    
877  #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});  #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
878    
879                  foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {                  foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
880                            my $format = $tag->{'value'} || $tag->{'content'};
881    
882                          my $v = $self->parse($rec,$tag->{'content'},$i);                          $log->debug("format: $format");
 print "## $i:",$tag->{'content'}," = ",($v || 'null'),"\n";  
   
                         next if (!$v || $v && $v eq '');  
883    
884                          # does tag have type?                          my @v;
885                          if ($tag->{'type'}) {                          if ($format =~ /$LOOKUP_REGEX/o) {
886                                  push @{$row->{$tag->{'type'}}}, $v;                                  @v = $self->fill_in_to_arr($rec,$format);
887                          } else {                          } else {
888                                  push @{$row->{'display'}}, $v;                                  @v = $self->parse_to_arr($rec,$format);
889                                  push @{$row->{'swish'}}, $v;                          }
890                            next if (! @v);
891    
892                            if ($tag->{'sort'}) {
893                                    @v = $self->sort_arr(@v);
894                                    $log->warn("sort within tag is usually not what you want!");
895                            }
896    
897                            # use format?
898                            if ($tag->{'format_name'}) {
899                                    @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
900                            }
901    
902                            if ($field eq 'filename') {
903                                    $self->{'current_filename'} = join('',@v);
904                                    $log->debug("filename: ",$self->{'current_filename'});
905                            } elsif ($field eq 'headline') {
906                                    $self->{'headline'} .= join('',@v);
907                                    $log->debug("headline: ",$self->{'headline'});
908                                    next; # don't return headline in data_structure!
909                            }
910    
911                            # delimiter will join repeatable fields
912                            if ($tag->{'delimiter'}) {
913                                    @v = ( join($tag->{'delimiter'}, @v) );
914                            }
915    
916                            # default types
917                            my @types = qw(display swish);
918                            # override by type attribute
919                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
920    
921                            foreach my $type (@types) {
922                                    # append to previous line?
923                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
924                                    if ($tag->{'append'}) {
925    
926                                            # I will delimit appended part with
927                                            # delimiter (or ,)
928                                            my $d = $tag->{'delimiter'};
929                                            # default delimiter
930                                            $d ||= " ";
931    
932                                            my $last = pop @{$row->{$type}};
933                                            $d = "" if (! $last);
934                                            $last .= $d . join($d, @v);
935                                            push @{$row->{$type}}, $last;
936    
937                                    } else {
938                                            push @{$row->{$type}}, @v;
939                                    }
940                          }                          }
941    
942    
943                  }                  }
944    
945                  push @{$ds->{$field}}, $row if ($row);                  if ($row) {
946                            $row->{'tag'} = $field;
947    
948                            # TODO: name_sigular, name_plural
949                            my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
950                            $row->{'name'} = $name ? $self->_x($name) : $field;
951    
952                            # post-sort all values in field
953                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
954                                    $log->warn("sort at field tag not implemented");
955                            }
956    
957                            push @ds, $row;
958    
959                            $log->debug("row $field: ",sub { Dumper($row) });
960                    }
961    
962          }          }
963    
964          print Dumper($ds);          return @ds;
965    
966    }
967    
968    =head2 output
969    
970    Create output from in-memory data structure using Template Toolkit template.
971    
972    my $text = $webpac->output( template => 'text.tt', data => @ds );
973    
974    =cut
975    
976    sub output {
977            my $self = shift;
978    
979            my $args = {@_};
980    
981            my $log = $self->_get_logger();
982    
983            $log->logconfess("need template name") if (! $args->{'template'});
984            $log->logconfess("need data array") if (! $args->{'data'});
985    
986            my $out;
987    
988            $self->{'tt'}->process(
989                    $args->{'template'},
990                    $args,
991                    \$out
992            ) || confess $self->{'tt'}->error();
993    
994            return $out;
995  }  }
996    
997    =head2 output_file
998    
999    Create output from in-memory data structure using Template Toolkit template
1000    to a file.
1001    
1002     $webpac->output_file(
1003            file => 'out.txt',
1004            template => 'text.tt',
1005            data => @ds
1006     );
1007    
1008    =cut
1009    
1010    sub output_file {
1011            my $self = shift;
1012    
1013            my $args = {@_};
1014    
1015            my $log = $self->_get_logger();
1016    
1017            my $file = $args->{'file'} || $log->logconfess("need file name");
1018    
1019            $log->debug("creating file ",$file);
1020    
1021            open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
1022            print $fh $self->output(
1023                    template => $args->{'template'},
1024                    data => $args->{'data'},
1025            ) || $log->logdie("print: $!");
1026            close($fh) || $log->logdie("close: $!");
1027    }
1028    
1029    =head2 apply_format
1030    
1031    Apply format specified in tag with C<format_name="name"> and
1032    C<format_delimiter=";;">.
1033    
1034     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
1035    
1036    Formats can contain C<lookup{...}> if you need them.
1037    
1038    =cut
1039    
1040    sub apply_format {
1041            my $self = shift;
1042    
1043            my ($name,$delimiter,$data) = @_;
1044    
1045            my $log = $self->_get_logger();
1046    
1047            if (! $self->{'import_xml'}->{'format'}->{$name}) {
1048                    $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
1049                    return $data;
1050            }
1051    
1052            $log->warn("no delimiter for format $name") if (! $delimiter);
1053    
1054            my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
1055    
1056            my @data = split(/\Q$delimiter\E/, $data);
1057    
1058            my $out = sprintf($format, @data);
1059            $log->debug("using format $name [$format] on $data to produce: $out");
1060    
1061            if ($out =~ m/$LOOKUP_REGEX/o) {
1062                    return $self->lookup($out);
1063            } else {
1064                    return $out;
1065            }
1066    
1067    }
1068    
1069    
1070    #
1071    #
1072    #
1073    
1074    =head1 INTERNAL METHODS
1075    
1076    Here is a quick list of internal methods, mostly useful to turn debugging
1077    on them (see L<LOGGING> below for explanation).
1078    
1079    =cut
1080    
1081    =head2 _eval
1082    
1083    Internal function to eval code without C<strict 'subs'>.
1084    
1085    =cut
1086    
1087    sub _eval {
1088            my $self = shift;
1089    
1090            my $code = shift || return;
1091    
1092            my $log = $self->_get_logger();
1093    
1094            no strict 'subs';
1095            my $ret = eval $code;
1096            if ($@) {
1097                    $log->error("problem with eval code [$code]: $@");
1098            }
1099    
1100            $log->debug("eval: ",$code," [",$ret,"]");
1101    
1102            return $ret || undef;
1103    }
1104    
1105    =head2 _sort_by_order
1106    
1107    Sort xml tags data structure accoding to C<order=""> attribute.
1108    
1109    =cut
1110    
1111    sub _sort_by_order {
1112            my $self = shift;
1113    
1114            my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
1115                    $self->{'import_xml'}->{'indexer'}->{$a};
1116            my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
1117                    $self->{'import_xml'}->{'indexer'}->{$b};
1118    
1119            return $va <=> $vb;
1120    }
1121    
1122    =head2 _get_logger
1123    
1124    Get C<Log::Log4perl> object with a twist: domains are defined for each
1125    method
1126    
1127     my $log = $webpac->_get_logger();
1128    
1129    =cut
1130    
1131    sub _get_logger {
1132            my $self = shift;
1133    
1134            my $name = (caller(1))[3] || caller;
1135            return get_logger($name);
1136    }
1137    
1138    =head2 _x
1139    
1140    Convert string from UTF-8 to code page defined in C<import_xml>.
1141    
1142     my $text = $webpac->_x('utf8 text');
1143    
1144    =cut
1145    
1146    sub _x {
1147            my $self = shift;
1148            my $utf8 = shift || return;
1149    
1150            return $self->{'utf2cp'}->convert($utf8) ||
1151                    $self->_get_logger()->logwarn("can't convert '$utf8'");
1152    }
1153    
1154    #
1155    #
1156    #
1157    
1158    =head1 LOGGING
1159    
1160    Logging in WebPAC is performed by L<Log::Log4perl> with config file
1161    C<log.conf>.
1162    
1163    Methods defined above have different levels of logging, so
1164    it's descriptions will be useful to turn (mostry B<debug> logging) on
1165    or off to see why WabPAC isn't perforing as you expect it (it might even
1166    be a bug!).
1167    
1168    B<This is different from normal Log4perl behaviour>. To repeat, you can
1169    also use method names, and not only classes (which are just few)
1170    to filter logging.
1171    
1172    
1173    =head1 MEMORY USAGE
1174    
1175    C<low_mem> options is double-edged sword. If enabled, WebPAC
1176    will run on memory constraint machines (which doesn't have enough
1177    physical RAM to create memory structure for whole source database).
1178    
1179    If your machine has 512Mb or more of RAM and database is around 10000 records,
1180    memory shouldn't be an issue. If you don't have enough physical RAM, you
1181    might consider using virtual memory (if your operating system is handling it
1182    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1183    parsed structure of ISIS database (this is what C<low_mem> option does).
1184    
1185    Hitting swap at end of reading source database is probably o.k. However,
1186    hitting swap before 90% will dramatically decrease performance and you will
1187    be better off with C<low_mem> and using rest of availble memory for
1188    operating system disk cache (Linux is particuallary good about this).
1189    However, every access to database record will require disk access, so
1190    generation phase will be slower 10-100 times.
1191    
1192    Parsed structures are essential - you just have option to trade RAM memory
1193    (which is fast) for disk space (which is slow). Be sure to have planty of
1194    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1195    
1196    However, when WebPAC is running on desktop machines (or laptops :-), it's
1197    highly undesireable for system to start swapping. Using C<low_mem> option can
1198    reduce WecPAC memory usage to around 64Mb for same database with lookup
1199    fields and sorted indexes which stay in RAM. Performance will suffer, but
1200    memory usage will really be minimal. It might be also more confortable to
1201    run WebPAC reniced on those machines.
1202    
1203    =cut
1204    
1205  1;  1;

Legend:
Removed from v.366  
changed lines
  Added in v.501

  ViewVC Help
Powered by ViewVC 1.1.26