/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 363 - (hide annotations)
Wed Jun 16 20:05:19 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 9574 byte(s)
open_import_xml, debug option to new

1 dpavlin 354 package WebPAC;
2 dpavlin 352
3     use Carp;
4 dpavlin 353 use Text::Iconv;
5     use Config::IniFiles;
6 dpavlin 363 use XML::Simple;
7 dpavlin 352
8 dpavlin 358 use Data::Dumper;
9    
10 dpavlin 352 =head1 NAME
11    
12 dpavlin 354 WebPAC - base class for WebPAC
13 dpavlin 352
14     =head1 DESCRIPTION
15    
16 dpavlin 354 This module implements methods used by WebPAC.
17 dpavlin 352
18     =head1 METHODS
19    
20     =head2 new
21    
22 dpavlin 354 This will create new instance of WebPAC using configuration specified by C<config_file>.
23 dpavlin 352
24 dpavlin 354 my $webpac = new WebPAC(
25 dpavlin 352 config_file => 'name.conf',
26     [code_page => 'ISO-8859-2',]
27     );
28    
29     Default C<code_page> is C<ISO-8859-2>.
30    
31 dpavlin 353 It will also read configuration files
32     C<global.conf> (used by indexer and Web font-end)
33     and configuration file specified by C<config_file>
34     which describes databases to be indexed.
35    
36 dpavlin 352 =cut
37    
38 dpavlin 363 # mapping between data type and tag which specify
39     # format in XML file
40     my %type2tag = (
41     'isis' => 'isis',
42     # 'excel' => 'column',
43     # 'marc' => 'marc',
44     # 'feed' => 'feed'
45     );
46    
47 dpavlin 352 sub new {
48     my $class = shift;
49     my $self = {@_};
50     bless($self, $class);
51    
52     # fill in default values
53     # output codepage
54     $self->{'code_page'} = 'ISO-8859-2' if (! $self->{'code_page'});
55    
56 dpavlin 353 #
57     # read global.conf
58     #
59 dpavlin 352
60     $self->{global_config_file} = new Config::IniFiles( -file => 'global.conf' ) || croak "can't open 'global.conf'";
61    
62     # read global config parametars
63     foreach my $var (qw(
64     dbi_dbd
65     dbi_dsn
66     dbi_user
67     dbi_passwd
68     show_progress
69     my_unac_filter
70     )) {
71     $self->{global_config}->{$var} = $self->{global_config_file}->val('global', $var);
72     }
73    
74 dpavlin 353 #
75     # read indexer config file
76     #
77 dpavlin 352
78     $self->{indexer_config_file} = new Config::IniFiles( -file => $self->{config_file} ) || croak "can't open '$self->{config_file}'";
79    
80     return $self;
81     }
82    
83     =head2 open_isis
84    
85     Open CDS/ISIS database using OpenIsis module and read all records to memory.
86    
87     $webpac->open_isis(
88     filename => '/data/ISIS/ISIS',
89     code_page => '852',
90     limit_mfn => '500',
91     lookup => [ ... ],
92     );
93    
94     By default, ISIS code page is assumed to be C<852>.
95    
96 dpavlin 353 If optional parametar C<limit_mfn> is set, it will read just 500 records
97     from database in example above.
98 dpavlin 352
99     Returns number of last record read into memory (size of database, really).
100    
101     C<lookup> argument is an array of lookups to create. Each lookup must have C<key> and
102     C<val>. Optional parametar C<eval> is perl code to evaluate before storing
103     value in index.
104    
105     lookup => [
106     { 'key' => 'd:v900', 'val' => 'v250^a' },
107     { 'eval' => '"v901^a" eq "Podruèje"',
108     'key' => 'pa:v561^4:v562^4:v461^1',
109     'val' => 'v900' },
110     ]
111    
112     =cut
113    
114     sub open_isis {
115     my $self = shift;
116     my $arg = {@_};
117    
118     croak "need filename" if (! $arg->{'filename'});
119     my $code_page = $arg->{'code_page'} || '852';
120    
121 dpavlin 353 use OpenIsis;
122    
123 dpavlin 352 #$self->{'isis_code_page'} = $code_page;
124    
125     # create Text::Iconv object
126     my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
127    
128 dpavlin 363 print STDERR "reading ISIS database '",$arg->{'filename'},"'\n" if ($self->{'debug'});
129    
130 dpavlin 352 my $isis_db = OpenIsis::open($arg->{'filename'});
131    
132     my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
133    
134 dpavlin 357 $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
135    
136 dpavlin 363 print STDERR "processing $maxmfn records...\n" if ($self->{'debug'});
137    
138 dpavlin 352 # read database
139     for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
140    
141     # read record
142     my $row = OpenIsis::read( $isis_db, $mfn );
143     foreach my $k (keys %{$row}) {
144     if ($k ne "mfn") {
145     foreach my $l (@{$row->{$k}}) {
146     $l = $cp->convert($l);
147     # has subfields?
148     my $val;
149     if ($l =~ m/\^/) {
150     foreach my $t (split(/\^/,$l)) {
151     next if (! $t);
152     $val->{substr($t,0,1)} = substr($t,1);
153     }
154     } else {
155     $val = $l;
156     }
157    
158     push @{$self->{'data'}->{$mfn}->{$k}}, $val;
159     }
160     }
161    
162     }
163    
164     # create lookup
165 dpavlin 355 my $rec = $self->{'data'}->{$mfn};
166     $self->create_lookup($rec, @{$arg->{'lookup'}});
167 dpavlin 352
168     }
169    
170 dpavlin 362 $self->{'current_mfn'} = 1;
171    
172 dpavlin 352 # store max mfn and return it.
173     return $self->{'max_mfn'} = $maxmfn;
174     }
175    
176 dpavlin 362 =head2 fetch_rec
177    
178     Fetch next record from database. It will also display progress bar (once
179     it's implemented, that is).
180    
181     my $rec = $webpac->fetch_rec;
182    
183     =cut
184    
185     sub fetch_rec {
186     my $self = shift;
187    
188     my $mfn = $self->{'current_mfn'}++ || confess "it seems that you didn't load database!";
189    
190     if ($mfn > $self->{'max_mfn'}) {
191     $self->{'current_mfn'} = $self->{'max_mfn'};
192     return;
193     }
194    
195     return $self->{'data'}->{$mfn};
196     }
197    
198 dpavlin 363 =head2 open_import_xml
199    
200     Read file from C<import_xml/> directory and parse it.
201    
202     $webpac->open_import_xml(type => 'isis');
203    
204     =cut
205    
206     sub open_import_xml {
207     my $self = shift;
208    
209     my $arg = {@_};
210     confess "need type to load file from import_xml/" if (! $arg->{'type'});
211    
212     my $type = $arg->{'type'};
213    
214     my $type_base = $type;
215     $type_base =~ s/_.*$//g;
216    
217     my $f = "./import_xml/$type.xml";
218     confess "import_xml file '$f' doesn't exist!" if (! -e "$f");
219    
220     print STDERR "reading '$f'\n" if ($self->{'debug'});
221    
222     $self->{'import_xml'} = XMLin($f,
223     ForceArray => [ $type2tag{$type_base}, 'config', 'format' ],
224     ForceContent => 1
225     );
226    
227     print Dumper($self->{'import_xml'});
228    
229     }
230    
231 dpavlin 355 =head2 create_lookup
232    
233     Create lookup from record using lookup definition.
234    
235     =cut
236    
237     sub create_lookup {
238     my $self = shift;
239    
240     my $rec = shift || confess "need record to create lookup";
241     confess("need HASH as first argument!") if ($rec !~ /HASH/o);
242    
243     foreach my $i (@_) {
244     if ($i->{'eval'}) {
245     my $eval = $self->fill_in($rec,$i->{'eval'});
246     my $key = $self->fill_in($rec,$i->{'key'});
247     my @val = $self->fill_in($rec,$i->{'val'});
248     if ($key && @val && eval $eval) {
249     push @{$self->{'lookup'}->{$key}}, @val;
250     }
251     } else {
252     my $key = $self->fill_in($rec,$i->{'key'});
253     my @val = $self->fill_in($rec,$i->{'val'});
254     if ($key && @val) {
255     push @{$self->{'lookup'}->{$key}}, @val;
256     }
257     }
258     }
259     }
260    
261 dpavlin 356 =head2 get_data
262    
263     Returns value from record.
264    
265     $self->get_data(\$rec,$f,$sf,$i,\$found);
266    
267     Arguments are:
268     record reference C<$rec>,
269     field C<$f>,
270     optional subfiled C<$sf>,
271     index for repeatable values C<$i>.
272    
273     Optinal variable C<$found> will be incremeted if thre
274     is field.
275    
276     Returns value or empty string.
277    
278     =cut
279    
280     sub get_data {
281     my $self = shift;
282    
283     my ($rec,$f,$sf,$i,$found) = @_;
284     if ($$rec->{$f}) {
285     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
286     $$found++ if (defined($$found));
287     return $$rec->{$f}->[$i]->{$sf};
288     } elsif ($$rec->{$f}->[$i]) {
289     $$found++ if (defined($$found));
290     return $$rec->{$f}->[$i];
291     }
292     } else {
293     return '';
294     }
295     }
296    
297 dpavlin 352 =head2 fill_in
298    
299     Workhourse of all: takes record from in-memory structure of database and
300     strings with placeholders and returns string or array of with substituted
301     values from record.
302    
303     $webpac->fill_in($rec,'v250^a');
304    
305     Optional argument is ordinal number for repeatable fields. By default,
306 dpavlin 353 it's assume to be first repeatable field (fields are perl array, so first
307     element is 0).
308     Following example will read second value from repeatable field.
309 dpavlin 352
310 dpavlin 353 $webpac->fill_in($rec,'Title: v250^a',1);
311    
312     This function B<does not> perform parsing of format to inteligenty skip
313     delimiters before fields which aren't used.
314    
315 dpavlin 352 =cut
316    
317     sub fill_in {
318     my $self = shift;
319    
320     my $rec = shift || confess "need data record";
321     my $format = shift || confess "need format to parse";
322     # iteration (for repeatable fields)
323     my $i = shift || 0;
324    
325     # FIXME remove for speedup?
326 dpavlin 355 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
327 dpavlin 352
328     my $found = 0;
329    
330 dpavlin 359 my $eval_code;
331     # remove eval{...} from beginning
332     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
333    
334 dpavlin 352 # do actual replacement of placeholders
335 dpavlin 356 $format =~ s/v(\d+)(?:\^(\w))*/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
336 dpavlin 352
337 dpavlin 353 if ($found) {
338 dpavlin 359 if ($eval_code) {
339     my $eval = $self->fill_in($rec,$eval_code,$i);
340     return if (! eval $eval);
341     }
342 dpavlin 353 # do we have lookups?
343     if ($format =~ /\[[^\[\]]+\]/o) {
344     return $self->lookup($format);
345     } else {
346     return $format;
347     }
348 dpavlin 352 } else {
349     return;
350     }
351     }
352    
353     =head2 lookup
354    
355 dpavlin 355 Perform lookups on format supplied to it.
356 dpavlin 352
357     my $txt = $self->lookup('[v900]');
358    
359 dpavlin 355 Lookups can be nested (like C<[d:[a:[v900]]]>).
360    
361 dpavlin 352 =cut
362    
363     sub lookup {
364     my $self = shift;
365    
366     my $tmp = shift || confess "need format";
367    
368 dpavlin 353 if ($tmp =~ /\[[^\[\]]+\]/o) {
369 dpavlin 352 my @in = ( $tmp );
370 dpavlin 353 #print "##lookup $tmp\n";
371 dpavlin 352 my @out;
372     while (my $f = shift @in) {
373     if ($f =~ /\[([^\[\]]+)\]/) {
374     my $k = $1;
375     if ($self->{'lookup'}->{$k}) {
376 dpavlin 353 #print "## lookup key = $k\n";
377 dpavlin 352 foreach my $nv (@{$self->{'lookup'}->{$k}}) {
378     my $tmp2 = $f;
379     $tmp2 =~ s/\[$k\]/$nv/g;
380     push @in, $tmp2;
381 dpavlin 353 #print "## lookup in => $tmp2\n";
382 dpavlin 352 }
383     } else {
384     undef $f;
385     }
386     } elsif ($f) {
387     push @out, $f;
388 dpavlin 353 #print "## lookup out => $f\n";
389 dpavlin 352 }
390     }
391     return @out;
392     } else {
393     return $tmp;
394     }
395     }
396    
397 dpavlin 356 =head2 parse
398    
399     Perform smart parsing of string, skipping delimiters for fields which aren't
400     defined. It can also eval code in format starting with C<eval{...}> and
401     return output or nothing depending on eval code.
402    
403     $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
404    
405     =cut
406    
407     sub parse {
408     my $self = shift;
409    
410     my ($rec, $format, $i) = @_;
411    
412 dpavlin 358 confess("need HASH as first argument!") if ($rec !~ /HASH/o);
413    
414     $i = 0 if (! $i);
415    
416 dpavlin 356 my @out;
417    
418     my $eval_code;
419     # remove eval{...} from beginning
420     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
421    
422 dpavlin 358 my $prefix;
423     my $all_found=0;
424 dpavlin 356
425 dpavlin 359 #print "## $format\n";
426 dpavlin 358 while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))*//s) {
427 dpavlin 359 #print "## [ $1 | $2 | $3 ] $format\n";
428 dpavlin 356
429 dpavlin 358 my $del = $1 || '';
430 dpavlin 359 $prefix ||= $del if ($all_found == 0);
431 dpavlin 358
432     my $found = 0;
433     my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
434    
435 dpavlin 356 if ($found) {
436 dpavlin 358 push @out, $del;
437     push @out, $tmp;
438     $all_found += $found;
439 dpavlin 356 }
440     }
441    
442 dpavlin 358 return if (! $all_found);
443 dpavlin 356
444 dpavlin 358 my $out = join('',@out) . $format;
445    
446     # add prefix if not there
447     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
448    
449 dpavlin 359 if ($eval_code) {
450     my $eval = $self->fill_in($rec,$eval_code,$i);
451     return if (! eval $eval);
452     }
453    
454 dpavlin 358 return $out;
455 dpavlin 356 }
456    
457 dpavlin 352 1;

  ViewVC Help
Powered by ViewVC 1.1.26