/[webpac]/trunk/all2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/all2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 641 - (hide annotations)
Sun Jan 23 02:02:10 2005 UTC (19 years, 2 months ago) by dpavlin
File MIME type: text/plain
File size: 26325 byte(s)
New implementation of indexes: now it uses only two tables (index for all
data and tags for all tags). Currently, it doesn't enforce relation between
them on RDBMS level (I have to test this code against SQLite and MySQL
before enforcing that).
Removed swish-e output while indexing, database is used as default tag to
enable filtering by database (there isn't possiblity to set tag to something
else yet!). Output usage count in index.

1 dpavlin 1 #!/usr/bin/perl -w
2    
3     use strict;
4 dpavlin 632 use Biblio::Isis;
5 dpavlin 1 use Getopt::Std;
6     use Data::Dumper;
7     use XML::Simple;
8 dpavlin 10 use Text::Iconv;
9 dpavlin 13 use Config::IniFiles;
10 dpavlin 40 use Encode;
11 dpavlin 177 #use GDBM_File;
12     use Fcntl; # for O_RDWR
13     use TDB_File;
14 dpavlin 1
15 dpavlin 10 $|=1;
16 dpavlin 9
17 dpavlin 13 my $config_file = $0;
18     $config_file =~ s/\.pl$/.conf/;
19 dpavlin 320 $config_file = $ARGV[0] if ($ARGV[0] && -f $ARGV[0]);
20 dpavlin 13 die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file);
21    
22 dpavlin 10 my $config;
23    
24 dpavlin 58 #use index_DBI; # default DBI module for index
25 dpavlin 641 #use index_DBI_cache; # faster DBI module using memory cache
26     use index_DBI_tag; # tag support for indexes
27 dpavlin 50 my $index;
28 dpavlin 10
29 dpavlin 1 my %opts;
30    
31 dpavlin 7 # usage:
32     # -d directory name
33     # -m multiple directories
34     # -q quiet
35     # -s run swish
36 dpavlin 1
37 dpavlin 7 getopts('d:m:qs', \%opts);
38    
39 dpavlin 43 my $path; # this is name of database
40 dpavlin 1
41 dpavlin 57 Text::Iconv->raise_error(0); # Conversion errors don't raise exceptions
42 dpavlin 1
43 dpavlin 40 # this is encoding of all files on disk, including import_xml/*.xml file and
44     # filter/*.pm files! It will be used to store strings in perl internally!
45     my $codepage = 'ISO-8859-2';
46 dpavlin 1
47 dpavlin 40 my $utf2cp = Text::Iconv->new('UTF-8',$codepage);
48     # this function will convert data from XML files to local encoding
49     sub x {
50     return $utf2cp->convert($_[0]);
51     }
52 dpavlin 3
53 dpavlin 54 # decode isis/excel or other import codepage
54     my $import2cp;
55 dpavlin 10
56 dpavlin 40 # outgoing xml must be in UTF-8
57     my $cp2utf = Text::Iconv->new($codepage,'UTF-8');
58 dpavlin 29
59 dpavlin 54 # mapping between data type and tag which specify
60     # format in XML file
61     my %type2tag = (
62     'isis' => 'isis',
63 dpavlin 62 'excel' => 'column',
64     'marc' => 'marc',
65 dpavlin 67 'feed' => 'feed'
66 dpavlin 54 );
67 dpavlin 3
68 dpavlin 170 my $cache; # for cacheing
69    
70 dpavlin 177 # lookup hash (tied to file)
71     my %lhash;
72     # this option will cache all lookup entries in memory.
73     # if you are tight on memory, turn this off
74     my $use_lhash_cache = 1;
75    
76 dpavlin 195 my $last_field_name; # cache to prevent repeated fields
77    
78 dpavlin 54 sub data2xml {
79    
80 dpavlin 10 use xmlify;
81    
82 dpavlin 54 my $type = shift @_;
83 dpavlin 3 my $row = shift @_;
84 dpavlin 13 my $add_xml = shift @_;
85 dpavlin 59 # needed to read values from configuration file
86     my $cfg = shift @_;
87     my $database = shift @_;
88 dpavlin 3
89     my $xml;
90    
91 dpavlin 10 use parse_format;
92 dpavlin 3
93 dpavlin 13 my $html = ""; # html formatted display output
94 dpavlin 10
95 dpavlin 13 my %field_usage; # counter for usage of each field
96    
97 dpavlin 32 # sort subrouting using order="" attribute
98     sub by_order {
99 dpavlin 98 my $va = $config->{indexer}->{$a}->{order} ||
100     $config->{indexer}->{$a};
101     my $vb = $config->{indexer}->{$b}->{order} ||
102     $config->{indexer}->{$b};
103 dpavlin 29
104 dpavlin 98 return $va <=> $vb;
105 dpavlin 32 }
106 dpavlin 3
107 dpavlin 170 my @sorted_tags;
108 dpavlin 180 if ($cache->{tags_by_order}) {
109     @sorted_tags = @{$cache->{tags_by_order}};
110 dpavlin 170 } else {
111     @sorted_tags = sort by_order keys %{$config->{indexer}};
112 dpavlin 180 $cache->{tags_by_order} = \@sorted_tags;
113 dpavlin 170 }
114 dpavlin 32
115 dpavlin 177 # lookup key
116     my $lookup_key;
117    
118 dpavlin 178 # cache for field in pages
119 dpavlin 180 delete $cache->{display_data};
120     delete $cache->{swish_data};
121     delete $cache->{swish_exact_data};
122 dpavlin 182 delete $cache->{index_data};
123 dpavlin 188 delete $cache->{index_delimiter};
124 dpavlin 178 my @page_fields; # names of fields
125    
126    
127     # subs used to produce output
128    
129     sub get_field_name($$$) {
130     my ($config,$field,$field_usage) = @_;
131    
132     # find field name (signular, plural)
133     my $field_name = "";
134     if ($config->{indexer}->{$field}->{name_singular} && $field_usage == 1) {
135     $field_name = $config->{indexer}->{$field}->{name_singular};
136     } elsif ($config->{indexer}->{$field}->{name_plural}) {
137     $field_name = $config->{indexer}->{$field}->{name_plural};
138     } elsif ($config->{indexer}->{$field}->{name}) {
139     $field_name = $config->{indexer}->{$field}->{name};
140     } else {
141     print STDERR "WARNING: field '$field' doesn't have 'name' attribute!";
142     }
143 dpavlin 599
144 dpavlin 178 if ($field_name) {
145 dpavlin 599 $field_name = x($field_name);
146 dpavlin 195 if (! $last_field_name) {
147 dpavlin 599 $last_field_name = $field_name;
148 dpavlin 195 return $last_field_name;
149     } elsif ($field_name ne $last_field_name) {
150 dpavlin 599 $last_field_name = $field_name;
151 dpavlin 195 return $last_field_name;
152     }
153 dpavlin 178 }
154     }
155    
156 dpavlin 279
157     # init variables for different types
158 dpavlin 274 sub init_visible_type($) {
159     my $type = shift;
160 dpavlin 178
161 dpavlin 274 # swish, swish_exact, display, index, index_lookup
162     # swish and display defaults
163     my ($s,$se,$d,$i,$il) = (1,0,1,0,0);
164     if (lc($type) eq "display") {
165     $s = 0;
166     } elsif (lc($type) eq "swish") {
167     $d = 0;
168     } elsif (lc($type) eq "index") {
169     ($s,$se,$d,$i) = (0,1,0,1);
170     } elsif (lc($type) eq "swish_exact") {
171     ($s,$se,$d,$i) = (0,1,0,0);
172     } elsif (lc($type) =~ /^lookup/) {
173     ($s,$se,$d,$i,$il) = (0,1,0,0,1);
174 dpavlin 333 } elsif ($type) {
175     print STDERR "WARNING: unknown type: $type\n";
176 dpavlin 274 }
177     return ($s,$se,$d,$i,$il);
178     }
179    
180 dpavlin 279
181     # convert
182     #
183     # <tag>
184     # <delimiter>, </delimiter>
185     # <value>200a</value>
186     # </tag>
187     #
188     # to
189     #
190     # <tag delimiter=", ">200a</tag>
191     #
192     # but without loosing spaces in delimiter (becasue
193     # new XML::Simple strips spaces in attribute values
194     # as defined in XML specification)
195     #
196     sub unroll_x($) {
197     my $x = shift;
198    
199     if (defined $x->{value}) {
200     my ($v,$d) = ($x->{value}->{content}, $x->{delimiter}->{content});
201     delete $x->{value};
202     delete $x->{delimiter};
203     $x->{content} = $v;
204     $x->{delimiter} = $d;
205     }
206     return $x;
207     }
208    
209 dpavlin 178 # begin real work: go field by field
210 dpavlin 170 foreach my $field (@sorted_tags) {
211    
212 dpavlin 40 $field=x($field);
213 dpavlin 13 $field_usage{$field}++;
214    
215 dpavlin 10 my $swish_data = "";
216 dpavlin 163 my $swish_exact_data = "";
217 dpavlin 3 my $display_data = "";
218 dpavlin 182 my @index_data;
219 dpavlin 35 my $line_delimiter;
220 dpavlin 3
221 dpavlin 34 my ($swish,$display);
222    
223 dpavlin 54 my $tag = $type2tag{$type} || die "can't find which tag to use for type $type";
224 dpavlin 178
225     # is this field page-by-page?
226     my $iterate_by_page = $config->{indexer}->{$field}->{iterate_by_page};
227     push @page_fields,$field if ($iterate_by_page);
228     my %page_max = ();
229     # default line_delimiter if using
230     my $page_line_delimiter = $config->{indexer}->{$field}->{page_line_delimiter} || '<br/>';
231 dpavlin 188 $cache->{index_delimiter}->{$field} = $config->{indexer}->{$field}->{index_delimiter};
232 dpavlin 178
233 dpavlin 195 my $format_name = $config->{indexer}->{$field}->{format_name};
234     my $format_delimiter = $config->{indexer}->{$field}->{format_delimiter};
235     if ($format_name && $format_delimiter) {
236     $cache->{format}->{$field}->{format_name} = $format_name;
237     $cache->{format}->{$field}->{format_delimiter} = $format_delimiter;
238     }
239    
240 dpavlin 54 foreach my $x (@{$config->{indexer}->{$field}->{$tag}}) {
241 dpavlin 3
242 dpavlin 279 $x = unroll_x($x);
243    
244 dpavlin 40 my $format = x($x->{content});
245     my $delimiter = x($x->{delimiter}) || ' ';
246 dpavlin 3
247 dpavlin 178 my $repeat_off = 0; # init repeatable offset
248 dpavlin 29
249 dpavlin 274 my ($s,$se,$d,$i,$il) = init_visible_type($x->{type});
250 dpavlin 29
251 dpavlin 35 # what will separate last line from this one?
252 dpavlin 182 if ($display_data && $x->{append}) {
253 dpavlin 290 $line_delimiter = $delimiter;
254 dpavlin 35 } elsif ($display_data) {
255 dpavlin 34 $line_delimiter = '<br/>';
256     }
257 dpavlin 29
258 dpavlin 34 # init vars so that we go into while...
259     ($swish,$display) = (1,1);
260 dpavlin 29
261 dpavlin 195 sub mkformat($$) {
262 dpavlin 153 my $x = shift || die "mkformat needs tag reference";
263     my $data = shift || return;
264 dpavlin 263 my $format_name = x($x->{format_name}) || return $data;
265 dpavlin 153 my $fmt = x($config->{format}->{$format_name}->{content}) || die "<format name=\"$format_name\"> is not defined!";
266     my $format_delimiter = x($x->{format_delimiter});
267     my @data;
268     if ($format_delimiter) {
269     @data = split(/$format_delimiter/,$data);
270     } else {
271     push @data,$data;
272     }
273    
274     if ($fmt) {
275     my $nr = scalar $fmt =~ s/%s/%s/g;
276     if (($#data+1) == $nr) {
277 dpavlin 263 return sprintf($fmt,@data);
278 dpavlin 153 } else {
279 dpavlin 207 #print STDERR "mkformat: [$data] can't be split on [$format_delimiter] to $nr fields!\n";
280 dpavlin 263 return $data;
281 dpavlin 153 }
282     } else {
283     print STDERR "usage of link '$format_name' without defined format (<link> tag)\n";
284     }
285     }
286    
287 dpavlin 90 # while because of repeatable fields
288     while ($swish || $display) {
289 dpavlin 178 my $page = $repeat_off;
290     $page_max{$field} = $page if ($iterate_by_page && $page > ($page_max{$field} || 0));
291 dpavlin 54 ($swish,$display) = parse_format($type, $format,$row,$repeat_off++,$import2cp);
292 dpavlin 90 if ($repeat_off > 1000) {
293     print STDERR "loop (more than 1000 repeatable fields) deteced in $row, $format\n";
294     last;
295     }
296 dpavlin 177
297     # is this field is lookup?
298     if ($display && $x->{lookup}) {
299 dpavlin 178 my $null = "<!-- null -->";
300 dpavlin 177 if ($use_lhash_cache) {
301     if (!defined($cache->{lhash}->{$display})) {
302     my $new_display = $lhash{$display};
303 dpavlin 178 if (defined($new_display)) {
304 dpavlin 177 #print STDERR "lookup cache store '$display' = '$new_display'\n";
305     $display = $new_display;
306     $cache->{lhash}->{$display} = $new_display;
307     } else {
308 dpavlin 255 # print STDERR "WARNING: lookup for '$display' didn't find anything.\n";
309 dpavlin 177 $display = "";
310 dpavlin 178 $cache->{lhash}->{$display} = $null;
311 dpavlin 177 }
312     } else {
313     $display = $cache->{lhash}->{$display};
314     }
315     } else {
316 dpavlin 178 $display = $lhash{$display} || $null;
317 dpavlin 177 }
318     }
319    
320 dpavlin 29 # filter="name" ; filter this field through
321     # filter/[name].pm
322     my $filter = $x->{filter};
323 dpavlin 170 if ($filter && !$cache->{filter_loaded}->{$filter}) {
324 dpavlin 29 require "filter/".$filter.".pm";
325 dpavlin 170 $cache->{filter_loaded}->{$filter}++;
326 dpavlin 20 }
327 dpavlin 29 # type="swish" ; field for swish
328 dpavlin 163 if ($swish) {
329 dpavlin 259 my $tmp = $swish;
330 dpavlin 163 if ($filter && ($s || $se)) {
331 dpavlin 29 no strict 'refs';
332 dpavlin 259 $tmp = join(" ",&$filter($tmp)) if ($s || $se);
333 dpavlin 29 }
334 dpavlin 234
335 dpavlin 255 $swish_data .= $tmp if ($s && $tmp);
336 dpavlin 256 $swish_exact_data .= "xxbxx $tmp xxexx " if ($tmp && $tmp ne "" && $se);
337 dpavlin 29 }
338 dpavlin 17
339 dpavlin 29 # type="display" ; field for display
340     if ($d && $display) {
341 dpavlin 207 my $ldel = $delimiter;
342 dpavlin 35 if ($line_delimiter && $display_data) {
343 dpavlin 207 $ldel = $line_delimiter;
344 dpavlin 35 }
345 dpavlin 29 if ($filter) {
346     no strict 'refs';
347 dpavlin 207 my @arr;
348     foreach my $tmp (&$filter($display)) {
349     my $tmp2 = mkformat($x,$tmp);
350     push @arr,$tmp2 if ($tmp2);
351 dpavlin 138 }
352 dpavlin 207 $display_data .= $ldel if ($display_data && @arr);
353     $display_data .= join($delimiter,@arr);
354 dpavlin 29 } else {
355 dpavlin 207 $display_data .= $ldel if ($display_data);
356     my $tmp = mkformat($x,$display);
357 dpavlin 224 $display_data .= $tmp if ($tmp);
358 dpavlin 29 }
359 dpavlin 20 }
360 dpavlin 29
361     # type="index" ; insert into index
362 dpavlin 182 my $idisplay;
363 dpavlin 29 if ($i && $display) {
364 dpavlin 182 $idisplay = $display;
365 dpavlin 177 if ($filter) {
366     no strict 'refs';
367 dpavlin 182 $idisplay = &$filter($idisplay);
368 dpavlin 177 }
369 dpavlin 255 push @index_data, $idisplay if ($idisplay && !$iterate_by_page);
370 dpavlin 97 }
371    
372 dpavlin 177 # store fields in lookup
373     if ($il && $display) {
374     if (lc($x->{type}) eq "lookup_key") {
375     if ($lookup_key) {
376     print STDERR "WARNING: try to redefine lookup_key (keys shouldn't be repeatable fields!)";
377     } else {
378 dpavlin 233 if ($filter) {
379     no strict 'refs';
380     $lookup_key = &$filter($display);
381     } else {
382     $lookup_key = $display;
383     }
384 dpavlin 177 }
385     } elsif (lc($x->{type}) eq "lookup_val") {
386     if ($lookup_key) {
387 dpavlin 233 if ($filter) {
388     no strict 'refs';
389     $lhash{$lookup_key} = &$filter($display);
390     } else {
391     $lhash{$lookup_key} = $display;
392     }
393 dpavlin 177 } else {
394     print STDERR "WARNING: no lookup_key defined for '$display'?";
395     }
396 dpavlin 20 }
397 dpavlin 178
398 dpavlin 20 }
399 dpavlin 178
400     # store data for page-by-page repeatable fields
401     if ($iterate_by_page) {
402     sub iterate_fld($$$$$$) {
403     my ($cache,$what,$field,$page,$data,$append) = @_;
404     return if (!$data);
405 dpavlin 182
406     my $ldel = $page_line_delimiter;
407     $ldel = " " if ($append);
408     #print STDERR "line delimiter: ",Dumper($ldel) if ($ldel);
409 dpavlin 178 if (! $cache->{$what}->{$field}->[$page]) {
410     $cache->{$what}->{$field}->[$page] = $data;
411     } else {
412 dpavlin 182 $cache->{$what}->{$field}->[$page] .= $ldel.$data;
413 dpavlin 178 }
414     }
415    
416     if ($display_data) {
417     iterate_fld($cache,'display_data',$field,$page,$display_data,$x->{append});
418     }
419     $display_data = "";
420     if ($swish_data) {
421     iterate_fld($cache,'swish_data',$field,$page,$swish_data,$x->{append});
422     $swish_data = "";
423     }
424     if ($swish_exact_data) {
425     iterate_fld($cache,'swish_exact_data',$field,$page,$swish_exact_data,$x->{append});
426     $swish_exact_data = "";
427     }
428 dpavlin 182
429     if ($idisplay) {
430     my $ldel=$page_line_delimiter;
431     my @index_data;
432     if ($cache->{index_data}->{$field}->[$page]) {
433    
434     @index_data = @{$cache->{index_data}->{$field}->[$page]};
435     }
436     if ($x->{append}) {
437 dpavlin 195 if (@index_data) {
438     $index_data[$#index_data] .= $idisplay;
439     } else {
440     push @index_data, $idisplay;
441     }
442 dpavlin 182 } else {
443     push @index_data, $idisplay;
444     }
445     $idisplay = "";
446     @{$cache->{index_data}->{$field}->[$page]} = @index_data;
447     }
448 dpavlin 178 }
449 dpavlin 17 }
450 dpavlin 177
451 dpavlin 182 if (! $iterate_by_page) {
452 dpavlin 188 my $idel = $x->{index_delimiter};
453 dpavlin 182 # fill data in index
454 dpavlin 188 foreach my $tmp (@index_data) {
455     my $i = $d = $tmp;
456     if ($idel && $tmp =~ m/$idel/) {
457     ($i,$d) = split(/$idel/,$tmp);
458     }
459     $index->insert($field, $i, $d, $path);
460 dpavlin 182 }
461     @index_data = ();
462 dpavlin 177 }
463 dpavlin 3 }
464 dpavlin 9
465 dpavlin 59 # now try to parse variables from configuration file
466     foreach my $x (@{$config->{indexer}->{$field}->{'config'}}) {
467 dpavlin 13
468 dpavlin 279 $x = unroll_x($x);
469    
470 dpavlin 62 my $delimiter = x($x->{delimiter}) || ' ';
471 dpavlin 59 my $val = $cfg->val($database, x($x->{content}));
472    
473 dpavlin 274 # FIXME index_lookup is not supported!
474     my ($s,$se,$d,$i,$il) = init_visible_type($x->{type});
475 dpavlin 59
476     if ($val) {
477 dpavlin 62 $display_data .= $delimiter.$val if ($d);
478 dpavlin 274 $swish_data .= " ".$val if ($s);
479 dpavlin 59 $index->insert($field, $val, $path) if ($i);
480     }
481    
482 dpavlin 178 if ($iterate_by_page) {
483     # FIXME data from config tag will appear just
484     # on first page!!!
485     my $page = 0;
486     if ($display_data) {
487     $cache->{display_data}->{$field}->[$page] = $display_data;
488     $display_data = "";
489     }
490     if ($swish_data) {
491     $cache->{swish_data}->{$field}->[$page] = $swish_data;
492     $swish_data = "";
493     }
494     if ($swish_exact_data) {
495     $cache->{swish_exact_data}->{$field}->[$page] = $swish_exact_data;
496     $swish_exact_data = "";
497     }
498     }
499 dpavlin 59 }
500    
501 dpavlin 178 # save data page-by-page
502     foreach my $field (@page_fields) {
503     my $nr_pages = $page_max{$field} || next;
504     #print STDERR "field '$field' iterate over ",($nr_pages || 0)," pages...\n";
505     #print STDERR Dumper($cache->{display_data});
506     for (my $page=0; $page <= $nr_pages; $page++) {
507 dpavlin 195 my $display_data;
508     if ($cache->{format}->{$field}) {
509 dpavlin 207 my $tmp = mkformat($cache->{format}->{$field},$cache->{display_data}->{$field}->[$page]);
510     $display_data=$tmp if ($tmp);
511 dpavlin 195 } else {
512     $display_data = $cache->{display_data}->{$field}->[$page];
513     }
514 dpavlin 178 if ($display_data) { # default
515     if ($field eq "headline") {
516     $xml .= xmlify("headline", $display_data);
517     } else {
518 dpavlin 29
519 dpavlin 178 # fallback to empty field name if needed
520     $html .= get_field_name($config,$field,$field_usage{$field}) || '';
521     $html .= "#-#".$display_data."###\n";
522     }
523     }
524    
525     my $swish_data = $cache->{swish_data}->{$field}->[$page];
526     if ($swish_data) {
527     # remove extra spaces
528     $swish_data =~ s/ +/ /g;
529     $swish_data =~ s/ +$//g;
530 dpavlin 13
531 dpavlin 320 $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
532 dpavlin 13 }
533 dpavlin 178
534     my $swish_exact_data = $cache->{swish_exact_data}->{$field}->[$page];
535     if ($swish_exact_data) {
536     $swish_exact_data =~ s/ +/ /g;
537     $swish_exact_data =~ s/ +$//g;
538    
539     # add delimiters before and after word.
540     # That is required to produce exact match
541 dpavlin 320 $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
542 dpavlin 13 }
543 dpavlin 182
544 dpavlin 188 my $idel = $cache->{index_delimiter}->{$field};
545     foreach my $tmp (@{$cache->{index_data}->{$field}->[$page]}) {
546     my $i = $tmp;
547     my $d = $tmp;
548     if ($idel && $tmp =~ m/$idel/) {
549     ($i,$d) = split(/$idel/,$tmp);
550     }
551     $index->insert($field, $i, $d, $path);
552     #print STDERR "index [$idel] $field: $i --> $d [$path]\n";
553 dpavlin 182 }
554 dpavlin 13 }
555 dpavlin 178
556 dpavlin 10 }
557 dpavlin 178
558     if (! $iterate_by_page) {
559     if ($display_data) {
560     if ($field eq "headline") {
561     $xml .= xmlify("headline", $display_data);
562     } else {
563 dpavlin 20
564 dpavlin 178 # fallback to empty field name if needed
565     $html .= get_field_name($config,$field,$field_usage{$field}) || '';
566     $html .= "#-#".$display_data."###\n";
567     }
568     }
569     if ($swish_data) {
570     # remove extra spaces
571     $swish_data =~ s/ +/ /g;
572     $swish_data =~ s/ +$//g;
573 dpavlin 10
574 dpavlin 320 $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
575 dpavlin 178 }
576 dpavlin 9
577 dpavlin 178 if ($swish_exact_data) {
578     $swish_exact_data =~ s/ +/ /g;
579     $swish_exact_data =~ s/ +$//g;
580    
581     # add delimiters before and after word.
582     # That is required to produce exact match
583 dpavlin 320 $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
584 dpavlin 178 }
585 dpavlin 163 }
586 dpavlin 3 }
587 dpavlin 13
588     # dump formatted output in <html>
589     if ($html) {
590 dpavlin 81 #$xml .= xmlify("html",$html);
591     $xml .= "<html><![CDATA[ $html ]]></html>";
592 dpavlin 13 }
593    
594 dpavlin 3 if ($xml) {
595 dpavlin 13 $xml .= $add_xml if ($add_xml);
596 dpavlin 10 return "<xml>\n$xml</xml>\n";
597 dpavlin 3 } else {
598     return;
599     }
600     }
601    
602     ##########################################################################
603    
604 dpavlin 54 # read configuration for this script
605 dpavlin 13 my $cfg = new Config::IniFiles( -file => $config_file );
606 dpavlin 1
607 dpavlin 54 # read global.conf configuration
608     my $cfg_global = new Config::IniFiles( -file => 'global.conf' );
609    
610 dpavlin 50 # open index
611     $index = new index_DBI(
612 dpavlin 54 $cfg_global->val('global', 'dbi_dbd'),
613     $cfg_global->val('global', 'dbi_dsn'),
614     $cfg_global->val('global', 'dbi_user'),
615     $cfg_global->val('global', 'dbi_passwd') || '',
616 dpavlin 50 );
617    
618 dpavlin 97 my $show_progress = $cfg_global->val('global', 'show_progress');
619    
620 dpavlin 320 my $my_unac_filter = $cfg_global->val('global', 'my_unac_filter');
621     if ($my_unac_filter) {
622     print STDERR "using $my_unac_filter to filter characters for search\n";
623     require $my_unac_filter;
624     } else {
625     print STDERR "### fallback to default my_unac_string!\n";
626     eval q{
627     sub main::my_unac_string($$) {
628     my ($charset, $string) = (@_);
629     return $string;
630     }
631     };
632 dpavlin 164 }
633    
634 dpavlin 13 foreach my $database ($cfg->Sections) {
635 dpavlin 1
636 dpavlin 54 my $type = lc($cfg -> val($database, 'type')) || die "$database doesn't have 'type' defined";
637 dpavlin 40 my $add_xml = $cfg -> val($database, 'xml'); # optional
638 dpavlin 1
639 dpavlin 177 # create new lookup file
640     my $lookup_file = $cfg -> val($database, 'lookup_newfile'); # optional
641     if ($lookup_file) {
642     #tie %lhash, 'GDBM_File', $lookup_file, &GDBM_NEWDB, 0644;
643 dpavlin 379 if (! -e $lookup_file) {
644     open(LOOKUP, "> $lookup_file") || die "can't create $lookup_file': $!";
645     close(LOOKUP);
646     }
647 dpavlin 177 tie %lhash, 'TDB_File', $lookup_file, TDB_CLEAR_IF_FIRST, O_RDWR, 0644;
648     print STDERR "creating lookup file '$lookup_file'\n";
649 dpavlin 231 # delete memory cache for lookup file
650     delete $cache->{lhash};
651 dpavlin 177 }
652    
653     # open existing lookup file
654     $lookup_file = $cfg -> val($database, 'lookup_open'); # optional
655     if ($lookup_file) {
656     #tie %lhash, 'GDBM_File', $lookup_file, &GDBM_READER, 0644;
657     tie %lhash, 'TDB_File', $lookup_file, TDB_DEFAULT, O_RDWR, 0644;
658     print STDERR "opening lookup file '$lookup_file'\n";
659     }
660    
661 dpavlin 54 print STDERR "reading ./import_xml/$type.xml\n";
662 dpavlin 1
663 dpavlin 58 # extract just type basic
664     my $type_base = $type;
665     $type_base =~ s/_.+$//g;
666 dpavlin 40
667 dpavlin 279 $config=XMLin("./import_xml/$type.xml", ForceArray => [ $type2tag{$type_base}, 'config', 'format' ], ForceContent => 1 );
668 dpavlin 58
669 dpavlin 618 # helper for progress bar
670     sub fmt_time {
671     my $t = shift || 0;
672     my $out = "";
673    
674     my ($ss,$mm,$hh) = gmtime($t);
675     $out .= "${hh}h" if ($hh);
676     $out .= sprintf("%02d:%02d", $mm,$ss);
677     $out .= " " if ($hh == 0);
678     return $out;
679     }
680    
681 dpavlin 54 # output current progress indicator
682     my $last_p = 0;
683 dpavlin 618 my $start_t = time();
684 dpavlin 54 sub progress {
685 dpavlin 101 return if (! $show_progress);
686 dpavlin 620 my $current = shift;
687 dpavlin 57 my $total = shift || 1;
688 dpavlin 54 my $p = int($current * 100 / $total);
689 dpavlin 628 if ($p < $last_p || $current == 1) {
690 dpavlin 626 $start_t = time();
691 dpavlin 628 $last_p = 0;
692 dpavlin 626 } elsif ($p != $last_p) {
693 dpavlin 618 my $rate = ($current / (time() - $start_t || 1));
694     my $eta = ($total-$current) / ($rate || 1);
695     printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$current,"=" x ($p/3)."$p%>", $total, $rate, fmt_time($eta));
696 dpavlin 54 $last_p = $p;
697     }
698 dpavlin 3 }
699    
700 dpavlin 67 my $fake_dir = 1;
701 dpavlin 628 my $fake_pos = 0;
702     my $last_fake_t = time();
703 dpavlin 67 sub fakeprogress {
704 dpavlin 104 return if (! $show_progress);
705 dpavlin 67 my $current = shift @_;
706    
707 dpavlin 628 my @ind = ('-','\\','|','/','-','\\','|','/');
708 dpavlin 67
709 dpavlin 628 if ($current < $fake_pos) {
710     $start_t = time();
711     $last_fake_t = 0;
712     $fake_dir = 1;
713     $fake_pos = 0;
714 dpavlin 67 }
715 dpavlin 628
716     if (time()-$last_fake_t >= 1) {
717     $last_fake_t = time();
718     $fake_pos += $fake_dir;
719     $fake_dir = -$fake_dir if ($fake_pos > 38);
720     }
721    
722     if ($current % 10 == 0) {
723     my $rate = ($current / (time() - $start_t || 1));
724     printf STDERR ("%5d [%-38s] %0.1f/s\r",$current, " " x $fake_pos .$ind[($current / 10) % 8], $rate);
725     }
726 dpavlin 67 }
727    
728 dpavlin 54 # now read database
729     print STDERR "using: $type...\n";
730 dpavlin 1
731 dpavlin 180 # erase cache for tags by order in this database
732     delete $cache->{tags_by_order};
733    
734 dpavlin 58 if ($type_base eq "isis") {
735    
736 dpavlin 54 my $isis_db = $cfg -> val($database, 'isis_db') || die "$database doesn't have 'isis_db' defined!";
737 dpavlin 1
738 dpavlin 54 $import2cp = Text::Iconv->new($config->{isis_codepage},$codepage);
739 dpavlin 632 my $db = new Biblio::Isis( isisdb => $isis_db );
740 dpavlin 10
741 dpavlin 632 my $max_rowid = $db->count || die "can't find maxmfn";
742 dpavlin 109
743 dpavlin 54 print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
744    
745 dpavlin 641 $path = $database;
746 dpavlin 54
747     for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
748 dpavlin 622 my $row = $db->to_hash( $row_id );
749     if ($row) {
750    
751     $row->{mfn} = $row_id;
752     $row->{record} = $db->{record};
753    
754 dpavlin 54 progress($row->{mfn}, $max_rowid);
755    
756     my $swishpath = $path."#".int($row->{mfn});
757    
758 dpavlin 59 if (my $xml = data2xml($type_base,$row,$add_xml,$cfg,$database)) {
759 dpavlin 54 $xml = $cp2utf->convert($xml);
760     use bytes; # as opposed to chars
761     print "Path-Name: $swishpath\n";
762     print "Content-Length: ".(length($xml)+1)."\n";
763     print "Document-Type: XML\n\n$xml\n";
764     }
765 dpavlin 3 }
766 dpavlin 54 }
767     print STDERR "\n";
768 dpavlin 3
769 dpavlin 58 } elsif ($type_base eq "excel") {
770 dpavlin 255 require Spreadsheet::ParseExcel;
771     require Spreadsheet::ParseExcel::Utility;
772     import Spreadsheet::ParseExcel::Utility qw(int2col);
773 dpavlin 54
774     $import2cp = Text::Iconv->new($config->{excel_codepage},$codepage);
775     my $excel_file = $cfg -> val($database, 'excel_file') || die "$database doesn't have 'excel_file' defined!";
776 dpavlin 43
777 dpavlin 54 my $sheet = x($config->{sheet}) || die "no sheet in $type.xml";
778 dpavlin 56 my $start_row = x($config->{start_row}) - 1 || die "no start_row in $type.xml";
779 dpavlin 54
780     my $oBook = Spreadsheet::ParseExcel::Workbook->Parse($excel_file) || die "can't open Excel file '$excel_file'";
781    
782     my $sheet_nr = 0;
783     foreach my $oWks (@{$oBook->{Worksheet}}) {
784     #print STDERR "-- SHEET $sheet_nr:", $oWks->{Name}, "\n";
785     last if ($oWks->{Name} eq $sheet);
786     $sheet_nr++;
787     }
788    
789     my $oWorksheet = $oBook->{Worksheet}[$sheet_nr];
790    
791     print STDERR "using sheet: ",$oWorksheet->{Name},"\n";
792     defined ($oWorksheet) || die "can't find sheet '$sheet' in $excel_file";
793     my $end_row = x($config->{end_row}) || $oWorksheet->{MaxRow};
794    
795 dpavlin 56 for(my $iR = $start_row ; defined $end_row && $iR <= $end_row ; $iR++) {
796 dpavlin 54 my $row;
797     for(my $iC = $oWorksheet->{MinCol} ; defined $oWorksheet->{MaxCol} && $iC <= $oWorksheet->{MaxCol} ; $iC++) {
798     my $cell = $oWorksheet->{Cells}[$iR][$iC];
799     if ($cell) {
800 dpavlin 488 # this conversion is a cludge.
801     # Files from Excell could have
802     # characters which don't fit into
803     # destination encoding.
804     $row->{int2col($iC)} = $utf2cp->convert($cell->Value) || $cell->Value;
805 dpavlin 54 }
806     }
807    
808     progress($iR, $end_row);
809    
810     # print "row[$iR/$end_row] ";
811     # foreach (keys %{$row}) {
812     # print "$_: ",$row->{$_},"\t";
813     # }
814     # print "\n";
815    
816     my $swishpath = $database."#".$iR;
817    
818     next if (! $row);
819    
820 dpavlin 59 if (my $xml = data2xml($type_base,$row,$add_xml,$cfg,$database)) {
821 dpavlin 44 $xml = $cp2utf->convert($xml);
822 dpavlin 35 use bytes; # as opposed to chars
823 dpavlin 43 print "Path-Name: $swishpath\n";
824 dpavlin 10 print "Content-Length: ".(length($xml)+1)."\n";
825     print "Document-Type: XML\n\n$xml\n";
826 dpavlin 3 }
827 dpavlin 1 }
828 dpavlin 62 } elsif ($type_base eq "marc") {
829 dpavlin 67
830 dpavlin 620 require MARC::File::USMARC;
831 dpavlin 62
832     $import2cp = Text::Iconv->new($config->{marc_codepage},$codepage);
833     my $marc_file = $cfg -> val($database, 'marc_file') || die "$database doesn't have 'marc_file' defined!";
834    
835     # optional argument is format
836 dpavlin 620 warn "marc_format is no longer used!" if ($config->{marc_format});
837 dpavlin 62 print STDERR "Reading MARC file '$marc_file'\n";
838    
839 dpavlin 620 my $marc = MARC::File::USMARC->in( $marc_file )
840     || die "Can't open MARC file '$marc_file': ".$MARC::File::ERROR;
841 dpavlin 62
842 dpavlin 620 # count records in MARC file
843     sub marc_count {
844     my $filename = shift || die;
845     my $file = MARC::File::USMARC->in($filename) || die $MARC::File::ERROR;
846     my $count = 0;
847     while ($file->skip()) {
848     $count++;
849     }
850     return $count;
851     }
852 dpavlin 62
853 dpavlin 620 my $count = marc_count($marc_file) || warn "no records in '$marc_file'?";
854 dpavlin 62
855 dpavlin 620 my $i = 0;
856 dpavlin 62
857 dpavlin 620 while( my $rec = $marc->next() ) {
858 dpavlin 62
859 dpavlin 620 progress($i++,$count);
860 dpavlin 215
861 dpavlin 62 my $swishpath = $database."#".$i;
862    
863 dpavlin 620 if (my $xml = data2xml($type_base,$rec,$add_xml,$cfg,$database)) {
864 dpavlin 62 $xml = $cp2utf->convert($xml);
865     use bytes; # as opposed to chars
866     print "Path-Name: $swishpath\n";
867     print "Content-Length: ".(length($xml)+1)."\n";
868     print "Document-Type: XML\n\n$xml\n";
869     }
870     }
871 dpavlin 215
872     print STDERR "\n";
873    
874 dpavlin 67 } elsif ($type_base eq "feed") {
875    
876     $import2cp = Text::Iconv->new($config->{feed_codepage},$codepage);
877     my $prog = x($config->{prog}) || die "$database doesn't have 'prog' defined!";
878    
879     print STDERR "Reading feed from program '$prog'\n";
880    
881     open(FEED,"feeds/$prog |") || die "can't start $prog: $!";
882    
883 dpavlin 74 my $i=1; # record nr.
884 dpavlin 67
885     my $data;
886     my $line=1;
887    
888     while (<FEED>) {
889     chomp;
890    
891     if (/^$/) {
892 dpavlin 74 my $swishpath = $database."#".$i++;
893 dpavlin 67
894     if (my $xml = data2xml($type_base,$data,$add_xml,$cfg,$database)) {
895     $xml = $cp2utf->convert($xml);
896     use bytes; # as opposed to chars
897     print "Path-Name: $swishpath\n";
898     print "Content-Length: ".(length($xml)+1)."\n";
899     print "Document-Type: XML\n\n$xml\n";
900     }
901     $line = 1;
902     $data = {};
903     next;
904     }
905    
906 dpavlin 74 $line = $1 if (s/^(\d+):\s*//);
907 dpavlin 67 $data->{$line++} = $_;
908    
909 dpavlin 74 fakeprogress($i);
910 dpavlin 67
911     }
912 dpavlin 177 # close lookup
913     untie %lhash if (%lhash);
914 dpavlin 1 }
915     }
916 dpavlin 3
917 dpavlin 10 # call this to commit index
918     $index->close;
919 dpavlin 3
920     1;
921     __END__
922     ##########################################################################
923    
924     =head1 NAME
925    
926 dpavlin 81 all2xml.pl - read various file formats and dump XML for SWISH-E
927 dpavlin 3
928 dpavlin 298 =head1 SYNOPSYS
929    
930     $ all2xml.pl [test.conf]
931    
932 dpavlin 3 =head1 DESCRIPTION
933    
934 dpavlin 623 This command will read ISIS data file using IsisDB perl module, MARC
935 dpavlin 81 records using MARC module and optionally Micro$oft Excel files to
936     create one XML file for usage with I<SWISH-E> indexer. Dispite it's name,
937     this script B<isn't general xml generator> from isis files (isis allready
938     has something like that). Output of this script is tailor-made for SWISH-E.
939 dpavlin 3
940 dpavlin 298 If no configuration file is specified, it will use default one called
941     C<all2xml.conf>.
942    
943 dpavlin 81 =head1 BUGS
944    
945     Documentation is really lacking. However, in true Open Source spirit, source
946     is best documentation. I even made considerable effort to comment parts
947     which are not intuitively clear, so...
948    
949 dpavlin 3 =head1 AUTHOR
950    
951     Dobrica Pavlinusic <dpavlin@rot13.org>
952    
953     =head1 COPYRIGHT
954    
955     GNU Public License (GPL) v2 or later
956    
957     =head1 SEE ALSO
958    
959     SWISH-E web site at http://www.swish-e.org
960    
961     =cut

Properties

Name Value
cvs2svn:cvs-rev 1.64
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26