/[webpac]/trunk/all2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/all2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 641 - (show annotations)
Sun Jan 23 02:02:10 2005 UTC (19 years, 2 months ago) by dpavlin
File MIME type: text/plain
File size: 26325 byte(s)
New implementation of indexes: now it uses only two tables (index for all
data and tags for all tags). Currently, it doesn't enforce relation between
them on RDBMS level (I have to test this code against SQLite and MySQL
before enforcing that).
Removed swish-e output while indexing, database is used as default tag to
enable filtering by database (there isn't possiblity to set tag to something
else yet!). Output usage count in index.

1 #!/usr/bin/perl -w
2
3 use strict;
4 use Biblio::Isis;
5 use Getopt::Std;
6 use Data::Dumper;
7 use XML::Simple;
8 use Text::Iconv;
9 use Config::IniFiles;
10 use Encode;
11 #use GDBM_File;
12 use Fcntl; # for O_RDWR
13 use TDB_File;
14
15 $|=1;
16
17 my $config_file = $0;
18 $config_file =~ s/\.pl$/.conf/;
19 $config_file = $ARGV[0] if ($ARGV[0] && -f $ARGV[0]);
20 die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file);
21
22 my $config;
23
24 #use index_DBI; # default DBI module for index
25 #use index_DBI_cache; # faster DBI module using memory cache
26 use index_DBI_tag; # tag support for indexes
27 my $index;
28
29 my %opts;
30
31 # usage:
32 # -d directory name
33 # -m multiple directories
34 # -q quiet
35 # -s run swish
36
37 getopts('d:m:qs', \%opts);
38
39 my $path; # this is name of database
40
41 Text::Iconv->raise_error(0); # Conversion errors don't raise exceptions
42
43 # this is encoding of all files on disk, including import_xml/*.xml file and
44 # filter/*.pm files! It will be used to store strings in perl internally!
45 my $codepage = 'ISO-8859-2';
46
47 my $utf2cp = Text::Iconv->new('UTF-8',$codepage);
48 # this function will convert data from XML files to local encoding
49 sub x {
50 return $utf2cp->convert($_[0]);
51 }
52
53 # decode isis/excel or other import codepage
54 my $import2cp;
55
56 # outgoing xml must be in UTF-8
57 my $cp2utf = Text::Iconv->new($codepage,'UTF-8');
58
59 # mapping between data type and tag which specify
60 # format in XML file
61 my %type2tag = (
62 'isis' => 'isis',
63 'excel' => 'column',
64 'marc' => 'marc',
65 'feed' => 'feed'
66 );
67
68 my $cache; # for cacheing
69
70 # lookup hash (tied to file)
71 my %lhash;
72 # this option will cache all lookup entries in memory.
73 # if you are tight on memory, turn this off
74 my $use_lhash_cache = 1;
75
76 my $last_field_name; # cache to prevent repeated fields
77
78 sub data2xml {
79
80 use xmlify;
81
82 my $type = shift @_;
83 my $row = shift @_;
84 my $add_xml = shift @_;
85 # needed to read values from configuration file
86 my $cfg = shift @_;
87 my $database = shift @_;
88
89 my $xml;
90
91 use parse_format;
92
93 my $html = ""; # html formatted display output
94
95 my %field_usage; # counter for usage of each field
96
97 # sort subrouting using order="" attribute
98 sub by_order {
99 my $va = $config->{indexer}->{$a}->{order} ||
100 $config->{indexer}->{$a};
101 my $vb = $config->{indexer}->{$b}->{order} ||
102 $config->{indexer}->{$b};
103
104 return $va <=> $vb;
105 }
106
107 my @sorted_tags;
108 if ($cache->{tags_by_order}) {
109 @sorted_tags = @{$cache->{tags_by_order}};
110 } else {
111 @sorted_tags = sort by_order keys %{$config->{indexer}};
112 $cache->{tags_by_order} = \@sorted_tags;
113 }
114
115 # lookup key
116 my $lookup_key;
117
118 # cache for field in pages
119 delete $cache->{display_data};
120 delete $cache->{swish_data};
121 delete $cache->{swish_exact_data};
122 delete $cache->{index_data};
123 delete $cache->{index_delimiter};
124 my @page_fields; # names of fields
125
126
127 # subs used to produce output
128
129 sub get_field_name($$$) {
130 my ($config,$field,$field_usage) = @_;
131
132 # find field name (signular, plural)
133 my $field_name = "";
134 if ($config->{indexer}->{$field}->{name_singular} && $field_usage == 1) {
135 $field_name = $config->{indexer}->{$field}->{name_singular};
136 } elsif ($config->{indexer}->{$field}->{name_plural}) {
137 $field_name = $config->{indexer}->{$field}->{name_plural};
138 } elsif ($config->{indexer}->{$field}->{name}) {
139 $field_name = $config->{indexer}->{$field}->{name};
140 } else {
141 print STDERR "WARNING: field '$field' doesn't have 'name' attribute!";
142 }
143
144 if ($field_name) {
145 $field_name = x($field_name);
146 if (! $last_field_name) {
147 $last_field_name = $field_name;
148 return $last_field_name;
149 } elsif ($field_name ne $last_field_name) {
150 $last_field_name = $field_name;
151 return $last_field_name;
152 }
153 }
154 }
155
156
157 # init variables for different types
158 sub init_visible_type($) {
159 my $type = shift;
160
161 # swish, swish_exact, display, index, index_lookup
162 # swish and display defaults
163 my ($s,$se,$d,$i,$il) = (1,0,1,0,0);
164 if (lc($type) eq "display") {
165 $s = 0;
166 } elsif (lc($type) eq "swish") {
167 $d = 0;
168 } elsif (lc($type) eq "index") {
169 ($s,$se,$d,$i) = (0,1,0,1);
170 } elsif (lc($type) eq "swish_exact") {
171 ($s,$se,$d,$i) = (0,1,0,0);
172 } elsif (lc($type) =~ /^lookup/) {
173 ($s,$se,$d,$i,$il) = (0,1,0,0,1);
174 } elsif ($type) {
175 print STDERR "WARNING: unknown type: $type\n";
176 }
177 return ($s,$se,$d,$i,$il);
178 }
179
180
181 # convert
182 #
183 # <tag>
184 # <delimiter>, </delimiter>
185 # <value>200a</value>
186 # </tag>
187 #
188 # to
189 #
190 # <tag delimiter=", ">200a</tag>
191 #
192 # but without loosing spaces in delimiter (becasue
193 # new XML::Simple strips spaces in attribute values
194 # as defined in XML specification)
195 #
196 sub unroll_x($) {
197 my $x = shift;
198
199 if (defined $x->{value}) {
200 my ($v,$d) = ($x->{value}->{content}, $x->{delimiter}->{content});
201 delete $x->{value};
202 delete $x->{delimiter};
203 $x->{content} = $v;
204 $x->{delimiter} = $d;
205 }
206 return $x;
207 }
208
209 # begin real work: go field by field
210 foreach my $field (@sorted_tags) {
211
212 $field=x($field);
213 $field_usage{$field}++;
214
215 my $swish_data = "";
216 my $swish_exact_data = "";
217 my $display_data = "";
218 my @index_data;
219 my $line_delimiter;
220
221 my ($swish,$display);
222
223 my $tag = $type2tag{$type} || die "can't find which tag to use for type $type";
224
225 # is this field page-by-page?
226 my $iterate_by_page = $config->{indexer}->{$field}->{iterate_by_page};
227 push @page_fields,$field if ($iterate_by_page);
228 my %page_max = ();
229 # default line_delimiter if using
230 my $page_line_delimiter = $config->{indexer}->{$field}->{page_line_delimiter} || '<br/>';
231 $cache->{index_delimiter}->{$field} = $config->{indexer}->{$field}->{index_delimiter};
232
233 my $format_name = $config->{indexer}->{$field}->{format_name};
234 my $format_delimiter = $config->{indexer}->{$field}->{format_delimiter};
235 if ($format_name && $format_delimiter) {
236 $cache->{format}->{$field}->{format_name} = $format_name;
237 $cache->{format}->{$field}->{format_delimiter} = $format_delimiter;
238 }
239
240 foreach my $x (@{$config->{indexer}->{$field}->{$tag}}) {
241
242 $x = unroll_x($x);
243
244 my $format = x($x->{content});
245 my $delimiter = x($x->{delimiter}) || ' ';
246
247 my $repeat_off = 0; # init repeatable offset
248
249 my ($s,$se,$d,$i,$il) = init_visible_type($x->{type});
250
251 # what will separate last line from this one?
252 if ($display_data && $x->{append}) {
253 $line_delimiter = $delimiter;
254 } elsif ($display_data) {
255 $line_delimiter = '<br/>';
256 }
257
258 # init vars so that we go into while...
259 ($swish,$display) = (1,1);
260
261 sub mkformat($$) {
262 my $x = shift || die "mkformat needs tag reference";
263 my $data = shift || return;
264 my $format_name = x($x->{format_name}) || return $data;
265 my $fmt = x($config->{format}->{$format_name}->{content}) || die "<format name=\"$format_name\"> is not defined!";
266 my $format_delimiter = x($x->{format_delimiter});
267 my @data;
268 if ($format_delimiter) {
269 @data = split(/$format_delimiter/,$data);
270 } else {
271 push @data,$data;
272 }
273
274 if ($fmt) {
275 my $nr = scalar $fmt =~ s/%s/%s/g;
276 if (($#data+1) == $nr) {
277 return sprintf($fmt,@data);
278 } else {
279 #print STDERR "mkformat: [$data] can't be split on [$format_delimiter] to $nr fields!\n";
280 return $data;
281 }
282 } else {
283 print STDERR "usage of link '$format_name' without defined format (<link> tag)\n";
284 }
285 }
286
287 # while because of repeatable fields
288 while ($swish || $display) {
289 my $page = $repeat_off;
290 $page_max{$field} = $page if ($iterate_by_page && $page > ($page_max{$field} || 0));
291 ($swish,$display) = parse_format($type, $format,$row,$repeat_off++,$import2cp);
292 if ($repeat_off > 1000) {
293 print STDERR "loop (more than 1000 repeatable fields) deteced in $row, $format\n";
294 last;
295 }
296
297 # is this field is lookup?
298 if ($display && $x->{lookup}) {
299 my $null = "<!-- null -->";
300 if ($use_lhash_cache) {
301 if (!defined($cache->{lhash}->{$display})) {
302 my $new_display = $lhash{$display};
303 if (defined($new_display)) {
304 #print STDERR "lookup cache store '$display' = '$new_display'\n";
305 $display = $new_display;
306 $cache->{lhash}->{$display} = $new_display;
307 } else {
308 # print STDERR "WARNING: lookup for '$display' didn't find anything.\n";
309 $display = "";
310 $cache->{lhash}->{$display} = $null;
311 }
312 } else {
313 $display = $cache->{lhash}->{$display};
314 }
315 } else {
316 $display = $lhash{$display} || $null;
317 }
318 }
319
320 # filter="name" ; filter this field through
321 # filter/[name].pm
322 my $filter = $x->{filter};
323 if ($filter && !$cache->{filter_loaded}->{$filter}) {
324 require "filter/".$filter.".pm";
325 $cache->{filter_loaded}->{$filter}++;
326 }
327 # type="swish" ; field for swish
328 if ($swish) {
329 my $tmp = $swish;
330 if ($filter && ($s || $se)) {
331 no strict 'refs';
332 $tmp = join(" ",&$filter($tmp)) if ($s || $se);
333 }
334
335 $swish_data .= $tmp if ($s && $tmp);
336 $swish_exact_data .= "xxbxx $tmp xxexx " if ($tmp && $tmp ne "" && $se);
337 }
338
339 # type="display" ; field for display
340 if ($d && $display) {
341 my $ldel = $delimiter;
342 if ($line_delimiter && $display_data) {
343 $ldel = $line_delimiter;
344 }
345 if ($filter) {
346 no strict 'refs';
347 my @arr;
348 foreach my $tmp (&$filter($display)) {
349 my $tmp2 = mkformat($x,$tmp);
350 push @arr,$tmp2 if ($tmp2);
351 }
352 $display_data .= $ldel if ($display_data && @arr);
353 $display_data .= join($delimiter,@arr);
354 } else {
355 $display_data .= $ldel if ($display_data);
356 my $tmp = mkformat($x,$display);
357 $display_data .= $tmp if ($tmp);
358 }
359 }
360
361 # type="index" ; insert into index
362 my $idisplay;
363 if ($i && $display) {
364 $idisplay = $display;
365 if ($filter) {
366 no strict 'refs';
367 $idisplay = &$filter($idisplay);
368 }
369 push @index_data, $idisplay if ($idisplay && !$iterate_by_page);
370 }
371
372 # store fields in lookup
373 if ($il && $display) {
374 if (lc($x->{type}) eq "lookup_key") {
375 if ($lookup_key) {
376 print STDERR "WARNING: try to redefine lookup_key (keys shouldn't be repeatable fields!)";
377 } else {
378 if ($filter) {
379 no strict 'refs';
380 $lookup_key = &$filter($display);
381 } else {
382 $lookup_key = $display;
383 }
384 }
385 } elsif (lc($x->{type}) eq "lookup_val") {
386 if ($lookup_key) {
387 if ($filter) {
388 no strict 'refs';
389 $lhash{$lookup_key} = &$filter($display);
390 } else {
391 $lhash{$lookup_key} = $display;
392 }
393 } else {
394 print STDERR "WARNING: no lookup_key defined for '$display'?";
395 }
396 }
397
398 }
399
400 # store data for page-by-page repeatable fields
401 if ($iterate_by_page) {
402 sub iterate_fld($$$$$$) {
403 my ($cache,$what,$field,$page,$data,$append) = @_;
404 return if (!$data);
405
406 my $ldel = $page_line_delimiter;
407 $ldel = " " if ($append);
408 #print STDERR "line delimiter: ",Dumper($ldel) if ($ldel);
409 if (! $cache->{$what}->{$field}->[$page]) {
410 $cache->{$what}->{$field}->[$page] = $data;
411 } else {
412 $cache->{$what}->{$field}->[$page] .= $ldel.$data;
413 }
414 }
415
416 if ($display_data) {
417 iterate_fld($cache,'display_data',$field,$page,$display_data,$x->{append});
418 }
419 $display_data = "";
420 if ($swish_data) {
421 iterate_fld($cache,'swish_data',$field,$page,$swish_data,$x->{append});
422 $swish_data = "";
423 }
424 if ($swish_exact_data) {
425 iterate_fld($cache,'swish_exact_data',$field,$page,$swish_exact_data,$x->{append});
426 $swish_exact_data = "";
427 }
428
429 if ($idisplay) {
430 my $ldel=$page_line_delimiter;
431 my @index_data;
432 if ($cache->{index_data}->{$field}->[$page]) {
433
434 @index_data = @{$cache->{index_data}->{$field}->[$page]};
435 }
436 if ($x->{append}) {
437 if (@index_data) {
438 $index_data[$#index_data] .= $idisplay;
439 } else {
440 push @index_data, $idisplay;
441 }
442 } else {
443 push @index_data, $idisplay;
444 }
445 $idisplay = "";
446 @{$cache->{index_data}->{$field}->[$page]} = @index_data;
447 }
448 }
449 }
450
451 if (! $iterate_by_page) {
452 my $idel = $x->{index_delimiter};
453 # fill data in index
454 foreach my $tmp (@index_data) {
455 my $i = $d = $tmp;
456 if ($idel && $tmp =~ m/$idel/) {
457 ($i,$d) = split(/$idel/,$tmp);
458 }
459 $index->insert($field, $i, $d, $path);
460 }
461 @index_data = ();
462 }
463 }
464
465 # now try to parse variables from configuration file
466 foreach my $x (@{$config->{indexer}->{$field}->{'config'}}) {
467
468 $x = unroll_x($x);
469
470 my $delimiter = x($x->{delimiter}) || ' ';
471 my $val = $cfg->val($database, x($x->{content}));
472
473 # FIXME index_lookup is not supported!
474 my ($s,$se,$d,$i,$il) = init_visible_type($x->{type});
475
476 if ($val) {
477 $display_data .= $delimiter.$val if ($d);
478 $swish_data .= " ".$val if ($s);
479 $index->insert($field, $val, $path) if ($i);
480 }
481
482 if ($iterate_by_page) {
483 # FIXME data from config tag will appear just
484 # on first page!!!
485 my $page = 0;
486 if ($display_data) {
487 $cache->{display_data}->{$field}->[$page] = $display_data;
488 $display_data = "";
489 }
490 if ($swish_data) {
491 $cache->{swish_data}->{$field}->[$page] = $swish_data;
492 $swish_data = "";
493 }
494 if ($swish_exact_data) {
495 $cache->{swish_exact_data}->{$field}->[$page] = $swish_exact_data;
496 $swish_exact_data = "";
497 }
498 }
499 }
500
501 # save data page-by-page
502 foreach my $field (@page_fields) {
503 my $nr_pages = $page_max{$field} || next;
504 #print STDERR "field '$field' iterate over ",($nr_pages || 0)," pages...\n";
505 #print STDERR Dumper($cache->{display_data});
506 for (my $page=0; $page <= $nr_pages; $page++) {
507 my $display_data;
508 if ($cache->{format}->{$field}) {
509 my $tmp = mkformat($cache->{format}->{$field},$cache->{display_data}->{$field}->[$page]);
510 $display_data=$tmp if ($tmp);
511 } else {
512 $display_data = $cache->{display_data}->{$field}->[$page];
513 }
514 if ($display_data) { # default
515 if ($field eq "headline") {
516 $xml .= xmlify("headline", $display_data);
517 } else {
518
519 # fallback to empty field name if needed
520 $html .= get_field_name($config,$field,$field_usage{$field}) || '';
521 $html .= "#-#".$display_data."###\n";
522 }
523 }
524
525 my $swish_data = $cache->{swish_data}->{$field}->[$page];
526 if ($swish_data) {
527 # remove extra spaces
528 $swish_data =~ s/ +/ /g;
529 $swish_data =~ s/ +$//g;
530
531 $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
532 }
533
534 my $swish_exact_data = $cache->{swish_exact_data}->{$field}->[$page];
535 if ($swish_exact_data) {
536 $swish_exact_data =~ s/ +/ /g;
537 $swish_exact_data =~ s/ +$//g;
538
539 # add delimiters before and after word.
540 # That is required to produce exact match
541 $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
542 }
543
544 my $idel = $cache->{index_delimiter}->{$field};
545 foreach my $tmp (@{$cache->{index_data}->{$field}->[$page]}) {
546 my $i = $tmp;
547 my $d = $tmp;
548 if ($idel && $tmp =~ m/$idel/) {
549 ($i,$d) = split(/$idel/,$tmp);
550 }
551 $index->insert($field, $i, $d, $path);
552 #print STDERR "index [$idel] $field: $i --> $d [$path]\n";
553 }
554 }
555
556 }
557
558 if (! $iterate_by_page) {
559 if ($display_data) {
560 if ($field eq "headline") {
561 $xml .= xmlify("headline", $display_data);
562 } else {
563
564 # fallback to empty field name if needed
565 $html .= get_field_name($config,$field,$field_usage{$field}) || '';
566 $html .= "#-#".$display_data."###\n";
567 }
568 }
569 if ($swish_data) {
570 # remove extra spaces
571 $swish_data =~ s/ +/ /g;
572 $swish_data =~ s/ +$//g;
573
574 $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
575 }
576
577 if ($swish_exact_data) {
578 $swish_exact_data =~ s/ +/ /g;
579 $swish_exact_data =~ s/ +$//g;
580
581 # add delimiters before and after word.
582 # That is required to produce exact match
583 $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
584 }
585 }
586 }
587
588 # dump formatted output in <html>
589 if ($html) {
590 #$xml .= xmlify("html",$html);
591 $xml .= "<html><![CDATA[ $html ]]></html>";
592 }
593
594 if ($xml) {
595 $xml .= $add_xml if ($add_xml);
596 return "<xml>\n$xml</xml>\n";
597 } else {
598 return;
599 }
600 }
601
602 ##########################################################################
603
604 # read configuration for this script
605 my $cfg = new Config::IniFiles( -file => $config_file );
606
607 # read global.conf configuration
608 my $cfg_global = new Config::IniFiles( -file => 'global.conf' );
609
610 # open index
611 $index = new index_DBI(
612 $cfg_global->val('global', 'dbi_dbd'),
613 $cfg_global->val('global', 'dbi_dsn'),
614 $cfg_global->val('global', 'dbi_user'),
615 $cfg_global->val('global', 'dbi_passwd') || '',
616 );
617
618 my $show_progress = $cfg_global->val('global', 'show_progress');
619
620 my $my_unac_filter = $cfg_global->val('global', 'my_unac_filter');
621 if ($my_unac_filter) {
622 print STDERR "using $my_unac_filter to filter characters for search\n";
623 require $my_unac_filter;
624 } else {
625 print STDERR "### fallback to default my_unac_string!\n";
626 eval q{
627 sub main::my_unac_string($$) {
628 my ($charset, $string) = (@_);
629 return $string;
630 }
631 };
632 }
633
634 foreach my $database ($cfg->Sections) {
635
636 my $type = lc($cfg -> val($database, 'type')) || die "$database doesn't have 'type' defined";
637 my $add_xml = $cfg -> val($database, 'xml'); # optional
638
639 # create new lookup file
640 my $lookup_file = $cfg -> val($database, 'lookup_newfile'); # optional
641 if ($lookup_file) {
642 #tie %lhash, 'GDBM_File', $lookup_file, &GDBM_NEWDB, 0644;
643 if (! -e $lookup_file) {
644 open(LOOKUP, "> $lookup_file") || die "can't create $lookup_file': $!";
645 close(LOOKUP);
646 }
647 tie %lhash, 'TDB_File', $lookup_file, TDB_CLEAR_IF_FIRST, O_RDWR, 0644;
648 print STDERR "creating lookup file '$lookup_file'\n";
649 # delete memory cache for lookup file
650 delete $cache->{lhash};
651 }
652
653 # open existing lookup file
654 $lookup_file = $cfg -> val($database, 'lookup_open'); # optional
655 if ($lookup_file) {
656 #tie %lhash, 'GDBM_File', $lookup_file, &GDBM_READER, 0644;
657 tie %lhash, 'TDB_File', $lookup_file, TDB_DEFAULT, O_RDWR, 0644;
658 print STDERR "opening lookup file '$lookup_file'\n";
659 }
660
661 print STDERR "reading ./import_xml/$type.xml\n";
662
663 # extract just type basic
664 my $type_base = $type;
665 $type_base =~ s/_.+$//g;
666
667 $config=XMLin("./import_xml/$type.xml", ForceArray => [ $type2tag{$type_base}, 'config', 'format' ], ForceContent => 1 );
668
669 # helper for progress bar
670 sub fmt_time {
671 my $t = shift || 0;
672 my $out = "";
673
674 my ($ss,$mm,$hh) = gmtime($t);
675 $out .= "${hh}h" if ($hh);
676 $out .= sprintf("%02d:%02d", $mm,$ss);
677 $out .= " " if ($hh == 0);
678 return $out;
679 }
680
681 # output current progress indicator
682 my $last_p = 0;
683 my $start_t = time();
684 sub progress {
685 return if (! $show_progress);
686 my $current = shift;
687 my $total = shift || 1;
688 my $p = int($current * 100 / $total);
689 if ($p < $last_p || $current == 1) {
690 $start_t = time();
691 $last_p = 0;
692 } elsif ($p != $last_p) {
693 my $rate = ($current / (time() - $start_t || 1));
694 my $eta = ($total-$current) / ($rate || 1);
695 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$current,"=" x ($p/3)."$p%>", $total, $rate, fmt_time($eta));
696 $last_p = $p;
697 }
698 }
699
700 my $fake_dir = 1;
701 my $fake_pos = 0;
702 my $last_fake_t = time();
703 sub fakeprogress {
704 return if (! $show_progress);
705 my $current = shift @_;
706
707 my @ind = ('-','\\','|','/','-','\\','|','/');
708
709 if ($current < $fake_pos) {
710 $start_t = time();
711 $last_fake_t = 0;
712 $fake_dir = 1;
713 $fake_pos = 0;
714 }
715
716 if (time()-$last_fake_t >= 1) {
717 $last_fake_t = time();
718 $fake_pos += $fake_dir;
719 $fake_dir = -$fake_dir if ($fake_pos > 38);
720 }
721
722 if ($current % 10 == 0) {
723 my $rate = ($current / (time() - $start_t || 1));
724 printf STDERR ("%5d [%-38s] %0.1f/s\r",$current, " " x $fake_pos .$ind[($current / 10) % 8], $rate);
725 }
726 }
727
728 # now read database
729 print STDERR "using: $type...\n";
730
731 # erase cache for tags by order in this database
732 delete $cache->{tags_by_order};
733
734 if ($type_base eq "isis") {
735
736 my $isis_db = $cfg -> val($database, 'isis_db') || die "$database doesn't have 'isis_db' defined!";
737
738 $import2cp = Text::Iconv->new($config->{isis_codepage},$codepage);
739 my $db = new Biblio::Isis( isisdb => $isis_db );
740
741 my $max_rowid = $db->count || die "can't find maxmfn";
742
743 print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
744
745 $path = $database;
746
747 for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
748 my $row = $db->to_hash( $row_id );
749 if ($row) {
750
751 $row->{mfn} = $row_id;
752 $row->{record} = $db->{record};
753
754 progress($row->{mfn}, $max_rowid);
755
756 my $swishpath = $path."#".int($row->{mfn});
757
758 if (my $xml = data2xml($type_base,$row,$add_xml,$cfg,$database)) {
759 $xml = $cp2utf->convert($xml);
760 use bytes; # as opposed to chars
761 print "Path-Name: $swishpath\n";
762 print "Content-Length: ".(length($xml)+1)."\n";
763 print "Document-Type: XML\n\n$xml\n";
764 }
765 }
766 }
767 print STDERR "\n";
768
769 } elsif ($type_base eq "excel") {
770 require Spreadsheet::ParseExcel;
771 require Spreadsheet::ParseExcel::Utility;
772 import Spreadsheet::ParseExcel::Utility qw(int2col);
773
774 $import2cp = Text::Iconv->new($config->{excel_codepage},$codepage);
775 my $excel_file = $cfg -> val($database, 'excel_file') || die "$database doesn't have 'excel_file' defined!";
776
777 my $sheet = x($config->{sheet}) || die "no sheet in $type.xml";
778 my $start_row = x($config->{start_row}) - 1 || die "no start_row in $type.xml";
779
780 my $oBook = Spreadsheet::ParseExcel::Workbook->Parse($excel_file) || die "can't open Excel file '$excel_file'";
781
782 my $sheet_nr = 0;
783 foreach my $oWks (@{$oBook->{Worksheet}}) {
784 #print STDERR "-- SHEET $sheet_nr:", $oWks->{Name}, "\n";
785 last if ($oWks->{Name} eq $sheet);
786 $sheet_nr++;
787 }
788
789 my $oWorksheet = $oBook->{Worksheet}[$sheet_nr];
790
791 print STDERR "using sheet: ",$oWorksheet->{Name},"\n";
792 defined ($oWorksheet) || die "can't find sheet '$sheet' in $excel_file";
793 my $end_row = x($config->{end_row}) || $oWorksheet->{MaxRow};
794
795 for(my $iR = $start_row ; defined $end_row && $iR <= $end_row ; $iR++) {
796 my $row;
797 for(my $iC = $oWorksheet->{MinCol} ; defined $oWorksheet->{MaxCol} && $iC <= $oWorksheet->{MaxCol} ; $iC++) {
798 my $cell = $oWorksheet->{Cells}[$iR][$iC];
799 if ($cell) {
800 # this conversion is a cludge.
801 # Files from Excell could have
802 # characters which don't fit into
803 # destination encoding.
804 $row->{int2col($iC)} = $utf2cp->convert($cell->Value) || $cell->Value;
805 }
806 }
807
808 progress($iR, $end_row);
809
810 # print "row[$iR/$end_row] ";
811 # foreach (keys %{$row}) {
812 # print "$_: ",$row->{$_},"\t";
813 # }
814 # print "\n";
815
816 my $swishpath = $database."#".$iR;
817
818 next if (! $row);
819
820 if (my $xml = data2xml($type_base,$row,$add_xml,$cfg,$database)) {
821 $xml = $cp2utf->convert($xml);
822 use bytes; # as opposed to chars
823 print "Path-Name: $swishpath\n";
824 print "Content-Length: ".(length($xml)+1)."\n";
825 print "Document-Type: XML\n\n$xml\n";
826 }
827 }
828 } elsif ($type_base eq "marc") {
829
830 require MARC::File::USMARC;
831
832 $import2cp = Text::Iconv->new($config->{marc_codepage},$codepage);
833 my $marc_file = $cfg -> val($database, 'marc_file') || die "$database doesn't have 'marc_file' defined!";
834
835 # optional argument is format
836 warn "marc_format is no longer used!" if ($config->{marc_format});
837 print STDERR "Reading MARC file '$marc_file'\n";
838
839 my $marc = MARC::File::USMARC->in( $marc_file )
840 || die "Can't open MARC file '$marc_file': ".$MARC::File::ERROR;
841
842 # count records in MARC file
843 sub marc_count {
844 my $filename = shift || die;
845 my $file = MARC::File::USMARC->in($filename) || die $MARC::File::ERROR;
846 my $count = 0;
847 while ($file->skip()) {
848 $count++;
849 }
850 return $count;
851 }
852
853 my $count = marc_count($marc_file) || warn "no records in '$marc_file'?";
854
855 my $i = 0;
856
857 while( my $rec = $marc->next() ) {
858
859 progress($i++,$count);
860
861 my $swishpath = $database."#".$i;
862
863 if (my $xml = data2xml($type_base,$rec,$add_xml,$cfg,$database)) {
864 $xml = $cp2utf->convert($xml);
865 use bytes; # as opposed to chars
866 print "Path-Name: $swishpath\n";
867 print "Content-Length: ".(length($xml)+1)."\n";
868 print "Document-Type: XML\n\n$xml\n";
869 }
870 }
871
872 print STDERR "\n";
873
874 } elsif ($type_base eq "feed") {
875
876 $import2cp = Text::Iconv->new($config->{feed_codepage},$codepage);
877 my $prog = x($config->{prog}) || die "$database doesn't have 'prog' defined!";
878
879 print STDERR "Reading feed from program '$prog'\n";
880
881 open(FEED,"feeds/$prog |") || die "can't start $prog: $!";
882
883 my $i=1; # record nr.
884
885 my $data;
886 my $line=1;
887
888 while (<FEED>) {
889 chomp;
890
891 if (/^$/) {
892 my $swishpath = $database."#".$i++;
893
894 if (my $xml = data2xml($type_base,$data,$add_xml,$cfg,$database)) {
895 $xml = $cp2utf->convert($xml);
896 use bytes; # as opposed to chars
897 print "Path-Name: $swishpath\n";
898 print "Content-Length: ".(length($xml)+1)."\n";
899 print "Document-Type: XML\n\n$xml\n";
900 }
901 $line = 1;
902 $data = {};
903 next;
904 }
905
906 $line = $1 if (s/^(\d+):\s*//);
907 $data->{$line++} = $_;
908
909 fakeprogress($i);
910
911 }
912 # close lookup
913 untie %lhash if (%lhash);
914 }
915 }
916
917 # call this to commit index
918 $index->close;
919
920 1;
921 __END__
922 ##########################################################################
923
924 =head1 NAME
925
926 all2xml.pl - read various file formats and dump XML for SWISH-E
927
928 =head1 SYNOPSYS
929
930 $ all2xml.pl [test.conf]
931
932 =head1 DESCRIPTION
933
934 This command will read ISIS data file using IsisDB perl module, MARC
935 records using MARC module and optionally Micro$oft Excel files to
936 create one XML file for usage with I<SWISH-E> indexer. Dispite it's name,
937 this script B<isn't general xml generator> from isis files (isis allready
938 has something like that). Output of this script is tailor-made for SWISH-E.
939
940 If no configuration file is specified, it will use default one called
941 C<all2xml.conf>.
942
943 =head1 BUGS
944
945 Documentation is really lacking. However, in true Open Source spirit, source
946 is best documentation. I even made considerable effort to comment parts
947 which are not intuitively clear, so...
948
949 =head1 AUTHOR
950
951 Dobrica Pavlinusic <dpavlin@rot13.org>
952
953 =head1 COPYRIGHT
954
955 GNU Public License (GPL) v2 or later
956
957 =head1 SEE ALSO
958
959 SWISH-E web site at http://www.swish-e.org
960
961 =cut

Properties

Name Value
cvs2svn:cvs-rev 1.64
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26