/[webpac]/trunk/all2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/all2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 163 - (show annotations)
Thu Nov 20 21:23:40 2003 UTC (20 years, 4 months ago) by dpavlin
File MIME type: text/plain
File size: 16781 byte(s)
Added type="swish_exact" to save data into swish index with boundaries
xxbxx data xxexxx. This is helpful to implement exact match from beginning
of query and exact match to full query which are defined using e[nr] field
in web user interface (with same [nr] as f[nr] and v[nr] fields) which
have to have value 1 (from beginning) 2 (from end, not that useful...) or
3 (1+2 - exact match)

1 #!/usr/bin/perl -w
2
3 use strict;
4 use OpenIsis;
5 use Getopt::Std;
6 use Data::Dumper;
7 use XML::Simple;
8 use Text::Unaccent 1.02; # 1.01 won't compile on my platform,
9 use Text::Iconv;
10 use Config::IniFiles;
11 use Encode;
12
13 $|=1;
14
15 my $config_file = $0;
16 $config_file =~ s/\.pl$/.conf/;
17 die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file);
18
19 my $config;
20
21 #use index_DBI; # default DBI module for index
22 use index_DBI_cache; # faster DBI module using memory cache
23 my $index;
24
25 my %opts;
26
27 # usage:
28 # -d directory name
29 # -m multiple directories
30 # -q quiet
31 # -s run swish
32
33 getopts('d:m:qs', \%opts);
34
35 my $path; # this is name of database
36
37 Text::Iconv->raise_error(0); # Conversion errors don't raise exceptions
38
39 # this is encoding of all files on disk, including import_xml/*.xml file and
40 # filter/*.pm files! It will be used to store strings in perl internally!
41 my $codepage = 'ISO-8859-2';
42
43 my $utf2cp = Text::Iconv->new('UTF-8',$codepage);
44 # this function will convert data from XML files to local encoding
45 sub x {
46 return $utf2cp->convert($_[0]);
47 }
48
49 # decode isis/excel or other import codepage
50 my $import2cp;
51
52 # outgoing xml must be in UTF-8
53 my $cp2utf = Text::Iconv->new($codepage,'UTF-8');
54
55 # mapping between data type and tag which specify
56 # format in XML file
57 my %type2tag = (
58 'isis' => 'isis',
59 'excel' => 'column',
60 'marc' => 'marc',
61 'feed' => 'feed'
62 );
63
64 sub data2xml {
65
66 use xmlify;
67
68 my $type = shift @_;
69 my $row = shift @_;
70 my $add_xml = shift @_;
71 # needed to read values from configuration file
72 my $cfg = shift @_;
73 my $database = shift @_;
74
75 my $xml;
76
77 use parse_format;
78
79 my $html = ""; # html formatted display output
80
81 my %field_usage; # counter for usage of each field
82
83 # sort subrouting using order="" attribute
84 sub by_order {
85 my $va = $config->{indexer}->{$a}->{order} ||
86 $config->{indexer}->{$a};
87 my $vb = $config->{indexer}->{$b}->{order} ||
88 $config->{indexer}->{$b};
89
90 return $va <=> $vb;
91 }
92
93 foreach my $field (sort by_order keys %{$config->{indexer}}) {
94
95 $field=x($field);
96 $field_usage{$field}++;
97
98 my $swish_data = "";
99 my $swish_exact_data = "";
100 my $display_data = "";
101 my $line_delimiter;
102
103 my ($swish,$display);
104
105 my $tag = $type2tag{$type} || die "can't find which tag to use for type $type";
106 foreach my $x (@{$config->{indexer}->{$field}->{$tag}}) {
107
108 my $format = x($x->{content});
109 my $delimiter = x($x->{delimiter}) || ' ';
110
111 my $repeat_off = 0; # repeatable offset
112
113 my ($s,$se,$d,$i) = (1,0,1,0); # swish, display default
114 $s = 0 if (lc($x->{type}) eq "display");
115 $d = 0 if (lc($x->{type}) eq "swish");
116 $se = 1 if (lc($x->{type}) eq "swish_exact");
117 ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index");
118
119 # what will separate last line from this one?
120 if ($display_data && $x->{append} && $x->{append} eq "1") {
121 $line_delimiter = ' ';
122 } elsif ($display_data) {
123 $line_delimiter = '<br/>';
124 }
125
126 # init vars so that we go into while...
127 ($swish,$display) = (1,1);
128
129 # placeholder for all repeatable entries for index
130 my @index_data;
131 my $index_filter;
132
133 sub mkformat {
134 my $x = shift || die "mkformat needs tag reference";
135 my $data = shift || return;
136 my $format_name = x($x->{format_name}) || return $data;
137 my $fmt = x($config->{format}->{$format_name}->{content}) || die "<format name=\"$format_name\"> is not defined!";
138 my $format_delimiter = x($x->{format_delimiter});
139 my @data;
140 if ($format_delimiter) {
141 @data = split(/$format_delimiter/,$data);
142 } else {
143 push @data,$data;
144 }
145
146 if ($fmt) {
147 my $nr = scalar $fmt =~ s/%s/%s/g;
148 if (($#data+1) == $nr) {
149 return sprintf($fmt,@data);
150 } else {
151 print STDERR "mkformat: [$data] can't be split on [$format_delimiter] to $nr fields!\n";
152 return $data;
153 }
154 } else {
155 print STDERR "usage of link '$format_name' without defined format (<link> tag)\n";
156 }
157 }
158
159 # while because of repeatable fields
160 while ($swish || $display) {
161 ($swish,$display) = parse_format($type, $format,$row,$repeat_off++,$import2cp);
162 if ($repeat_off > 1000) {
163 print STDERR "loop (more than 1000 repeatable fields) deteced in $row, $format\n";
164 last;
165 }
166
167 # filter="name" ; filter this field through
168 # filter/[name].pm
169 my $filter = $x->{filter};
170 if ($filter) {
171 require "filter/".$filter.".pm";
172 }
173 # type="swish" ; field for swish
174 if ($swish) {
175 if ($filter && ($s || $se)) {
176 no strict 'refs';
177 my $tmp = join(" ",&$filter($swish)) if ($s || $se);
178 $swish_data .= $tmp if ($s);
179 $swish_exact_data .= $tmp if ($se);
180
181 } else {
182 $swish_data .= $swish if ($s);
183 $swish_exact_data .= $swish if ($se);
184 }
185 }
186
187 # type="display" ; field for display
188 if ($d && $display) {
189 if ($line_delimiter && $display_data) {
190 $display_data .= $line_delimiter;
191 undef $line_delimiter;
192 }
193 if ($filter) {
194 no strict 'refs';
195 if ($display_data) {
196 $display_data .= $delimiter.join($delimiter,mkformat($x,&$filter($display)));
197 } else {
198 $display_data = join($delimiter,mkformat($x,&$filter($display)));
199 }
200 } else {
201 if ($display_data) {
202 $display_data .= $delimiter.mkformat($x,$display);
203 } else {
204 $display_data = mkformat($x,$display);
205 }
206 }
207 }
208
209 # type="index" ; insert into index
210 if ($i && $display) {
211 push @index_data, $display;
212 $index_filter = $filter if ($filter);
213 }
214 }
215
216 # fill data in index
217 if (@index_data) {
218 if ($index_filter) {
219 no strict 'refs';
220 foreach my $d (@index_data) {
221 $index->insert($field, &$index_filter($d), $path);
222 }
223 } else {
224 foreach my $d (@index_data) {
225 $index->insert($field, $d, $path);
226 }
227 }
228 }
229 }
230
231 # now try to parse variables from configuration file
232 foreach my $x (@{$config->{indexer}->{$field}->{'config'}}) {
233
234 my $delimiter = x($x->{delimiter}) || ' ';
235 my $val = $cfg->val($database, x($x->{content}));
236
237 my ($s,$d,$i) = (1,1,0); # swish, display default
238 $s = 0 if (lc($x->{type}) eq "display");
239 $d = 0 if (lc($x->{type}) eq "swish");
240 # no support for swish exact in config.
241 # IMHO, it's useless
242 ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index");
243
244 if ($val) {
245 $display_data .= $delimiter.$val if ($d);
246 $swish_data .= $val if ($s);
247 $index->insert($field, $val, $path) if ($i);
248 }
249
250 }
251
252
253 if ($display_data) {
254
255 if ($field eq "headline") {
256 $xml .= xmlify("headline", $display_data);
257 } else {
258
259 # find field name (signular, plural)
260 my $field_name = "";
261 if ($config->{indexer}->{$field}->{name_singular} && $field_usage{$field} == 1) {
262 $field_name = $config->{indexer}->{$field}->{name_singular}."#-#";
263 } elsif ($config->{indexer}->{$field}->{name_plural}) {
264 $field_name = $config->{indexer}->{$field}->{name_plural}."#-#";
265 } elsif ($config->{indexer}->{$field}->{name}) {
266 $field_name = $config->{indexer}->{$field}->{name}."#-#";
267 } else {
268 print STDERR "WARNING: field '$field' doesn't have 'name' attribute!";
269 }
270 if ($field_name) {
271 $html .= x($field_name);
272 }
273 $html .= $display_data."###\n";
274 }
275 }
276 if ($swish_data) {
277 # remove extra spaces
278 $swish_data =~ s/ +/ /g;
279 $swish_data =~ s/ +$//g;
280
281 $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data));
282 }
283
284 if ($swish_exact_data) {
285 $swish_exact_data =~ s/ +/ /g;
286 $swish_exact_data =~ s/ +$//g;
287
288 # add delimiters before and after word.
289 # That is required to produce exact match
290 $xml .= xmlify($field."_swish_exact", unac_string($codepage,'xxbxx '.$swish_exact_data.' xxexx'));
291 }
292
293
294 }
295
296 # dump formatted output in <html>
297 if ($html) {
298 #$xml .= xmlify("html",$html);
299 $xml .= "<html><![CDATA[ $html ]]></html>";
300 }
301
302 if ($xml) {
303 $xml .= $add_xml if ($add_xml);
304 return "<xml>\n$xml</xml>\n";
305 } else {
306 return;
307 }
308 }
309
310 ##########################################################################
311
312 # read configuration for this script
313 my $cfg = new Config::IniFiles( -file => $config_file );
314
315 # read global.conf configuration
316 my $cfg_global = new Config::IniFiles( -file => 'global.conf' );
317
318 # open index
319 $index = new index_DBI(
320 $cfg_global->val('global', 'dbi_dbd'),
321 $cfg_global->val('global', 'dbi_dsn'),
322 $cfg_global->val('global', 'dbi_user'),
323 $cfg_global->val('global', 'dbi_passwd') || '',
324 );
325
326 my $show_progress = $cfg_global->val('global', 'show_progress');
327
328 foreach my $database ($cfg->Sections) {
329
330 my $type = lc($cfg -> val($database, 'type')) || die "$database doesn't have 'type' defined";
331 my $add_xml = $cfg -> val($database, 'xml'); # optional
332
333 print STDERR "reading ./import_xml/$type.xml\n";
334
335 # extract just type basic
336 my $type_base = $type;
337 $type_base =~ s/_.+$//g;
338
339 $config=XMLin("./import_xml/$type.xml", forcearray => [ $type2tag{$type_base}, 'config', 'format' ], forcecontent => 1);
340
341 # output current progress indicator
342 my $last_p = 0;
343 sub progress {
344 return if (! $show_progress);
345 my $current = shift;
346 my $total = shift || 1;
347 my $p = int($current * 100 / $total);
348 if ($p != $last_p) {
349 printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$current,$total,"=" x ($p/2).">", $p );
350 $last_p = $p;
351 }
352 }
353
354 my $fake_dir = 1;
355 sub fakeprogress {
356 return if (! $show_progress);
357 my $current = shift @_;
358
359 my @ind = ('-','\\','|','/','-','\\','|','/', '-');
360
361 $last_p += $fake_dir;
362 $fake_dir = -$fake_dir if ($last_p > 1000 || $last_p < 0);
363 if ($last_p % 10 == 0) {
364 printf STDERR ("%5d / %5s [%-51s]\r",$current,"?"," " x ($last_p/20).$ind[($last_p/20) % $#ind]);
365 }
366 }
367
368 # now read database
369 print STDERR "using: $type...\n";
370
371 if ($type_base eq "isis") {
372
373 my $isis_db = $cfg -> val($database, 'isis_db') || die "$database doesn't have 'isis_db' defined!";
374
375 $import2cp = Text::Iconv->new($config->{isis_codepage},$codepage);
376 my $db = OpenIsis::open( $isis_db );
377
378 # check if .txt database for OpenIsis is zero length,
379 # if so, erase it and re-open database
380 sub check_txt_db {
381 my $isis_db = shift || die "need isis database name";
382 my $reopen = 0;
383
384 if (-e $isis_db.".TXT") {
385 print STDERR "WARNING: removing $isis_db.TXT OpenIsis database...\n";
386 unlink $isis_db.".TXT" || warn "FATAL: unlink error on '$isis_db.TXT': $!";
387 $reopen++;
388 }
389 if (-e $isis_db.".PTR") {
390 print STDERR "WARNING: removing $isis_db.PTR OpenIsis database...\n";
391 unlink $isis_db.".PTR" || warn "FATAL: unlink error on '$isis_db.PTR': $!";
392 $reopen++;
393 }
394 return OpenIsis::open( $isis_db ) if ($reopen);
395 }
396
397 # EOF error
398 if ($db == -1) {
399 $db = check_txt_db($isis_db);
400 if ($db == -1) {
401 print STDERR "FATAL: OpenIsis can't open zero size file $isis_db\n";
402 next;
403 }
404 }
405
406 # OpenIsis::ERR_BADF
407 if ($db == -4) {
408 print STDERR "FATAL: OpenIsis can't find file $isis_db\n";
409 next;
410 # OpenIsis::ERR_IO
411 } elsif ($db == -5) {
412 print STDERR "FATAL: OpenIsis can't access file $isis_db\n";
413 next;
414 } elsif ($db < 0) {
415 print STDERR "FATAL: OpenIsis unknown error $db with file $isis_db\n";
416 next;
417 }
418
419 my $max_rowid = OpenIsis::maxRowid( $db );
420
421 # if 0 records, try to rease isis .txt database
422 if ($max_rowid == 0) {
423 # force removal of database
424 $db = check_txt_db($isis_db);
425 $max_rowid = OpenIsis::maxRowid( $db );
426 }
427
428 print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
429
430 my $path = $database;
431
432 for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
433 my $row = OpenIsis::read( $db, $row_id );
434 if ($row && $row->{mfn}) {
435
436 progress($row->{mfn}, $max_rowid);
437
438 my $swishpath = $path."#".int($row->{mfn});
439
440 if (my $xml = data2xml($type_base,$row,$add_xml,$cfg,$database)) {
441 $xml = $cp2utf->convert($xml);
442 use bytes; # as opposed to chars
443 print "Path-Name: $swishpath\n";
444 print "Content-Length: ".(length($xml)+1)."\n";
445 print "Document-Type: XML\n\n$xml\n";
446 }
447 }
448 }
449 # for this to work with current version of OpenIsis (0.9.0)
450 # you might need my patch from
451 # http://www.rot13.org/~dpavlin/projects/openisis-0.9.0-perl_close.diff
452 OpenIsis::close($db);
453 print STDERR "\n";
454
455 } elsif ($type_base eq "excel") {
456 use Spreadsheet::ParseExcel;
457 use Spreadsheet::ParseExcel::Utility qw(int2col);
458
459 $import2cp = Text::Iconv->new($config->{excel_codepage},$codepage);
460 my $excel_file = $cfg -> val($database, 'excel_file') || die "$database doesn't have 'excel_file' defined!";
461
462 my $sheet = x($config->{sheet}) || die "no sheet in $type.xml";
463 my $start_row = x($config->{start_row}) - 1 || die "no start_row in $type.xml";
464
465 my $oBook = Spreadsheet::ParseExcel::Workbook->Parse($excel_file) || die "can't open Excel file '$excel_file'";
466
467 my $sheet_nr = 0;
468 foreach my $oWks (@{$oBook->{Worksheet}}) {
469 #print STDERR "-- SHEET $sheet_nr:", $oWks->{Name}, "\n";
470 last if ($oWks->{Name} eq $sheet);
471 $sheet_nr++;
472 }
473
474 my $oWorksheet = $oBook->{Worksheet}[$sheet_nr];
475
476 print STDERR "using sheet: ",$oWorksheet->{Name},"\n";
477 defined ($oWorksheet) || die "can't find sheet '$sheet' in $excel_file";
478 my $end_row = x($config->{end_row}) || $oWorksheet->{MaxRow};
479
480 for(my $iR = $start_row ; defined $end_row && $iR <= $end_row ; $iR++) {
481 my $row;
482 for(my $iC = $oWorksheet->{MinCol} ; defined $oWorksheet->{MaxCol} && $iC <= $oWorksheet->{MaxCol} ; $iC++) {
483 my $cell = $oWorksheet->{Cells}[$iR][$iC];
484 if ($cell) {
485 $row->{int2col($iC)} = $cell->Value;
486 }
487 }
488
489 progress($iR, $end_row);
490
491 # print "row[$iR/$end_row] ";
492 # foreach (keys %{$row}) {
493 # print "$_: ",$row->{$_},"\t";
494 # }
495 # print "\n";
496
497 my $swishpath = $database."#".$iR;
498
499 next if (! $row);
500
501 if (my $xml = data2xml($type_base,$row,$add_xml,$cfg,$database)) {
502 $xml = $cp2utf->convert($xml);
503 use bytes; # as opposed to chars
504 print "Path-Name: $swishpath\n";
505 print "Content-Length: ".(length($xml)+1)."\n";
506 print "Document-Type: XML\n\n$xml\n";
507 }
508 }
509 } elsif ($type_base eq "marc") {
510
511 use MARC;
512
513 $import2cp = Text::Iconv->new($config->{marc_codepage},$codepage);
514 my $marc_file = $cfg -> val($database, 'marc_file') || die "$database doesn't have 'marc_file' defined!";
515
516 # optional argument is format
517 my $format = x($config->{format}) || 'usmarc';
518
519 print STDERR "Reading MARC file '$marc_file'\n";
520
521 my $marc = new MARC;
522 my $nr = $marc->openmarc({
523 file=>$marc_file, format=>$format
524 }) || die "Can't open MARC file '$marc_file'";
525
526 my $i=0; # record nr.
527
528 my $rec;
529
530 while ($marc->nextmarc(1)) {
531
532 # XXX
533 fakeprogress($i++);
534
535 my $swishpath = $database."#".$i;
536
537 if (my $xml = data2xml($type_base,$marc,$add_xml,$cfg,$database)) {
538 $xml = $cp2utf->convert($xml);
539 use bytes; # as opposed to chars
540 print "Path-Name: $swishpath\n";
541 print "Content-Length: ".(length($xml)+1)."\n";
542 print "Document-Type: XML\n\n$xml\n";
543 }
544 }
545 } elsif ($type_base eq "feed") {
546
547 $import2cp = Text::Iconv->new($config->{feed_codepage},$codepage);
548 my $prog = x($config->{prog}) || die "$database doesn't have 'prog' defined!";
549
550 print STDERR "Reading feed from program '$prog'\n";
551
552 open(FEED,"feeds/$prog |") || die "can't start $prog: $!";
553
554 my $i=1; # record nr.
555
556 my $data;
557 my $line=1;
558
559 while (<FEED>) {
560 chomp;
561
562 if (/^$/) {
563 my $swishpath = $database."#".$i++;
564
565 if (my $xml = data2xml($type_base,$data,$add_xml,$cfg,$database)) {
566 $xml = $cp2utf->convert($xml);
567 use bytes; # as opposed to chars
568 print "Path-Name: $swishpath\n";
569 print "Content-Length: ".(length($xml)+1)."\n";
570 print "Document-Type: XML\n\n$xml\n";
571 }
572 $line = 1;
573 $data = {};
574 next;
575 }
576
577 $line = $1 if (s/^(\d+):\s*//);
578 $data->{$line++} = $_;
579
580 fakeprogress($i);
581
582 }
583 }
584 }
585
586 # call this to commit index
587 $index->close;
588
589 1;
590 __END__
591 ##########################################################################
592
593 =head1 NAME
594
595 all2xml.pl - read various file formats and dump XML for SWISH-E
596
597 =head1 DESCRIPTION
598
599 This command will read ISIS data file using OpenIsis perl module, MARC
600 records using MARC module and optionally Micro$oft Excel files to
601 create one XML file for usage with I<SWISH-E> indexer. Dispite it's name,
602 this script B<isn't general xml generator> from isis files (isis allready
603 has something like that). Output of this script is tailor-made for SWISH-E.
604
605 =head1 BUGS
606
607 Documentation is really lacking. However, in true Open Source spirit, source
608 is best documentation. I even made considerable effort to comment parts
609 which are not intuitively clear, so...
610
611 =head1 AUTHOR
612
613 Dobrica Pavlinusic <dpavlin@rot13.org>
614
615 =head1 COPYRIGHT
616
617 GNU Public License (GPL) v2 or later
618
619 =head1 SEE ALSO
620
621 SWISH-E web site at http://www.swish-e.org
622
623 =cut

Properties

Name Value
cvs2svn:cvs-rev 1.45
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26