/[webpac]/trunk/all2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/all2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3 - (show annotations)
Sat Nov 30 00:36:34 2002 UTC (21 years, 3 months ago) by dpavlin
File MIME type: text/plain
File size: 4996 byte(s)
first really working version -- creates xml file for swish + swish config

1 #!/usr/bin/perl -w
2
3 use strict;
4 use OpenIsis;
5 use Getopt::Std;
6 use Data::Dumper;
7 use XML::Simple;
8 use Text::Unaccent;
9 require Unicode::Map8;
10
11 my $config=XMLin(undef, forcearray => [ 'isis' ], forcecontent => 1);
12
13 my %opts;
14
15 getopts('d:m:q', \%opts);
16
17 my $db_dir = $opts{d} || "ps"; # FIX
18
19 #die "usage: $0 -d [database_dir] -m [database1,database2] " if (! %opts);
20
21 #print Dumper($config->{indexer});
22 #print "-" x 70,"\n";
23
24 # how to convert isis code page to UTF8?
25 my $isis_map = Unicode::Map8->new($config->{isis_codepage}) || die;
26
27 sub isis2xml {
28
29 my $row = shift @_;
30
31 my $xml;
32
33 sub isis_sf {
34 my $row = shift @_;
35 my $isis_id = shift @_;
36 my $subfield = shift @_;
37 if ($row->{$isis_id}->[0]) {
38 my $sf = OpenIsis::subfields($row->{$isis_id}->[0]);
39 if (! defined $subfield || length($subfield) == 0) {
40 # subfield list undef, empty or no defined subfields for this record
41 return $row->{$isis_id}->[0];
42 } elsif ($sf->{$subfield}) {
43 return $sf->{$subfield};
44 }
45 }
46 }
47
48 foreach my $field (keys %{$config->{indexer}}) {
49
50 my $display_data = "";
51 my $index_data = "";
52
53 foreach my $x (@{$config->{indexer}->{$field}->{isis}}) {
54
55 my $display_tmp = "";
56 my $index_tmp = "";
57
58 my $format = $x->{content};
59 my $i = 1; # index only
60 my $d = 1; # display only
61 $i = 0 if (lc($x->{type}) eq "display");
62 $d = 0 if (lc($x->{type}) eq "index");
63 #print "## i: $i d: $d ## $format ##";
64 # parse format
65 my $prefix = "";
66 if ($format =~ s/^([^\d]+)//) {
67 $prefix = $1;
68 }
69 while ($format) {
70 if ($format =~ s/^(\d\d\d)(\w?)//) {
71 my $isis_tmp = isis_sf($row,$1,$2);
72 if ($isis_tmp) {
73 # $display_tmp .= $prefix . "/$1/$2/".$isis_tmp if ($d);
74 $display_tmp .= $prefix . $isis_tmp if ($d);
75 $index_tmp .= $isis_tmp." " if ($i);
76 #print " $isis_tmp <--\n";
77 }
78 $prefix = "";
79 } elsif ($format =~ s/^([^\d]+)//) {
80 $prefix = $1;
81 } else {
82 print STDERR "WARNING: unparsed format '$format'\n";
83 last;
84 };
85 }
86 # add suffix
87 $display_tmp .= $prefix if ($display_tmp);
88
89 # $display_data .= $display_tmp if ($display_tmp ne "");
90 # $index_data .= $index_tmp if ($index_tmp ne "");
91 $display_data .= $display_tmp;
92 $index_data .= $index_tmp;
93
94 }
95 #print "--display:$display_data\n--index:$index_data\n";
96 $xml->{$field}->{display} .= $isis_map->tou($display_data)->utf8 if ($display_data);
97 $xml->{$field}->{index} .= unac_string($config->{isis_codepage},$index_data) if ($index_data);
98
99 }
100 if ($xml) {
101 return XMLout($xml, rootname => 'xml', noattr => 1 );
102 } else {
103 return;
104 }
105 }
106
107 ##########################################################################
108
109 my $last_tell=0;
110
111 my @isis_dirs = ( '.' ); # use dirname as database name
112
113 if ($opts{m}) {
114 @isis_dirs = split(/,/,$opts{m});
115 }
116
117 my @isis_dbs;
118
119 foreach (@isis_dirs) {
120 if (-e $config->{isis_data}."/$db_dir/$_/LIBRI") {
121 push @isis_dbs,$config->{isis_data}."/$db_dir/$_/LIBRI/LIBRI";
122 }
123 if (-e $config->{isis_data}."/$db_dir/$_/PERI") {
124 push @isis_dbs,$config->{isis_data}."/$db_dir/$_/PERI/PERI";
125 }
126 if (-e $config->{isis_data}."/$db_dir/$_/AMS") {
127 push @isis_dbs,$config->{isis_data}."/$db_dir/$_/AMS/AMS";
128 }
129 if (-e $config->{isis_data}."/$db_dir/$_/ARTI") {
130 # push @isis_dbs,$config->{isis_data}."/$db_dir/$_/ARTI/ARTI";
131 }
132 }
133
134 print STDERR "FATAL: Can't find isis database.\nPerhaps isis_data (".$config->{isis_data}.") has wrong value?\n" if (! @isis_dbs);
135
136 my $db;
137
138 foreach my $isis_db (@isis_dbs) {
139
140
141 my $db = OpenIsis::open( $isis_db );
142 if (0) {
143 # # FIX
144 # if (! $db ) {
145 print STDERR "WARNING: can't open '$isis_db'\n";
146 next ;
147 }
148
149 my $max_rowid = OpenIsis::maxRowid( $db );
150
151 print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
152
153 my $last_p = 0;
154
155 # { my $row_id = 1;
156 # FIX
157 for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
158 my $row = OpenIsis::read( $db, $row_id );
159 if ($row && $row->{mfn}) {
160
161 # output current process indicator
162 my $p = int($row->{mfn} * 100 / $max_rowid);
163 if ($p != $last_p) {
164 printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$row->{mfn},$max_rowid,"=" x ($p/2).">", $p ) if (! $opts{q});
165 $last_p = $p;
166 }
167
168 if (my $xml = isis2xml($row)) {
169 print "Path-Name: $isis_db#".$row->{mfn}."\n";
170 print "Content-Length: ".(length($xml)+1)."\n";
171 print "Document-Type: XML\n\n$xml\n";
172 }
173 }
174 }
175 print STDERR "\n";
176 }
177
178
179 1;
180 __END__
181 ##########################################################################
182
183 =head1 NAME
184
185 isis2xml.pl - read isis file and dump XML
186
187 =head1 DESCRIPTION
188
189 This command will read ISIS data file using OpenIsis perl module and
190 create XML file for usage with I<SWISH-E>
191 indexer. Dispite it's name, this script B<isn't general xml generator>
192 from isis files (isis allready has something like that). Output of this
193 script is tailor-made for SWISH-E.
194
195 =head1 AUTHOR
196
197 Dobrica Pavlinusic <dpavlin@rot13.org>
198
199 =head1 COPYRIGHT
200
201 GNU Public License (GPL) v2 or later
202
203 =head1 SEE ALSO
204
205 SWISH-E web site at http://www.swish-e.org
206
207 =cut

Properties

Name Value
cvs2svn:cvs-rev 1.2
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26