/[webpac]/branches/tehnika/all2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/tehnika/all2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Sun Feb 16 22:41:37 2003 UTC (16 years, 8 months ago) by dpavlin
Original Path: trunk/all2xml.pl
File MIME type: text/plain
File size: 6528 byte(s)
added configuration file with database descriptions,
moved isis.xml definition file in separate directory (in preparation for MARK),
support for different encodings in different files,
various fixes, improvements and badly written parts which will change ;-)

1 #!/usr/bin/perl -w
2
3 use strict;
4 use OpenIsis;
5 use Getopt::Std;
6 use Data::Dumper;
7 use XML::Simple;
8 use Text::Unaccent 1.02; # 1.01 won't compile on my platform,
9 use Text::Iconv;
10 use Config::IniFiles;
11
12 $|=1;
13
14 my $config_file = $0;
15 $config_file =~ s/\.pl$/.conf/;
16 die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file);
17
18 my $config;
19
20 use index_DBI; # there is no other, right now ;-)
21
22 my $index = new index_DBI(); # open index
23
24 my %opts;
25
26 # usage:
27 # -d directory name
28 # -m multiple directories
29 # -q quiet
30 # -s run swish
31
32 getopts('d:m:qs', \%opts);
33
34 my $db_dir;
35
36 #die "usage: $0 -d [database_dir] -m [database1,database2] " if (! %opts);
37
38 #print Dumper($config->{indexer});
39 #print "-" x 70,"\n";
40
41 Text::Iconv->raise_error(1); # Conversion errors raise exceptions
42
43 #my $isis_codepage = Text::Iconv->new($config->{isis_codepage},'UTF8');
44 #my $index_codepage = Text::Iconv->new($config->{isis_codepage},$config->{index_codepage});
45 my $isis_codepage;
46 my $index_codepage;
47 my $cludge_codepage = Text::Iconv->new('UTF8','ISO8859-1');
48 my $xml_codepage;
49
50 sub isis2xml {
51
52 use xmlify;
53
54 my $row = shift @_;
55 my $add_xml = shift @_;
56
57 my $xml;
58
59 use parse_format;
60
61 my $html = ""; # html formatted display output
62
63 my %field_usage; # counter for usage of each field
64
65 foreach my $field (keys %{$config->{indexer}}) {
66
67 $field_usage{$field}++;
68
69 my $swish_data = "";
70 my $display_data = "";
71 my $index_data = "";
72
73 foreach my $x (@{$config->{indexer}->{$field}->{isis}}) {
74
75 my $format = $x->{content};
76 my ($s,$d,$i) = (1,1,0); # swish, display default
77 $s = 0 if (lc($x->{type}) eq "display");
78 $d = 0 if (lc($x->{type}) eq "swish");
79 ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index");
80 #print STDERR "## s: $s d: $d i: $i ## $format ##\n";
81
82 $format = $cludge_codepage->convert($format);
83 my ($swish,$display) = parse_format($format,$row);
84 #print STDERR "s: $swish\nd: $display\n" if ($swish);
85
86 #print STDERR "swish: $swish<-- display: $display<--\n";
87 # FIX: this is ugly, UGLY, cludge: OpenIsis return
88 # UTF8 encoding of strings, but as if source charset
89 # is ISO8859-1 and not some other. This breaks our
90 # isis character encoding, so we convert it first
91 # back to ISO8859-1 (which can actually be different
92 # encoding in isis)
93
94 $swish_data .= $swish if ($s && $swish);
95 $display_data .= $display if ($d && $display);
96 $index_data .= $display if ($i && $display);
97 }
98
99
100 #print STDERR "s_d: $swish_data\nd_d: $display_data\n" if ($swish_data);
101 if ($display_data) {
102 $display_data = $isis_codepage->convert($display_data) || die "Can't convert '$display_data' !";
103 # FIX: this is removed and replaced by html tag.
104 #$xml .= xmlify($field."_display", $display_data);
105
106 if ($field eq "headline") {
107 $xml .= xmlify("headline", $display_data);
108 } else {
109
110 # find field name (signular, plural)
111 my $field_name = "";
112 if ($config->{indexer}->{$field}->{name_singular} && $field_usage{$field} == 1) {
113 $field_name = $config->{indexer}->{$field}->{name_singular}."#-#";
114 } elsif ($config->{indexer}->{$field}->{name_plural}) {
115 $field_name = $config->{indexer}->{$field}->{name_plural}."#-#";
116 } else {
117 $field_name = $config->{indexer}->{$field}->{name}."#-#";
118 }
119 if ($field_name) {
120 $html .= $xml_codepage->convert($field_name);
121 }
122 $html .= $display_data."###\n";
123 }
124 }
125 if ($swish_data) {
126 my $i = Text::Iconv->new($config->{isis_codepage},'ISO8859-2');
127 $swish_data = $i->convert($swish_data);
128 $xml .= xmlify($field."_swish",unac_string('ISO8859-2',$swish_data));
129 #$swish_data = $isis_codepage->convert($swish_data)."##" || $swish_data;
130 #$xml .= xmlify($field."_swish",unac_string($config->{isis_codepage},$swish_data));
131 }
132
133 # index
134 if ($index_data && $index_data ne "") {
135 $index_data = $index_codepage->convert($index_data) || $index_data;
136 $index->insert($field, $index_data, $db_dir);
137 }
138
139 }
140
141 # dump formatted output in <html>
142 if ($html) {
143 $xml .= xmlify("html",$html);
144 }
145
146 if ($xml) {
147 #print STDERR "x: $xml\n";
148 $xml .= $add_xml if ($add_xml);
149 return "<xml>\n$xml</xml>\n";
150 } else {
151 return;
152 }
153 }
154
155 ##########################################################################
156
157 my $cfg = new Config::IniFiles( -file => $config_file );
158
159 foreach my $database ($cfg->Sections) {
160
161 my $isis_db = $cfg -> val($database, 'isis_db');
162 my $type = $cfg -> val($database, 'type');
163 my $add_xml = $cfg -> val($database, 'xml');
164
165 # read configuration for this type
166 $config=XMLin("./import_xml/$type.xml", forcearray => [ 'isis' ], forcecontent => 1);
167 $isis_codepage = Text::Iconv->new($config->{isis_codepage},'UTF8');
168 $index_codepage = Text::Iconv->new($config->{isis_codepage},$config->{index_codepage});
169 $xml_codepage = Text::Iconv->new($cfg->val($database,'xml_codepage'),'UTF8');
170
171 my $db = OpenIsis::open( $isis_db );
172 if (0) {
173 # # FIX
174 # if (! $db ) {
175 print STDERR "WARNING: can't open '$isis_db'\n";
176 next ;
177 }
178
179 my $max_rowid = OpenIsis::maxRowid( $db );
180
181 print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
182
183 my $path = $database; # was $isis_db
184
185 my $last_p = 0;
186
187 # { my $row_id = 4514;
188 # FIX
189 for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
190 my $row = OpenIsis::read( $db, $row_id );
191 if ($row && $row->{mfn}) {
192 #print STDERR "mfn: ",$row->{mfn},"\n";
193 # output current process indicator
194 my $p = int($row->{mfn} * 100 / $max_rowid);
195 if ($p != $last_p) {
196 printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$row->{mfn},$max_rowid,"=" x ($p/2).">", $p ) if (! $opts{q});
197 $last_p = $p;
198 }
199
200 if (my $xml = isis2xml($row,$add_xml)) {
201 #print STDERR "--ret-->$xml\n";
202 print "Path-Name: $path#".int($row->{mfn})."\n";
203 print "Content-Length: ".(length($xml)+1)."\n";
204 print "Document-Type: XML\n\n$xml\n";
205 }
206 }
207 }
208 print STDERR "\n";
209 }
210
211 # call this to commit index
212 $index->close;
213
214 1;
215 __END__
216 ##########################################################################
217
218 =head1 NAME
219
220 isis2xml.pl - read isis file and dump XML
221
222 =head1 DESCRIPTION
223
224 This command will read ISIS data file using OpenIsis perl module and
225 create XML file for usage with I<SWISH-E>
226 indexer. Dispite it's name, this script B<isn't general xml generator>
227 from isis files (isis allready has something like that). Output of this
228 script is tailor-made for SWISH-E.
229
230 =head1 AUTHOR
231
232 Dobrica Pavlinusic <dpavlin@rot13.org>
233
234 =head1 COPYRIGHT
235
236 GNU Public License (GPL) v2 or later
237
238 =head1 SEE ALSO
239
240 SWISH-E web site at http://www.swish-e.org
241
242 =cut

Properties

Name Value
cvs2svn:cvs-rev 1.8
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26