/[webpac]/trunk/all2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/all2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (hide annotations)
Sun Feb 16 22:41:37 2003 UTC (21 years, 1 month ago) by dpavlin
File MIME type: text/plain
File size: 6528 byte(s)
added configuration file with database descriptions,
moved isis.xml definition file in separate directory (in preparation for MARK),
support for different encodings in different files,
various fixes, improvements and badly written parts which will change ;-)

1 dpavlin 1 #!/usr/bin/perl -w
2    
3     use strict;
4     use OpenIsis;
5     use Getopt::Std;
6     use Data::Dumper;
7     use XML::Simple;
8 dpavlin 5 use Text::Unaccent 1.02; # 1.01 won't compile on my platform,
9 dpavlin 10 use Text::Iconv;
10 dpavlin 13 use Config::IniFiles;
11 dpavlin 1
12 dpavlin 10 $|=1;
13 dpavlin 9
14 dpavlin 13 my $config_file = $0;
15     $config_file =~ s/\.pl$/.conf/;
16     die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file);
17    
18 dpavlin 10 my $config;
19    
20     use index_DBI; # there is no other, right now ;-)
21    
22     my $index = new index_DBI(); # open index
23    
24 dpavlin 1 my %opts;
25    
26 dpavlin 7 # usage:
27     # -d directory name
28     # -m multiple directories
29     # -q quiet
30     # -s run swish
31 dpavlin 1
32 dpavlin 7 getopts('d:m:qs', \%opts);
33    
34 dpavlin 13 my $db_dir;
35 dpavlin 1
36 dpavlin 3 #die "usage: $0 -d [database_dir] -m [database1,database2] " if (! %opts);
37 dpavlin 1
38 dpavlin 3 #print Dumper($config->{indexer});
39     #print "-" x 70,"\n";
40 dpavlin 1
41 dpavlin 10 Text::Iconv->raise_error(1); # Conversion errors raise exceptions
42 dpavlin 3
43 dpavlin 13 #my $isis_codepage = Text::Iconv->new($config->{isis_codepage},'UTF8');
44     #my $index_codepage = Text::Iconv->new($config->{isis_codepage},$config->{index_codepage});
45     my $isis_codepage;
46     my $index_codepage;
47 dpavlin 10 my $cludge_codepage = Text::Iconv->new('UTF8','ISO8859-1');
48 dpavlin 13 my $xml_codepage;
49 dpavlin 10
50 dpavlin 3 sub isis2xml {
51    
52 dpavlin 10 use xmlify;
53    
54 dpavlin 3 my $row = shift @_;
55 dpavlin 13 my $add_xml = shift @_;
56 dpavlin 3
57     my $xml;
58    
59 dpavlin 10 use parse_format;
60 dpavlin 3
61 dpavlin 13 my $html = ""; # html formatted display output
62 dpavlin 10
63 dpavlin 13 my %field_usage; # counter for usage of each field
64    
65 dpavlin 3 foreach my $field (keys %{$config->{indexer}}) {
66    
67 dpavlin 13 $field_usage{$field}++;
68    
69 dpavlin 10 my $swish_data = "";
70 dpavlin 3 my $display_data = "";
71     my $index_data = "";
72    
73     foreach my $x (@{$config->{indexer}->{$field}->{isis}}) {
74    
75     my $format = $x->{content};
76 dpavlin 10 my ($s,$d,$i) = (1,1,0); # swish, display default
77 dpavlin 9 $s = 0 if (lc($x->{type}) eq "display");
78     $d = 0 if (lc($x->{type}) eq "swish");
79     ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index");
80     #print STDERR "## s: $s d: $d i: $i ## $format ##\n";
81 dpavlin 3
82 dpavlin 10 $format = $cludge_codepage->convert($format);
83     my ($swish,$display) = parse_format($format,$row);
84     #print STDERR "s: $swish\nd: $display\n" if ($swish);
85 dpavlin 3
86 dpavlin 10 #print STDERR "swish: $swish<-- display: $display<--\n";
87     # FIX: this is ugly, UGLY, cludge: OpenIsis return
88     # UTF8 encoding of strings, but as if source charset
89     # is ISO8859-1 and not some other. This breaks our
90     # isis character encoding, so we convert it first
91     # back to ISO8859-1 (which can actually be different
92     # encoding in isis)
93    
94     $swish_data .= $swish if ($s && $swish);
95     $display_data .= $display if ($d && $display);
96     $index_data .= $display if ($i && $display);
97 dpavlin 3 }
98 dpavlin 9
99 dpavlin 13
100 dpavlin 10 #print STDERR "s_d: $swish_data\nd_d: $display_data\n" if ($swish_data);
101     if ($display_data) {
102 dpavlin 13 $display_data = $isis_codepage->convert($display_data) || die "Can't convert '$display_data' !";
103     # FIX: this is removed and replaced by html tag.
104     #$xml .= xmlify($field."_display", $display_data);
105    
106     if ($field eq "headline") {
107     $xml .= xmlify("headline", $display_data);
108     } else {
109    
110     # find field name (signular, plural)
111     my $field_name = "";
112     if ($config->{indexer}->{$field}->{name_singular} && $field_usage{$field} == 1) {
113     $field_name = $config->{indexer}->{$field}->{name_singular}."#-#";
114     } elsif ($config->{indexer}->{$field}->{name_plural}) {
115     $field_name = $config->{indexer}->{$field}->{name_plural}."#-#";
116     } else {
117     $field_name = $config->{indexer}->{$field}->{name}."#-#";
118     }
119     if ($field_name) {
120     $html .= $xml_codepage->convert($field_name);
121     }
122     $html .= $display_data."###\n";
123     }
124 dpavlin 10 }
125     if ($swish_data) {
126     my $i = Text::Iconv->new($config->{isis_codepage},'ISO8859-2');
127     $swish_data = $i->convert($swish_data);
128     $xml .= xmlify($field."_swish",unac_string('ISO8859-2',$swish_data));
129     #$swish_data = $isis_codepage->convert($swish_data)."##" || $swish_data;
130     #$xml .= xmlify($field."_swish",unac_string($config->{isis_codepage},$swish_data));
131     }
132    
133 dpavlin 9 # index
134     if ($index_data && $index_data ne "") {
135 dpavlin 10 $index_data = $index_codepage->convert($index_data) || $index_data;
136     $index->insert($field, $index_data, $db_dir);
137 dpavlin 9 }
138    
139 dpavlin 3 }
140 dpavlin 13
141     # dump formatted output in <html>
142     if ($html) {
143     $xml .= xmlify("html",$html);
144     }
145    
146 dpavlin 3 if ($xml) {
147 dpavlin 10 #print STDERR "x: $xml\n";
148 dpavlin 13 $xml .= $add_xml if ($add_xml);
149 dpavlin 10 return "<xml>\n$xml</xml>\n";
150 dpavlin 3 } else {
151     return;
152     }
153     }
154    
155     ##########################################################################
156    
157 dpavlin 13 my $cfg = new Config::IniFiles( -file => $config_file );
158 dpavlin 1
159 dpavlin 13 foreach my $database ($cfg->Sections) {
160 dpavlin 1
161 dpavlin 13 my $isis_db = $cfg -> val($database, 'isis_db');
162     my $type = $cfg -> val($database, 'type');
163     my $add_xml = $cfg -> val($database, 'xml');
164 dpavlin 1
165 dpavlin 13 # read configuration for this type
166     $config=XMLin("./import_xml/$type.xml", forcearray => [ 'isis' ], forcecontent => 1);
167     $isis_codepage = Text::Iconv->new($config->{isis_codepage},'UTF8');
168     $index_codepage = Text::Iconv->new($config->{isis_codepage},$config->{index_codepage});
169     $xml_codepage = Text::Iconv->new($cfg->val($database,'xml_codepage'),'UTF8');
170 dpavlin 1
171 dpavlin 3 my $db = OpenIsis::open( $isis_db );
172     if (0) {
173     # # FIX
174     # if (! $db ) {
175     print STDERR "WARNING: can't open '$isis_db'\n";
176     next ;
177     }
178    
179 dpavlin 1 my $max_rowid = OpenIsis::maxRowid( $db );
180    
181 dpavlin 3 print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
182 dpavlin 1
183 dpavlin 13 my $path = $database; # was $isis_db
184 dpavlin 10
185 dpavlin 3 my $last_p = 0;
186    
187 dpavlin 10 # { my $row_id = 4514;
188 dpavlin 3 # FIX
189 dpavlin 1 for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
190     my $row = OpenIsis::read( $db, $row_id );
191 dpavlin 3 if ($row && $row->{mfn}) {
192 dpavlin 10 #print STDERR "mfn: ",$row->{mfn},"\n";
193 dpavlin 3 # output current process indicator
194     my $p = int($row->{mfn} * 100 / $max_rowid);
195     if ($p != $last_p) {
196     printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$row->{mfn},$max_rowid,"=" x ($p/2).">", $p ) if (! $opts{q});
197     $last_p = $p;
198     }
199    
200 dpavlin 13 if (my $xml = isis2xml($row,$add_xml)) {
201 dpavlin 10 #print STDERR "--ret-->$xml\n";
202     print "Path-Name: $path#".int($row->{mfn})."\n";
203     print "Content-Length: ".(length($xml)+1)."\n";
204     print "Document-Type: XML\n\n$xml\n";
205 dpavlin 3 }
206 dpavlin 1 }
207     }
208 dpavlin 3 print STDERR "\n";
209 dpavlin 1 }
210 dpavlin 3
211 dpavlin 10 # call this to commit index
212     $index->close;
213 dpavlin 3
214     1;
215     __END__
216     ##########################################################################
217    
218     =head1 NAME
219    
220     isis2xml.pl - read isis file and dump XML
221    
222     =head1 DESCRIPTION
223    
224     This command will read ISIS data file using OpenIsis perl module and
225     create XML file for usage with I<SWISH-E>
226     indexer. Dispite it's name, this script B<isn't general xml generator>
227     from isis files (isis allready has something like that). Output of this
228     script is tailor-made for SWISH-E.
229    
230     =head1 AUTHOR
231    
232     Dobrica Pavlinusic <dpavlin@rot13.org>
233    
234     =head1 COPYRIGHT
235    
236     GNU Public License (GPL) v2 or later
237    
238     =head1 SEE ALSO
239    
240     SWISH-E web site at http://www.swish-e.org
241    
242     =cut

Properties

Name Value
cvs2svn:cvs-rev 1.8
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26