/[webpac]/branches/ffzg/all2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/ffzg/all2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5 - (show annotations)
Sat Jan 11 06:14:48 2003 UTC (16 years, 10 months ago) by dpavlin
Original Path: trunk/all2xml.pl
File MIME type: text/plain
File size: 5107 byte(s)
require 1.02 version of Text::Unaccent (1.01 can't pass 'make test' here!)

1 #!/usr/bin/perl -w
2
3 use strict;
4 use OpenIsis;
5 use Getopt::Std;
6 use Data::Dumper;
7 use XML::Simple;
8 use Text::Unaccent 1.02; # 1.01 won't compile on my platform,
9 require Unicode::Map8;
10
11 my $config=XMLin(undef, forcearray => [ 'isis' ], forcecontent => 1);
12
13 my %opts;
14
15 getopts('d:m:q', \%opts);
16
17 my $db_dir = $opts{d} || "ps"; # FIX
18
19 #die "usage: $0 -d [database_dir] -m [database1,database2] " if (! %opts);
20
21 #print Dumper($config->{indexer});
22 #print "-" x 70,"\n";
23
24 # how to convert isis code page to UTF8?
25 my $isis_map = Unicode::Map8->new($config->{isis_codepage}) || die;
26
27 sub isis2xml {
28
29 my $row = shift @_;
30
31 my $xml;
32
33 sub isis_sf {
34 my $row = shift @_;
35 my $isis_id = shift @_;
36 my $subfield = shift @_;
37 if ($row->{$isis_id}->[0]) {
38 my $sf = OpenIsis::subfields($row->{$isis_id}->[0]);
39 if (! defined $subfield || length($subfield) == 0) {
40 # subfield list undef, empty or no defined subfields for this record
41 my $all_sf = $row->{$isis_id}->[0];
42 $all_sf =~ s/\^./ /g; nuke definirions
43 return $all_sf;
44 } elsif ($sf->{$subfield}) {
45 return $sf->{$subfield};
46 }
47 }
48 }
49
50 foreach my $field (keys %{$config->{indexer}}) {
51
52 my $display_data = "";
53 my $index_data = "";
54
55 foreach my $x (@{$config->{indexer}->{$field}->{isis}}) {
56
57 my $display_tmp = "";
58 my $index_tmp = "";
59
60 my $format = $x->{content};
61 my $i = 1; # index only
62 my $d = 1; # display only
63 $i = 0 if (lc($x->{type}) eq "display");
64 $d = 0 if (lc($x->{type}) eq "index");
65 #print "## i: $i d: $d ## $format ##";
66 # parse format
67 my $prefix = "";
68 if ($format =~ s/^([^\d]+)//) {
69 $prefix = $1;
70 }
71 while ($format) {
72 if ($format =~ s/^(\d\d\d)(\w?)//) {
73 my $isis_tmp = isis_sf($row,$1,$2);
74 if ($isis_tmp) {
75 # $display_tmp .= $prefix . "/$1/$2/".$isis_tmp if ($d);
76 $display_tmp .= $prefix . $isis_tmp if ($d);
77 $index_tmp .= $isis_tmp." " if ($i);
78 #print " $isis_tmp <--\n";
79 }
80 $prefix = "";
81 } elsif ($format =~ s/^([^\d]+)//) {
82 $prefix = $1;
83 } else {
84 print STDERR "WARNING: unparsed format '$format'\n";
85 last;
86 };
87 }
88 # add suffix
89 $display_tmp .= $prefix if ($display_tmp);
90
91 # $display_data .= $display_tmp if ($display_tmp ne "");
92 # $index_data .= $index_tmp if ($index_tmp ne "");
93 $display_data .= $display_tmp;
94 $index_data .= $index_tmp;
95
96 }
97 #print "--display:$display_data\n--index:$index_data\n";
98 $xml->{$field}->{display} .= $isis_map->tou($display_data)->utf8 if ($display_data);
99 $xml->{$field}->{index} .= unac_string($config->{isis_codepage},$index_data) if ($index_data);
100
101 }
102 if ($xml) {
103 return XMLout($xml, rootname => 'xml', noattr => 1 );
104 } else {
105 return;
106 }
107 }
108
109 ##########################################################################
110
111 my $last_tell=0;
112
113 my @isis_dirs = ( '.' ); # use dirname as database name
114
115 if ($opts{m}) {
116 @isis_dirs = split(/,/,$opts{m});
117 }
118
119 my @isis_dbs;
120
121 foreach (@isis_dirs) {
122 if (-e $config->{isis_data}."/$db_dir/$_/LIBRI") {
123 push @isis_dbs,$config->{isis_data}."/$db_dir/$_/LIBRI/LIBRI";
124 }
125 if (-e $config->{isis_data}."/$db_dir/$_/PERI") {
126 push @isis_dbs,$config->{isis_data}."/$db_dir/$_/PERI/PERI";
127 }
128 if (-e $config->{isis_data}."/$db_dir/$_/AMS") {
129 push @isis_dbs,$config->{isis_data}."/$db_dir/$_/AMS/AMS";
130 }
131 if (-e $config->{isis_data}."/$db_dir/$_/ARTI") {
132 # push @isis_dbs,$config->{isis_data}."/$db_dir/$_/ARTI/ARTI";
133 }
134 }
135
136 print STDERR "FATAL: Can't find isis database.\nPerhaps isis_data (".$config->{isis_data}.") has wrong value?\n" if (! @isis_dbs);
137
138 my $db;
139
140 foreach my $isis_db (@isis_dbs) {
141
142
143 my $db = OpenIsis::open( $isis_db );
144 if (0) {
145 # # FIX
146 # if (! $db ) {
147 print STDERR "WARNING: can't open '$isis_db'\n";
148 next ;
149 }
150
151 my $max_rowid = OpenIsis::maxRowid( $db );
152
153 print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
154
155 my $last_p = 0;
156
157 # { my $row_id = 1;
158 # FIX
159 for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
160 my $row = OpenIsis::read( $db, $row_id );
161 if ($row && $row->{mfn}) {
162
163 # output current process indicator
164 my $p = int($row->{mfn} * 100 / $max_rowid);
165 if ($p != $last_p) {
166 printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$row->{mfn},$max_rowid,"=" x ($p/2).">", $p ) if (! $opts{q});
167 $last_p = $p;
168 }
169
170 if (my $xml = isis2xml($row)) {
171 print "Path-Name: $isis_db#".$row->{mfn}."\n";
172 print "Content-Length: ".(length($xml)+1)."\n";
173 print "Document-Type: XML\n\n$xml\n";
174 }
175 }
176 }
177 print STDERR "\n";
178 }
179
180
181 1;
182 __END__
183 ##########################################################################
184
185 =head1 NAME
186
187 isis2xml.pl - read isis file and dump XML
188
189 =head1 DESCRIPTION
190
191 This command will read ISIS data file using OpenIsis perl module and
192 create XML file for usage with I<SWISH-E>
193 indexer. Dispite it's name, this script B<isn't general xml generator>
194 from isis files (isis allready has something like that). Output of this
195 script is tailor-made for SWISH-E.
196
197 =head1 AUTHOR
198
199 Dobrica Pavlinusic <dpavlin@rot13.org>
200
201 =head1 COPYRIGHT
202
203 GNU Public License (GPL) v2 or later
204
205 =head1 SEE ALSO
206
207 SWISH-E web site at http://www.swish-e.org
208
209 =cut

Properties

Name Value
cvs2svn:cvs-rev 1.4
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26