/[webpac]/trunk/all2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/all2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3 - (hide annotations)
Sat Nov 30 00:36:34 2002 UTC (21 years, 3 months ago) by dpavlin
File MIME type: text/plain
File size: 4996 byte(s)
first really working version -- creates xml file for swish + swish config

1 dpavlin 1 #!/usr/bin/perl -w
2    
3     use strict;
4     use OpenIsis;
5     use Getopt::Std;
6     use Data::Dumper;
7     use XML::Simple;
8 dpavlin 3 use Text::Unaccent;
9     require Unicode::Map8;
10 dpavlin 1
11 dpavlin 3 my $config=XMLin(undef, forcearray => [ 'isis' ], forcecontent => 1);
12 dpavlin 1
13     my %opts;
14    
15     getopts('d:m:q', \%opts);
16    
17 dpavlin 3 my $db_dir = $opts{d} || "ps"; # FIX
18 dpavlin 1
19 dpavlin 3 #die "usage: $0 -d [database_dir] -m [database1,database2] " if (! %opts);
20 dpavlin 1
21 dpavlin 3 #print Dumper($config->{indexer});
22     #print "-" x 70,"\n";
23 dpavlin 1
24 dpavlin 3 # how to convert isis code page to UTF8?
25     my $isis_map = Unicode::Map8->new($config->{isis_codepage}) || die;
26    
27     sub isis2xml {
28    
29     my $row = shift @_;
30    
31     my $xml;
32    
33     sub isis_sf {
34     my $row = shift @_;
35     my $isis_id = shift @_;
36     my $subfield = shift @_;
37     if ($row->{$isis_id}->[0]) {
38     my $sf = OpenIsis::subfields($row->{$isis_id}->[0]);
39     if (! defined $subfield || length($subfield) == 0) {
40     # subfield list undef, empty or no defined subfields for this record
41     return $row->{$isis_id}->[0];
42     } elsif ($sf->{$subfield}) {
43     return $sf->{$subfield};
44     }
45     }
46     }
47    
48     foreach my $field (keys %{$config->{indexer}}) {
49    
50     my $display_data = "";
51     my $index_data = "";
52    
53     foreach my $x (@{$config->{indexer}->{$field}->{isis}}) {
54    
55     my $display_tmp = "";
56     my $index_tmp = "";
57    
58     my $format = $x->{content};
59     my $i = 1; # index only
60     my $d = 1; # display only
61     $i = 0 if (lc($x->{type}) eq "display");
62     $d = 0 if (lc($x->{type}) eq "index");
63     #print "## i: $i d: $d ## $format ##";
64     # parse format
65     my $prefix = "";
66     if ($format =~ s/^([^\d]+)//) {
67     $prefix = $1;
68     }
69     while ($format) {
70     if ($format =~ s/^(\d\d\d)(\w?)//) {
71     my $isis_tmp = isis_sf($row,$1,$2);
72     if ($isis_tmp) {
73     # $display_tmp .= $prefix . "/$1/$2/".$isis_tmp if ($d);
74     $display_tmp .= $prefix . $isis_tmp if ($d);
75     $index_tmp .= $isis_tmp." " if ($i);
76     #print " $isis_tmp <--\n";
77     }
78     $prefix = "";
79     } elsif ($format =~ s/^([^\d]+)//) {
80     $prefix = $1;
81     } else {
82     print STDERR "WARNING: unparsed format '$format'\n";
83     last;
84     };
85     }
86     # add suffix
87     $display_tmp .= $prefix if ($display_tmp);
88    
89     # $display_data .= $display_tmp if ($display_tmp ne "");
90     # $index_data .= $index_tmp if ($index_tmp ne "");
91     $display_data .= $display_tmp;
92     $index_data .= $index_tmp;
93    
94     }
95     #print "--display:$display_data\n--index:$index_data\n";
96     $xml->{$field}->{display} .= $isis_map->tou($display_data)->utf8 if ($display_data);
97     $xml->{$field}->{index} .= unac_string($config->{isis_codepage},$index_data) if ($index_data);
98    
99     }
100     if ($xml) {
101     return XMLout($xml, rootname => 'xml', noattr => 1 );
102     } else {
103     return;
104     }
105     }
106    
107     ##########################################################################
108    
109 dpavlin 1 my $last_tell=0;
110    
111     my @isis_dirs = ( '.' ); # use dirname as database name
112    
113     if ($opts{m}) {
114     @isis_dirs = split(/,/,$opts{m});
115     }
116    
117     my @isis_dbs;
118    
119     foreach (@isis_dirs) {
120 dpavlin 3 if (-e $config->{isis_data}."/$db_dir/$_/LIBRI") {
121     push @isis_dbs,$config->{isis_data}."/$db_dir/$_/LIBRI/LIBRI";
122 dpavlin 1 }
123 dpavlin 3 if (-e $config->{isis_data}."/$db_dir/$_/PERI") {
124     push @isis_dbs,$config->{isis_data}."/$db_dir/$_/PERI/PERI";
125 dpavlin 1 }
126 dpavlin 3 if (-e $config->{isis_data}."/$db_dir/$_/AMS") {
127     push @isis_dbs,$config->{isis_data}."/$db_dir/$_/AMS/AMS";
128 dpavlin 1 }
129 dpavlin 3 if (-e $config->{isis_data}."/$db_dir/$_/ARTI") {
130     # push @isis_dbs,$config->{isis_data}."/$db_dir/$_/ARTI/ARTI";
131 dpavlin 1 }
132     }
133    
134 dpavlin 3 print STDERR "FATAL: Can't find isis database.\nPerhaps isis_data (".$config->{isis_data}.") has wrong value?\n" if (! @isis_dbs);
135    
136     my $db;
137    
138 dpavlin 1 foreach my $isis_db (@isis_dbs) {
139    
140    
141 dpavlin 3 my $db = OpenIsis::open( $isis_db );
142     if (0) {
143     # # FIX
144     # if (! $db ) {
145     print STDERR "WARNING: can't open '$isis_db'\n";
146     next ;
147     }
148    
149 dpavlin 1 my $max_rowid = OpenIsis::maxRowid( $db );
150    
151 dpavlin 3 print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
152 dpavlin 1
153 dpavlin 3 my $last_p = 0;
154    
155     # { my $row_id = 1;
156     # FIX
157 dpavlin 1 for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
158     my $row = OpenIsis::read( $db, $row_id );
159 dpavlin 3 if ($row && $row->{mfn}) {
160 dpavlin 1
161 dpavlin 3 # output current process indicator
162     my $p = int($row->{mfn} * 100 / $max_rowid);
163     if ($p != $last_p) {
164     printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$row->{mfn},$max_rowid,"=" x ($p/2).">", $p ) if (! $opts{q});
165     $last_p = $p;
166     }
167    
168     if (my $xml = isis2xml($row)) {
169     print "Path-Name: $isis_db#".$row->{mfn}."\n";
170     print "Content-Length: ".(length($xml)+1)."\n";
171     print "Document-Type: XML\n\n$xml\n";
172     }
173 dpavlin 1 }
174     }
175 dpavlin 3 print STDERR "\n";
176 dpavlin 1 }
177 dpavlin 3
178    
179     1;
180     __END__
181     ##########################################################################
182    
183     =head1 NAME
184    
185     isis2xml.pl - read isis file and dump XML
186    
187     =head1 DESCRIPTION
188    
189     This command will read ISIS data file using OpenIsis perl module and
190     create XML file for usage with I<SWISH-E>
191     indexer. Dispite it's name, this script B<isn't general xml generator>
192     from isis files (isis allready has something like that). Output of this
193     script is tailor-made for SWISH-E.
194    
195     =head1 AUTHOR
196    
197     Dobrica Pavlinusic <dpavlin@rot13.org>
198    
199     =head1 COPYRIGHT
200    
201     GNU Public License (GPL) v2 or later
202    
203     =head1 SEE ALSO
204    
205     SWISH-E web site at http://www.swish-e.org
206    
207     =cut

Properties

Name Value
cvs2svn:cvs-rev 1.2
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26