/[webpac]/trunk2/all2all.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/all2all.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 546 - (show annotations)
Tue Oct 26 18:59:38 2004 UTC (19 years, 5 months ago) by dpavlin
File MIME type: text/plain
File size: 9139 byte(s)
Removed all unaccented letters except in bfilter for characters in "part".
This solves problem of different browsers sorting differently according to
current locale setting. This way, only US ASCII characters are used and
everything is (hopefully) fine.

1 #!/usr/bin/perl -w
2
3 =head1 NAME
4
5 all2all.pl - basic script for all WebPAC needs
6
7 =cut
8
9 use strict;
10 use locale;
11 use YAML;
12 use Carp;
13 use Getopt::Long;
14 use Text::Unaccent 1.02;
15
16 use lib './lib';
17 use WebPAC;
18 use WebPAC::jsFind;
19 use WebPAC::Index;
20 use WebPAC::Tree;
21
22 # options which can be changed via command line
23 #
24 my $code_page = 'ISO-8859-2';
25 my ($limit_mfn, $start_mfn, $debug, $low_mem);
26 my $index_path = './out/index';
27
28 my $result = GetOptions(
29 "code_page=s" => \$code_page,
30 "limit_mfn=i" => \$limit_mfn,
31 "start_mfn=i" => \$start_mfn,
32 "debug!" => \$debug,
33 "low_mem!" => \$low_mem,
34 );
35
36 my $filter = {
37 'CROVOC' => sub {
38 my $tmp = shift || return;
39 return undef unless ($tmp =~ s/\s*CROVOC.*$/ */);
40 return $tmp;
41 },
42 'CROVOC_tree' => sub {
43 my $tmp = shift || return;
44 $tmp =~ s/\s*CROVOC.*$/ */;
45 $tmp =~ s/\s*EUROVOC.*//;
46 return $tmp;
47 },
48 };
49
50 ## remove accented characters
51 #
52 sub unac {
53 my $string = shift || return;
54 $string = unac_string($code_page,$string);
55 $string =~ tr/ðÐ/dD/;
56 return $string;
57 }
58 sub unac_2 {
59 my $string = shift || return;
60 if (length($string) > 2) {
61 my $pr = substr($string,0,2);
62 $string = unac_string($code_page,substr($string,2));
63 $string =~ tr/ðÐ/dD/;
64 $string = $pr . $string;
65 }
66 return lc($string);
67 }
68
69 # create WebPAC object
70 #
71 my $webpac = new WebPAC(
72 code_page => $code_page,
73 limit_mfn => $limit_mfn,
74 start_mfn => $start_mfn,
75 debug => $debug,
76 low_mem => $low_mem,
77 filter => $filter,
78 ) || die;
79
80 my $log = $webpac->_get_logger() || die "can't get logger";
81
82 $log->debug("creating WebPAC::jsFind object");
83
84 my $index = new WebPAC::jsFind(
85 index_path => $index_path,
86 keys => 10,
87 ) || die;
88
89 my $thes;
90
91 $|=1;
92
93 my $maxmfn = $webpac->open_isis(
94 filename => shift @ARGV || '/data/hidra/THS/THS',
95 lookup => [
96 { 'key' => 'd:v900', 'val' => 'filter{CROVOC_tree}v250^a v800' },
97 # { 'eval' => '"v901^a" eq "Podruèje"', 'key' => 'pa:v561^4:v562^4:v461^1', 'val' => 'v900' },
98 # { 'eval '=> '"v901^a" eq "Mikrotezaurus"', 'key' => 'a:v561^4:v562^4:v461^1', 'val' => 'v900' },
99 # { 'eval' => '"v901^a" eq "Deskriptor"', 'key' => 'a:v561^4:v562^4:v461^1', 'val' => 'v900' },
100 { 'key' => 'a:v561^4:v562^4:v461^1', 'val' => 'v900' },
101 { 'key' => '900_mfn:v900', 'val' => 'v000' },
102 # tree structure
103 { 'eval' => 'length("v251") == 2 && "v800" =~ m/EUROVOC/ || "v800" =~ m/CROVOC/ && "v251" =~ m/^(H|HD|L|Z|P)$/', 'key' => 'root:v251', 'val' => 'v900' },
104 { 'eval' => '"v251"', 'key' => 'code:v900', 'val' => 'v561^4:v251' },
105 { 'eval' => '"v561^4" && "v562^4"', 'key' => 'code:v900', 'val' => 'v561^4:v562^4' },
106 ],
107 );
108
109 $log->debug("isis file ",$webpac->{'isis_filename'}," opened");
110
111 $log->info("rows: $maxmfn");
112
113 $webpac->open_import_xml(type => 'isis_hidra_ths');
114
115 if(1) { # XXX
116
117 while (my $rec = $webpac->fetch_rec) {
118
119 my @ds = $webpac->data_structure($rec);
120
121 if (0 && $log->is_debug) {
122 $log->debug("rec = ",Dump($rec));
123 $log->debug("ds = ",Dump(\@ds));
124 }
125
126 next if (! @ds);
127
128 my $filename = $webpac->{'current_filename'} || $log->logdie("no current_filename in webpac object");
129
130 if ($filename) {
131 $webpac->output_file(
132 file => $filename,
133 template => 'html.tt',
134 data => \@ds,
135 headline => $webpac->{'headline'},
136 );
137 } else {
138 print $webpac->output(
139 template => 'text.tt',
140 data => \@ds,
141 headline => $webpac->{'headline'},
142 );
143 }
144
145 my $headline = $webpac->{'headline'};
146
147 my $f = $filename;
148 $f =~ s!out/!!;
149
150 # save into index
151 foreach my $ds (@ds) {
152 next if (! $ds->{'swish'});
153
154 # strip all non word characters from beginning or end
155 # of word
156 my $words = join(" ",@{$ds->{'swish'}});
157 $words =~ s/^\W+//;
158 $words =~ s/\W*\s+\W*/ /g;
159 $words =~ s/\W+$//;
160
161 # first try to generate headline for this entry from index
162 my $h = $ds->{'index'}->[0];
163 # then, from display
164 $h ||= $ds->{'display'}->[0];
165 # and as last resport, fallback to headline
166 $h ||= $headline;
167
168 $index->insert(
169 index_name => $ds->{'tag'},
170 #path => $f,
171 path => $webpac->mfn,
172 headline => $h,
173 words => unac($words),
174 );
175 }
176
177 # save into sorted index (thesaurus)
178 foreach my $ds (@ds) {
179 next if (! $ds->{'index'});
180
181 $thes->{$ds->{'tag'}} ||= new WebPAC::Index( name => $ds->{'tag'} );
182
183 foreach my $h (@{$ds->{'index'}}) {
184 $thes->{$ds->{'tag'}}->insert(
185 sort_by => unac_2($h),
186 mfn => $webpac->mfn,
187 headline => $h,
188 );
189 }
190 }
191
192 # print Dump(\@ds);
193
194 }
195
196 foreach my $t (keys %{$thes}) {
197
198 my @e = $thes->{$t}->elements;
199 if (! @e) {
200 $log->logwarn("no elements in sorted index $t?");
201 next;
202 }
203
204 my $file = "./out/bfilter/$t.txt";
205 $log->info("saving sorted index $t to '$file' [".scalar(@e)." elements]");
206
207 $webpac->output_file(
208 file => $file,
209 template => 'index.tt',
210 data => \@e,
211 index_name => $t,
212 );
213 }
214
215 if (0 && $log->is_debug) {
216 $log->debug("lookup hash: ",Dump($webpac->{'lookup'}));
217 $log->debug("data hash: ",Dump($webpac->{'data'}));
218 foreach my $t (keys %{$thes}) {
219 $log->debug("thesaurus $t hash: ",Dump($thes->{$t}));
220 }
221 }
222
223 } # XXX if(0)
224
225 #$log->debug("lookup hash: ",Dump($webpac->{'lookup'}));
226
227 $log->info("creating tree");
228
229 #
230 # define tree structure
231 #
232
233 my $l = $webpac->{'lookup'} || $log->logconfess("can't find lookup");
234
235 my @tree = ({
236 # level 0
237 code_arr => sub { sort keys %{$l} },
238 filter_code => sub {
239 my $t = shift;
240 return $t if ($t =~ s/root://);
241 },
242 lookup_v900 => sub { shift @{$l->{"root:".$_[0]}} },
243 lookup_term => sub { shift @{$l->{"d:".$_[1]}} },
244 lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[1]}} },
245 have_children => sub { return $l->{"a:".$_[0]."::"} },
246 have_children_at_level => sub {
247 return unless (defined($l->{"code:".$_[1]}));
248 my $code = shift @{$l->{"code:".$_[1]}};
249 print STDERR "## $_[1] -> $code\n";
250 return undef unless($code);
251 return(9, $l->{"a:$code:"} ) if (defined($l->{"a:$code:"}));
252 },
253 style => 'display: none',
254 },{
255 # 1
256 code_arr => sub { @{$_[0]} },
257 filter_code => sub { shift }, # nop
258 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
259 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
260 lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
261 have_children => sub { return $l->{"a:".$_[1].":"} },
262 style => 'display: none',
263 },{
264 # 2
265 code_arr => sub { @{$_[0]} },
266 filter_code => sub { shift },
267 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
268 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
269 lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
270 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
271 #style => 'display: none',
272 },{
273 # 3 u¾i pojam
274 code_arr => sub { @{$_[0]} },
275 filter_code => sub { shift },
276 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
277 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
278 lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
279 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
280 },{
281 # 4
282 code_arr => sub { @{$_[0]} },
283 filter_code => sub { shift },
284 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
285 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
286 lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
287 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
288 },{
289 # 5
290 code_arr => sub { @{$_[0]} },
291 filter_code => sub { shift },
292 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
293 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
294 lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
295 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
296 },{
297 # 6
298 code_arr => sub { @{$_[0]} },
299 filter_code => sub { shift },
300 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
301 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
302 lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
303 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
304 },{
305 # 7
306 code_arr => sub { @{$_[0]} },
307 filter_code => sub { shift },
308 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
309 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
310 lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
311 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
312 },{
313 # 8
314 code_arr => sub { @{$_[0]} },
315 filter_code => sub { shift },
316 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
317 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
318 lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
319 # have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
320 have_children => sub { 0 },
321 },{
322 # 9 - level which is never reached except explicitly
323 code_arr => sub { @{$_[0]} },
324 filter_code => sub { shift },
325 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
326 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
327 lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
328 have_children => sub { 0 },
329 have_children_at_level => sub { defined($l->{"a:".$_[1].":".$_[0]}) && return (9,$l->{"a:".$_[1].":".$_[0]}) },
330 },{
331 });
332
333 my $tree = new WebPAC::Tree(
334 tree => \@tree,
335 );
336
337 $tree->output(
338 dir => './out',
339 html => 'browse.html',
340 template => './output_template/tree.tt',
341 js => 'tree-ids.js',
342 );
343
344 $tree->output(
345 dir => './eurovoc',
346 html => 'hijerarhija.html',
347 template => './output_template/hijerarhija.tt',
348 js => 'tree-ids.js',
349 );
350
351
352 $log->info("closing index");
353 $index->close;
354
355 $log->info("elapsed time: ",$webpac->fmt_time(time() - $webpac->{'start_t'}));

Properties

Name Value
cvs2svn:cvs-rev 1.64
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26