/[webpac]/trunk2/all2all.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/all2all.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 546 - (hide annotations)
Tue Oct 26 18:59:38 2004 UTC (14 years, 8 months ago) by dpavlin
File MIME type: text/plain
File size: 9139 byte(s)
Removed all unaccented letters except in bfilter for characters in "part".
This solves problem of different browsers sorting differently according to
current locale setting. This way, only US ASCII characters are used and
everything is (hopefully) fine.

1 dpavlin 1 #!/usr/bin/perl -w
2    
3 dpavlin 348 =head1 NAME
4    
5 dpavlin 354 all2all.pl - basic script for all WebPAC needs
6 dpavlin 348
7     =cut
8    
9 dpavlin 1 use strict;
10 dpavlin 490 use locale;
11 dpavlin 454 use YAML;
12 dpavlin 352 use Carp;
13 dpavlin 437 use Getopt::Long;
14 dpavlin 546 use Text::Unaccent 1.02;
15 dpavlin 1
16 dpavlin 352 use lib './lib';
17 dpavlin 354 use WebPAC;
18 dpavlin 390 use WebPAC::jsFind;
19 dpavlin 410 use WebPAC::Index;
20 dpavlin 441 use WebPAC::Tree;
21 dpavlin 348
22 dpavlin 437 # options which can be changed via command line
23     #
24     my $code_page = 'ISO-8859-2';
25     my ($limit_mfn, $start_mfn, $debug, $low_mem);
26     my $index_path = './out/index';
27    
28     my $result = GetOptions(
29     "code_page=s" => \$code_page,
30     "limit_mfn=i" => \$limit_mfn,
31     "start_mfn=i" => \$start_mfn,
32     "debug!" => \$debug,
33     "low_mem!" => \$low_mem,
34     );
35    
36 dpavlin 500 my $filter = {
37     'CROVOC' => sub {
38     my $tmp = shift || return;
39 dpavlin 503 return undef unless ($tmp =~ s/\s*CROVOC.*$/ */);
40 dpavlin 500 return $tmp;
41     },
42 dpavlin 501 'CROVOC_tree' => sub {
43     my $tmp = shift || return;
44 dpavlin 503 $tmp =~ s/\s*CROVOC.*$/ */;
45     $tmp =~ s/\s*EUROVOC.*//;
46 dpavlin 501 return $tmp;
47     },
48 dpavlin 500 };
49    
50 dpavlin 546 ## remove accented characters
51     #
52     sub unac {
53     my $string = shift || return;
54     $string = unac_string($code_page,$string);
55     $string =~ tr/ðÐ/dD/;
56     return $string;
57     }
58     sub unac_2 {
59     my $string = shift || return;
60     if (length($string) > 2) {
61     my $pr = substr($string,0,2);
62     $string = unac_string($code_page,substr($string,2));
63     $string =~ tr/ðÐ/dD/;
64     $string = $pr . $string;
65     }
66     return lc($string);
67     }
68    
69 dpavlin 437 # create WebPAC object
70     #
71 dpavlin 354 my $webpac = new WebPAC(
72 dpavlin 437 code_page => $code_page,
73     limit_mfn => $limit_mfn,
74     start_mfn => $start_mfn,
75     debug => $debug,
76     low_mem => $low_mem,
77 dpavlin 500 filter => $filter,
78 dpavlin 352 ) || die;
79    
80 dpavlin 421 my $log = $webpac->_get_logger() || die "can't get logger";
81 dpavlin 373
82 dpavlin 421 $log->debug("creating WebPAC::jsFind object");
83    
84 dpavlin 390 my $index = new WebPAC::jsFind(
85 dpavlin 437 index_path => $index_path,
86 dpavlin 413 keys => 10,
87 dpavlin 390 ) || die;
88    
89 dpavlin 410 my $thes;
90    
91 dpavlin 10 $|=1;
92 dpavlin 9
93 dpavlin 352 my $maxmfn = $webpac->open_isis(
94 dpavlin 357 filename => shift @ARGV || '/data/hidra/THS/THS',
95 dpavlin 352 lookup => [
96 dpavlin 501 { 'key' => 'd:v900', 'val' => 'filter{CROVOC_tree}v250^a v800' },
97 dpavlin 352 # { 'eval' => '"v901^a" eq "Podruèje"', 'key' => 'pa:v561^4:v562^4:v461^1', 'val' => 'v900' },
98     # { 'eval '=> '"v901^a" eq "Mikrotezaurus"', 'key' => 'a:v561^4:v562^4:v461^1', 'val' => 'v900' },
99     # { 'eval' => '"v901^a" eq "Deskriptor"', 'key' => 'a:v561^4:v562^4:v461^1', 'val' => 'v900' },
100     { 'key' => 'a:v561^4:v562^4:v461^1', 'val' => 'v900' },
101 dpavlin 374 { 'key' => '900_mfn:v900', 'val' => 'v000' },
102 dpavlin 419 # tree structure
103 dpavlin 471 { 'eval' => 'length("v251") == 2 && "v800" =~ m/EUROVOC/ || "v800" =~ m/CROVOC/ && "v251" =~ m/^(H|HD|L|Z|P)$/', 'key' => 'root:v251', 'val' => 'v900' },
104 dpavlin 419 { 'eval' => '"v251"', 'key' => 'code:v900', 'val' => 'v561^4:v251' },
105 dpavlin 460 { 'eval' => '"v561^4" && "v562^4"', 'key' => 'code:v900', 'val' => 'v561^4:v562^4' },
106 dpavlin 352 ],
107 dpavlin 54 );
108 dpavlin 3
109 dpavlin 421 $log->debug("isis file ",$webpac->{'isis_filename'}," opened");
110    
111 dpavlin 372 $log->info("rows: $maxmfn");
112 dpavlin 170
113 dpavlin 363 $webpac->open_import_xml(type => 'isis_hidra_ths');
114    
115 dpavlin 425 if(1) { # XXX
116 dpavlin 424
117 dpavlin 362 while (my $rec = $webpac->fetch_rec) {
118 dpavlin 358
119 dpavlin 370 my @ds = $webpac->data_structure($rec);
120 dpavlin 366
121 dpavlin 411 if (0 && $log->is_debug) {
122 dpavlin 454 $log->debug("rec = ",Dump($rec));
123     $log->debug("ds = ",Dump(\@ds));
124 dpavlin 372 }
125    
126 dpavlin 374 next if (! @ds);
127 dpavlin 366
128 dpavlin 421 my $filename = $webpac->{'current_filename'} || $log->logdie("no current_filename in webpac object");
129 dpavlin 374
130     if ($filename) {
131 dpavlin 411 $webpac->output_file(
132     file => $filename,
133 dpavlin 374 template => 'html.tt',
134     data => \@ds,
135 dpavlin 398 headline => $webpac->{'headline'},
136 dpavlin 374 );
137     } else {
138     print $webpac->output(
139     template => 'text.tt',
140     data => \@ds,
141 dpavlin 398 headline => $webpac->{'headline'},
142 dpavlin 374 );
143     }
144    
145 dpavlin 398 my $headline = $webpac->{'headline'};
146 dpavlin 390
147 dpavlin 392 my $f = $filename;
148     $f =~ s!out/!!;
149    
150 dpavlin 390 # save into index
151     foreach my $ds (@ds) {
152     next if (! $ds->{'swish'});
153    
154 dpavlin 466 # strip all non word characters from beginning or end
155     # of word
156     my $words = join(" ",@{$ds->{'swish'}});
157     $words =~ s/^\W+//;
158     $words =~ s/\W*\s+\W*/ /g;
159     $words =~ s/\W+$//;
160    
161 dpavlin 504 # first try to generate headline for this entry from index
162 dpavlin 520 my $h = $ds->{'index'}->[0];
163 dpavlin 504 # then, from display
164 dpavlin 520 $h ||= $ds->{'display'}->[0];
165 dpavlin 504 # and as last resport, fallback to headline
166     $h ||= $headline;
167    
168 dpavlin 390 $index->insert(
169     index_name => $ds->{'tag'},
170 dpavlin 466 #path => $f,
171     path => $webpac->mfn,
172 dpavlin 504 headline => $h,
173 dpavlin 546 words => unac($words),
174 dpavlin 390 );
175     }
176    
177 dpavlin 410 # save into sorted index (thesaurus)
178     foreach my $ds (@ds) {
179     next if (! $ds->{'index'});
180    
181 dpavlin 511 $thes->{$ds->{'tag'}} ||= new WebPAC::Index( name => $ds->{'tag'} );
182 dpavlin 410
183 dpavlin 415 foreach my $h (@{$ds->{'index'}}) {
184     $thes->{$ds->{'tag'}}->insert(
185 dpavlin 546 sort_by => unac_2($h),
186     mfn => $webpac->mfn,
187 dpavlin 415 headline => $h,
188     );
189     }
190 dpavlin 410 }
191    
192 dpavlin 454 # print Dump(\@ds);
193 dpavlin 390
194 dpavlin 3 }
195    
196 dpavlin 411 foreach my $t (keys %{$thes}) {
197    
198     my @e = $thes->{$t}->elements;
199     if (! @e) {
200     $log->logwarn("no elements in sorted index $t?");
201     next;
202     }
203    
204 dpavlin 415 my $file = "./out/bfilter/$t.txt";
205     $log->info("saving sorted index $t to '$file' [".scalar(@e)." elements]");
206 dpavlin 411
207     $webpac->output_file(
208 dpavlin 415 file => $file,
209 dpavlin 411 template => 'index.tt',
210     data => \@e,
211     index_name => $t,
212     );
213     }
214    
215     if (0 && $log->is_debug) {
216 dpavlin 454 $log->debug("lookup hash: ",Dump($webpac->{'lookup'}));
217     $log->debug("data hash: ",Dump($webpac->{'data'}));
218 dpavlin 410 foreach my $t (keys %{$thes}) {
219 dpavlin 454 $log->debug("thesaurus $t hash: ",Dump($thes->{$t}));
220 dpavlin 410 }
221 dpavlin 372 }
222 dpavlin 390
223 dpavlin 424 } # XXX if(0)
224    
225 dpavlin 530 #$log->debug("lookup hash: ",Dump($webpac->{'lookup'}));
226 dpavlin 420
227 dpavlin 441 $log->info("creating tree");
228    
229 dpavlin 419 #
230 dpavlin 441 # define tree structure
231 dpavlin 419 #
232    
233 dpavlin 424 my $l = $webpac->{'lookup'} || $log->logconfess("can't find lookup");
234 dpavlin 419
235 dpavlin 424 my @tree = ({
236     # level 0
237     code_arr => sub { sort keys %{$l} },
238     filter_code => sub {
239     my $t = shift;
240     return $t if ($t =~ s/root://);
241     },
242     lookup_v900 => sub { shift @{$l->{"root:".$_[0]}} },
243     lookup_term => sub { shift @{$l->{"d:".$_[1]}} },
244     lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[1]}} },
245 dpavlin 454 have_children => sub { return $l->{"a:".$_[0]."::"} },
246     have_children_at_level => sub {
247 dpavlin 471 return unless (defined($l->{"code:".$_[1]}));
248     my $code = shift @{$l->{"code:".$_[1]}};
249     print STDERR "## $_[1] -> $code\n";
250     return undef unless($code);
251     return(9, $l->{"a:$code:"} ) if (defined($l->{"a:$code:"}));
252 dpavlin 454 },
253 dpavlin 424 style => 'display: none',
254     },{
255     # 1
256 dpavlin 454 code_arr => sub { @{$_[0]} },
257 dpavlin 424 filter_code => sub { shift }, # nop
258     lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
259     lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
260     lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
261 dpavlin 454 have_children => sub { return $l->{"a:".$_[1].":"} },
262 dpavlin 424 style => 'display: none',
263     },{
264     # 2
265 dpavlin 454 code_arr => sub { @{$_[0]} },
266 dpavlin 424 filter_code => sub { shift },
267 dpavlin 460 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
268 dpavlin 424 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
269     lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
270 dpavlin 460 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
271 dpavlin 454 #style => 'display: none',
272 dpavlin 425 },{
273     # 3 u¾i pojam
274 dpavlin 454 code_arr => sub { @{$_[0]} },
275 dpavlin 425 filter_code => sub { shift },
276 dpavlin 460 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
277 dpavlin 426 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
278     lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
279 dpavlin 460 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
280 dpavlin 426 },{
281     # 4
282 dpavlin 454 code_arr => sub { @{$_[0]} },
283 dpavlin 426 filter_code => sub { shift },
284 dpavlin 460 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
285 dpavlin 426 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
286     lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
287 dpavlin 460 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
288 dpavlin 427 },{
289     # 5
290 dpavlin 454 code_arr => sub { @{$_[0]} },
291 dpavlin 427 filter_code => sub { shift },
292 dpavlin 460 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
293 dpavlin 427 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
294     lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
295 dpavlin 460 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
296 dpavlin 427 },{
297     # 6
298 dpavlin 454 code_arr => sub { @{$_[0]} },
299 dpavlin 427 filter_code => sub { shift },
300 dpavlin 460 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
301 dpavlin 427 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
302     lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
303 dpavlin 460 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
304 dpavlin 427 },{
305     # 7
306 dpavlin 454 code_arr => sub { @{$_[0]} },
307 dpavlin 427 filter_code => sub { shift },
308 dpavlin 460 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
309 dpavlin 427 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
310     lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
311 dpavlin 460 have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
312 dpavlin 427 },{
313     # 8
314 dpavlin 454 code_arr => sub { @{$_[0]} },
315 dpavlin 427 filter_code => sub { shift },
316 dpavlin 460 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
317 dpavlin 427 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
318     lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
319 dpavlin 460 # have_children => sub { return $l->{"a:".$_[1].":".$_[0]} },
320 dpavlin 424 have_children => sub { 0 },
321 dpavlin 454 },{
322     # 9 - level which is never reached except explicitly
323 dpavlin 471 code_arr => sub { @{$_[0]} },
324 dpavlin 454 filter_code => sub { shift },
325 dpavlin 460 lookup_v900 => sub { shift @{$l->{"code:".$_[0]}} },
326 dpavlin 454 lookup_term => sub { shift @{$l->{"d:".$_[0]}} },
327     lookup_mfn => sub { shift @{$l->{"900_mfn:".$_[0]}} },
328     have_children => sub { 0 },
329 dpavlin 471 have_children_at_level => sub { defined($l->{"a:".$_[1].":".$_[0]}) && return (9,$l->{"a:".$_[1].":".$_[0]}) },
330 dpavlin 454 },{
331 dpavlin 424 });
332 dpavlin 419
333 dpavlin 441 my $tree = new WebPAC::Tree(
334 dpavlin 530 tree => \@tree,
335     );
336    
337     $tree->output(
338     dir => './out',
339     html => 'browse.html',
340 dpavlin 441 template => './output_template/tree.tt',
341     js => 'tree-ids.js',
342     );
343 dpavlin 428
344 dpavlin 530 $tree->output(
345     dir => './eurovoc',
346     html => 'hijerarhija.html',
347     template => './output_template/hijerarhija.tt',
348     js => 'tree-ids.js',
349     );
350    
351    
352 dpavlin 428 $log->info("closing index");
353 dpavlin 390 $index->close;
354    
355 dpavlin 422 $log->info("elapsed time: ",$webpac->fmt_time(time() - $webpac->{'start_t'}));

Properties

Name Value
cvs2svn:cvs-rev 1.64
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26