/[webpac-proto]/isis2stream.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /isis2stream.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (show annotations)
Fri Jun 14 18:24:05 2002 UTC (21 years, 9 months ago) by dpavlin
Branch: MAIN
Changes since 1.1: +1 -1 lines
File MIME type: text/plain
izbaceni interpunkcijski znakovi kod indeksiranja

1 #!/usr/bin/perl -w
2
3 use strict;
4 use OpenIsis;
5 use Getopt::Std;
6 #use Data::Dumper;
7
8 my $install_dir="/local/index";
9 my $mpsindex="/local/mps-5.3/bin/mpsindex -l 9 -b";
10 my $isis_data="/var/autofs/misc/isis_data/";
11 #my $isis_data="/mnt/20020606/Isis/Data/"; # doma
12
13 my %opts;
14
15 getopt('dD', \%opts);
16
17 die "usage: $0 -d [database_dir] " if (! $opts{d});
18
19 my $db_dir = $opts{d};
20
21 mkdir "$install_dir/$db_dir" if (!-e "$install_dir/$db_dir");
22 mkdir "$install_dir/$db_dir/data" if (!-e "$install_dir/$db_dir/data");
23
24 my $dir="$install_dir/$db_dir/data";
25
26
27 open(S,"> $dir/stream") || die "can't open output $dir/stram: $!";
28 open(R,"> $dir/bib") || die "can't open output $dir/bib: $!";
29 open(MPS,"| $mpsindex -d $install_dir/$db_dir -autokey") || die "can't start MPS indexer $mpsindex: $!";
30 #open(MPS,"> /tmp/mpsindex") || die "mps: $!";
31
32 my $s="V 5 3
33 L hr-HR
34 F 700+ 1 Autor
35 F 200+ 2 Naslov
36 F 210 3 Izdavanje
37 F 225 4 Nakladnièka cjelina
38 F 300+ 5 Napomene
39 F 330 6 Sadr¾aj
40 F 464 7 Analitièki radovi
41 F 610 8 Kljuène rijeèi
42 F 675 9 UDK
43 F 686 10 CC
44 F 990 11 Signatura
45 F 991 12 Inventarni broj
46 F 10 13 ISBN
47 ";
48
49 print S $s;
50 print MPS $s;
51
52 #
53 # expand(nr,"space separated string");
54 #
55
56 sub expand {
57 my $nr = shift @_;
58 my $out = "";
59 while (my $fld = c_852_iso(shift @_)) {
60 my @words=split(/\s+/,$fld);
61 foreach my $w (@words) {
62 # FIX: this should be replaced by stemmer!
63 #$w =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
64 $w =~ tr/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£èáíóú¡±®¾Êê¼ÈºÁÂ̪¯¿ÃãðÐÏËïÒÍÎìÞÙÓÔÑñò©¹ÀÚàÛýÝþ´­½²·¢¸¨ÿØø/CueaauccleOoiZACELlooLlSsOUTtLcaiouAaZzEezCsAAESZzAadDDEdNIIeTUOoNnnSsRUrUyYt'-".'',"'Rr/;
65 $w =~ s/ß/ss/g;
66 $out .= "W $w $nr\n";
67 }
68 }
69 return $out;
70 }
71
72 #--------------------------------------------------------------------
73
74 ################### ERASE###############3
75
76 # expand sub-fileds from ISIS field
77 # (^a.....^b....)
78 my %data; # FIX
79 sub ex_sf {
80 %data = ();
81 my $in = $_[0];
82 if (my $tmp = $in) {
83 # $tmp =~ tr/çæÐџ¬†§¦/¹©ðÐèÈæƾ®/; # ISIS -> iso-8859-2
84 $tmp =~ tr/€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúüýþÿ/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£×èáíóú¡±®¾ÊꔼȺ—„•’ˆƒÁÂ̪‡€œ›¯¿‹Ÿ–˜Ž‘Ã㍐…†“™ž¤ðÐÏËïÒÍÎ슂‰šÞٝÓßÔÑñò©¹ÀÚàÛýÝþ´­½²·¢§÷¸°¨ÿØøŒ /;
85 if ($in =~ m/^\^/) {
86 my @sub = split(/\^/,$in);
87 foreach my $fld (@sub) {
88 $data{$1} = $2 if ($fld =~ m/^(\w+)(.+)$/)
89 }
90 } else {
91 $data{all} = $in."<-- iz polja bez podpolja";
92 }
93 }
94 }
95
96 # dump all sub-fields
97 sub all_sf {
98 my $nr = shift @_;
99 my $out="";
100 foreach my $k (sort keys %data) {
101 $out.=expand($nr,$data{$k});
102 }
103 return $out;
104 }
105
106 sub all_sf_r {
107 my $nr = shift @_;
108 my $out="";
109 foreach my $k (sort {$b cmp $a} keys %data) {
110 $out.=expand($nr,$data{$k});
111 }
112 return $out;
113 }
114
115 sub all_sf2bib {
116 my $nr = shift @_;
117 my $max_in_line=shift @_ || 0;
118 my $sep = shift @_ || ' ';
119 my $out;
120 my $i=0;
121 my $bib = "";
122 foreach my $k (sort keys %data) {
123 if ($out) {
124 $out.= $sep.$data{$k};
125 } else {
126 $out = $data{$k};
127 }
128 $i++;
129 if ($i == $max_in_line) {
130 $bib .= $nr." ".$out."\n" if ($out);
131 $i=0;
132 $out="";
133 }
134 }
135 $bib .= $nr." ".$out."\n" if ($out);
136 return $bib;
137 }
138
139 sub all_sf2bib_r {
140 my $nr = shift @_;
141 my $max_in_line=shift @_ || 0;
142 my $sep = shift @_ || ' ';
143 my $out;
144 my $i=0;
145 my $bib = "";
146 foreach my $k (sort {$b cmp $a} keys %data) {
147 if ($out) {
148 $out.= $sep.$data{$k};
149 } else {
150 $out = $data{$k};
151 }
152 $i++;
153 if ($i == $max_in_line) {
154 $bib .= $nr." ".$out."\n" if ($out);
155 $i=0;
156 $out="";
157 }
158 }
159 $bib .= $nr." ".$out."\n" if ($out);
160 return $bib;
161 }
162
163 #--------------------------------------------------------------------
164
165 sub c_852_iso {
166 my $tmp = $_[0];
167 $tmp =~ tr/€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúüýþÿ/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£×èáíóú¡±®¾ÊꔼȺ—„•’ˆƒÁÂ̪‡€œ›¯¿‹Ÿ–˜Ž‘Ã㍐…†“™ž¤ðÐÏËïÒÍÎ슂‰šÞٝÓßÔÑñò©¹ÀÚàÛýÝþ´­½²·¢§÷¸°¨ÿØøŒ / if ($tmp);
168 return $tmp;
169 }
170
171 sub c_852_czs {
172 my $tmp = $_[0];
173 $tmp =~ tr/€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúüýþÿ/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£×èáíóú¡±®¾ÊꔼȺ—„•’ˆƒÁÂ̪‡€œ›¯¿‹Ÿ–˜Ž‘Ã㍐…†“™ž¤ðÐÏËïÒÍÎ슂‰šÞٝÓßÔÑñò©¹ÀÚàÛýÝþ´­½²·¢§÷¸°¨ÿØøŒ /;
174 $tmp =~ tr/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£èáíóú¡±®¾Êê¼ÈºÁÂ̪¯¿ÃãðÐÏËïÒÍÎìÞÙÓÔÑñò©¹ÀÚàÛýÝþ´­½²·¢¸¨ÿØø/CueaauccleOoiZACELlooLlSsOUTtLcaiouAaZzEezCsAAESZzAadDDEdNIIeTUOoNnnSsRUrUyYt'-".'',"'Rr/;
175 $tmp =~ s/ß/ss/g;
176 return $tmp;
177 }
178
179 #--------------------------------------------------------------------
180 #
181 # mps_expand(nr,"space separated string");
182 #
183
184 sub mps_expand {
185 my $nr = shift @_;
186 my $out = "";
187 while (my $fld = shift @_) {
188 if ($fld =~ s/\s*[,;\.!?'"<>\[\]]*\s+/ /g) {
189 foreach my $w (split(/\s+/,$fld)) {
190 # FIX: this should be replaced by stemmer!
191 $out .= "W $w $nr\n";
192 }
193 } else {
194 $out .= "W $fld $nr\n";
195 }
196 }
197 return c_852_czs($out);
198 }
199
200 #--------------------------------------------------------------------
201
202 # $mps .= sf_to_mps($subfiled_hash,"subfields",mps_id);
203 #
204 # subfields options:
205 # * - all (no sort)
206 # > - all, sort ascending
207 # < - all, sort descending
208
209 sub subfields_str_2_arr {
210 my $subfields = shift @_ || return;
211 my $sf_hash = shift @_;
212 my @sf_arr;
213
214 if ($subfields eq "*") {
215 @sf_arr = keys %{$sf_hash};
216 } elsif ($subfields eq ">") {
217 @sf_arr = sort keys %{$sf_hash};
218 } elsif ($subfields eq "<") {
219 @sf_arr = sort {$b cmp $a} keys %{$sf_hash};
220 } elsif ($subfields =~ s/>//) {
221 @sf_arr = sort split(//,$subfields);
222 } elsif ($subfields =~ s/<//) {
223 @sf_arr = sort {$b cmp $a} split(//,$subfields);
224 } else {
225 @sf_arr = split(//,$subfields);
226 }
227 return @sf_arr;
228 }
229
230
231 sub sf_to_mps {
232 my ($sf_hash,$subfields,$mps_id) = @_;
233 my $out="";
234 my @sf_arr = subfields_str_2_arr($subfields,$sf_hash);
235
236 foreach (@sf_arr) {
237 $out.=mps_expand($mps_id,$sf_hash->{$_});
238 }
239 return $out;
240 }
241
242 #--------------------------------------------------------------------
243
244 # $mps .= isis_to_mps ($row,isis_id,mps_id[,"subfields"])
245
246 sub isis_to_mps {
247 my $row = shift @_ || die;
248 my $isis_id = shift @_ || die;
249 my $mps_id = shift @_ || die;
250 my $subfields = shift @_;
251
252 my $i=0;
253 my $out = "";
254
255 while ($row->{$isis_id}->[$i]) {
256 my $sf_hash = OpenIsis::subfields($row->{$isis_id}->[$i]);
257
258 if (scalar keys %{$sf_hash} > 0) {
259 if ($subfields) {
260 foreach (split(//,$subfields)) {
261 $out.=mps_expand($mps_id,$sf_hash->{$_});
262 }
263 } else {
264 foreach (keys %{$sf_hash}) {
265 $out.=mps_expand($mps_id,$sf_hash->{$_});
266 }
267 }
268 } else {
269 $out.=mps_expand($mps_id,$row->{$isis_id}->[$i]);
270 }
271 $i++;
272 }
273 return $out;
274 }
275
276 #--------------------------------------------------------------------
277
278 # $bib .= isis_to_bib ($row,isis_id,bib_id,"subfields",[,group size][,"group sort"][,"separator"])
279 #
280 # fields:
281 # * - all (no sort)
282 # > - all, sort ascending
283 # < - all, sort descending
284 #
285
286 sub isis_to_bib {
287 my $row = shift @_ || die;
288 my $isis_id = shift @_ || die;
289 my $bib_id = shift @_ || die;
290 my $subfields = shift @_ || '*';
291 my $group_size = shift @_ || 0;
292 my $group_sort = shift @_ || '';
293 my $sep = shift @_ || ' ';
294
295 my $i=0;
296
297 my $bib="";
298
299 my $sf_hash;
300
301 # bib_grp(('a','b','c'))
302 sub bib_grp {
303 my $bib_grp;
304 my $sf_hash = shift @_ || return "";
305 my $bib_id = shift @_;
306 my $sep = shift @_;
307 foreach (@_) {
308 next if (! defined $sf_hash->{$_});
309 if ($bib_grp) {
310 $bib_grp.= $sep . $sf_hash->{$_};
311 } else {
312 $bib_grp = $sf_hash->{$_};
313 }
314 }
315 if ($bib_grp) {
316 return "$bib_id $bib_grp\n"
317 } else {
318 return "";
319 }
320 }
321
322 while ($row->{$isis_id}->[$i]) {
323 my $sf_hash = OpenIsis::subfields($row->{$isis_id}->[$i]);
324
325 if (scalar keys %{$sf_hash} > 0) {
326 if ($group_size) {
327 my $tmp_flds = join("",subfields_str_2_arr($subfields,$sf_hash));
328 while ($tmp_flds) {
329 my $tmp_fld_grp = substr($tmp_flds,0,$group_size);
330 $bib .= bib_grp( $sf_hash, $bib_id, $sep, subfields_str_2_arr($tmp_fld_grp.$group_sort) );
331 last if (length($tmp_flds) <= $group_size);
332 $tmp_flds=substr($tmp_flds,$group_size,length($tmp_flds)-$group_size);
333 }
334
335 } else {
336 $bib .= bib_grp( $sf_hash, $bib_id, $sep, subfields_str_2_arr($subfields,$sf_hash) );
337 }
338 } else {
339 # no subfields, use just value!
340 $bib .= "$bib_id ".$row->{$isis_id}->[$i]."\n";
341 }
342 $i++;
343 }
344 return $bib;
345 }
346
347 #--------------------------------------------------------------------
348
349 sub isis_sf {
350 my $row = shift @_;
351 my $isis_id = shift @_;
352 my $subfield = shift @_ || 'a';
353 my $prefix = shift @_ || '';
354 my $postfix = shift @_ || '';
355
356 my @sep = @_; # rest are separators
357
358 if ($row->{$isis_id}->[0]) {
359 my $sf = OpenIsis::subfields($row->{$isis_id}->[0]);
360 if (length($subfield) == 1) {
361 if ($sf->{$subfield}) {
362 return $prefix . $sf->{$subfield} . $postfix;
363 } else {
364 return '';
365 }
366 } elsif (length($subfield) > 1) {
367 my @s = split(//,$subfield);
368 my $out;
369 foreach (@s) {
370 my $sep = shift @sep || ' ';
371 if ($out) {
372 $out .= $sep . $sf->{$_} if ($sf->{$_});
373 } else {
374 $out = $sf->{$_} if ($sf->{$_});
375 }
376 }
377 return $prefix . $out . $postfix if ($out);
378 }
379 }
380 return '';
381 }
382
383 #--------------------------------------------------------------------
384 #--------------------------------------------------------------------
385
386
387 my $last_tell=0;
388
389 my $db = OpenIsis::open( "$isis_data/$db_dir/LIBRI/LIBRI" );
390
391 my $max_rowid = OpenIsis::maxRowid( $db );
392
393 my $last_pcnt = 0;
394
395 for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
396 my $row = OpenIsis::read( $db, $row_id );
397 if (my $tmp = $row->{'200'}->[0]) {
398
399 my $bib = "%MFN $row->{mfn}\n";
400 my $mps;
401
402 my $pcnt = int($row->{mfn} * 100 / $max_rowid);
403 if ($pcnt != $last_pcnt) {
404 printf MPS ("M %5d / %5d -- %-2d %%\n",$row->{mfn},$max_rowid,$pcnt);
405 $last_pcnt = $pcnt;
406 }
407
408 my $headline;
409 $headline .= isis_sf($row,'200','a',"'");
410 $headline .= isis_sf($row,'200','e'," : ","'");
411
412 # author
413 $bib .= isis_to_bib($row,'700','%700+','*',2,'<');
414 $bib .= isis_to_bib($row,'701','%700+','*',2,'<');
415 $bib .= isis_to_bib($row,'710','%700+','*',2,'<');
416 $bib .= isis_to_bib($row,'711','%700+','*',2,'<');
417 $bib .= isis_to_bib($row,'503','%700+','*',2,'<');
418
419 $mps .= isis_to_mps($row,'700',1);
420 $mps .= isis_to_mps($row,'701',1);
421 $mps .= isis_to_mps($row,'710',1);
422 $mps .= isis_to_mps($row,'711',1);
423 $mps .= isis_to_mps($row,'503',1);
424 $mps .= isis_to_mps($row,'702',1);
425
426 $bib .= isis_to_bib($row,'205','%205');
427
428 # naslov
429 my $sf = OpenIsis::subfields($row->{'200'}->[0]);
430 my $book;
431 $book .= $sf->{a} if ($sf->{a});
432 $book .= " ; ".$sf->{k} if ($sf->{k});
433 $book .= " = ".$sf->{d} if ($sf->{d});
434 $book .= " : ".$sf->{e} if ($sf->{e});
435 $book .= " / ".$sf->{f} if ($sf->{f});
436 $book .= " ; ".$sf->{g} if ($sf->{g});
437 $book .= ". ".$sf->{c} if ($sf->{c});
438 $book .= " / ".$sf->{x} if ($sf->{x});
439 $book .= " ; ".$sf->{y} if ($sf->{y});
440 $bib .= "%200 $book\n" if ($book);
441
442 $mps .= isis_to_mps($row,'200',2,"akcde");
443 $mps .= isis_to_mps($row,'532',2);
444 $mps .= isis_to_mps($row,'424',2);
445
446 $headline .= isis_sf($row,'700','b'," ");
447 $headline .= isis_sf($row,'700','a'," ");
448
449 # izdavac
450 $mps .= isis_to_mps($row,'210',3);
451 # if (my $sf = OpenIsis::subfields($row->{'210'}->[0])) {
452 # my $tmp;
453 # $tmp .= $sf->{a} if ($sf->{a});
454 # $tmp .= " : ".$sf->{c} if ($sf->{c});
455 # $tmp .= ", ".$sf->{d} if ($sf->{d});
456 # $bib .= "%210 $tmp\n" if ($tmp);
457 # }
458 $bib .= "%210 ".isis_sf($row,'210','acd', '','', ('',' : ',', ') )."\n";
459
460 if (my $year = isis_sf($row,'210','d')) {
461 $year =~ s/^\s*cop\.*\s*//i;
462 $year =~ s/[\[\]]*//g;
463 $mps .= "D ${year}\n" if ($year !~ m/\?/ && $year =~ /\d{4}/);
464 $headline .= " ($year)";
465 }
466
467 $bib .= isis_to_bib($row,'215','%215', '*', undef, undef, ', ');
468
469 # $bib .= isis_to_bib($row,'225','%225', 'aehivw');
470 $bib .= "%225 ".isis_sf($row,'225','aevhiw', '(',')', ('',' : ',' ; ','. ',', ',' ; '))."\n";
471 $mps .= isis_to_mps($row,'225',4);
472
473 $bib .= isis_to_bib($row,'300','%300+');
474 $bib .= isis_to_bib($row,'320','%300+');
475 $bib .= isis_to_bib($row,'327','%300+');
476 $mps .= isis_to_mps($row,'300',5);
477 $mps .= isis_to_mps($row,'320',5);
478 $mps .= isis_to_mps($row,'327',5);
479
480 $bib .= isis_to_bib($row,'330','%330');
481 $mps .= isis_to_mps($row,'330',6);
482
483 $bib .= isis_to_bib($row,'423','%423');
484 $bib .= isis_to_bib($row,'464','%464');
485 $mps .= isis_to_mps($row,'464',7);
486 $bib .= isis_to_bib($row,'610','%610');
487 $mps .= isis_to_mps($row,'610',8);
488
489 $bib .= isis_to_bib($row,'675','%675+');
490 $mps .= isis_to_mps($row,'675',9);
491 $bib .= isis_to_bib($row,'686','%675+');
492 $mps .= isis_to_mps($row,'686',10);
493
494 $bib .= isis_to_bib($row,'990','%990');
495 $mps .= isis_to_mps($row,'990',11);
496
497 $bib .= isis_to_bib($row,'991','%991');
498 $mps .= isis_to_mps($row,'991',12);
499
500 # ISBN
501 if (my $isbn = $row->{10}->[0]) {
502 $isbn =~ s/ +//g; # remove spaces
503 $mps .= "W $isbn 13\n";
504 $bib .= "%ISBN $isbn\n";
505 $isbn =~ s/-//g;
506 $mps .= "W $isbn 13\n";
507 }
508 $mps .= isis_to_mps($row,'10',12);
509
510 $mps .= isis_to_mps($row,'532',1);
511
512 $bib .= isis_to_bib($row,'994','%994a','a');
513
514 # headline
515 if ($headline) {
516 $headline .= " [".$row->{mfn}."]"; ## debug MFN!
517 $mps .= "H ".c_852_iso($headline)."\n";
518 } else {
519 $mps .= "H nepoznato\n";
520 }
521
522
523 #if ($db_dir eq "sf") {
524 # print "MFN: $row->{mfn} ROW ID: $row_id\n";
525 # if ($row->{mfn} >= 146) {
526 # print Dumper($row);
527 # }
528 #}
529
530 print R c_852_iso($bib);
531
532 $mps .= "T document text/plain ".(tell(R) - $last_tell)." $dir/bib $last_tell ".tell(R)."\n";
533 $last_tell=tell(R);
534
535 print R "\n";
536
537 $mps .= "E\n";
538
539
540 print S $mps;
541 print MPS $mps;
542 }
543 }
544 print S "M over and out\nX\n";
545 print MPS "M over and out\nX\n";
546 close(MPS);

  ViewVC Help
Powered by ViewVC 1.1.26