/[webpac-proto]/isis2stream.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /isis2stream.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations)
Fri Jun 14 18:24:05 2002 UTC (21 years, 9 months ago) by dpavlin
Branch: MAIN
Changes since 1.1: +1 -1 lines
File MIME type: text/plain
izbaceni interpunkcijski znakovi kod indeksiranja

1 dpavlin 1.1 #!/usr/bin/perl -w
2    
3     use strict;
4     use OpenIsis;
5     use Getopt::Std;
6     #use Data::Dumper;
7    
8     my $install_dir="/local/index";
9     my $mpsindex="/local/mps-5.3/bin/mpsindex -l 9 -b";
10     my $isis_data="/var/autofs/misc/isis_data/";
11     #my $isis_data="/mnt/20020606/Isis/Data/"; # doma
12    
13     my %opts;
14    
15     getopt('dD', \%opts);
16    
17     die "usage: $0 -d [database_dir] " if (! $opts{d});
18    
19     my $db_dir = $opts{d};
20    
21     mkdir "$install_dir/$db_dir" if (!-e "$install_dir/$db_dir");
22     mkdir "$install_dir/$db_dir/data" if (!-e "$install_dir/$db_dir/data");
23    
24     my $dir="$install_dir/$db_dir/data";
25    
26    
27     open(S,"> $dir/stream") || die "can't open output $dir/stram: $!";
28     open(R,"> $dir/bib") || die "can't open output $dir/bib: $!";
29     open(MPS,"| $mpsindex -d $install_dir/$db_dir -autokey") || die "can't start MPS indexer $mpsindex: $!";
30     #open(MPS,"> /tmp/mpsindex") || die "mps: $!";
31    
32     my $s="V 5 3
33     L hr-HR
34     F 700+ 1 Autor
35     F 200+ 2 Naslov
36     F 210 3 Izdavanje
37     F 225 4 Nakladnièka cjelina
38     F 300+ 5 Napomene
39     F 330 6 Sadr¾aj
40     F 464 7 Analitièki radovi
41     F 610 8 Kljuène rijeèi
42     F 675 9 UDK
43     F 686 10 CC
44     F 990 11 Signatura
45     F 991 12 Inventarni broj
46     F 10 13 ISBN
47     ";
48    
49     print S $s;
50     print MPS $s;
51    
52     #
53     # expand(nr,"space separated string");
54     #
55    
56     sub expand {
57     my $nr = shift @_;
58     my $out = "";
59     while (my $fld = c_852_iso(shift @_)) {
60     my @words=split(/\s+/,$fld);
61     foreach my $w (@words) {
62     # FIX: this should be replaced by stemmer!
63     #$w =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
64     $w =~ tr/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£èáíóú¡±®¾Êê¼ÈºÁÂ̪¯¿ÃãðÐÏËïÒÍÎìÞÙÓÔÑñò©¹ÀÚàÛýÝþ´­½²·¢¸¨ÿØø/CueaauccleOoiZACELlooLlSsOUTtLcaiouAaZzEezCsAAESZzAadDDEdNIIeTUOoNnnSsRUrUyYt'-".'',"'Rr/;
65     $w =~ s/ß/ss/g;
66     $out .= "W $w $nr\n";
67     }
68     }
69     return $out;
70     }
71    
72     #--------------------------------------------------------------------
73    
74     ################### ERASE###############3
75    
76     # expand sub-fileds from ISIS field
77     # (^a.....^b....)
78     my %data; # FIX
79     sub ex_sf {
80     %data = ();
81     my $in = $_[0];
82     if (my $tmp = $in) {
83     # $tmp =~ tr/çæÐџ¬†§¦/¹©ðÐèÈæƾ®/; # ISIS -> iso-8859-2
84     $tmp =~ tr/€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúüýþÿ/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£×èáíóú¡±®¾ÊꔼȺ—„•’ˆƒÁÂ̪‡€œ›¯¿‹Ÿ–˜Ž‘Ã㍐…†“™ž¤ðÐÏËïÒÍÎ슂‰šÞٝÓßÔÑñò©¹ÀÚàÛýÝþ´­½²·¢§÷¸°¨ÿØøŒ /;
85     if ($in =~ m/^\^/) {
86     my @sub = split(/\^/,$in);
87     foreach my $fld (@sub) {
88     $data{$1} = $2 if ($fld =~ m/^(\w+)(.+)$/)
89     }
90     } else {
91     $data{all} = $in."<-- iz polja bez podpolja";
92     }
93     }
94     }
95    
96     # dump all sub-fields
97     sub all_sf {
98     my $nr = shift @_;
99     my $out="";
100     foreach my $k (sort keys %data) {
101     $out.=expand($nr,$data{$k});
102     }
103     return $out;
104     }
105    
106     sub all_sf_r {
107     my $nr = shift @_;
108     my $out="";
109     foreach my $k (sort {$b cmp $a} keys %data) {
110     $out.=expand($nr,$data{$k});
111     }
112     return $out;
113     }
114    
115     sub all_sf2bib {
116     my $nr = shift @_;
117     my $max_in_line=shift @_ || 0;
118     my $sep = shift @_ || ' ';
119     my $out;
120     my $i=0;
121     my $bib = "";
122     foreach my $k (sort keys %data) {
123     if ($out) {
124     $out.= $sep.$data{$k};
125     } else {
126     $out = $data{$k};
127     }
128     $i++;
129     if ($i == $max_in_line) {
130     $bib .= $nr." ".$out."\n" if ($out);
131     $i=0;
132     $out="";
133     }
134     }
135     $bib .= $nr." ".$out."\n" if ($out);
136     return $bib;
137     }
138    
139     sub all_sf2bib_r {
140     my $nr = shift @_;
141     my $max_in_line=shift @_ || 0;
142     my $sep = shift @_ || ' ';
143     my $out;
144     my $i=0;
145     my $bib = "";
146     foreach my $k (sort {$b cmp $a} keys %data) {
147     if ($out) {
148     $out.= $sep.$data{$k};
149     } else {
150     $out = $data{$k};
151     }
152     $i++;
153     if ($i == $max_in_line) {
154     $bib .= $nr." ".$out."\n" if ($out);
155     $i=0;
156     $out="";
157     }
158     }
159     $bib .= $nr." ".$out."\n" if ($out);
160     return $bib;
161     }
162    
163     #--------------------------------------------------------------------
164    
165     sub c_852_iso {
166     my $tmp = $_[0];
167     $tmp =~ tr/€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúüýþÿ/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£×èáíóú¡±®¾ÊꔼȺ—„•’ˆƒÁÂ̪‡€œ›¯¿‹Ÿ–˜Ž‘Ã㍐…†“™ž¤ðÐÏËïÒÍÎ슂‰šÞٝÓßÔÑñò©¹ÀÚàÛýÝþ´­½²·¢§÷¸°¨ÿØøŒ / if ($tmp);
168     return $tmp;
169     }
170    
171     sub c_852_czs {
172     my $tmp = $_[0];
173     $tmp =~ tr/€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúüýþÿ/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£×èáíóú¡±®¾ÊꔼȺ—„•’ˆƒÁÂ̪‡€œ›¯¿‹Ÿ–˜Ž‘Ã㍐…†“™ž¤ðÐÏËïÒÍÎ슂‰šÞٝÓßÔÑñò©¹ÀÚàÛýÝþ´­½²·¢§÷¸°¨ÿØøŒ /;
174     $tmp =~ tr/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£èáíóú¡±®¾Êê¼ÈºÁÂ̪¯¿ÃãðÐÏËïÒÍÎìÞÙÓÔÑñò©¹ÀÚàÛýÝþ´­½²·¢¸¨ÿØø/CueaauccleOoiZACELlooLlSsOUTtLcaiouAaZzEezCsAAESZzAadDDEdNIIeTUOoNnnSsRUrUyYt'-".'',"'Rr/;
175     $tmp =~ s/ß/ss/g;
176     return $tmp;
177     }
178    
179     #--------------------------------------------------------------------
180     #
181     # mps_expand(nr,"space separated string");
182     #
183    
184     sub mps_expand {
185     my $nr = shift @_;
186     my $out = "";
187     while (my $fld = shift @_) {
188 dpavlin 1.2 if ($fld =~ s/\s*[,;\.!?'"<>\[\]]*\s+/ /g) {
189 dpavlin 1.1 foreach my $w (split(/\s+/,$fld)) {
190     # FIX: this should be replaced by stemmer!
191     $out .= "W $w $nr\n";
192     }
193     } else {
194     $out .= "W $fld $nr\n";
195     }
196     }
197     return c_852_czs($out);
198     }
199    
200     #--------------------------------------------------------------------
201    
202     # $mps .= sf_to_mps($subfiled_hash,"subfields",mps_id);
203     #
204     # subfields options:
205     # * - all (no sort)
206     # > - all, sort ascending
207     # < - all, sort descending
208    
209     sub subfields_str_2_arr {
210     my $subfields = shift @_ || return;
211     my $sf_hash = shift @_;
212     my @sf_arr;
213    
214     if ($subfields eq "*") {
215     @sf_arr = keys %{$sf_hash};
216     } elsif ($subfields eq ">") {
217     @sf_arr = sort keys %{$sf_hash};
218     } elsif ($subfields eq "<") {
219     @sf_arr = sort {$b cmp $a} keys %{$sf_hash};
220     } elsif ($subfields =~ s/>//) {
221     @sf_arr = sort split(//,$subfields);
222     } elsif ($subfields =~ s/<//) {
223     @sf_arr = sort {$b cmp $a} split(//,$subfields);
224     } else {
225     @sf_arr = split(//,$subfields);
226     }
227     return @sf_arr;
228     }
229    
230    
231     sub sf_to_mps {
232     my ($sf_hash,$subfields,$mps_id) = @_;
233     my $out="";
234     my @sf_arr = subfields_str_2_arr($subfields,$sf_hash);
235    
236     foreach (@sf_arr) {
237     $out.=mps_expand($mps_id,$sf_hash->{$_});
238     }
239     return $out;
240     }
241    
242     #--------------------------------------------------------------------
243    
244     # $mps .= isis_to_mps ($row,isis_id,mps_id[,"subfields"])
245    
246     sub isis_to_mps {
247     my $row = shift @_ || die;
248     my $isis_id = shift @_ || die;
249     my $mps_id = shift @_ || die;
250     my $subfields = shift @_;
251    
252     my $i=0;
253     my $out = "";
254    
255     while ($row->{$isis_id}->[$i]) {
256     my $sf_hash = OpenIsis::subfields($row->{$isis_id}->[$i]);
257    
258     if (scalar keys %{$sf_hash} > 0) {
259     if ($subfields) {
260     foreach (split(//,$subfields)) {
261     $out.=mps_expand($mps_id,$sf_hash->{$_});
262     }
263     } else {
264     foreach (keys %{$sf_hash}) {
265     $out.=mps_expand($mps_id,$sf_hash->{$_});
266     }
267     }
268     } else {
269     $out.=mps_expand($mps_id,$row->{$isis_id}->[$i]);
270     }
271     $i++;
272     }
273     return $out;
274     }
275    
276     #--------------------------------------------------------------------
277    
278     # $bib .= isis_to_bib ($row,isis_id,bib_id,"subfields",[,group size][,"group sort"][,"separator"])
279     #
280     # fields:
281     # * - all (no sort)
282     # > - all, sort ascending
283     # < - all, sort descending
284     #
285    
286     sub isis_to_bib {
287     my $row = shift @_ || die;
288     my $isis_id = shift @_ || die;
289     my $bib_id = shift @_ || die;
290     my $subfields = shift @_ || '*';
291     my $group_size = shift @_ || 0;
292     my $group_sort = shift @_ || '';
293     my $sep = shift @_ || ' ';
294    
295     my $i=0;
296    
297     my $bib="";
298    
299     my $sf_hash;
300    
301     # bib_grp(('a','b','c'))
302     sub bib_grp {
303     my $bib_grp;
304     my $sf_hash = shift @_ || return "";
305     my $bib_id = shift @_;
306     my $sep = shift @_;
307     foreach (@_) {
308     next if (! defined $sf_hash->{$_});
309     if ($bib_grp) {
310     $bib_grp.= $sep . $sf_hash->{$_};
311     } else {
312     $bib_grp = $sf_hash->{$_};
313     }
314     }
315     if ($bib_grp) {
316     return "$bib_id $bib_grp\n"
317     } else {
318     return "";
319     }
320     }
321    
322     while ($row->{$isis_id}->[$i]) {
323     my $sf_hash = OpenIsis::subfields($row->{$isis_id}->[$i]);
324    
325     if (scalar keys %{$sf_hash} > 0) {
326     if ($group_size) {
327     my $tmp_flds = join("",subfields_str_2_arr($subfields,$sf_hash));
328     while ($tmp_flds) {
329     my $tmp_fld_grp = substr($tmp_flds,0,$group_size);
330     $bib .= bib_grp( $sf_hash, $bib_id, $sep, subfields_str_2_arr($tmp_fld_grp.$group_sort) );
331     last if (length($tmp_flds) <= $group_size);
332     $tmp_flds=substr($tmp_flds,$group_size,length($tmp_flds)-$group_size);
333     }
334    
335     } else {
336     $bib .= bib_grp( $sf_hash, $bib_id, $sep, subfields_str_2_arr($subfields,$sf_hash) );
337     }
338     } else {
339     # no subfields, use just value!
340     $bib .= "$bib_id ".$row->{$isis_id}->[$i]."\n";
341     }
342     $i++;
343     }
344     return $bib;
345     }
346    
347     #--------------------------------------------------------------------
348    
349     sub isis_sf {
350     my $row = shift @_;
351     my $isis_id = shift @_;
352     my $subfield = shift @_ || 'a';
353     my $prefix = shift @_ || '';
354     my $postfix = shift @_ || '';
355    
356     my @sep = @_; # rest are separators
357    
358     if ($row->{$isis_id}->[0]) {
359     my $sf = OpenIsis::subfields($row->{$isis_id}->[0]);
360     if (length($subfield) == 1) {
361     if ($sf->{$subfield}) {
362     return $prefix . $sf->{$subfield} . $postfix;
363     } else {
364     return '';
365     }
366     } elsif (length($subfield) > 1) {
367     my @s = split(//,$subfield);
368     my $out;
369     foreach (@s) {
370     my $sep = shift @sep || ' ';
371     if ($out) {
372     $out .= $sep . $sf->{$_} if ($sf->{$_});
373     } else {
374     $out = $sf->{$_} if ($sf->{$_});
375     }
376     }
377     return $prefix . $out . $postfix if ($out);
378     }
379     }
380     return '';
381     }
382    
383     #--------------------------------------------------------------------
384     #--------------------------------------------------------------------
385    
386    
387     my $last_tell=0;
388    
389     my $db = OpenIsis::open( "$isis_data/$db_dir/LIBRI/LIBRI" );
390    
391     my $max_rowid = OpenIsis::maxRowid( $db );
392    
393     my $last_pcnt = 0;
394    
395     for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
396     my $row = OpenIsis::read( $db, $row_id );
397     if (my $tmp = $row->{'200'}->[0]) {
398    
399     my $bib = "%MFN $row->{mfn}\n";
400     my $mps;
401    
402     my $pcnt = int($row->{mfn} * 100 / $max_rowid);
403     if ($pcnt != $last_pcnt) {
404     printf MPS ("M %5d / %5d -- %-2d %%\n",$row->{mfn},$max_rowid,$pcnt);
405     $last_pcnt = $pcnt;
406     }
407    
408     my $headline;
409     $headline .= isis_sf($row,'200','a',"'");
410     $headline .= isis_sf($row,'200','e'," : ","'");
411    
412     # author
413     $bib .= isis_to_bib($row,'700','%700+','*',2,'<');
414     $bib .= isis_to_bib($row,'701','%700+','*',2,'<');
415     $bib .= isis_to_bib($row,'710','%700+','*',2,'<');
416     $bib .= isis_to_bib($row,'711','%700+','*',2,'<');
417     $bib .= isis_to_bib($row,'503','%700+','*',2,'<');
418    
419     $mps .= isis_to_mps($row,'700',1);
420     $mps .= isis_to_mps($row,'701',1);
421     $mps .= isis_to_mps($row,'710',1);
422     $mps .= isis_to_mps($row,'711',1);
423     $mps .= isis_to_mps($row,'503',1);
424     $mps .= isis_to_mps($row,'702',1);
425    
426     $bib .= isis_to_bib($row,'205','%205');
427    
428     # naslov
429     my $sf = OpenIsis::subfields($row->{'200'}->[0]);
430     my $book;
431     $book .= $sf->{a} if ($sf->{a});
432     $book .= " ; ".$sf->{k} if ($sf->{k});
433     $book .= " = ".$sf->{d} if ($sf->{d});
434     $book .= " : ".$sf->{e} if ($sf->{e});
435     $book .= " / ".$sf->{f} if ($sf->{f});
436     $book .= " ; ".$sf->{g} if ($sf->{g});
437     $book .= ". ".$sf->{c} if ($sf->{c});
438     $book .= " / ".$sf->{x} if ($sf->{x});
439     $book .= " ; ".$sf->{y} if ($sf->{y});
440     $bib .= "%200 $book\n" if ($book);
441    
442     $mps .= isis_to_mps($row,'200',2,"akcde");
443     $mps .= isis_to_mps($row,'532',2);
444     $mps .= isis_to_mps($row,'424',2);
445    
446     $headline .= isis_sf($row,'700','b'," ");
447     $headline .= isis_sf($row,'700','a'," ");
448    
449     # izdavac
450     $mps .= isis_to_mps($row,'210',3);
451     # if (my $sf = OpenIsis::subfields($row->{'210'}->[0])) {
452     # my $tmp;
453     # $tmp .= $sf->{a} if ($sf->{a});
454     # $tmp .= " : ".$sf->{c} if ($sf->{c});
455     # $tmp .= ", ".$sf->{d} if ($sf->{d});
456     # $bib .= "%210 $tmp\n" if ($tmp);
457     # }
458     $bib .= "%210 ".isis_sf($row,'210','acd', '','', ('',' : ',', ') )."\n";
459    
460     if (my $year = isis_sf($row,'210','d')) {
461     $year =~ s/^\s*cop\.*\s*//i;
462     $year =~ s/[\[\]]*//g;
463     $mps .= "D ${year}\n" if ($year !~ m/\?/ && $year =~ /\d{4}/);
464     $headline .= " ($year)";
465     }
466    
467     $bib .= isis_to_bib($row,'215','%215', '*', undef, undef, ', ');
468    
469     # $bib .= isis_to_bib($row,'225','%225', 'aehivw');
470     $bib .= "%225 ".isis_sf($row,'225','aevhiw', '(',')', ('',' : ',' ; ','. ',', ',' ; '))."\n";
471     $mps .= isis_to_mps($row,'225',4);
472    
473     $bib .= isis_to_bib($row,'300','%300+');
474     $bib .= isis_to_bib($row,'320','%300+');
475     $bib .= isis_to_bib($row,'327','%300+');
476     $mps .= isis_to_mps($row,'300',5);
477     $mps .= isis_to_mps($row,'320',5);
478     $mps .= isis_to_mps($row,'327',5);
479    
480     $bib .= isis_to_bib($row,'330','%330');
481     $mps .= isis_to_mps($row,'330',6);
482    
483     $bib .= isis_to_bib($row,'423','%423');
484     $bib .= isis_to_bib($row,'464','%464');
485     $mps .= isis_to_mps($row,'464',7);
486     $bib .= isis_to_bib($row,'610','%610');
487     $mps .= isis_to_mps($row,'610',8);
488    
489     $bib .= isis_to_bib($row,'675','%675+');
490     $mps .= isis_to_mps($row,'675',9);
491     $bib .= isis_to_bib($row,'686','%675+');
492     $mps .= isis_to_mps($row,'686',10);
493    
494     $bib .= isis_to_bib($row,'990','%990');
495     $mps .= isis_to_mps($row,'990',11);
496    
497     $bib .= isis_to_bib($row,'991','%991');
498     $mps .= isis_to_mps($row,'991',12);
499    
500     # ISBN
501     if (my $isbn = $row->{10}->[0]) {
502     $isbn =~ s/ +//g; # remove spaces
503     $mps .= "W $isbn 13\n";
504     $bib .= "%ISBN $isbn\n";
505     $isbn =~ s/-//g;
506     $mps .= "W $isbn 13\n";
507     }
508     $mps .= isis_to_mps($row,'10',12);
509    
510     $mps .= isis_to_mps($row,'532',1);
511    
512     $bib .= isis_to_bib($row,'994','%994a','a');
513    
514     # headline
515     if ($headline) {
516     $headline .= " [".$row->{mfn}."]"; ## debug MFN!
517     $mps .= "H ".c_852_iso($headline)."\n";
518     } else {
519     $mps .= "H nepoznato\n";
520     }
521    
522    
523     #if ($db_dir eq "sf") {
524     # print "MFN: $row->{mfn} ROW ID: $row_id\n";
525     # if ($row->{mfn} >= 146) {
526     # print Dumper($row);
527     # }
528     #}
529    
530     print R c_852_iso($bib);
531    
532     $mps .= "T document text/plain ".(tell(R) - $last_tell)." $dir/bib $last_tell ".tell(R)."\n";
533     $last_tell=tell(R);
534    
535     print R "\n";
536    
537     $mps .= "E\n";
538    
539    
540     print S $mps;
541     print MPS $mps;
542     }
543     }
544     print S "M over and out\nX\n";
545     print MPS "M over and out\nX\n";
546     close(MPS);

  ViewVC Help
Powered by ViewVC 1.1.26