/[webpac-proto]/isis2stream.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /isis2stream.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (show annotations)
Mon Jun 24 13:46:53 2002 UTC (21 years, 9 months ago) by dpavlin
Branch: MAIN
Changes since 1.9: +20 -7 lines
File MIME type: text/plain
headline fix

1 #!/usr/bin/perl -w
2
3 use strict;
4 use OpenIsis;
5 use Getopt::Std;
6 use Data::Dumper;
7 use common;
8
9 my %opts;
10
11 getopt('dm', \%opts);
12
13 die "usage: $0 -d [database_dir] -m [database1,database2] " if (! %opts);
14
15 my $db_dir = $opts{d};
16
17 mkdir "$common::install_dir/$db_dir" if (!-e "$common::install_dir/$db_dir");
18 mkdir "$common::install_dir/$db_dir/data" if (!-e "$common::install_dir/$db_dir/data");
19
20 my $dir="$common::install_dir/$db_dir/data";
21
22
23 open(S,"> $dir/stream") || die "can't open output $dir/stream: $!";
24 open(R,"> $dir/bib") || die "can't open output $dir/bib: $!";
25 open(MPS,"| $common::mpsindex -d $common::install_dir/$db_dir -autokey") || die "can't start MPS indexer $common::mpsindex: $!";
26 #open(MPS,"> /tmp/mpsindex") || die "mps: $!";
27
28 print S $common::mps_header;
29 print MPS $common::mps_header;
30
31 #--------------------------------------------------------------------
32 # init array in_mps_header for config checks later
33
34 my %in_mps_header;
35 foreach (split(/\n/,$common::mps_header)) {
36 if (/^F /) {
37 my (undef,$isis,$mps,undef) = split(/ /,$_,4);
38 $in_mps_header{$mps}=$isis;
39 }
40 }
41 require "./search/config.pm";
42
43 #--------------------------------------------------------------------
44 # read database configuration, store database names
45 open(CF,$common::database_cf) || die "$common::database_cf: $!";
46 my %DatabaseDescriptions;
47 while(<CF>) {
48 chomp;
49 if (/^database-name:([^=]+)=(.*)$/) {
50 my ($db_name,$db_desc) = ($1,$2);
51 $db_desc=~s/^##\w+##//g;
52 $DatabaseDescriptions{$db_name}=$db_desc;
53 }
54 }
55 close(CF);
56
57 #--------------------------------------------------------------------
58 #
59 # expand(nr,"space separated string");
60 #
61
62 sub expand {
63 my $nr = shift @_;
64 die "$nr is not in mps_header" if (!$in_mps_header{$nr});
65 my $out = "";
66 while (my $fld = c_852_iso(shift @_)) {
67 my @words=split(/\s+/,$fld);
68 foreach my $w (@words) {
69 # FIX: this should be replaced by stemmer!
70 #$w =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
71 $w =~ tr/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£èáíóú¡±®¾Êê¼ÈºÁÂ̪¯¿ÃãðÐÏËïÒÍÎìÞÙÓÔÑñò©¹ÀÚàÛýÝþ´­½²·¢¸¨ÿØø/CueaauccleOoiZACELlooLlSsOUTtLcaiouAaZzEezCsAAESZzAadDDEdNIIeTUOoNnnSsRUrUyYt'-".'',"'Rr/;
72 $w =~ s/ß/ss/g;
73 $out .= "W $w $nr\n";
74 }
75 }
76 return $out;
77 }
78
79 #--------------------------------------------------------------------
80
81 sub c_852_iso {
82 my $tmp = $_[0];
83 $tmp =~ tr/€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúüýþÿ/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£×èáíóú¡±®¾ÊꔼȺ—„•’ˆƒÁÂ̪‡€œ›¯¿‹Ÿ–˜Ž‘Ã㍐…†“™ž¤ðÐÏËïÒÍÎ슂‰šÞٝÓßÔÑñò©¹ÀÚàÛýÝþ´­½²·¢§÷¸°¨ÿØøŒ / if ($tmp);
84 return $tmp;
85 }
86
87 sub c_852_czs {
88 my $tmp = $_[0];
89 $tmp =~ tr/€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúüýþÿ/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£×èáíóú¡±®¾ÊꔼȺ—„•’ˆƒÁÂ̪‡€œ›¯¿‹Ÿ–˜Ž‘Ã㍐…†“™ž¤ðÐÏËïÒÍÎ슂‰šÞٝÓßÔÑñò©¹ÀÚàÛýÝþ´­½²·¢§÷¸°¨ÿØøŒ /;
90 $tmp =~ tr/Çüéâäùæç³ëÕõî¬ÄÆÉÅåôö¥µ¦¶ÖÜ«»£èáíóú¡±®¾Êê¼ÈºÁÂ̪¯¿ÃãðÐÏËïÒÍÎìÞÙÓÔÑñò©¹ÀÚàÛýÝþ´­½²·¢¸¨ÿØø/CueaauccleOoiZACELlooLlSsOUTtLcaiouAaZzEezCsAAESZzAadDDEdNIIeTUOoNnnSsRUrUyYt'-".'',"'Rr/;
91 $tmp =~ s/ß/ss/g;
92 return $tmp;
93 }
94
95 #--------------------------------------------------------------------
96
97 # $mps .= sf_to_mps("subfields",$subfiled_hash)
98 #
99 # subfields options:
100 # * - all (no sort)
101 # > - all, sort ascending
102 # < - all, sort descending
103
104 sub subfields_str_2_arr {
105 my $subfields = shift @_ || return;
106 my $sf_hash = shift @_;
107 my @sf_arr;
108
109 if ($subfields eq "*") {
110 @sf_arr = keys %{$sf_hash};
111 } elsif ($subfields eq ">") {
112 @sf_arr = sort keys %{$sf_hash};
113 } elsif ($subfields eq "<") {
114 @sf_arr = sort {$b cmp $a} keys %{$sf_hash};
115 } elsif ($subfields =~ s/>//) {
116 @sf_arr = sort split(//,$subfields);
117 } elsif ($subfields =~ s/<//) {
118 @sf_arr = sort {$b cmp $a} split(//,$subfields);
119 } else {
120 @sf_arr = split(//,$subfields);
121 }
122 return @sf_arr;
123 }
124
125
126 sub sf_to_mps {
127 my ($sf_hash,$subfields,$mps_id) = @_;
128 die "$mps_id is not in mps_header" if (!$in_mps_header{$mps_id});
129 my $out="";
130 my @sf_arr = subfields_str_2_arr($subfields,$sf_hash);
131
132 foreach (@sf_arr) {
133 $out.=mps_expand($mps_id,$sf_hash->{$_});
134 }
135 return $out;
136 }
137
138 #--------------------------------------------------------------------
139
140 # $mps .= isis_to_mps ($row,isis_id,mps_id[,"subfields"])
141
142 sub isis_to_mps {
143 my $row = shift @_ || die;
144 my $isis_id = shift @_ || die;
145 my $mps_id = shift @_ || die;
146 my $subfields = shift @_;
147
148 die "$mps_id is not in mps_header" if (!$in_mps_header{$mps_id});
149
150 my $i=0;
151 my $out = "";
152
153 while ($row->{$isis_id}->[$i]) {
154 my $sf_hash = OpenIsis::subfields($row->{$isis_id}->[$i]);
155
156 if (scalar keys %{$sf_hash} > 0) {
157 if ($subfields) {
158 foreach (split(//,$subfields)) {
159 $out.=mps_expand($mps_id,$sf_hash->{$_});
160 }
161 } else {
162 foreach (keys %{$sf_hash}) {
163 $out.=mps_expand($mps_id,$sf_hash->{$_});
164 }
165 }
166 } else {
167 $out.=mps_expand($mps_id,$row->{$isis_id}->[$i]);
168 }
169 $i++;
170 }
171 return $out;
172 }
173
174 #--------------------------------------------------------------------
175
176 # $bib .= isis_to_bib ($row,isis_id,bib_id,"subfields",[,group size][,"group sort"][,"separator"])
177 #
178 # fields:
179 # * - all (no sort)
180 # > - all, sort ascending
181 # < - all, sort descending
182 #
183
184 sub isis_to_bib {
185 my $row = shift @_ || die;
186 my $isis_id = shift @_ || die;
187 my $bib_id = shift @_ || die;
188 my $subfields = shift @_ || '*';
189 my $group_size = shift @_ || 0;
190 my $group_sort = shift @_ || '';
191 my $sep = shift @_ || ' ';
192
193 my $fld = $bib_id; $fld =~ s/^%//;
194 die "$fld is not in FieldNames" if (!$default::FieldNames{$fld});
195
196 my $i=0;
197
198 my $bib="";
199
200 my $sf_hash;
201
202 # bib_grp(('a','b','c'))
203 sub bib_grp {
204 my $bib_grp;
205 my $sf_hash = shift @_ || return "";
206 my $bib_id = shift @_;
207 my $sep = shift @_;
208 foreach (@_) {
209 next if (! defined $sf_hash->{$_});
210 if ($bib_grp) {
211 $bib_grp.= $sep . $sf_hash->{$_};
212 } else {
213 $bib_grp = $sf_hash->{$_};
214 }
215 }
216 if ($bib_grp) {
217 return "$bib_id $bib_grp\n"
218 } else {
219 return "";
220 }
221 }
222
223 while ($row->{$isis_id}->[$i]) {
224 my $sf_hash = OpenIsis::subfields($row->{$isis_id}->[$i]);
225
226 if (scalar keys %{$sf_hash} > 0) {
227 if ($group_size) {
228 my $tmp_flds = join("",subfields_str_2_arr($subfields,$sf_hash));
229 while ($tmp_flds) {
230 my $tmp_fld_grp = substr($tmp_flds,0,$group_size);
231 $bib .= bib_grp( $sf_hash, $bib_id, $sep, subfields_str_2_arr($tmp_fld_grp.$group_sort) );
232 last if (length($tmp_flds) <= $group_size);
233 $tmp_flds=substr($tmp_flds,$group_size,length($tmp_flds)-$group_size);
234 }
235
236 } else {
237 $bib .= bib_grp( $sf_hash, $bib_id, $sep, subfields_str_2_arr($subfields,$sf_hash) );
238 }
239 } else {
240 # no subfields, use just value!
241 $bib .= "$bib_id ".$row->{$isis_id}->[$i]."\n";
242 }
243 $i++;
244 }
245 return $bib;
246 }
247
248 #--------------------------------------------------------------------
249
250 sub isis_sf {
251 my $row = shift @_;
252 my $isis_id = shift @_;
253 my $subfield = shift @_;
254 my $prefix = shift @_ || '';
255 my $postfix = shift @_ || '';
256
257 my @sep = @_; # rest are separators
258
259 if ($row->{$isis_id}->[0]) {
260 my $sf = OpenIsis::subfields($row->{$isis_id}->[0]);
261 if (! defined $subfield || length($subfield) == 0) {
262 # subfield list undef, empty or no defined subfields for this record
263 return $prefix . $row->{$isis_id}->[0] . $postfix;
264 } elsif (length($subfield) == 1) {
265 if ($sf->{$subfield}) {
266 return $prefix . $sf->{$subfield} . $postfix;
267 } else {
268 return '';
269 }
270 } elsif (length($subfield) > 1) {
271 my @s = split(//,$subfield);
272 my $out;
273 foreach (@s) {
274 my $sep = shift @sep || ' ';
275 if ($out) {
276 $out .= $sep . $sf->{$_} if ($sf->{$_});
277 } else {
278 $out = $sf->{$_} if ($sf->{$_});
279 }
280 }
281 return $prefix . $out . $postfix if ($out);
282 }
283 }
284 return '';
285 }
286
287 #--------------------------------------------------------------------
288
289 my $last_tell=0;
290
291 my @isis_dirs = ( '.' ); # use dirname as database name
292
293 if ($opts{m}) {
294 @isis_dirs = split(/,/,$opts{m});
295 }
296
297 my @isis_dbs;
298
299 foreach (@isis_dirs) {
300 if (-e "$common::isis_data/$db_dir/$_/LIBRI") {
301 push @isis_dbs,"$common::isis_data/$db_dir/$_/LIBRI/LIBRI";
302 }
303 if (-e "$common::isis_data/$db_dir/$_/PERI") {
304 push @isis_dbs,"$common::isis_data/$db_dir/$_/PERI/PERI";
305 }
306 }
307
308 foreach my $isis_db (@isis_dbs) {
309
310 print MPS "M reading ISIS from '$isis_db'...\n";
311
312 my $db = OpenIsis::open( "$isis_db" );
313
314 my $max_rowid = OpenIsis::maxRowid( $db );
315
316 my $last_pcnt = 0;
317
318 for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
319 my $row = OpenIsis::read( $db, $row_id );
320 if (my $tmp = $row->{'200'}->[0]) {
321
322 my $bib = "%MFN $row->{mfn}\n";
323 my $mps = "W $row->{mfn} 14\n";
324
325 my $pcnt = int($row->{mfn} * 100 / $max_rowid);
326 if ($pcnt != $last_pcnt) {
327 printf MPS ("M %5d / %5d -- %-2d %%\n",$row->{mfn},$max_rowid,$pcnt);
328 $last_pcnt = $pcnt;
329 }
330
331 my $headline;
332 $headline .= isis_sf($row,'200','a');
333 $headline .= isis_sf($row,'200','e'," : ");
334 $headline .= isis_sf($row,'200','f'," / ");
335 $headline .= isis_sf($row,'210','d'," , ");
336
337 # author
338 $bib .= isis_to_bib($row,'700','%700+','*',2,'<');
339 $bib .= isis_to_bib($row,'701','%700+','*',2,'<');
340 $bib .= isis_to_bib($row,'710','%700+','*',2,'<');
341 $bib .= isis_to_bib($row,'711','%700+','*',2,'<');
342 $bib .= isis_to_bib($row,'503','%700+','*',2,'<');
343
344 $mps .= isis_to_mps($row,'700',1);
345 $mps .= isis_to_mps($row,'701',1);
346 $mps .= isis_to_mps($row,'710',1);
347 $mps .= isis_to_mps($row,'711',1);
348 $mps .= isis_to_mps($row,'503',1);
349 $mps .= isis_to_mps($row,'702',1);
350 $mps .= isis_to_mps($row,'200',1,"fg");
351
352 $bib .= isis_to_bib($row,'205','%205');
353
354 # naslov
355 my $sf = OpenIsis::subfields($row->{'200'}->[0]);
356 my $book;
357 $book .= $sf->{a} if ($sf->{a});
358 $book .= " ; ".$sf->{k} if ($sf->{k});
359 $book .= " = ".$sf->{d} if ($sf->{d});
360 $book .= " : ".$sf->{e} if ($sf->{e});
361 $book .= " / ".$sf->{f} if ($sf->{f});
362 $book .= " ; ".$sf->{g} if ($sf->{g});
363 $book .= ". ".$sf->{c} if ($sf->{c});
364 $book .= " / ".$sf->{x} if ($sf->{x});
365 $book .= " ; ".$sf->{y} if ($sf->{y});
366 $bib .= "%200+ $book\n" if ($book);
367
368 $mps .= isis_to_mps($row,'200',2,"akcde");
369 $mps .= isis_to_mps($row,'532',2);
370 $mps .= isis_to_mps($row,'424',2);
371
372 $mps .= isis_to_mps($row,'230',2,"ae");
373 $mps .= isis_to_mps($row,'231',2,"ae");
374 $mps .= isis_to_mps($row,'232',2,"ae");
375 $mps .= isis_to_mps($row,'233',2,"ae");
376
377
378 my $tmp;
379 $tmp = isis_sf($row,'230','v').
380 isis_sf($row,'230','a',' : ').
381 isis_sf($row,'250',undef,'. - ').
382 isis_sf($row,'260',undef,'. - ').
383 isis_sf($row,'290',undef,'<br>ISBN ');
384 $bib .= "%sv $tmp\n" if ($tmp);
385
386 $tmp = isis_sf($row,'231','v','<br>').
387 isis_sf($row,'231','a',' : ').
388 isis_sf($row,'251',undef,'. - ').
389 isis_sf($row,'261',undef,'. - ').
390 isis_sf($row,'291',undef,'<br>ISBN ');
391 $bib .= "%sv $tmp\n" if ($tmp);
392
393 $tmp = isis_sf($row,'232','v','<br>').
394 isis_sf($row,'232','a',' : ').
395 isis_sf($row,'252',undef,'. - ').
396 isis_sf($row,'262',undef,'. - ').
397 isis_sf($row,'292',undef,'<br>ISBN ');
398 $bib .= "%sv $tmp\n" if ($tmp);
399
400 $tmp = isis_sf($row,'233','v','<br>').
401 isis_sf($row,'233','a',' : ').
402 isis_sf($row,'253',undef,'. - ').
403 isis_sf($row,'263',undef,'. - ').
404 isis_sf($row,'293',undef,'<br>ISBN ');
405
406 $mps .= isis_to_mps($row,'270',2);
407 $mps .= isis_to_mps($row,'271',2);
408 $mps .= isis_to_mps($row,'272',2);
409 $mps .= isis_to_mps($row,'273',2);
410
411 # izdavac
412 $mps .= isis_to_mps($row,'210',3);
413 $mps .= isis_to_mps($row,'250',3);
414 $mps .= isis_to_mps($row,'251',3);
415 $mps .= isis_to_mps($row,'252',3);
416 $mps .= isis_to_mps($row,'253',3);
417 # if (my $sf = OpenIsis::subfields($row->{'210'}->[0])) {
418 # my $tmp;
419 # $tmp .= $sf->{a} if ($sf->{a});
420 # $tmp .= " : ".$sf->{c} if ($sf->{c});
421 # $tmp .= ", ".$sf->{d} if ($sf->{d});
422 # $bib .= "%210 $tmp\n" if ($tmp);
423 # }
424 $bib .= "%210+ ".isis_sf($row,'210','acd', '','', ('',' : ',', ') )."\n";
425
426 if (my $year = isis_sf($row,'210','d')) {
427 $year =~ s/^\s*cop\.*\s*//i;
428 $year =~ s/[\[\]]*//g;
429 $mps .= "D ${year}\n" if ($year !~ m/\?/ && $year =~ /\d{4}/);
430 }
431
432 $mps .= isis_to_mps($row,'215',15);
433 $mps .= isis_to_mps($row,'260',15);
434 $mps .= isis_to_mps($row,'261',15);
435 $mps .= isis_to_mps($row,'262',15);
436 $mps .= isis_to_mps($row,'263',15);
437 $bib .= isis_to_bib($row,'215','%215+', '*', undef, undef, ', ');
438
439 # $bib .= isis_to_bib($row,'225','%225', 'aehivw');
440 $bib .= "%225 ".isis_sf($row,'225','aevhiw', '(',')', ('',' : ',' ; ','. ',', ',' ; '))."\n";
441 $mps .= isis_to_mps($row,'225',4);
442
443 $bib .= isis_to_bib($row,'300','%300+');
444 $bib .= isis_to_bib($row,'320','%300+');
445 $bib .= isis_to_bib($row,'327','%300+');
446 $mps .= isis_to_mps($row,'300',5);
447 $mps .= isis_to_mps($row,'320',5);
448 $mps .= isis_to_mps($row,'327',5);
449 $mps .= isis_to_mps($row,'280',5);
450 $mps .= isis_to_mps($row,'281',5);
451 $mps .= isis_to_mps($row,'282',5);
452 $mps .= isis_to_mps($row,'283',5);
453
454 $bib .= isis_to_bib($row,'330','%330');
455 $mps .= isis_to_mps($row,'330',6);
456
457 $bib .= isis_to_bib($row,'423','%423');
458 $bib .= isis_to_bib($row,'464','%464');
459 $mps .= isis_to_mps($row,'464',7);
460 $bib .= isis_to_bib($row,'610','%610');
461 $mps .= isis_to_mps($row,'610',8);
462
463 $bib .= isis_to_bib($row,'675','%675+');
464 $mps .= isis_to_mps($row,'675',9);
465 $bib .= isis_to_bib($row,'686','%675+');
466 $mps .= isis_to_mps($row,'686',10);
467
468 $bib .= isis_to_bib($row,'990','%990');
469 $mps .= isis_to_mps($row,'990',11);
470
471 $bib .= isis_to_bib($row,'991','%991');
472 $mps .= isis_to_mps($row,'991',12);
473
474 sub isis_isn_to_mps {
475 my $row = shift @_ || die;
476 my $isis_id = shift @_ || die;
477 my $nr = shift @_ || die;
478 my $i=0;
479 my $mps='';
480 while (my $isn=$row->{$isis_id}->[$i]) {
481 $isn =~ s/ +//g; # remove spaces
482 $mps .= "W $isn $nr\n";
483 if ($isn =~ s/-//g) {
484 $mps .= "W $isn $nr\n";
485 }
486 $i++;
487 }
488 return $mps;
489 }
490
491 # ISBN
492 $bib .= isis_to_bib($row,'10','%ISBN');
493 $mps .= isis_isn_to_mps($row,'10',13);
494 $mps .= isis_isn_to_mps($row,'290',13);
495 $mps .= isis_isn_to_mps($row,'291',13);
496 $mps .= isis_isn_to_mps($row,'292',13);
497 $mps .= isis_isn_to_mps($row,'293',13);
498
499 # ISSN
500 #store_isn($row->{11}->[0],14,'%ISSN');
501
502 $mps .= isis_to_mps($row,'532',1);
503
504 $bib .= isis_to_bib($row,'994','%994a','a');
505
506 # headline
507 if ($headline) {
508 $headline .= " <i>(".$DatabaseDescriptions{$db_dir}.", ".$row->{mfn}.")</i>"; ## debug MFN!
509 $headline =~ s/&/&and;/g;
510 $headline =~ s/</&lt;/g;
511 $headline =~ s/>/&gt;/g;
512 $headline =~ s/&lt;(\/?[bi])&gt;/<$1>/g;
513 $mps .= "H ".c_852_iso($headline)."\n";
514 } else {
515 $mps .= "H nepoznato\n";
516 }
517
518
519 #if ($db_dir eq "sf") {
520 # print "MFN: $row->{mfn} ROW ID: $row_id\n";
521 # if ($row->{mfn} >= 146) {
522 # print Dumper($row);
523 # }
524 #}
525
526 print R c_852_iso($bib);
527
528 # check if all fields are defined
529 foreach (split(/\n/,$bib)) {
530 if (/^%(\w+)\s/ && !$default::FieldNames{$1}) {
531 die "field $1 used but not in FieldNames";
532 }
533 }
534
535 # print R "%perl ".Dumper($row)."\n";
536
537 $mps .= "T document text/plain ".(tell(R) - $last_tell)." $dir/bib $last_tell ".tell(R)."\n";
538 $last_tell=tell(R);
539
540 print R "\n";
541
542 $mps .= "E\n";
543
544
545 print S $mps;
546 print MPS $mps;
547 }
548 }
549 }
550 print S "M over and out\nX\n";
551 print MPS "M over and out\nX\n";
552 close(MPS);

  ViewVC Help
Powered by ViewVC 1.1.26