--- isis2stream.pl 2002/06/13 15:32:10 1.1.1.1 +++ isis2stream.pl 2002/07/17 19:33:19 1.20 @@ -3,58 +3,67 @@ use strict; use OpenIsis; use Getopt::Std; -#use Data::Dumper; - -my $install_dir="/local/index"; -my $mpsindex="/local/mps-5.3/bin/mpsindex -l 9 -b"; -my $isis_data="/var/autofs/misc/isis_data/"; -#my $isis_data="/mnt/20020606/Isis/Data/"; # doma +use Data::Dumper; +use common; my %opts; -getopt('dD', \%opts); +getopts('d:m:q', \%opts); -die "usage: $0 -d [database_dir] " if (! $opts{d}); +die "usage: $0 -d [database_dir] -m [database1,database2] " if (! %opts); my $db_dir = $opts{d}; -mkdir "$install_dir/$db_dir" if (!-e "$install_dir/$db_dir"); -mkdir "$install_dir/$db_dir/data" if (!-e "$install_dir/$db_dir/data"); +mkdir "$common::install_dir/$db_dir" if (!-e "$common::install_dir/$db_dir"); +mkdir "$common::install_dir/$db_dir/data" if (!-e "$common::install_dir/$db_dir/data"); -my $dir="$install_dir/$db_dir/data"; +my $dir="$common::install_dir/$db_dir/data"; -open(S,"> $dir/stream") || die "can't open output $dir/stram: $!"; +open(S,"> $dir/stream") || die "can't open output $dir/stream: $!"; open(R,"> $dir/bib") || die "can't open output $dir/bib: $!"; -open(MPS,"| $mpsindex -d $install_dir/$db_dir -autokey") || die "can't start MPS indexer $mpsindex: $!"; +open(MPS,"| $common::mpsindex -d $common::install_dir/$db_dir -autokey") || die "can't start MPS indexer $common::mpsindex: $!"; #open(MPS,"> /tmp/mpsindex") || die "mps: $!"; -my $s="V 5 3 -L hr-HR -F 700+ 1 Autor -F 200+ 2 Naslov -F 210 3 Izdavanje -F 225 4 Nakladnika cjelina -F 300+ 5 Napomene -F 330 6 Sadraj -F 464 7 Analitiki radovi -F 610 8 Kljune rijei -F 675 9 UDK -F 686 10 CC -F 990 11 Signatura -F 991 12 Inventarni broj -F 10 13 ISBN -"; +print S $common::mps_header; +print MPS $common::mps_header; -print S $s; -print MPS $s; +#-------------------------------------------------------------------- +# init array in_mps_header for config checks later +my %in_mps_header; +foreach (split(/\n/,$common::mps_header)) { + if (/^F /) { + my (undef,$isis,$mps,undef) = split(/ /,$_,4); + $in_mps_header{$mps}=$isis; + } +} +require "./search/config.pm"; + +#-------------------------------------------------------------------- +# read database configuration, store database names +open(CF,$common::database_cf) || die "$common::database_cf: $!"; +my %DatabaseDescriptions; +while() { + chomp; + if (/^database-name:([^=]+)=(.*)$/) { + my ($db_name,$db_desc) = ($1,$2); + $db_desc=~s/^##\w+##//g; + # c_iso_852 is a cludge so that output format would be + # correct 8859-2 again... + $DatabaseDescriptions{$db_name}=c_iso_852($db_desc); + } +} +close(CF); + +#-------------------------------------------------------------------- # # expand(nr,"space separated string"); # sub expand { my $nr = shift @_; + die "$nr is not in mps_header" if (!$in_mps_header{$nr}); my $out = ""; while (my $fld = c_852_iso(shift @_)) { my @words=split(/\s+/,$fld); @@ -71,103 +80,18 @@ #-------------------------------------------------------------------- -################### ERASE###############3 - -# expand sub-fileds from ISIS field -# (^a.....^b....) -my %data; # FIX -sub ex_sf { - %data = (); - my $in = $_[0]; - if (my $tmp = $in) { -# $tmp =~ tr/џ/ƾ/; # ISIS -> iso-8859-2 - $tmp =~ tr//ܫꔼȺ̪㍐슂ٝ/; - if ($in =~ m/^\^/) { - my @sub = split(/\^/,$in); - foreach my $fld (@sub) { - $data{$1} = $2 if ($fld =~ m/^(\w+)(.+)$/) - } - } else { - $data{all} = $in."<-- iz polja bez podpolja"; - } - } -} - -# dump all sub-fields -sub all_sf { - my $nr = shift @_; - my $out=""; - foreach my $k (sort keys %data) { - $out.=expand($nr,$data{$k}); - } - return $out; -} - -sub all_sf_r { - my $nr = shift @_; - my $out=""; - foreach my $k (sort {$b cmp $a} keys %data) { - $out.=expand($nr,$data{$k}); - } - return $out; -} - -sub all_sf2bib { - my $nr = shift @_; - my $max_in_line=shift @_ || 0; - my $sep = shift @_ || ' '; - my $out; - my $i=0; - my $bib = ""; - foreach my $k (sort keys %data) { - if ($out) { - $out.= $sep.$data{$k}; - } else { - $out = $data{$k}; - } - $i++; - if ($i == $max_in_line) { - $bib .= $nr." ".$out."\n" if ($out); - $i=0; - $out=""; - } - } - $bib .= $nr." ".$out."\n" if ($out); - return $bib; -} - -sub all_sf2bib_r { - my $nr = shift @_; - my $max_in_line=shift @_ || 0; - my $sep = shift @_ || ' '; - my $out; - my $i=0; - my $bib = ""; - foreach my $k (sort {$b cmp $a} keys %data) { - if ($out) { - $out.= $sep.$data{$k}; - } else { - $out = $data{$k}; - } - $i++; - if ($i == $max_in_line) { - $bib .= $nr." ".$out."\n" if ($out); - $i=0; - $out=""; - } - } - $bib .= $nr." ".$out."\n" if ($out); - return $bib; -} - -#-------------------------------------------------------------------- - sub c_852_iso { my $tmp = $_[0]; $tmp =~ tr//ܫꔼȺ̪㍐슂ٝ/ if ($tmp); return $tmp; } +sub c_iso_852 { + my $tmp = $_[0]; + $tmp =~ tr/ܫꔼȺ̪㍐슂ٝ// if ($tmp); + return $tmp; +} + sub c_852_czs { my $tmp = $_[0]; $tmp =~ tr//ܫꔼȺ̪㍐슂ٝ/; @@ -177,29 +101,8 @@ } #-------------------------------------------------------------------- -# -# mps_expand(nr,"space separated string"); -# -sub mps_expand { - my $nr = shift @_; - my $out = ""; - while (my $fld = shift @_) { - if ($fld =~ m/\s+/) { - foreach my $w (split(/\s+/,$fld)) { - # FIX: this should be replaced by stemmer! - $out .= "W $w $nr\n"; - } - } else { - $out .= "W $fld $nr\n"; - } - } - return c_852_czs($out); -} - -#-------------------------------------------------------------------- - -# $mps .= sf_to_mps($subfiled_hash,"subfields",mps_id); +# $mps .= sf_to_mps("subfields",$subfiled_hash) # # subfields options: # * - all (no sort) @@ -230,6 +133,7 @@ sub sf_to_mps { my ($sf_hash,$subfields,$mps_id) = @_; + die "$mps_id is not in mps_header" if (!$in_mps_header{$mps_id}); my $out=""; my @sf_arr = subfields_str_2_arr($subfields,$sf_hash); @@ -249,6 +153,8 @@ my $mps_id = shift @_ || die; my $subfields = shift @_; + die "$mps_id is not in mps_header" if (!$in_mps_header{$mps_id}); + my $i=0; my $out = ""; @@ -292,6 +198,9 @@ my $group_sort = shift @_ || ''; my $sep = shift @_ || ' '; + my $fld = $bib_id; $fld =~ s/^%//; + die "$fld is not in FieldNames" if (!$default::FieldNames{$fld}); + my $i=0; my $bib=""; @@ -349,7 +258,7 @@ sub isis_sf { my $row = shift @_; my $isis_id = shift @_; - my $subfield = shift @_ || 'a'; + my $subfield = shift @_; my $prefix = shift @_ || ''; my $postfix = shift @_ || ''; @@ -357,7 +266,10 @@ if ($row->{$isis_id}->[0]) { my $sf = OpenIsis::subfields($row->{$isis_id}->[0]); - if (length($subfield) == 1) { + if (! defined $subfield || length($subfield) == 0) { + # subfield list undef, empty or no defined subfields for this record + return $prefix . $row->{$isis_id}->[0] . $postfix; + } elsif (length($subfield) == 1) { if ($sf->{$subfield}) { return $prefix . $sf->{$subfield} . $postfix; } else { @@ -381,40 +293,99 @@ } #-------------------------------------------------------------------- -#-------------------------------------------------------------------- - my $last_tell=0; -my $db = OpenIsis::open( "$isis_data/$db_dir/LIBRI/LIBRI" ); +my @isis_dirs = ( '.' ); # use dirname as database name -my $max_rowid = OpenIsis::maxRowid( $db ); +if ($opts{m}) { + @isis_dirs = split(/,/,$opts{m}); +} -my $last_pcnt = 0; +my @isis_dbs; -for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) { +foreach (@isis_dirs) { + if (-e "$common::isis_data/$db_dir/$_/LIBRI") { + push @isis_dbs,"$common::isis_data/$db_dir/$_/LIBRI/LIBRI"; + } + if (-e "$common::isis_data/$db_dir/$_/PERI") { + push @isis_dbs,"$common::isis_data/$db_dir/$_/PERI/PERI"; + } + if (-e "$common::isis_data/$db_dir/$_/AMS") { + push @isis_dbs,"$common::isis_data/$db_dir/$_/AMS/AMS"; + } + if (-e "$common::isis_data/$db_dir/$_/ARTI") { +# push @isis_dbs,"$common::isis_data/$db_dir/$_/ARTI/ARTI"; + } +} + +foreach my $isis_db (@isis_dbs) { + + print MPS "M reading ISIS from '$isis_db'...\n"; + + my $db = OpenIsis::open( "$isis_db" ); + + if (! defined $db) { + die "can't open '$isis_db'"; + } + + my $tip = $isis_db; $tip =~ s/^.+?\/([^\/]+)$/$1/; + if (defined $default::tip{$tip}) { + $tip=$default::tip{$tip}; + } elsif ($tip eq "AMS") { + $tip=$default::tip{'LIBRI'}; + } else { + die "can't find tip for database '$isis_db'"; + } + $tip = c_iso_852($tip); + + my $max_rowid = OpenIsis::maxRowid( $db ); + + my $last_pcnt = 0; + + for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) { my $row = OpenIsis::read( $db, $row_id ); if (my $tmp = $row->{'200'}->[0]) { - my $bib = "%MFN $row->{mfn}\n"; - my $mps; + my $bib; + my $mps = "W $row->{mfn} 14\n"; + + # tip gradje + $mps .= "W ".c_852_czs($tip)." 17\n"; + $bib .= "%tip $tip\n"; my $pcnt = int($row->{mfn} * 100 / $max_rowid); if ($pcnt != $last_pcnt) { - printf MPS ("M %5d / %5d -- %-2d %%\n",$row->{mfn},$max_rowid,$pcnt); + printf MPS ("M %5d / %5d -- %-2d %%\n",$row->{mfn},$max_rowid,$pcnt) if (! $opts{q}); $last_pcnt = $pcnt; } my $headline; - $headline .= isis_sf($row,'200','a',"'"); - $headline .= isis_sf($row,'200','e'," : ","'"); + $headline .= isis_sf($row,'200','a'); + $headline .= isis_sf($row,'200','e'," : "); + $headline .= isis_sf($row,'200','f'," / "); + $headline .= isis_sf($row,'210','d'," , "); + + # remove newlines, compress spaces + $headline =~ s/[\n\r]//g; + $headline =~ s/^\s+//g; + $headline =~ s/\s+$//g; # author - $bib .= isis_to_bib($row,'700','%700+','*',2,'<'); - $bib .= isis_to_bib($row,'701','%700+','*',2,'<'); - $bib .= isis_to_bib($row,'710','%700+','*',2,'<'); - $bib .= isis_to_bib($row,'711','%700+','*',2,'<'); - $bib .= isis_to_bib($row,'503','%700+','*',2,'<'); + $bib .= isis_to_bib($row,'700','%700+','ab',undef,'>',', '); + $bib .= isis_to_bib($row,'701','%700+','ab',undef,'>',', '); + $bib .= isis_to_bib($row,'701','%700+','cd',undef,'>',', '); + + my $tmp; + $tmp = isis_sf($row,'710','abc', '', '', (' : ',', ')); + $tmp .= isis_sf($row,'710','dfe', ' (', ')', ('', ' ; ',' ; ')); + $bib .= "%700+ $tmp\n" if ($tmp); + + $tmp = isis_sf($row,'711','abc', '', '', (' : ',', ')); + $tmp .= isis_sf($row,'711','dfe', ' (', ')', ('', ' ; ',' ; ')); + $bib .= "%700+ $tmp\n" if ($tmp); + + $bib .= isis_to_bib($row,'503','%700+','ab',undef,'>',', '); $mps .= isis_to_mps($row,'700',1); $mps .= isis_to_mps($row,'701',1); @@ -422,6 +393,7 @@ $mps .= isis_to_mps($row,'711',1); $mps .= isis_to_mps($row,'503',1); $mps .= isis_to_mps($row,'702',1); + $mps .= isis_to_mps($row,'200',1,"fg"); $bib .= isis_to_bib($row,'205','%205'); @@ -437,17 +409,56 @@ $book .= ". ".$sf->{c} if ($sf->{c}); $book .= " / ".$sf->{x} if ($sf->{x}); $book .= " ; ".$sf->{y} if ($sf->{y}); - $bib .= "%200 $book\n" if ($book); + $bib .= "%200+ $book\n" if ($book); $mps .= isis_to_mps($row,'200',2,"akcde"); $mps .= isis_to_mps($row,'532',2); $mps .= isis_to_mps($row,'424',2); - $headline .= isis_sf($row,'700','b'," "); - $headline .= isis_sf($row,'700','a'," "); + $mps .= isis_to_mps($row,'230',2,"ae"); + $mps .= isis_to_mps($row,'231',2,"ae"); + $mps .= isis_to_mps($row,'232',2,"ae"); + $mps .= isis_to_mps($row,'233',2,"ae"); + + + $tmp = isis_sf($row,'230','v'). + isis_sf($row,'230','a',' : '). + isis_sf($row,'250',undef,'. - '). + isis_sf($row,'260',undef,'. - '). + isis_sf($row,'290',undef,'
ISBN '); + $bib .= "%sv $tmp\n" if ($tmp); + + $tmp = isis_sf($row,'231','v','
'). + isis_sf($row,'231','a',' : '). + isis_sf($row,'251',undef,'. - '). + isis_sf($row,'261',undef,'. - '). + isis_sf($row,'291',undef,'
ISBN '); + $bib .= "%sv $tmp\n" if ($tmp); + + $tmp = isis_sf($row,'232','v','
'). + isis_sf($row,'232','a',' : '). + isis_sf($row,'252',undef,'. - '). + isis_sf($row,'262',undef,'. - '). + isis_sf($row,'292',undef,'
ISBN '); + $bib .= "%sv $tmp\n" if ($tmp); + + $tmp = isis_sf($row,'233','v','
'). + isis_sf($row,'233','a',' : '). + isis_sf($row,'253',undef,'. - '). + isis_sf($row,'263',undef,'. - '). + isis_sf($row,'293',undef,'
ISBN '); + + $mps .= isis_to_mps($row,'270',2); + $mps .= isis_to_mps($row,'271',2); + $mps .= isis_to_mps($row,'272',2); + $mps .= isis_to_mps($row,'273',2); # izdavac $mps .= isis_to_mps($row,'210',3); + $mps .= isis_to_mps($row,'250',3); + $mps .= isis_to_mps($row,'251',3); + $mps .= isis_to_mps($row,'252',3); + $mps .= isis_to_mps($row,'253',3); # if (my $sf = OpenIsis::subfields($row->{'210'}->[0])) { # my $tmp; # $tmp .= $sf->{a} if ($sf->{a}); @@ -455,16 +466,21 @@ # $tmp .= ", ".$sf->{d} if ($sf->{d}); # $bib .= "%210 $tmp\n" if ($tmp); # } - $bib .= "%210 ".isis_sf($row,'210','acd', '','', ('',' : ',', ') )."\n"; + $bib .= "%210+ ".isis_sf($row,'210','acd', '','', ('',' : ',', ') )."\n"; if (my $year = isis_sf($row,'210','d')) { $year =~ s/^\s*cop\.*\s*//i; $year =~ s/[\[\]]*//g; + $year =~ s/[\n\r]//g; # remove cr $mps .= "D ${year}\n" if ($year !~ m/\?/ && $year =~ /\d{4}/); - $headline .= " ($year)"; } - $bib .= isis_to_bib($row,'215','%215', '*', undef, undef, ', '); + $mps .= isis_to_mps($row,'215',15); + $mps .= isis_to_mps($row,'260',15); + $mps .= isis_to_mps($row,'261',15); + $mps .= isis_to_mps($row,'262',15); + $mps .= isis_to_mps($row,'263',15); + $bib .= isis_to_bib($row,'215','%215+', '*', undef, undef, ', '); # $bib .= isis_to_bib($row,'225','%225', 'aehivw'); $bib .= "%225 ".isis_sf($row,'225','aevhiw', '(',')', ('',' : ',' ; ','. ',', ',' ; '))."\n"; @@ -476,6 +492,10 @@ $mps .= isis_to_mps($row,'300',5); $mps .= isis_to_mps($row,'320',5); $mps .= isis_to_mps($row,'327',5); + $mps .= isis_to_mps($row,'280',5); + $mps .= isis_to_mps($row,'281',5); + $mps .= isis_to_mps($row,'282',5); + $mps .= isis_to_mps($row,'283',5); $bib .= isis_to_bib($row,'330','%330'); $mps .= isis_to_mps($row,'330',6); @@ -486,9 +506,9 @@ $bib .= isis_to_bib($row,'610','%610'); $mps .= isis_to_mps($row,'610',8); - $bib .= isis_to_bib($row,'675','%675+'); + $bib .= isis_to_bib($row,'675','%675+','a'); $mps .= isis_to_mps($row,'675',9); - $bib .= isis_to_bib($row,'686','%675+'); + $bib .= isis_to_bib($row,'686','%675+','a'); $mps .= isis_to_mps($row,'686',10); $bib .= isis_to_bib($row,'990','%990'); @@ -497,23 +517,63 @@ $bib .= isis_to_bib($row,'991','%991'); $mps .= isis_to_mps($row,'991',12); - # ISBN - if (my $isbn = $row->{10}->[0]) { - $isbn =~ s/ +//g; # remove spaces - $mps .= "W $isbn 13\n"; - $bib .= "%ISBN $isbn\n"; - $isbn =~ s/-//g; - $mps .= "W $isbn 13\n"; + # Jezik + $bib .= isis_to_bib($row,'101','%101'); + $mps .= isis_to_mps($row,'101',16); + # Pismo + $bib .= isis_to_bib($row,'998','%101', 'a'); + + sub isis_isn_to_mps { + my $row = shift @_ || die; + my $isis_id = shift @_ || die; + my $nr = shift @_ || die; + my $i=0; + my $mps=''; + while (my $isn=$row->{$isis_id}->[$i]) { + $isn =~ s/ +//g; # remove spaces + $isn =~ s/[\n\r]//g; # remove cr + $mps .= "W $isn $nr\n"; + if ($isn =~ s/-//g) { + $mps .= "W $isn $nr\n"; + } + $i++; + } + return $mps; } - $mps .= isis_to_mps($row,'10',12); + + # ISBN + $bib .= isis_to_bib($row,'10','%ISBN'); + $mps .= isis_isn_to_mps($row,'10',13); + $mps .= isis_isn_to_mps($row,'290',13); + $mps .= isis_isn_to_mps($row,'291',13); + $mps .= isis_isn_to_mps($row,'292',13); + $mps .= isis_isn_to_mps($row,'293',13); + + # ISSN + $bib .= isis_to_bib($row,'11','%ISSN'); + $mps .= isis_isn_to_mps($row,'11',13); $mps .= isis_to_mps($row,'532',1); - $bib .= isis_to_bib($row,'994','%994a','a'); + # Casopisi + $tmp = isis_to_bib($row,'326','%326'); + $tmp =~ s/g1/godinjak/; + $tmp =~ s/g6/dvomjesenik/; + $tmp =~ s/10/godinje 10 brojeva/; + $tmp =~ s/m1/mjesenik/; + $tmp =~ s/m2/polumjesenik/; + $tmp =~ s/nr/neredovito/; + $tmp =~ s/g4/etiri puta godinje/; + $bib .= c_iso_852($tmp); + $bib .= isis_to_bib($row,'992','%992'); + $bib .= '%knjiz '.$DatabaseDescriptions{$db_dir}.", ".$row->{mfn}."\n"; # headline if ($headline) { - $headline .= " [".$row->{mfn}."]"; ## debug MFN! + $headline .= " (".$DatabaseDescriptions{$db_dir}.", ".$row->{mfn}.")"; ## debug MFN! + $headline =~ s/&/∧/g; + $headline =~ s//>/g; $mps .= "H ".c_852_iso($headline)."\n"; } else { $mps .= "H nepoznato\n"; @@ -529,6 +589,15 @@ print R c_852_iso($bib); + # check if all fields are defined + foreach (split(/\n/,$bib)) { + if (/^%(\w+)\s/ && !$default::FieldNames{$1}) { + die "field $1 used but not in FieldNames"; + } + } + +# print R "%perl ".Dumper($row)."\n"; + $mps .= "T document text/plain ".(tell(R) - $last_tell)." $dir/bib $last_tell ".tell(R)."\n"; $last_tell=tell(R); @@ -540,6 +609,7 @@ print S $mps; print MPS $mps; } + } } print S "M over and out\nX\n"; print MPS "M over and out\nX\n";