--- trunk/tools/mods2unimarc.pl 2004/05/14 17:22:39 326 +++ trunk/tools/mods2unimarc.pl 2004/05/15 18:54:41 327 @@ -24,9 +24,15 @@ Feel free to hack this script and convert it to your own needs. -=head1 WARNING +=head1 CAVEAT -This script is in state of flux. +This script will parse imput XML twice: once with C and +then each entry with C to produce in-memory structure. +That's because I wanted to keep node selection logical (and perl-like). + +If you don't like it, you can rewrite this script to use XPATH. I tried +and failed (it seems that MODS is too complicated for my limited knowledge +of XPATH). =cut @@ -40,6 +46,7 @@ my $xml_file = "/data/tehnika/fer/all.xml"; $xml_file = "/data/tehnika/fer/modsFER_1.xml"; +$xml_file = "/data/tehnika/fer/mods-small.xml"; my $marc_file = "fer.marc"; $|=1; @@ -47,33 +54,35 @@ my $marc = MARC->new; +my $ENCODING = 'ISO-8859-2'; + my $twig=XML::Twig->new( - twig_roots => { 'mods' => \&item }, - output_encoding => 'iso-8859-2', + twig_roots => { 'mods' => \&mods }, + output_encoding => 'UTF8', ); -my $utf2iso = Text::Iconv->new("UTF-8", "ISO-8859-2"); +my $utf2iso = Text::Iconv->new("UTF8", $ENCODING); +print "$xml_file: "; $twig->parsefile($xml_file); $twig->purge; +print "$nr\nSaving MARC file...\n"; $marc->output({file=>"> $marc_file",'format'=>"usmarc"}); -sub item { +sub mods { my( $t, $elt)= @_; my $xml=$elt->xml_string; - my $ref = XMLin("".$xml."", + my $ref = XMLin(''.$xml.'', ForceArray => [ 'name', 'classification', 'topic', - 'udc', ], KeyAttr => { 'namePart' => 'type', 'identifier' => 'type', - 'classification' => 'authority', 'namePart' => 'type', 'role' => 'type', }, @@ -88,49 +97,95 @@ my $m_cache; sub marc_add { - my $m_cache = \shift; + my $m_cache = \shift || die "need m_cache"; my $fld = shift || die "need field!"; - my $sf = shift || ''; + my $sf = shift; + my $data = shift || return; - return if (! @_); +#print "add: $fld",($sf ? "^".$sf : ''),": $data\n"; - my @a; - foreach (@_) { - next if (! $_); - push @a,$sf if ($sf); -# push @a,$utf2iso->convert($_) || $_; - push @a,$_; + if ($sf) { + push @{$$m_cache->{tmp}->{$fld}}, $sf; } + push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data); + } - return if (! @a); + sub marc_rep { + my $m_cache = \shift || die "need m_cache"; + foreach my $fld (@_) { +#print "marc_rep: $fld\n"; + push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld}); + delete $$m_cache->{tmp}->{$fld}; + } + } -# print "storing $fld: ",join("|",@a),"\n"; + sub marc_single { + my $m_cache = \shift || die "need m_cache"; + foreach my $fld (@_) { +#print "marc_single: $fld\n"; - push @{$$m_cache->{$fld}}, @a; + die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld}); + $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld}); + delete $$m_cache->{tmp}->{$fld}; + } + } + + sub marc_add_rep { + my $m_cache = \shift || die "need m_cache"; + my $fld = shift || die "need field!"; + my $sf = shift; + my $data = shift || return; + + marc_add($$m_cache,$fld,$sf,$data); + marc_rep($$m_cache,$fld); + } + + sub marc_add_single { + my $m_cache = \shift || die "need m_cache"; + my $fld = shift || die "need field!"; + my $sf = shift; + my $data = shift || return; + + marc_add($$m_cache,$fld,$sf,$data); + marc_single($$m_cache,$fld); } my $journal = 0; + # Journals start with c- in our MODS $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/); - - marc_add($m_cache,'610','a',@{$ref->{subject}->{topic}}); - my $fld = '700'; + foreach my $t (@{$ref->{subject}->{topic}}) { + marc_add($m_cache,'610','a', $t); + marc_rep($m_cache,'610'); + } + + my $fld_700 = '700'; + my $fld_710 = '710'; foreach my $name (@{$ref->{name}}) { my $role = $name->{role}->{roleTerm}->{content}; next if (! $role); if ($role eq "author") { - marc_add($m_cache,$fld,'a',$name->{namePart}->{family}); - marc_add($m_cache,$fld,'b',$name->{namePart}->{given}); - marc_add($m_cache,$fld,'4',$role); + marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family}); + marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given}); + marc_add($m_cache,$fld_700,'4',$role); + + marc_rep($m_cache,$fld_700); # first author goes in 700, others in 701 - $fld = '701'; + $fld_700 = '701'; } elsif ($role eq "editor" or $role eq "illustrator") { marc_add($m_cache,'702','a',$name->{namePart}->{family}); marc_add($m_cache,'702','b',$name->{namePart}->{given}); marc_add($m_cache,'702','4',$role); + marc_rep($m_cache,'702'); + } elsif ($role eq "corporate") { + marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart}); + $fld_710 = '711'; + } elsif ($role eq "conference") { + marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart}); + $fld_710 = '711'; } else { die "FATAL: don't know how to map role '$role'" if ($role); } @@ -141,23 +196,23 @@ if ($note) { foreach my $n (split(/\s*;\s+/, $note)) { if ($n =~ s/bibliogr:\s+//i) { - marc_add($m_cache,'320','a',"Bibliografija: $n"); + marc_add_rep($m_cache,'320','a',"Bibliografija: $n"); } elsif ($n =~ s/ilustr:\s+//i) { marc_add($m_cache,'215','c', $n); } else { - marc_add($m_cache,'320','a',$n); + marc_add_rep($m_cache,'320','a',$n); } } } - + my $type = $ref->{identifier}->{type}; if ($type) { if ($type eq "isbn") { - marc_add($m_cache,'010','a',$ref->{identifier}->{content}); + marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content}); } elsif ($type eq "issn") { - marc_add($m_cache,'011','a',$ref->{identifier}->{content}); + marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content}); } else { die "unknown identifier type $type"; } @@ -181,27 +236,34 @@ marc_add($m_cache,'215','a', $data) if ($data); marc_add($m_cache,'215','d', $tmp->{visina}); } + marc_rep($m_cache,'215'); - marc_add($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier}); + marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier}); marc_add($m_cache,'200','a',$ref->{titleInfo}->{title}); marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle}); + marc_single($m_cache,'200'); - marc_add($m_cache,'675','a',$ref->{classification}->{udc}); + foreach my $c (@{$ref->{classification}}) { + if ($c->{'authority'} eq "udc") { + marc_add_rep($m_cache,'675','a', $c->{'content'}); + } + } my $related = $ref->{relatedItem}->{type}; if ($related) { if ($related eq "series") { marc_add($m_cache,'225','a',$ref->{relatedItem}->{titleInfo}->{title}); marc_add($m_cache,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber}); + marc_rep($m_cache,'225','999'); } elsif ($related eq "preceding") { - marc_add($m_cache,'430','a',$ref->{relatedItem}->{titleInfo}->{title}); + marc_add_rep($m_cache,'430','a',$ref->{relatedItem}->{titleInfo}->{title}); } else { die "can't parse related item type $related" if ($related); } } - marc_add($m_cache,'205','a',$ref->{originInfo}->{edition}); + marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition}); my $publisher = $ref->{originInfo}->{publisher}; if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) { @@ -211,25 +273,55 @@ marc_add($m_cache,'210','c', $publisher); } - marc_add($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal); - marc_add($m_cache,'210','a',$ref->{originInfo}->{place}); - marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued}); + marc_single($m_cache,'210'); + + marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal); + $nr++; print "$nr " if ($nr % 100 == 0); # dump record - my $m=$marc->createrecord(); - foreach my $fld (keys %{$m_cache}) { -# print "$fld: ",join(" * ",@{$m_cache->{$fld}}),"\n"; + my $m=$marc->createrecord({leader=>"00000nam 2200000 a 4500"}); + + foreach my $fld (keys %{$m_cache->{array}}) { + foreach my $arr (@{$m_cache->{array}->{$fld}}) { +#print "array = ",Dumper($arr); + my ($i1,$i2); + # do we have indicators? + if ($fld =~ m/^(.+)\t(.)(.)$/) { + $fld = $1; + ($i1,$i2) = ($2,$3); + } + $marc->addfield({record=>$m, + field=>$fld, + i1=>$i1, + i2=>$i2, + value=>$arr + }); + } + } + + foreach my $fld (keys %{$m_cache->{single}}) { +#print "single = ",Dumper($m_cache->{single}->{$fld}); + my ($i1,$i2); + # do we have indicators? + if ($fld =~ m/^(.+)\t(.)(.)$/) { + $fld = $1; + ($i1,$i2) = ($2,$3); + } $marc->addfield({record=>$m, field=>$fld, - value=>\@{$m_cache->{$fld}} + i1=>$i1, + i2=>$i2, + value=>$m_cache->{single}->{$fld} }); } + $m_cache = {}; + $t->purge; # frees the memory }