--- trunk/tools/mods2unimarc.pl 2004/05/14 16:38:22 325 +++ branches/drustvene/tools/mods2unimarc.pl 2004/06/02 16:13:57 335 @@ -6,7 +6,7 @@ =head1 SYNOPSIS -mods2marc.pl mods.xml export.marc +mods2marc.pl export.marc mods.xml [mods2.xml ... ] =head1 DESCRIPTION @@ -19,11 +19,20 @@ This script B to MODS export from Faculty of Electrical Engineering and Computing -so you might want to edit it +so you might want to edit it (among other thing, it includes a lot +of fields which are in Croatian). -=head1 WARNING +Feel free to hack this script and convert it to your own needs. -This script is in state of flux. +=head1 CAVEAT + +This script will parse imput XML twice: once with C and +then each entry with C to produce in-memory structure. +That's because I wanted to keep node selection logical (and perl-like). + +If you don't like it, you can rewrite this script to use XPATH. I tried +and failed (it seems that MODS is too complicated for my limited knowledge +of XPATH). =cut @@ -35,42 +44,49 @@ use Data::Dumper; -my $xml_file = "/data/tehnika/fer/all.xml"; -$xml_file = "/data/tehnika/fer/modsFER_1.xml"; -my $marc_file = "fer.marc"; +my $marc_file = shift @ARGV || die "$0: need MARC export file"; +die "$0: need at least one MODS XML file" if (! @ARGV); $|=1; my $nr = 0; my $marc = MARC->new; +my $ENCODING = 'ISO-8859-2'; + my $twig=XML::Twig->new( - twig_roots => { 'mods' => \&item }, - output_encoding => 'iso-8859-2', + twig_roots => { 'mods' => \&mods }, + output_encoding => 'UTF8', ); -my $utf2iso = Text::Iconv->new("UTF-8", "ISO-8859-2"); +my $utf2iso = Text::Iconv->new("UTF8", $ENCODING); -$twig->parsefile($xml_file); -$twig->purge; +foreach my $xml_file (@ARGV) { + print "$xml_file: "; + $twig->parsefile($xml_file); + $twig->purge; + print "$nr\n"; +} + +print "Saving MARC file...\n"; $marc->output({file=>"> $marc_file",'format'=>"usmarc"}); -sub item { +sub mods { my( $t, $elt)= @_; my $xml=$elt->xml_string; - my $ref = XMLin("".$xml."", + my $ref = XMLin(''.$xml.'', ForceArray => [ 'name', 'classification', 'topic', - 'udc', + 'relatedItem', + 'partNumber', ], KeyAttr => { 'namePart' => 'type', 'identifier' => 'type', - 'classification' => 'authority', 'namePart' => 'type', 'role' => 'type', }, @@ -82,52 +98,98 @@ ContentKey => '-content', ); - my $m=$marc->createrecord(); + my $m_cache; - sub marc_arr { - my $m = shift || die "no marc record?"; - my $fld = shift || die "no marc field?"; - my $sf = shift || ''; + sub marc_add { + my $m_cache = \shift || die "need m_cache"; + my $fld = shift || die "need field!"; + my $sf = shift; + my $data = shift || return; - return if (! @_); +#print "add: $fld",($sf ? "^".$sf : ''),": $data\n"; - my @a; - foreach (@_) { - next if (! $_); - push @a,$sf; -# push @a,$utf2iso->convert($_) || $_; - push @a,$_; + if ($sf) { + push @{$$m_cache->{tmp}->{$fld}}, $sf; } + push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data); + } - return if (! @a); + sub marc_rep { + my $m_cache = \shift || die "need m_cache"; + foreach my $fld (@_) { +#print "marc_rep: $fld\n"; + push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld}); + delete $$m_cache->{tmp}->{$fld}; + } + } -# print "storing $fld: ",join("|",@a),"\n"; + sub marc_single { + my $m_cache = \shift || die "need m_cache"; + foreach my $fld (@_) { +#print "marc_single: $fld\n"; - $marc->addfield({record=>$m, - field=>$fld, - # i1=>$i1, - # i2=>$i2, - value=>\@a}); + die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld}); + + $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld}); + delete $$m_cache->{tmp}->{$fld}; + } + } + + sub marc_add_rep { + my $m_cache = \shift || die "need m_cache"; + my $fld = shift || die "need field!"; + my $sf = shift; + my $data = shift || return; + + marc_add($$m_cache,$fld,$sf,$data); + marc_rep($$m_cache,$fld); + } + + sub marc_add_single { + my $m_cache = \shift || die "need m_cache"; + my $fld = shift || die "need field!"; + my $sf = shift; + my $data = shift || return; + + marc_add($$m_cache,$fld,$sf,$data); + marc_single($$m_cache,$fld); } - marc_arr($m,'610','a',@{$ref->{subject}->{topic}}); + my $journal = 0; + # Journals start with c- in our MODS + $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/); + + foreach my $t (@{$ref->{subject}->{topic}}) { + marc_add($m_cache,'610','a', $t); + marc_rep($m_cache,'610'); + } - my $fld = '700'; + my $fld_700 = '700'; + my $fld_710 = '710'; foreach my $name (@{$ref->{name}}) { my $role = $name->{role}->{roleTerm}->{content}; next if (! $role); if ($role eq "author") { - marc_arr($m,$fld,'a',$name->{namePart}->{family}); - marc_arr($m,$fld,'b',$name->{namePart}->{given}); - marc_arr($m,$fld,'4',$role); + marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family}); + marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given}); + marc_add($m_cache,$fld_700,'4',$role); + + marc_rep($m_cache,$fld_700); # first author goes in 700, others in 701 - $fld = '701'; + $fld_700 = '701'; } elsif ($role eq "editor" or $role eq "illustrator") { - marc_arr($m,'702','a',$name->{namePart}->{family}); - marc_arr($m,'702','b',$name->{namePart}->{given}); - marc_arr($m,'702','4',$role); + marc_add($m_cache,'702','a',$name->{namePart}->{family}); + marc_add($m_cache,'702','b',$name->{namePart}->{given}); + marc_add($m_cache,'702','4',$role); + marc_rep($m_cache,'702'); + } elsif ($role eq "corporate") { + marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart}); + $fld_710 = '711'; + } elsif ($role eq "conference") { + marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart}); + $fld_710 = '711'; } else { die "FATAL: don't know how to map role '$role'" if ($role); } @@ -138,23 +200,25 @@ if ($note) { foreach my $n (split(/\s*;\s+/, $note)) { if ($n =~ s/bibliogr:\s+//i) { - marc_arr($m,'320','a',"Bibliografija: $n"); + marc_add_rep($m_cache,'320','a',"Bibliografija: $n"); } elsif ($n =~ s/ilustr:\s+//i) { - marc_arr($m,'215','c', $n); + marc_add($m_cache,'215','c', $n); } else { - marc_arr($m,'320','a',$n); + marc_add_rep($m_cache,'320','a',$n); } } } - + my $type = $ref->{identifier}->{type}; if ($type) { if ($type eq "isbn") { - marc_arr($m,'010','a',$ref->{identifier}->{content}); + marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content}); } elsif ($type eq "issn") { - marc_arr($m,'011','a',$ref->{identifier}->{content}); + marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content}); + } elsif ($type eq "uri") { + marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content}); } else { die "unknown identifier type $type"; } @@ -167,7 +231,7 @@ if ($t =~ m/([^:]+):\s+(.+)$/) { $tmp->{$1} = $2; } else { - die "can't parse $t"; + print STDERR "can't parse '$t' in ",Dumper($phy_desc); } } my $data = $tmp->{pagin}; @@ -175,163 +239,113 @@ if ($tmp->{str}) { $data .= $tmp->{str}." str"; } - marc_arr($m,'210','a', $data) if ($data); - marc_arr($m,'210','d', $tmp->{visina}); + marc_add($m_cache,'215','a', $data) if ($data); + marc_add($m_cache,'215','d', $tmp->{visina}); } + marc_rep($m_cache,'215'); - marc_arr($m,'001','',$ref->{recordInfo}->{recordIdentifier}); + marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier}); - marc_arr($m,'200','a',$ref->{titleInfo}->{title}); - marc_arr($m,'200','e',$ref->{titleInfo}->{subTitle}); + marc_add($m_cache,'200','a',$ref->{titleInfo}->{title}); + marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle}); + marc_single($m_cache,'200'); - marc_arr($m,'675','a',$ref->{classification}->{udc}); + foreach my $c (@{$ref->{classification}}) { + if ($c->{'authority'} eq "udc") { + marc_add_rep($m_cache,'675','a', $c->{'content'}); + } + } - my $related = $ref->{relatedItem}->{type}; - if ($related) { - if ($related eq "series") { - marc_arr($m,'675','a',$ref->{relatedItem}->{titleInfo}->{title}); - marc_arr($m,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber}); - } elsif ($related eq "preceding") { - marc_arr($m,'430','a',$ref->{relatedItem}->{titleInfo}->{title}); - } else { - die "can't parse related item type $related" if ($related); + foreach my $ri (@{$ref->{relatedItem}}) { + my $related = $ri->{type}; + if ($related) { + if ($related eq "series") { + marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title}); + foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) { + if ($journal) { + marc_add_rep($m_cache,'999','a',$pn); + } else { + marc_add_rep($m_cache,'225','v',$pn); + } + } + } elsif ($related eq "preceding") { + marc_add($m_cache,'520','a',$ri->{titleInfo}->{title}); + if ($ri->{identifier}) { + if ($ri->{identifier}->{type} eq "issn") { + marc_add($m_cache,'520','x',$ri->{identifier}->{content}); + } else { + die "can't store identifier type $type"; + } + } + marc_rep($m_cache,'520'); + } else { + die "can't parse related item type $related" if ($related); + } } } - marc_arr($m,'205','a',$ref->{originInfo}->{edition}); + marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition}); + + marc_add($m_cache,'210','a',$ref->{originInfo}->{place}); my $publisher = $ref->{originInfo}->{publisher}; if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) { - marc_arr($m,'210','a', $2); - marc_arr($m,'210','c', $1); + marc_add($m_cache,'210','a', $2); + marc_add($m_cache,'210','c', $1); } else { - marc_arr($m,'210','c', $publisher); + marc_add($m_cache,'210','c', $publisher); } - marc_arr($m,'326','a',$ref->{originInfo}->{frequency}); - marc_arr($m,'326','a',$ref->{originInfo}->{place}); + marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued}); + + marc_single($m_cache,'210'); - marc_arr($m,'210','d',$ref->{originInfo}->{dateIssued}); + marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal); $nr++; print "$nr " if ($nr % 100 == 0); - $t->purge; # frees the memory -} - -__END__ + # dump record + my $bib_level = "m"; + $bib_level = "s" if ($journal); + my $m=$marc->createrecord({leader=>"00000na".$bib_level." 2200000 a 4500"}); + + foreach my $fld (keys %{$m_cache->{array}}) { + foreach my $arr (@{$m_cache->{array}->{$fld}}) { +#print "array = ",Dumper($arr); + my ($i1,$i2); + # do we have indicators? + if ($fld =~ m/^(.+)\t(.)(.)$/) { + $fld = $1; + ($i1,$i2) = ($2,$3); + } + $marc->addfield({record=>$m, + field=>$fld, + i1=>$i1, + i2=>$i2, + value=>$arr + }); + } + } -KNJIGA = { -610a 'subject' => [ - { - 'topic' => [ - 'LIBRARIES-AUTOMATION', - 'ELECTRONIC DATA PROCESSING-LIBRARY SCIENCE' - ] - } - ], - 'name' => [ - { - 'namePart' => { -700b,701a... 'given' => 'Robert M.', -700a,701b... 'family' => 'Hayes' - }, - 'type' => 'personal', - 'role' => { - 'roleTerm' => { -7004,7014... 'content' => 'author', - 'type' => 'text' - } - } - }, - { - 'namePart' => { -702b 'given' => 'Joseph', -702a 'family' => 'Becker' - }, - 'type' => 'personal', - 'role' => { - 'roleTerm' => { -7024 'content' => 'editor', - 'type' => 'text' - } - } - }, - { - 'namePart' => { -702b 'given' => 'Joseph', -702a 'family' => 'Becker' - }, - 'type' => 'personal', - 'role' => { - 'roleTerm' => { -7024 'content' => 'illustrator', - 'type' => 'text' - } - } - } - ], - 'note' => 'bibliogr: 645-647; kazalo; ilustr: ilustr.', - 'identifier' => { -010a 'content' => '0-471-36483-5', - 'type' => 'isbn' - }, -215a;215d 'physicalDescription' => 'str: 688; pagin: xvi; visina: 24. cm', -001 'recordInfo' => { - 'recordIdentifier' => 'k-7996-8073' - }, -200a 'titleInfo' => { - 'title' => 'Handbook of data processing for libraries' - }, - 'typeOfResource' => 'text', -675a 'classification' => { - 'udc' => '=20' - }, -225a 'relatedItem' => { - 'titleInfo' => { - 'title' => 'A WILEY-BECKER & HAYES SERIES BOOK' - }, - 'type' => 'series' - }, - 'originInfo' => { - 'issuance' => 'monographic', -205a 'edition' => '2.', -210c/210a 'publisher' => 'MELVILLE PUBLISHING COMPANY /LOS ANGELES, CALIFORNIA/', -210d 'dateIssued' => '1974' - } - }; - - - - CASOPIS = { - 'identifier' => { -011a 'content' => '1041-5173', - 'type' => 'issn' - }, - 'recordInfo' => { - 'recordIdentifier' => 'c-1' - }, - 'titleInfo' => { - 'title' => 'DBMS - CLIENT/SERVER COMPUTING' - }, - 'typeOfResource' => 'text', -999a 'relatedItem' => { - 'titleInfo' => { - 'partNumber' => 'g. 1990, vol. 137, br. 5' - }, - 'type' => 'series' - }, - 'classification' => { - 'udc' => '=20' - }, - 'originInfo' => { -326a 'frequency' => 'mjeseļæ½no', - 'issuance' => 'continuing', -210a 'place' => 'SAN MATEO, KANADA', -210c 'publisher' => 'M&T PUBLISHING INC.' - } - }; + foreach my $fld (keys %{$m_cache->{single}}) { +#print "single = ",Dumper($m_cache->{single}->{$fld}); + my ($i1,$i2); + # do we have indicators? + if ($fld =~ m/^(.+)\t(.)(.)$/) { + $fld = $1; + ($i1,$i2) = ($2,$3); + } + $marc->addfield({record=>$m, + field=>$fld, + i1=>$i1, + i2=>$i2, + value=>$m_cache->{single}->{$fld} + }); + } + $m_cache = {}; -=cut + $t->purge; # frees the memory +}