--- trunk/tools/mods2unimarc.pl	2004/05/14 16:38:22	325
+++ branches/drustvene/tools/mods2unimarc.pl	2004/06/02 16:13:57	335
@@ -6,7 +6,7 @@
 
 =head1 SYNOPSIS
 
-mods2marc.pl mods.xml export.marc
+mods2marc.pl export.marc mods.xml [mods2.xml ... ]
 
 =head1 DESCRIPTION
 
@@ -19,11 +19,20 @@
 
 This script B<is somewhat specific> to MODS export from
 Faculty of Electrical Engineering and Computing
-so you might want to edit it
+so you might want to edit it (among other thing, it includes a lot
+of fields which are in Croatian).
 
-=head1 WARNING
+Feel free to hack this script and convert it to your own needs.
 
-This script is in state of flux.
+=head1 CAVEAT
+
+This script will parse imput XML twice: once with C<XML::Twig> and
+then each entry with C<XML::Simple> to produce in-memory structure.
+That's because I wanted to keep node selection logical (and perl-like).
+
+If you don't like it, you can rewrite this script to use XPATH. I tried
+and failed (it seems that MODS is too complicated for my limited knowledge
+of XPATH).
 
 =cut
 
@@ -35,42 +44,49 @@
 
 use Data::Dumper;
 
-my $xml_file = "/data/tehnika/fer/all.xml";
-$xml_file = "/data/tehnika/fer/modsFER_1.xml";
-my $marc_file = "fer.marc";
+my $marc_file = shift @ARGV || die "$0: need MARC export file";
+die "$0: need at least one MODS XML file" if (! @ARGV);
 
 $|=1;
 my $nr = 0;
 
 my $marc = MARC->new;
 
+my $ENCODING = 'ISO-8859-2';
+
 my $twig=XML::Twig->new(
-	twig_roots => { 'mods' => \&item },
-	output_encoding => 'iso-8859-2',
+	twig_roots => { 'mods' => \&mods },
+	output_encoding => 'UTF8',
 );
 
-my $utf2iso = Text::Iconv->new("UTF-8", "ISO-8859-2");
+my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
 
-$twig->parsefile($xml_file);
-$twig->purge;
+foreach my $xml_file (@ARGV) {
+	print "$xml_file: ";
+	$twig->parsefile($xml_file);
+	$twig->purge;
+	print "$nr\n";
+}
+
+print "Saving MARC file...\n";
 
 $marc->output({file=>"> $marc_file",'format'=>"usmarc"});
 
-sub item {
+sub mods {
 	my( $t, $elt)= @_;
 
 	my $xml=$elt->xml_string;
-	my $ref = XMLin("<xml>".$xml."</xml>", 
+	my $ref = XMLin('<xml>'.$xml.'</xml>', 
 		ForceArray => [
 			'name',
 			'classification',
 			'topic',
-			'udc',
+			'relatedItem',
+			'partNumber',
 		],
 		KeyAttr => {
 			'namePart' => 'type',
 			'identifier' => 'type',
-			'classification' => 'authority',
 			'namePart' => 'type',
 			'role' => 'type',
 		},
@@ -82,52 +98,98 @@
 		ContentKey => '-content',
 	);
 
-	my $m=$marc->createrecord();
+	my $m_cache;
 
-	sub marc_arr {
-		my $m = shift || die "no marc record?";
-		my $fld = shift || die "no marc field?";
-		my $sf = shift || '';
+	sub marc_add {
+		my $m_cache = \shift || die "need m_cache";
+		my $fld = shift || die "need field!";
+		my $sf = shift;
+		my $data = shift || return;
 
-		return if (! @_);
+#print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
 
-		my @a;
-		foreach (@_) {
-			next if (! $_);
-			push @a,$sf;
-#			push @a,$utf2iso->convert($_) || $_;
-			push @a,$_;
+		if ($sf) {
+			push @{$$m_cache->{tmp}->{$fld}}, $sf;
 		}
+		push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
+	}
 
-		return if (! @a);
+	sub marc_rep {
+		my $m_cache = \shift || die "need m_cache";
+		foreach my $fld (@_) {
+#print "marc_rep: $fld\n";	
+			push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
+			delete $$m_cache->{tmp}->{$fld};
+		}
+	}
 
-#		print "storing $fld: ",join("|",@a),"\n";
+	sub marc_single {
+		my $m_cache = \shift || die "need m_cache";
+		foreach my $fld (@_) {
+#print "marc_single: $fld\n";	
 
-		$marc->addfield({record=>$m,
-			field=>$fld,
-	#		i1=>$i1,
-	#		i2=>$i2,
-		value=>\@a});
+			die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
+
+			$$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
+			delete $$m_cache->{tmp}->{$fld};
+		}
+	}
+
+	sub marc_add_rep {
+		my $m_cache = \shift || die "need m_cache";
+		my $fld = shift || die "need field!";
+		my $sf = shift;
+		my $data = shift || return;
+
+		marc_add($$m_cache,$fld,$sf,$data);
+		marc_rep($$m_cache,$fld);
+	}
+
+	sub marc_add_single {
+		my $m_cache = \shift || die "need m_cache";
+		my $fld = shift || die "need field!";
+		my $sf = shift;
+		my $data = shift || return;
+
+		marc_add($$m_cache,$fld,$sf,$data);
+		marc_single($$m_cache,$fld);
 	}
 
-	marc_arr($m,'610','a',@{$ref->{subject}->{topic}});
+	my $journal = 0;
+	# Journals start with c- in our MODS
+	$journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
+
+	foreach my $t (@{$ref->{subject}->{topic}}) {
+		marc_add($m_cache,'610','a', $t);
+		marc_rep($m_cache,'610');
+	}
 
-	my $fld = '700';
+	my $fld_700 = '700';
+	my $fld_710 = '710';
 
 	foreach my $name (@{$ref->{name}}) {
 		my $role = $name->{role}->{roleTerm}->{content};
 		next if (! $role);
 		if ($role eq "author") {
-			marc_arr($m,$fld,'a',$name->{namePart}->{family});
-			marc_arr($m,$fld,'b',$name->{namePart}->{given});
-			marc_arr($m,$fld,'4',$role);
+			marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
+			marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
+			marc_add($m_cache,$fld_700,'4',$role);
+
+			marc_rep($m_cache,$fld_700);
 
 			# first author goes in 700, others in 701
-			$fld = '701';
+			$fld_700 = '701';
 		} elsif ($role eq "editor" or $role eq "illustrator") {
-			marc_arr($m,'702','a',$name->{namePart}->{family});
-			marc_arr($m,'702','b',$name->{namePart}->{given});
-			marc_arr($m,'702','4',$role);
+			marc_add($m_cache,'702','a',$name->{namePart}->{family});
+			marc_add($m_cache,'702','b',$name->{namePart}->{given});
+			marc_add($m_cache,'702','4',$role);
+			marc_rep($m_cache,'702');
+		} elsif ($role eq "corporate") {
+			marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
+			$fld_710 = '711';
+		} elsif ($role eq "conference") {
+			marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
+			$fld_710 = '711';
 		} else {
 			die "FATAL: don't know how to map role '$role'" if ($role);
 		}
@@ -138,23 +200,25 @@
 	if ($note) {
 		foreach my $n (split(/\s*;\s+/, $note)) {
 			if ($n =~ s/bibliogr:\s+//i) {
-				marc_arr($m,'320','a',"Bibliografija: $n");
+				marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
 			} elsif ($n =~ s/ilustr:\s+//i) {
-				marc_arr($m,'215','c', $n);
+				marc_add($m_cache,'215','c', $n);
 			} else {
-				marc_arr($m,'320','a',$n);
+				marc_add_rep($m_cache,'320','a',$n);
 			}
 		}
 	}
-			
+
 
 	my $type = $ref->{identifier}->{type};
 
 	if ($type) {
 		if ($type eq "isbn") {
-			marc_arr($m,'010','a',$ref->{identifier}->{content});
+			marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
 		} elsif ($type eq "issn") {
-			marc_arr($m,'011','a',$ref->{identifier}->{content});
+			marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
+		} elsif ($type eq "uri") {
+			marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
 		} else {
 			die "unknown identifier type $type";
 		}
@@ -167,7 +231,7 @@
 			if ($t =~ m/([^:]+):\s+(.+)$/) {
 				$tmp->{$1} = $2;
 			} else {
-				die "can't parse $t";
+				print STDERR "can't parse '$t' in ",Dumper($phy_desc);
 			}
 		}
 		my $data = $tmp->{pagin};
@@ -175,163 +239,113 @@
 		if ($tmp->{str}) {
 			$data .= $tmp->{str}." str";
 		}
-		marc_arr($m,'210','a', $data) if ($data);
-		marc_arr($m,'210','d', $tmp->{visina});
+		marc_add($m_cache,'215','a', $data) if ($data);
+		marc_add($m_cache,'215','d', $tmp->{visina});
 	}
+	marc_rep($m_cache,'215');
 
-	marc_arr($m,'001','',$ref->{recordInfo}->{recordIdentifier});
+	marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});
 
-	marc_arr($m,'200','a',$ref->{titleInfo}->{title});
-	marc_arr($m,'200','e',$ref->{titleInfo}->{subTitle});
+	marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
+	marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
+	marc_single($m_cache,'200');
 
-	marc_arr($m,'675','a',$ref->{classification}->{udc});
+	foreach my $c (@{$ref->{classification}}) {
+		if ($c->{'authority'} eq "udc") {
+			marc_add_rep($m_cache,'675','a', $c->{'content'});
+		}
+	}
 
-	my $related = $ref->{relatedItem}->{type};
-	if ($related) {
-		if ($related eq "series") {
-			marc_arr($m,'675','a',$ref->{relatedItem}->{titleInfo}->{title});
-			marc_arr($m,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber});
-		} elsif ($related eq "preceding") {
-			marc_arr($m,'430','a',$ref->{relatedItem}->{titleInfo}->{title});
-		} else {
-			die "can't parse related item type $related" if ($related);
+	foreach my $ri (@{$ref->{relatedItem}}) {
+		my $related = $ri->{type};
+		if ($related) {
+			if ($related eq "series") {
+				marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
+				foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
+					if ($journal) {
+						marc_add_rep($m_cache,'999','a',$pn);
+					} else {
+						marc_add_rep($m_cache,'225','v',$pn);
+					}
+				}
+			} elsif ($related eq "preceding") {
+				marc_add($m_cache,'520','a',$ri->{titleInfo}->{title});
+				if ($ri->{identifier}) {
+					if ($ri->{identifier}->{type} eq "issn") {
+						marc_add($m_cache,'520','x',$ri->{identifier}->{content});
+					} else {
+						die "can't store identifier type $type";
+					}
+				}
+				marc_rep($m_cache,'520');
+			} else {
+				die "can't parse related item type $related" if ($related);
+			}
 		}
 	}
 
-	marc_arr($m,'205','a',$ref->{originInfo}->{edition});
+	marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
+
+	marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
 
 	my $publisher = $ref->{originInfo}->{publisher};
 	if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
-		marc_arr($m,'210','a', $2);
-		marc_arr($m,'210','c', $1);
+		marc_add($m_cache,'210','a', $2);
+		marc_add($m_cache,'210','c', $1);
 	} else {
-		marc_arr($m,'210','c', $publisher);
+		marc_add($m_cache,'210','c', $publisher);
 	}
 
-	marc_arr($m,'326','a',$ref->{originInfo}->{frequency});
-	marc_arr($m,'326','a',$ref->{originInfo}->{place});
+	marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
+
+	marc_single($m_cache,'210');
 
-	marc_arr($m,'210','d',$ref->{originInfo}->{dateIssued});
+	marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
 
 	$nr++;
 	print "$nr " if ($nr % 100 == 0);
 
-	$t->purge;           # frees the memory
-}
-
-__END__
+	# dump record
+	my $bib_level = "m";
+	$bib_level = "s" if ($journal);
+	my $m=$marc->createrecord({leader=>"00000na".$bib_level."  2200000 a 4500"});
+
+	foreach my $fld (keys %{$m_cache->{array}}) {
+		foreach my $arr (@{$m_cache->{array}->{$fld}}) {
+#print "array = ",Dumper($arr);
+			my ($i1,$i2);
+			# do we have indicators?
+			if ($fld =~ m/^(.+)\t(.)(.)$/) {
+				$fld = $1;
+				($i1,$i2) = ($2,$3);
+			}
+			$marc->addfield({record=>$m,
+				field=>$fld,
+				i1=>$i1,
+				i2=>$i2,
+				value=>$arr
+			});
+		}
+	}
 
-KNJIGA = {
-610a           'subject' => [
-                          {
-                            'topic' => [
-                                             'LIBRARIES-AUTOMATION',
-	                                     'ELECTRONIC DATA PROCESSING-LIBRARY SCIENCE'
-                                       ]
-                          }
-                        ],
-           'name' => [
-                       {
-                         'namePart' => {
-700b,701a...                                 'given' => 'Robert M.',
-700a,701b...                                 'family' => 'Hayes'
-                                       },
-                         'type' => 'personal',
-                         'role' => {
-                                     'roleTerm' => {
-7004,7014...                                         'content' => 'author',
-                                                     'type' => 'text'
-                                                   }
-                                   }
-                       },
-                       {
-                         'namePart' => {
-702b                                     'given' => 'Joseph',
-702a                                     'family' => 'Becker'
-                                       },
-                         'type' => 'personal',
-                         'role' => {
-                                     'roleTerm' => {
-7024                                                 'content' => 'editor',
-                                                     'type' => 'text'
-                                                   }
-                                   }
-                       },
-                       {
-                         'namePart' => {
-702b                                     'given' => 'Joseph',
-702a                                     'family' => 'Becker'
-                                       },
-                         'type' => 'personal',
-                         'role' => {
-                                     'roleTerm' => {
-7024                                                 'content' => 'illustrator',
-                                                     'type' => 'text'
-                                                   }
-                                   }
-                       }
-                     ],
-           'note' => 'bibliogr: 645-647; kazalo; ilustr: ilustr.',
-           'identifier' => {
-010a                         'content' => '0-471-36483-5',
-                             'type' => 'isbn'
-                           },
-215a;215d  'physicalDescription' => 'str: 688; pagin: xvi; visina: 24. cm',
-001        'recordInfo' => {
-                             'recordIdentifier' => 'k-7996-8073'
-                           },
-200a       'titleInfo' => {
-                            'title' => 'Handbook of data processing for libraries'
-                          },
-           'typeOfResource' => 'text',
-675a       'classification' => {
-                                 'udc' => '=20'
-                               },
-225a       'relatedItem' => {
-                              'titleInfo' => {
-                                               'title' => 'A WILEY-BECKER & HAYES  SERIES BOOK'
-                                             },
-                              'type' => 'series'
-                            },
-           'originInfo' => {
-                             'issuance' => 'monographic',
-205a                         'edition' => '2.',
-210c/210a                    'publisher' => 'MELVILLE PUBLISHING COMPANY /LOS ANGELES, CALIFORNIA/',
-210d                         'dateIssued' => '1974'
-                           }
-         };
- 
- 
- 
- CASOPIS = {
-           'identifier' => {
-011a                         'content' => '1041-5173',
-                             'type' => 'issn'
-                           },
-           'recordInfo' => {
-                             'recordIdentifier' => 'c-1'
-                           },
-           'titleInfo' => {
-                            'title' => 'DBMS - CLIENT/SERVER COMPUTING'
-                          },
-           'typeOfResource' => 'text',
-999a       'relatedItem' => {
-                              'titleInfo' => {
-                                               'partNumber' => 'g. 1990, vol. 137, br. 5'
-                                             },
-                              'type' => 'series'
-                            },
-           'classification' => {
-                                 'udc' => '=20'
-                               },
-           'originInfo' => {
-326a                         'frequency' => 'mjese�no',
-                             'issuance' => 'continuing',
-210a                         'place' => 'SAN MATEO, KANADA',
-210c                         'publisher' => 'M&T PUBLISHING INC.'
-                           }
-         };
+	foreach my $fld (keys %{$m_cache->{single}}) {
+#print "single = ",Dumper($m_cache->{single}->{$fld});
+		my ($i1,$i2);
+		# do we have indicators?
+		if ($fld =~ m/^(.+)\t(.)(.)$/) {
+			$fld = $1;
+			($i1,$i2) = ($2,$3);
+		}
+		$marc->addfield({record=>$m,
+			field=>$fld,
+			i1=>$i1,
+			i2=>$i2,
+			value=>$m_cache->{single}->{$fld}
+		});
+	}
 
+	$m_cache = {};
 
-=cut
+	$t->purge;           # frees the memory
+}