Revision 337 (by dpavlin, 2004/06/10 19:22:40) new trunk for webpac v2
#!/usr/bin/perl -w

=head1 NAME

mods2marc.pl - convert MODS XML back to MARC (ISO2709)

=head1 SYNOPSIS

mods2marc.pl export.marc mods.xml [mods2.xml ... ]

=head1 DESCRIPTION

This script will convert MODS format
L<http://www.loc.gov/standards/mods/>
back to MARC (ISO2709) format.

Since conversion back to MARC is not simple, lot of things are hard-coded
in this script.

This script B<is somewhat specific> to MODS export from
Faculty of Electrical Engineering and Computing
so you might want to edit it (among other thing, it includes a lot
of fields which are in Croatian).

Feel free to hack this script and convert it to your own needs.

=head1 CAVEAT

This script will parse imput XML twice: once with C<XML::Twig> and
then each entry with C<XML::Simple> to produce in-memory structure.
That's because I wanted to keep node selection logical (and perl-like).

If you don't like it, you can rewrite this script to use XPATH. I tried
and failed (it seems that MODS is too complicated for my limited knowledge
of XPATH).

=cut

use strict;
use XML::Twig;
use XML::Simple;
use MARC;
use Text::Iconv;

use Data::Dumper;

my $marc_file = shift @ARGV || die "$0: need MARC export file";
die "$0: need at least one MODS XML file" if (! @ARGV);

$|=1;
my $nr = 0;

my $marc = MARC->new;

my $ENCODING = 'ISO-8859-2';
$ENCODING = 'windows-1250';

my $twig=XML::Twig->new(
	twig_roots => { 'mods' => \&mods },
	output_encoding => 'UTF8',
);

my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);

foreach my $xml_file (@ARGV) {
	print "$xml_file: ";
	$twig->parsefile($xml_file);
	$twig->purge;
	print "$nr\n";
}

print "Saving MARC file...\n";

$marc->output({file=>"> $marc_file",'format'=>"usmarc"});

sub mods {
	my( $t, $elt)= @_;

	my $xml=$elt->xml_string;
	my $ref = XMLin('<xml>'.$xml.'</xml>', 
		ForceArray => [
			'name',
			'classification',
			'topic',
			'relatedItem',
			'partNumber',
		],
		KeyAttr => {
			'namePart' => 'type',
			'identifier' => 'type',
			'namePart' => 'type',
			'role' => 'type',
		},
		GroupTags => {
			'place' => 'placeTerm',
			'physicalDescription' => 'extent',
			'roleTerm' => 'content',
		},
		ContentKey => '-content',
	);

	my $m_cache;

	sub marc_add {
		my $m_cache = \shift || die "need m_cache";
		my $fld = shift || die "need field!";
		my $sf = shift;
		my $data = shift || return;

#print "add: $fld",($sf ? "^".$sf : ''),": $data\n";

		if ($sf) {
			push @{$$m_cache->{tmp}->{$fld}}, $sf;
		}
		push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
	}

	sub marc_rep {
		my $m_cache = \shift || die "need m_cache";
		foreach my $fld (@_) {
#print "marc_rep: $fld\n";	
			push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
			delete $$m_cache->{tmp}->{$fld};
		}
	}

	sub marc_single {
		my $m_cache = \shift || die "need m_cache";
		foreach my $fld (@_) {
#print "marc_single: $fld\n";	

			die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});

			$$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
			delete $$m_cache->{tmp}->{$fld};
		}
	}

	sub marc_add_rep {
		my $m_cache = \shift || die "need m_cache";
		my $fld = shift || die "need field!";
		my $sf = shift;
		my $data = shift || return;

		marc_add($$m_cache,$fld,$sf,$data);
		marc_rep($$m_cache,$fld);
	}

	sub marc_add_single {
		my $m_cache = \shift || die "need m_cache";
		my $fld = shift || die "need field!";
		my $sf = shift;
		my $data = shift || return;

		marc_add($$m_cache,$fld,$sf,$data);
		marc_single($$m_cache,$fld);
	}

	my $journal = 0;
	# Journals start with c- in our MODS
	$journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);

	foreach my $t (@{$ref->{subject}->{topic}}) {
		marc_add($m_cache,'610','a', $t);
		marc_rep($m_cache,'610');
	}

	my $fld_700 = '700';
	my $fld_710 = '710';

	foreach my $name (@{$ref->{name}}) {
		my $role = $name->{role}->{roleTerm}->{content};
		next if (! $role);
		if ($role eq "author") {
			marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
			marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
			marc_add($m_cache,$fld_700,'4',$role);

			marc_rep($m_cache,$fld_700);

			# first author goes in 700, others in 701
			$fld_700 = '701';
		} elsif ($role eq "editor" or $role eq "illustrator") {
			marc_add($m_cache,'702','a',$name->{namePart}->{family});
			marc_add($m_cache,'702','b',$name->{namePart}->{given});
			marc_add($m_cache,'702','4',$role);
			marc_rep($m_cache,'702');
		} elsif ($role eq "corporate") {
			marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
			$fld_710 = '711';
		} elsif ($role eq "conference") {
			marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
			$fld_710 = '711';
		} else {
			die "FATAL: don't know how to map role '$role'" if ($role);
		}
	}

	my $note = $ref->{note};

	if ($note) {
		foreach my $n (split(/\s*;\s+/, $note)) {
			if ($n =~ s/bibliogr:\s+//i) {
				marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
			} elsif ($n =~ s/ilustr:\s+//i) {
				marc_add($m_cache,'215','c', $n);
			} else {
				marc_add_rep($m_cache,'320','a',$n);
			}
		}
	}


	my $type = $ref->{identifier}->{type};

	if ($type) {
		if ($type eq "isbn") {
			marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
		} elsif ($type eq "issn") {
			marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
		} elsif ($type eq "uri") {
			marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
		} else {
			die "unknown identifier type $type";
		}
	}

	my $phy_desc = $ref->{physicalDescription};
	if ($phy_desc) {
		my $tmp;
		foreach my $t (split(/\s*;\s+/, $phy_desc)) {
			if ($t =~ m/([^:]+):\s+(.+)$/) {
				$tmp->{$1} = $2;
			} else {
				print STDERR "can't parse '$t' in ",Dumper($phy_desc);
			}
		}
		my $data = $tmp->{pagin};
		$data .= ", " if ($data);
		if ($tmp->{str}) {
			$data .= $tmp->{str}." str";
		}
		marc_add($m_cache,'215','a', $data) if ($data);
		marc_add($m_cache,'215','d', $tmp->{visina});
	}
	marc_rep($m_cache,'215');

	my $mfn = $ref->{recordInfo}->{recordIdentifier};
	$mfn =~ s/[^0-9]//g;
	marc_add_single($m_cache,'001',undef,$mfn);

	marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
	marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
	marc_single($m_cache,'200');

	foreach my $c (@{$ref->{classification}}) {
		if ($c->{'authority'} eq "udc") {
			marc_add_rep($m_cache,'675','a', $c->{'content'});
		}
	}

	foreach my $ri (@{$ref->{relatedItem}}) {
		my $related = $ri->{type};
		if ($related) {
			if ($related eq "series") {
				marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
				foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
					if ($journal) {
						marc_add_rep($m_cache,'999','a',$pn);
					} else {
						marc_add_rep($m_cache,'225','v',$pn);
					}
				}
			} elsif ($related eq "preceding") {
				marc_add($m_cache,'520','a',$ri->{titleInfo}->{title});
				if ($ri->{identifier}) {
					if ($ri->{identifier}->{type} eq "issn") {
						marc_add($m_cache,'520','x',$ri->{identifier}->{content});
					} else {
						die "can't store identifier type $type";
					}
				}
				marc_rep($m_cache,'520');
			} else {
				die "can't parse related item type $related" if ($related);
			}
		}
	}

	marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});

	marc_add($m_cache,'210','a',$ref->{originInfo}->{place});

	my $publisher = $ref->{originInfo}->{publisher};
	if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
		marc_add($m_cache,'210','a', $2);
		marc_add($m_cache,'210','c', $1);
	} else {
		marc_add($m_cache,'210','c', $publisher);
	}

	marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});

	marc_single($m_cache,'210');

	marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);

	$nr++;
	print "$nr " if ($nr % 100 == 0);

	# dump record
	my $bib_level = "m";
	$bib_level = "s" if ($journal);
	my $m=$marc->createrecord({leader=>"00000na".$bib_level."  2200000 a 4500"});

	foreach my $fld (keys %{$m_cache->{array}}) {
		foreach my $arr (@{$m_cache->{array}->{$fld}}) {
#print "array = ",Dumper($arr);
			my ($i1,$i2);
			# do we have indicators?
			if ($fld =~ m/^(.+)\t(.)(.)$/) {
				$fld = $1;
				($i1,$i2) = ($2,$3);
			}
			$marc->addfield({record=>$m,
				field=>$fld,
				i1=>$i1,
				i2=>$i2,
				value=>$arr
			});
		}
	}

	foreach my $fld (keys %{$m_cache->{single}}) {
#print "single = ",Dumper($m_cache->{single}->{$fld});
		my ($i1,$i2);
		# do we have indicators?
		if ($fld =~ m/^(.+)\t(.)(.)$/) {
			$fld = $1;
			($i1,$i2) = ($2,$3);
		}
		$marc->addfield({record=>$m,
			field=>$fld,
			i1=>$i1,
			i2=>$i2,
			value=>$m_cache->{single}->{$fld}
		});
	}

	$m_cache = {};

	$t->purge;           # frees the memory
}