trunk/tools/mods2unimarc.pl

#!/usr/bin/perl -w

=head1 NAME

mods2marc.pl - convert MODS XML back to MARC (ISO2709)

=head1 SYNOPSIS

mods2marc.pl export.marc mods.xml [mods2.xml ... ]

=head1 DESCRIPTION

This script will convert MODS format
L<http://www.loc.gov/standards/mods/>
back to MARC (ISO2709) format.

Since conversion back to MARC is not simple, lot of things are hard-coded
in this script.

This script B<is somewhat specific> to MODS export from
Faculty of Electrical Engineering and Computing
so you might want to edit it (among other thing, it includes a lot
of fields which are in Croatian).

Feel free to hack this script and convert it to your own needs.

=head1 CAVEAT

This script will parse imput XML twice: once with C<XML::Twig> and
then each entry with C<XML::Simple> to produce in-memory structure.
That's because I wanted to keep node selection logical (and perl-like).

If you don't like it, you can rewrite this script to use XPATH. I tried
and failed (it seems that MODS is too complicated for my limited knowledge
of XPATH).

=cut

use strict;
use XML::Twig;
use XML::Simple;
use MARC;
use Text::Iconv;

use Data::Dumper;

my $marc_file = shift @ARGV || die "$0: need MARC export file";
die "$0: need at least one MODS XML file" if (! @ARGV);

$|=1;
my $nr = 0;

my $marc = MARC->new;

my $ENCODING = 'ISO-8859-2';
$ENCODING = 'windows-1250';

my $twig=XML::Twig->new(
        twig_roots => { 'mods' => \&mods },
        output_encoding => 'UTF8',
);

my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);

foreach my $xml_file (@ARGV) {
        print "$xml_file: ";
        $twig->parsefile($xml_file);
        $twig->purge;
        print "$nr\n";
}

print "Saving MARC file...\n";

$marc->output({file=>"> $marc_file",'format'=>"usmarc"});

sub mods {
        my( $t, $elt)= @_;

        my $xml=$elt->xml_string;
        my $ref = XMLin('<xml>'.$xml.'</xml>', 
                ForceArray => [
                        'name',
                        'classification',
                        'topic',
                        'relatedItem',
                        'partNumber',
                ],
                KeyAttr => {
                        'namePart' => 'type',
                        'identifier' => 'type',
                        'namePart' => 'type',
                        'role' => 'type',
                },
                GroupTags => {
                        'place' => 'placeTerm',
                        'physicalDescription' => 'extent',
                        'roleTerm' => 'content',
                },
                ContentKey => '-content',
        );

        my $m_cache;

        sub marc_add {
                my $m_cache = \shift || die "need m_cache";
                my $fld = shift || die "need field!";
                my $sf = shift;
                my $data = shift || return;

#print "add: $fld",($sf ? "^".$sf : ''),": $data\n";

                if ($sf) {
                        push @{$$m_cache->{tmp}->{$fld}}, $sf;
                }
                push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
        }

        sub marc_rep {
                my $m_cache = \shift || die "need m_cache";
                foreach my $fld (@_) {
#print "marc_rep: $fld\n";      
                        push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
                        delete $$m_cache->{tmp}->{$fld};
                }
        }

        sub marc_single {
                my $m_cache = \shift || die "need m_cache";
                foreach my $fld (@_) {
#print "marc_single: $fld\n";   

                        die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});

                        $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
                        delete $$m_cache->{tmp}->{$fld};
                }
        }

        sub marc_add_rep {
                my $m_cache = \shift || die "need m_cache";
                my $fld = shift || die "need field!";
                my $sf = shift;
                my $data = shift || return;

                marc_add($$m_cache,$fld,$sf,$data);
                marc_rep($$m_cache,$fld);
        }

        sub marc_add_single {
                my $m_cache = \shift || die "need m_cache";
                my $fld = shift || die "need field!";
                my $sf = shift;
                my $data = shift || return;

                marc_add($$m_cache,$fld,$sf,$data);
                marc_single($$m_cache,$fld);
        }

        my $journal = 0;
        # Journals start with c- in our MODS
        $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);

        foreach my $t (@{$ref->{subject}->{topic}}) {
                marc_add($m_cache,'610','a', $t);
                marc_rep($m_cache,'610');
        }

        my $fld_700 = '700';
        my $fld_710 = '710';

        foreach my $name (@{$ref->{name}}) {
                my $role = $name->{role}->{roleTerm}->{content};
                next if (! $role);
                if ($role eq "author") {
                        marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
                        marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
                        marc_add($m_cache,$fld_700,'4',$role);

                        marc_rep($m_cache,$fld_700);

                        # first author goes in 700, others in 701
                        $fld_700 = '701';
                } elsif ($role eq "editor" or $role eq "illustrator") {
                        marc_add($m_cache,'702','a',$name->{namePart}->{family});
                        marc_add($m_cache,'702','b',$name->{namePart}->{given});
                        marc_add($m_cache,'702','4',$role);
                        marc_rep($m_cache,'702');
                } elsif ($role eq "corporate") {
                        marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
                        $fld_710 = '711';
                } elsif ($role eq "conference") {
                        marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
                        $fld_710 = '711';
                } else {
                        die "FATAL: don't know how to map role '$role'" if ($role);
                }
        }

        my $note = $ref->{note};

        if ($note) {
                foreach my $n (split(/\s*;\s+/, $note)) {
                        if ($n =~ s/bibliogr:\s+//i) {
                                marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
                        } elsif ($n =~ s/ilustr:\s+//i) {
                                marc_add($m_cache,'215','c', $n);
                        } else {
                                marc_add_rep($m_cache,'320','a',$n);
                        }
                }
        }


        my $type = $ref->{identifier}->{type};

        if ($type) {
                if ($type eq "isbn") {
                        marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
                } elsif ($type eq "issn") {
                        marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
                } elsif ($type eq "uri") {
                        marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
                } else {
                        die "unknown identifier type $type";
                }
        }

        my $phy_desc = $ref->{physicalDescription};
        if ($phy_desc) {
                my $tmp;
                foreach my $t (split(/\s*;\s+/, $phy_desc)) {
                        if ($t =~ m/([^:]+):\s+(.+)$/) {
                                $tmp->{$1} = $2;
                        } else {
                                print STDERR "can't parse '$t' in ",Dumper($phy_desc);
                        }
                }
                my $data = $tmp->{pagin};
                $data .= ", " if ($data);
                if ($tmp->{str}) {
                        $data .= $tmp->{str}." str";
                }
                marc_add($m_cache,'215','a', $data) if ($data);
                marc_add($m_cache,'215','d', $tmp->{visina});
        }
        marc_rep($m_cache,'215');

        my $mfn = $ref->{recordInfo}->{recordIdentifier};
        $mfn =~ s/[^0-9]//g;
        marc_add_single($m_cache,'001',undef,$mfn);

        marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
        marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
        marc_single($m_cache,'200');

        foreach my $c (@{$ref->{classification}}) {
                if ($c->{'authority'} eq "udc") {
                        marc_add_rep($m_cache,'675','a', $c->{'content'});
                }
        }

        foreach my $ri (@{$ref->{relatedItem}}) {
                my $related = $ri->{type};
                if ($related) {
                        if ($related eq "series") {
                                marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
                                foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
                                        if ($journal) {
                                                marc_add_rep($m_cache,'999','a',$pn);
                                        } else {
                                                marc_add_rep($m_cache,'225','v',$pn);
                                        }
                                }
                        } elsif ($related eq "preceding") {
                                marc_add($m_cache,'520','a',$ri->{titleInfo}->{title});
                                if ($ri->{identifier}) {
                                        if ($ri->{identifier}->{type} eq "issn") {
                                                marc_add($m_cache,'520','x',$ri->{identifier}->{content});
                                        } else {
                                                die "can't store identifier type $type";
                                        }
                                }
                                marc_rep($m_cache,'520');
                        } else {
                                die "can't parse related item type $related" if ($related);
                        }
                }
        }

        marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});

        marc_add($m_cache,'210','a',$ref->{originInfo}->{place});

        my $publisher = $ref->{originInfo}->{publisher};
        if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
                marc_add($m_cache,'210','a', $2);
                marc_add($m_cache,'210','c', $1);
        } else {
                marc_add($m_cache,'210','c', $publisher);
        }

        marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});

        marc_single($m_cache,'210');

        marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);

        $nr++;
        print "$nr " if ($nr % 100 == 0);

        # dump record
        my $bib_level = "m";
        $bib_level = "s" if ($journal);
        my $m=$marc->createrecord({leader=>"00000na".$bib_level."  2200000 a 4500"});

        foreach my $fld (keys %{$m_cache->{array}}) {
                foreach my $arr (@{$m_cache->{array}->{$fld}}) {
#print "array = ",Dumper($arr);
                        my ($i1,$i2);
                        # do we have indicators?
                        if ($fld =~ m/^(.+)\t(.)(.)$/) {
                                $fld = $1;
                                ($i1,$i2) = ($2,$3);
                        }
                        $marc->addfield({record=>$m,
                                field=>$fld,
                                i1=>$i1,
                                i2=>$i2,
                                value=>$arr
                        });
                }
        }

        foreach my $fld (keys %{$m_cache->{single}}) {
#print "single = ",Dumper($m_cache->{single}->{$fld});
                my ($i1,$i2);
                # do we have indicators?
                if ($fld =~ m/^(.+)\t(.)(.)$/) {
                        $fld = $1;
                        ($i1,$i2) = ($2,$3);
                }
                $marc->addfield({record=>$m,
                        field=>$fld,
                        i1=>$i1,
                        i2=>$i2,
                        value=>$m_cache->{single}->{$fld}
                });
        }

        $m_cache = {};

        $t->purge;           # frees the memory
}

1	#!/usr/bin/perl -w
2
3	=head1 NAME
4
5	mods2marc.pl - convert MODS XML back to MARC (ISO2709)
6
7	=head1 SYNOPSIS
8
9	mods2marc.pl export.marc mods.xml [mods2.xml ... ]
10
11	=head1 DESCRIPTION
12
13	This script will convert MODS format
14	L<http://www.loc.gov/standards/mods/>
15	back to MARC (ISO2709) format.
16
17	Since conversion back to MARC is not simple, lot of things are hard-coded
18	in this script.
19
20	This script B<is somewhat specific> to MODS export from
21	Faculty of Electrical Engineering and Computing
22	so you might want to edit it (among other thing, it includes a lot
23	of fields which are in Croatian).
24
25	Feel free to hack this script and convert it to your own needs.
26
27	=head1 CAVEAT
28
29	This script will parse imput XML twice: once with C<XML::Twig> and
30	then each entry with C<XML::Simple> to produce in-memory structure.
31	That's because I wanted to keep node selection logical (and perl-like).
32
33	If you don't like it, you can rewrite this script to use XPATH. I tried
34	and failed (it seems that MODS is too complicated for my limited knowledge
35	of XPATH).
36
37	=cut
38
39	use strict;
40	use XML::Twig;
41	use XML::Simple;
42	use MARC;
43	use Text::Iconv;
44
45	use Data::Dumper;
46
47	my $marc_file = shift @ARGV \|\| die "$0: need MARC export file";
48	die "$0: need at least one MODS XML file" if (! @ARGV);
49
50	$\|=1;
51	my $nr = 0;
52
53	my $marc = MARC->new;
54
55	my $ENCODING = 'ISO-8859-2';
56	$ENCODING = 'windows-1250';
57
58	my $twig=XML::Twig->new(
59	twig_roots => { 'mods' => \&mods },
60	output_encoding => 'UTF8',
61	);
62
63	my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
64
65	foreach my $xml_file (@ARGV) {
66	print "$xml_file: ";
67	$twig->parsefile($xml_file);
68	$twig->purge;
69	print "$nr\n";
70	}
71
72	print "Saving MARC file...\n";
73
74	$marc->output({file=>"> $marc_file",'format'=>"usmarc"});
75
76	sub mods {
77	my( $t, $elt)= @_;
78
79	my $xml=$elt->xml_string;
80	my $ref = XMLin('<xml>'.$xml.'</xml>',
81	ForceArray => [
82	'name',
83	'classification',
84	'topic',
85	'relatedItem',
86	'partNumber',
87	],
88	KeyAttr => {
89	'namePart' => 'type',
90	'identifier' => 'type',
91	'namePart' => 'type',
92	'role' => 'type',
93	},
94	GroupTags => {
95	'place' => 'placeTerm',
96	'physicalDescription' => 'extent',
97	'roleTerm' => 'content',
98	},
99	ContentKey => '-content',
100	);
101
102	my $m_cache;
103
104	sub marc_add {
105	my $m_cache = \shift \|\| die "need m_cache";
106	my $fld = shift \|\| die "need field!";
107	my $sf = shift;
108	my $data = shift \|\| return;
109
110	#print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
111
112	if ($sf) {
113	push @{$$m_cache->{tmp}->{$fld}}, $sf;
114	}
115	push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
116	}
117
118	sub marc_rep {
119	my $m_cache = \shift \|\| die "need m_cache";
120	foreach my $fld (@_) {
121	#print "marc_rep: $fld\n";
122	push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
123	delete $$m_cache->{tmp}->{$fld};
124	}
125	}
126
127	sub marc_single {
128	my $m_cache = \shift \|\| die "need m_cache";
129	foreach my $fld (@_) {
130	#print "marc_single: $fld\n";
131
132	die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
133
134	$$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
135	delete $$m_cache->{tmp}->{$fld};
136	}
137	}
138
139	sub marc_add_rep {
140	my $m_cache = \shift \|\| die "need m_cache";
141	my $fld = shift \|\| die "need field!";
142	my $sf = shift;
143	my $data = shift \|\| return;
144
145	marc_add($$m_cache,$fld,$sf,$data);
146	marc_rep($$m_cache,$fld);
147	}
148
149	sub marc_add_single {
150	my $m_cache = \shift \|\| die "need m_cache";
151	my $fld = shift \|\| die "need field!";
152	my $sf = shift;
153	my $data = shift \|\| return;
154
155	marc_add($$m_cache,$fld,$sf,$data);
156	marc_single($$m_cache,$fld);
157	}
158
159	my $journal = 0;
160	# Journals start with c- in our MODS
161	$journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
162
163	foreach my $t (@{$ref->{subject}->{topic}}) {
164	marc_add($m_cache,'610','a', $t);
165	marc_rep($m_cache,'610');
166	}
167
168	my $fld_700 = '700';
169	my $fld_710 = '710';
170
171	foreach my $name (@{$ref->{name}}) {
172	my $role = $name->{role}->{roleTerm}->{content};
173	next if (! $role);
174	if ($role eq "author") {
175	marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
176	marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
177	marc_add($m_cache,$fld_700,'4',$role);
178
179	marc_rep($m_cache,$fld_700);
180
181	# first author goes in 700, others in 701
182	$fld_700 = '701';
183	} elsif ($role eq "editor" or $role eq "illustrator") {
184	marc_add($m_cache,'702','a',$name->{namePart}->{family});
185	marc_add($m_cache,'702','b',$name->{namePart}->{given});
186	marc_add($m_cache,'702','4',$role);
187	marc_rep($m_cache,'702');
188	} elsif ($role eq "corporate") {
189	marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
190	$fld_710 = '711';
191	} elsif ($role eq "conference") {
192	marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
193	$fld_710 = '711';
194	} else {
195	die "FATAL: don't know how to map role '$role'" if ($role);
196	}
197	}
198
199	my $note = $ref->{note};
200
201	if ($note) {
202	foreach my $n (split(/\s*;\s+/, $note)) {
203	if ($n =~ s/bibliogr:\s+//i) {
204	marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
205	} elsif ($n =~ s/ilustr:\s+//i) {
206	marc_add($m_cache,'215','c', $n);
207	} else {
208	marc_add_rep($m_cache,'320','a',$n);
209	}
210	}
211	}
212
213
214	my $type = $ref->{identifier}->{type};
215
216	if ($type) {
217	if ($type eq "isbn") {
218	marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
219	} elsif ($type eq "issn") {
220	marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
221	} elsif ($type eq "uri") {
222	marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
223	} else {
224	die "unknown identifier type $type";
225	}
226	}
227
228	my $phy_desc = $ref->{physicalDescription};
229	if ($phy_desc) {
230	my $tmp;
231	foreach my $t (split(/\s*;\s+/, $phy_desc)) {
232	if ($t =~ m/([^:]+):\s+(.+)$/) {
233	$tmp->{$1} = $2;
234	} else {
235	print STDERR "can't parse '$t' in ",Dumper($phy_desc);
236	}
237	}
238	my $data = $tmp->{pagin};
239	$data .= ", " if ($data);
240	if ($tmp->{str}) {
241	$data .= $tmp->{str}." str";
242	}
243	marc_add($m_cache,'215','a', $data) if ($data);
244	marc_add($m_cache,'215','d', $tmp->{visina});
245	}
246	marc_rep($m_cache,'215');
247
248	my $mfn = $ref->{recordInfo}->{recordIdentifier};
249	$mfn =~ s/[^0-9]//g;
250	marc_add_single($m_cache,'001',undef,$mfn);
251
252	marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
253	marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
254	marc_single($m_cache,'200');
255
256	foreach my $c (@{$ref->{classification}}) {
257	if ($c->{'authority'} eq "udc") {
258	marc_add_rep($m_cache,'675','a', $c->{'content'});
259	}
260	}
261
262	foreach my $ri (@{$ref->{relatedItem}}) {
263	my $related = $ri->{type};
264	if ($related) {
265	if ($related eq "series") {
266	marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
267	foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
268	if ($journal) {
269	marc_add_rep($m_cache,'999','a',$pn);
270	} else {
271	marc_add_rep($m_cache,'225','v',$pn);
272	}
273	}
274	} elsif ($related eq "preceding") {
275	marc_add($m_cache,'520','a',$ri->{titleInfo}->{title});
276	if ($ri->{identifier}) {
277	if ($ri->{identifier}->{type} eq "issn") {
278	marc_add($m_cache,'520','x',$ri->{identifier}->{content});
279	} else {
280	die "can't store identifier type $type";
281	}
282	}
283	marc_rep($m_cache,'520');
284	} else {
285	die "can't parse related item type $related" if ($related);
286	}
287	}
288	}
289
290	marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
291
292	marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
293
294	my $publisher = $ref->{originInfo}->{publisher};
295	if ($publisher =~ m,^(.+?)\s/\s(.+)$,) {
296	marc_add($m_cache,'210','a', $2);
297	marc_add($m_cache,'210','c', $1);
298	} else {
299	marc_add($m_cache,'210','c', $publisher);
300	}
301
302	marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
303
304	marc_single($m_cache,'210');
305
306	marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
307
308	$nr++;
309	print "$nr " if ($nr % 100 == 0);
310
311	# dump record
312	my $bib_level = "m";
313	$bib_level = "s" if ($journal);
314	my $m=$marc->createrecord({leader=>"00000na".$bib_level." 2200000 a 4500"});
315
316	foreach my $fld (keys %{$m_cache->{array}}) {
317	foreach my $arr (@{$m_cache->{array}->{$fld}}) {
318	#print "array = ",Dumper($arr);
319	my ($i1,$i2);
320	# do we have indicators?
321	if ($fld =~ m/^(.+)\t(.)(.)$/) {
322	$fld = $1;
323	($i1,$i2) = ($2,$3);
324	}
325	$marc->addfield({record=>$m,
326	field=>$fld,
327	i1=>$i1,
328	i2=>$i2,
329	value=>$arr
330	});
331	}
332	}
333
334	foreach my $fld (keys %{$m_cache->{single}}) {
335	#print "single = ",Dumper($m_cache->{single}->{$fld});
336	my ($i1,$i2);
337	# do we have indicators?
338	if ($fld =~ m/^(.+)\t(.)(.)$/) {
339	$fld = $1;
340	($i1,$i2) = ($2,$3);
341	}
342	$marc->addfield({record=>$m,
343	field=>$fld,
344	i1=>$i1,
345	i2=>$i2,
346	value=>$m_cache->{single}->{$fld}
347	});
348	}
349
350	$m_cache = {};
351
352	$t->purge; # frees the memory
353	}
354