drustvene/tools/mods2unimarc.pl

#!/usr/bin/perl -w

=head1 NAME

mods2marc.pl - convert MODS XML back to MARC (ISO2709)

=head1 SYNOPSIS

mods2marc.pl export.marc mods.xml [mods2.xml ... ]

=head1 DESCRIPTION

This script will convert MODS format
L<http://www.loc.gov/standards/mods/>
back to MARC (ISO2709) format.

Since conversion back to MARC is not simple, lot of things are hard-coded
in this script.

This script B<is somewhat specific> to MODS export from
Faculty of Electrical Engineering and Computing
so you might want to edit it (among other thing, it includes a lot
of fields which are in Croatian).

Feel free to hack this script and convert it to your own needs.

=head1 CAVEAT

This script will parse imput XML twice: once with C<XML::Twig> and
then each entry with C<XML::Simple> to produce in-memory structure.
That's because I wanted to keep node selection logical (and perl-like).

If you don't like it, you can rewrite this script to use XPATH. I tried
and failed (it seems that MODS is too complicated for my limited knowledge
of XPATH).

=cut

use strict;
use XML::Twig;
use XML::Simple;
use MARC;
use Text::Iconv;

use Data::Dumper;

my $marc_file = shift @ARGV || die "$0: need MARC export file";
die "$0: need at least one MODS XML file" if (! @ARGV);

$|=1;
my $nr = 0;

my $marc = MARC->new;

my $ENCODING = 'ISO-8859-2';

my $twig=XML::Twig->new(
        twig_roots => { 'mods' => \&mods },
        output_encoding => 'UTF8',
);

my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);

foreach my $xml_file (@ARGV) {
        print "$xml_file: ";
        $twig->parsefile($xml_file);
        $twig->purge;
        print "$nr\n";
}

print "Saving MARC file...\n";

$marc->output({file=>"> $marc_file",'format'=>"usmarc"});

sub mods {
        my( $t, $elt)= @_;

        my $xml=$elt->xml_string;
        my $ref = XMLin('<xml>'.$xml.'</xml>', 
                ForceArray => [
                        'name',
                        'classification',
                        'topic',
                        'relatedItem',
                        'partNumber',
                ],
                KeyAttr => {
                        'namePart' => 'type',
                        'identifier' => 'type',
                        'namePart' => 'type',
                        'role' => 'type',
                },
                GroupTags => {
                        'place' => 'placeTerm',
                        'physicalDescription' => 'extent',
                        'roleTerm' => 'content',
                },
                ContentKey => '-content',
        );

        my $m_cache;

        sub marc_add {
                my $m_cache = \shift || die "need m_cache";
                my $fld = shift || die "need field!";
                my $sf = shift;
                my $data = shift || return;

#print "add: $fld",($sf ? "^".$sf : ''),": $data\n";

                if ($sf) {
                        push @{$$m_cache->{tmp}->{$fld}}, $sf;
                }
                push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
        }

        sub marc_rep {
                my $m_cache = \shift || die "need m_cache";
                foreach my $fld (@_) {
#print "marc_rep: $fld\n";      
                        push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
                        delete $$m_cache->{tmp}->{$fld};
                }
        }

        sub marc_single {
                my $m_cache = \shift || die "need m_cache";
                foreach my $fld (@_) {
#print "marc_single: $fld\n";   

                        die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});

                        $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
                        delete $$m_cache->{tmp}->{$fld};
                }
        }

        sub marc_add_rep {
                my $m_cache = \shift || die "need m_cache";
                my $fld = shift || die "need field!";
                my $sf = shift;
                my $data = shift || return;

                marc_add($$m_cache,$fld,$sf,$data);
                marc_rep($$m_cache,$fld);
        }

        sub marc_add_single {
                my $m_cache = \shift || die "need m_cache";
                my $fld = shift || die "need field!";
                my $sf = shift;
                my $data = shift || return;

                marc_add($$m_cache,$fld,$sf,$data);
                marc_single($$m_cache,$fld);
        }

        my $journal = 0;
        # Journals start with c- in our MODS
        $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);

        foreach my $t (@{$ref->{subject}->{topic}}) {
                marc_add($m_cache,'610','a', $t);
                marc_rep($m_cache,'610');
        }

        my $fld_700 = '700';
        my $fld_710 = '710';

        foreach my $name (@{$ref->{name}}) {
                my $role = $name->{role}->{roleTerm}->{content};
                next if (! $role);
                if ($role eq "author") {
                        marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
                        marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
                        marc_add($m_cache,$fld_700,'4',$role);

                        marc_rep($m_cache,$fld_700);

                        # first author goes in 700, others in 701
                        $fld_700 = '701';
                } elsif ($role eq "editor" or $role eq "illustrator") {
                        marc_add($m_cache,'702','a',$name->{namePart}->{family});
                        marc_add($m_cache,'702','b',$name->{namePart}->{given});
                        marc_add($m_cache,'702','4',$role);
                        marc_rep($m_cache,'702');
                } elsif ($role eq "corporate") {
                        marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
                        $fld_710 = '711';
                } elsif ($role eq "conference") {
                        marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
                        $fld_710 = '711';
                } else {
                        die "FATAL: don't know how to map role '$role'" if ($role);
                }
        }

        my $note = $ref->{note};

        if ($note) {
                foreach my $n (split(/\s*;\s+/, $note)) {
                        if ($n =~ s/bibliogr:\s+//i) {
                                marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
                        } elsif ($n =~ s/ilustr:\s+//i) {
                                marc_add($m_cache,'215','c', $n);
                        } else {
                                marc_add_rep($m_cache,'320','a',$n);
                        }
                }
        }


        my $type = $ref->{identifier}->{type};

        if ($type) {
                if ($type eq "isbn") {
                        marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
                } elsif ($type eq "issn") {
                        marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
                } elsif ($type eq "uri") {
                        marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
                } else {
                        die "unknown identifier type $type";
                }
        }

        my $phy_desc = $ref->{physicalDescription};
        if ($phy_desc) {
                my $tmp;
                foreach my $t (split(/\s*;\s+/, $phy_desc)) {
                        if ($t =~ m/([^:]+):\s+(.+)$/) {
                                $tmp->{$1} = $2;
                        } else {
                                print STDERR "can't parse '$t' in ",Dumper($phy_desc);
                        }
                }
                my $data = $tmp->{pagin};
                $data .= ", " if ($data);
                if ($tmp->{str}) {
                        $data .= $tmp->{str}." str";
                }
                marc_add($m_cache,'215','a', $data) if ($data);
                marc_add($m_cache,'215','d', $tmp->{visina});
        }
        marc_rep($m_cache,'215');

        marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});

        marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
        marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
        marc_single($m_cache,'200');

        foreach my $c (@{$ref->{classification}}) {
                if ($c->{'authority'} eq "udc") {
                        marc_add_rep($m_cache,'675','a', $c->{'content'});
                }
        }

        foreach my $ri (@{$ref->{relatedItem}}) {
                my $related = $ri->{type};
                if ($related) {
                        if ($related eq "series") {
                                marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
                                foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
                                        if ($journal) {
                                                marc_add_rep($m_cache,'999','a',$pn);
                                        } else {
                                                marc_add_rep($m_cache,'225','v',$pn);
                                        }
                                }
                        } elsif ($related eq "preceding") {
                                marc_add($m_cache,'520','a',$ri->{titleInfo}->{title});
                                if ($ri->{identifier}) {
                                        if ($ri->{identifier}->{type} eq "issn") {
                                                marc_add($m_cache,'520','x',$ri->{identifier}->{content});
                                        } else {
                                                die "can't store identifier type $type";
                                        }
                                }
                                marc_rep($m_cache,'520');
                        } else {
                                die "can't parse related item type $related" if ($related);
                        }
                }
        }

        marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});

        marc_add($m_cache,'210','a',$ref->{originInfo}->{place});

        my $publisher = $ref->{originInfo}->{publisher};
        if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
                marc_add($m_cache,'210','a', $2);
                marc_add($m_cache,'210','c', $1);
        } else {
                marc_add($m_cache,'210','c', $publisher);
        }

        marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});

        marc_single($m_cache,'210');

        marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);

        $nr++;
        print "$nr " if ($nr % 100 == 0);

        # dump record
        my $bib_level = "m";
        $bib_level = "s" if ($journal);
        my $m=$marc->createrecord({leader=>"00000na".$bib_level."  2200000 a 4500"});

        foreach my $fld (keys %{$m_cache->{array}}) {
                foreach my $arr (@{$m_cache->{array}->{$fld}}) {
#print "array = ",Dumper($arr);
                        my ($i1,$i2);
                        # do we have indicators?
                        if ($fld =~ m/^(.+)\t(.)(.)$/) {
                                $fld = $1;
                                ($i1,$i2) = ($2,$3);
                        }
                        $marc->addfield({record=>$m,
                                field=>$fld,
                                i1=>$i1,
                                i2=>$i2,
                                value=>$arr
                        });
                }
        }

        foreach my $fld (keys %{$m_cache->{single}}) {
#print "single = ",Dumper($m_cache->{single}->{$fld});
                my ($i1,$i2);
                # do we have indicators?
                if ($fld =~ m/^(.+)\t(.)(.)$/) {
                        $fld = $1;
                        ($i1,$i2) = ($2,$3);
                }
                $marc->addfield({record=>$m,
                        field=>$fld,
                        i1=>$i1,
                        i2=>$i2,
                        value=>$m_cache->{single}->{$fld}
                });
        }

        $m_cache = {};

        $t->purge;           # frees the memory
}

1	#!/usr/bin/perl -w
2
3	=head1 NAME
4
5	mods2marc.pl - convert MODS XML back to MARC (ISO2709)
6
7	=head1 SYNOPSIS
8
9	mods2marc.pl export.marc mods.xml [mods2.xml ... ]
10
11	=head1 DESCRIPTION
12
13	This script will convert MODS format
14	L<http://www.loc.gov/standards/mods/>
15	back to MARC (ISO2709) format.
16
17	Since conversion back to MARC is not simple, lot of things are hard-coded
18	in this script.
19
20	This script B<is somewhat specific> to MODS export from
21	Faculty of Electrical Engineering and Computing
22	so you might want to edit it (among other thing, it includes a lot
23	of fields which are in Croatian).
24
25	Feel free to hack this script and convert it to your own needs.
26
27	=head1 CAVEAT
28
29	This script will parse imput XML twice: once with C<XML::Twig> and
30	then each entry with C<XML::Simple> to produce in-memory structure.
31	That's because I wanted to keep node selection logical (and perl-like).
32
33	If you don't like it, you can rewrite this script to use XPATH. I tried
34	and failed (it seems that MODS is too complicated for my limited knowledge
35	of XPATH).
36
37	=cut
38
39	use strict;
40	use XML::Twig;
41	use XML::Simple;
42	use MARC;
43	use Text::Iconv;
44
45	use Data::Dumper;
46
47	my $marc_file = shift @ARGV \|\| die "$0: need MARC export file";
48	die "$0: need at least one MODS XML file" if (! @ARGV);
49
50	$\|=1;
51	my $nr = 0;
52
53	my $marc = MARC->new;
54
55	my $ENCODING = 'ISO-8859-2';
56
57	my $twig=XML::Twig->new(
58	twig_roots => { 'mods' => \&mods },
59	output_encoding => 'UTF8',
60	);
61
62	my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
63
64	foreach my $xml_file (@ARGV) {
65	print "$xml_file: ";
66	$twig->parsefile($xml_file);
67	$twig->purge;
68	print "$nr\n";
69	}
70
71	print "Saving MARC file...\n";
72
73	$marc->output({file=>"> $marc_file",'format'=>"usmarc"});
74
75	sub mods {
76	my( $t, $elt)= @_;
77
78	my $xml=$elt->xml_string;
79	my $ref = XMLin('<xml>'.$xml.'</xml>',
80	ForceArray => [
81	'name',
82	'classification',
83	'topic',
84	'relatedItem',
85	'partNumber',
86	],
87	KeyAttr => {
88	'namePart' => 'type',
89	'identifier' => 'type',
90	'namePart' => 'type',
91	'role' => 'type',
92	},
93	GroupTags => {
94	'place' => 'placeTerm',
95	'physicalDescription' => 'extent',
96	'roleTerm' => 'content',
97	},
98	ContentKey => '-content',
99	);
100
101	my $m_cache;
102
103	sub marc_add {
104	my $m_cache = \shift \|\| die "need m_cache";
105	my $fld = shift \|\| die "need field!";
106	my $sf = shift;
107	my $data = shift \|\| return;
108
109	#print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
110
111	if ($sf) {
112	push @{$$m_cache->{tmp}->{$fld}}, $sf;
113	}
114	push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
115	}
116
117	sub marc_rep {
118	my $m_cache = \shift \|\| die "need m_cache";
119	foreach my $fld (@_) {
120	#print "marc_rep: $fld\n";
121	push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
122	delete $$m_cache->{tmp}->{$fld};
123	}
124	}
125
126	sub marc_single {
127	my $m_cache = \shift \|\| die "need m_cache";
128	foreach my $fld (@_) {
129	#print "marc_single: $fld\n";
130
131	die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
132
133	$$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
134	delete $$m_cache->{tmp}->{$fld};
135	}
136	}
137
138	sub marc_add_rep {
139	my $m_cache = \shift \|\| die "need m_cache";
140	my $fld = shift \|\| die "need field!";
141	my $sf = shift;
142	my $data = shift \|\| return;
143
144	marc_add($$m_cache,$fld,$sf,$data);
145	marc_rep($$m_cache,$fld);
146	}
147
148	sub marc_add_single {
149	my $m_cache = \shift \|\| die "need m_cache";
150	my $fld = shift \|\| die "need field!";
151	my $sf = shift;
152	my $data = shift \|\| return;
153
154	marc_add($$m_cache,$fld,$sf,$data);
155	marc_single($$m_cache,$fld);
156	}
157
158	my $journal = 0;
159	# Journals start with c- in our MODS
160	$journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
161
162	foreach my $t (@{$ref->{subject}->{topic}}) {
163	marc_add($m_cache,'610','a', $t);
164	marc_rep($m_cache,'610');
165	}
166
167	my $fld_700 = '700';
168	my $fld_710 = '710';
169
170	foreach my $name (@{$ref->{name}}) {
171	my $role = $name->{role}->{roleTerm}->{content};
172	next if (! $role);
173	if ($role eq "author") {
174	marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
175	marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
176	marc_add($m_cache,$fld_700,'4',$role);
177
178	marc_rep($m_cache,$fld_700);
179
180	# first author goes in 700, others in 701
181	$fld_700 = '701';
182	} elsif ($role eq "editor" or $role eq "illustrator") {
183	marc_add($m_cache,'702','a',$name->{namePart}->{family});
184	marc_add($m_cache,'702','b',$name->{namePart}->{given});
185	marc_add($m_cache,'702','4',$role);
186	marc_rep($m_cache,'702');
187	} elsif ($role eq "corporate") {
188	marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
189	$fld_710 = '711';
190	} elsif ($role eq "conference") {
191	marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
192	$fld_710 = '711';
193	} else {
194	die "FATAL: don't know how to map role '$role'" if ($role);
195	}
196	}
197
198	my $note = $ref->{note};
199
200	if ($note) {
201	foreach my $n (split(/\s*;\s+/, $note)) {
202	if ($n =~ s/bibliogr:\s+//i) {
203	marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
204	} elsif ($n =~ s/ilustr:\s+//i) {
205	marc_add($m_cache,'215','c', $n);
206	} else {
207	marc_add_rep($m_cache,'320','a',$n);
208	}
209	}
210	}
211
212
213	my $type = $ref->{identifier}->{type};
214
215	if ($type) {
216	if ($type eq "isbn") {
217	marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
218	} elsif ($type eq "issn") {
219	marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
220	} elsif ($type eq "uri") {
221	marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
222	} else {
223	die "unknown identifier type $type";
224	}
225	}
226
227	my $phy_desc = $ref->{physicalDescription};
228	if ($phy_desc) {
229	my $tmp;
230	foreach my $t (split(/\s*;\s+/, $phy_desc)) {
231	if ($t =~ m/([^:]+):\s+(.+)$/) {
232	$tmp->{$1} = $2;
233	} else {
234	print STDERR "can't parse '$t' in ",Dumper($phy_desc);
235	}
236	}
237	my $data = $tmp->{pagin};
238	$data .= ", " if ($data);
239	if ($tmp->{str}) {
240	$data .= $tmp->{str}." str";
241	}
242	marc_add($m_cache,'215','a', $data) if ($data);
243	marc_add($m_cache,'215','d', $tmp->{visina});
244	}
245	marc_rep($m_cache,'215');
246
247	marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});
248
249	marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
250	marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
251	marc_single($m_cache,'200');
252
253	foreach my $c (@{$ref->{classification}}) {
254	if ($c->{'authority'} eq "udc") {
255	marc_add_rep($m_cache,'675','a', $c->{'content'});
256	}
257	}
258
259	foreach my $ri (@{$ref->{relatedItem}}) {
260	my $related = $ri->{type};
261	if ($related) {
262	if ($related eq "series") {
263	marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
264	foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
265	if ($journal) {
266	marc_add_rep($m_cache,'999','a',$pn);
267	} else {
268	marc_add_rep($m_cache,'225','v',$pn);
269	}
270	}
271	} elsif ($related eq "preceding") {
272	marc_add($m_cache,'520','a',$ri->{titleInfo}->{title});
273	if ($ri->{identifier}) {
274	if ($ri->{identifier}->{type} eq "issn") {
275	marc_add($m_cache,'520','x',$ri->{identifier}->{content});
276	} else {
277	die "can't store identifier type $type";
278	}
279	}
280	marc_rep($m_cache,'520');
281	} else {
282	die "can't parse related item type $related" if ($related);
283	}
284	}
285	}
286
287	marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
288
289	marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
290
291	my $publisher = $ref->{originInfo}->{publisher};
292	if ($publisher =~ m,^(.+?)\s/\s(.+)$,) {
293	marc_add($m_cache,'210','a', $2);
294	marc_add($m_cache,'210','c', $1);
295	} else {
296	marc_add($m_cache,'210','c', $publisher);
297	}
298
299	marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
300
301	marc_single($m_cache,'210');
302
303	marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
304
305	$nr++;
306	print "$nr " if ($nr % 100 == 0);
307
308	# dump record
309	my $bib_level = "m";
310	$bib_level = "s" if ($journal);
311	my $m=$marc->createrecord({leader=>"00000na".$bib_level." 2200000 a 4500"});
312
313	foreach my $fld (keys %{$m_cache->{array}}) {
314	foreach my $arr (@{$m_cache->{array}->{$fld}}) {
315	#print "array = ",Dumper($arr);
316	my ($i1,$i2);
317	# do we have indicators?
318	if ($fld =~ m/^(.+)\t(.)(.)$/) {
319	$fld = $1;
320	($i1,$i2) = ($2,$3);
321	}
322	$marc->addfield({record=>$m,
323	field=>$fld,
324	i1=>$i1,
325	i2=>$i2,
326	value=>$arr
327	});
328	}
329	}
330
331	foreach my $fld (keys %{$m_cache->{single}}) {
332	#print "single = ",Dumper($m_cache->{single}->{$fld});
333	my ($i1,$i2);
334	# do we have indicators?
335	if ($fld =~ m/^(.+)\t(.)(.)$/) {
336	$fld = $1;
337	($i1,$i2) = ($2,$3);
338	}
339	$marc->addfield({record=>$m,
340	field=>$fld,
341	i1=>$i1,
342	i2=>$i2,
343	value=>$m_cache->{single}->{$fld}
344	});
345	}
346
347	$m_cache = {};
348
349	$t->purge; # frees the memory
350	}
351