1 |
#!/usr/bin/perl -w |
2 |
|
3 |
=head1 NAME |
4 |
|
5 |
mods2marc.pl - convert MODS XML back to MARC (ISO2709) |
6 |
|
7 |
=head1 SYNOPSIS |
8 |
|
9 |
mods2marc.pl mods.xml export.marc |
10 |
|
11 |
=head1 DESCRIPTION |
12 |
|
13 |
This script will convert MODS format |
14 |
L<http://www.loc.gov/standards/mods/> |
15 |
back to MARC (ISO2709) format. |
16 |
|
17 |
Since conversion back to MARC is not simple, lot of things are hard-coded |
18 |
in this script. |
19 |
|
20 |
This script B<is somewhat specific> to MODS export from |
21 |
Faculty of Electrical Engineering and Computing |
22 |
so you might want to edit it (among other thing, it includes a lot |
23 |
of fields which are in Croatian). |
24 |
|
25 |
Feel free to hack this script and convert it to your own needs. |
26 |
|
27 |
=head1 WARNING |
28 |
|
29 |
This script is in state of flux. |
30 |
|
31 |
=cut |
32 |
|
33 |
use strict; |
34 |
use XML::Twig; |
35 |
use XML::Simple; |
36 |
use MARC; |
37 |
use Text::Iconv; |
38 |
|
39 |
use Data::Dumper; |
40 |
|
41 |
my $xml_file = "/data/tehnika/fer/all.xml"; |
42 |
$xml_file = "/data/tehnika/fer/modsFER_1.xml"; |
43 |
my $marc_file = "fer.marc"; |
44 |
|
45 |
$|=1; |
46 |
my $nr = 0; |
47 |
|
48 |
my $marc = MARC->new; |
49 |
|
50 |
my $twig=XML::Twig->new( |
51 |
twig_roots => { 'mods' => \&item }, |
52 |
output_encoding => 'iso-8859-2', |
53 |
); |
54 |
|
55 |
my $utf2iso = Text::Iconv->new("UTF-8", "ISO-8859-2"); |
56 |
|
57 |
$twig->parsefile($xml_file); |
58 |
$twig->purge; |
59 |
|
60 |
$marc->output({file=>"> $marc_file",'format'=>"usmarc"}); |
61 |
|
62 |
sub item { |
63 |
my( $t, $elt)= @_; |
64 |
|
65 |
my $xml=$elt->xml_string; |
66 |
my $ref = XMLin("<xml>".$xml."</xml>", |
67 |
ForceArray => [ |
68 |
'name', |
69 |
'classification', |
70 |
'topic', |
71 |
'udc', |
72 |
], |
73 |
KeyAttr => { |
74 |
'namePart' => 'type', |
75 |
'identifier' => 'type', |
76 |
'classification' => 'authority', |
77 |
'namePart' => 'type', |
78 |
'role' => 'type', |
79 |
}, |
80 |
GroupTags => { |
81 |
'place' => 'placeTerm', |
82 |
'physicalDescription' => 'extent', |
83 |
'roleTerm' => 'content', |
84 |
}, |
85 |
ContentKey => '-content', |
86 |
); |
87 |
|
88 |
my $m_cache; |
89 |
|
90 |
sub marc_add { |
91 |
my $m_cache = \shift; |
92 |
my $fld = shift || die "need field!"; |
93 |
my $sf = shift || ''; |
94 |
|
95 |
return if (! @_); |
96 |
|
97 |
my @a; |
98 |
foreach (@_) { |
99 |
next if (! $_); |
100 |
push @a,$sf if ($sf); |
101 |
# push @a,$utf2iso->convert($_) || $_; |
102 |
push @a,$_; |
103 |
} |
104 |
|
105 |
return if (! @a); |
106 |
|
107 |
# print "storing $fld: ",join("|",@a),"\n"; |
108 |
|
109 |
push @{$$m_cache->{$fld}}, @a; |
110 |
|
111 |
} |
112 |
|
113 |
my $journal = 0; |
114 |
$journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/); |
115 |
|
116 |
marc_add($m_cache,'610','a',@{$ref->{subject}->{topic}}); |
117 |
|
118 |
my $fld = '700'; |
119 |
|
120 |
foreach my $name (@{$ref->{name}}) { |
121 |
my $role = $name->{role}->{roleTerm}->{content}; |
122 |
next if (! $role); |
123 |
if ($role eq "author") { |
124 |
marc_add($m_cache,$fld,'a',$name->{namePart}->{family}); |
125 |
marc_add($m_cache,$fld,'b',$name->{namePart}->{given}); |
126 |
marc_add($m_cache,$fld,'4',$role); |
127 |
|
128 |
# first author goes in 700, others in 701 |
129 |
$fld = '701'; |
130 |
} elsif ($role eq "editor" or $role eq "illustrator") { |
131 |
marc_add($m_cache,'702','a',$name->{namePart}->{family}); |
132 |
marc_add($m_cache,'702','b',$name->{namePart}->{given}); |
133 |
marc_add($m_cache,'702','4',$role); |
134 |
} else { |
135 |
die "FATAL: don't know how to map role '$role'" if ($role); |
136 |
} |
137 |
} |
138 |
|
139 |
my $note = $ref->{note}; |
140 |
|
141 |
if ($note) { |
142 |
foreach my $n (split(/\s*;\s+/, $note)) { |
143 |
if ($n =~ s/bibliogr:\s+//i) { |
144 |
marc_add($m_cache,'320','a',"Bibliografija: $n"); |
145 |
} elsif ($n =~ s/ilustr:\s+//i) { |
146 |
marc_add($m_cache,'215','c', $n); |
147 |
} else { |
148 |
marc_add($m_cache,'320','a',$n); |
149 |
} |
150 |
} |
151 |
} |
152 |
|
153 |
|
154 |
my $type = $ref->{identifier}->{type}; |
155 |
|
156 |
if ($type) { |
157 |
if ($type eq "isbn") { |
158 |
marc_add($m_cache,'010','a',$ref->{identifier}->{content}); |
159 |
} elsif ($type eq "issn") { |
160 |
marc_add($m_cache,'011','a',$ref->{identifier}->{content}); |
161 |
} else { |
162 |
die "unknown identifier type $type"; |
163 |
} |
164 |
} |
165 |
|
166 |
my $phy_desc = $ref->{physicalDescription}; |
167 |
if ($phy_desc) { |
168 |
my $tmp; |
169 |
foreach my $t (split(/\s*;\s+/, $phy_desc)) { |
170 |
if ($t =~ m/([^:]+):\s+(.+)$/) { |
171 |
$tmp->{$1} = $2; |
172 |
} else { |
173 |
die "can't parse $t"; |
174 |
} |
175 |
} |
176 |
my $data = $tmp->{pagin}; |
177 |
$data .= ", " if ($data); |
178 |
if ($tmp->{str}) { |
179 |
$data .= $tmp->{str}." str"; |
180 |
} |
181 |
marc_add($m_cache,'215','a', $data) if ($data); |
182 |
marc_add($m_cache,'215','d', $tmp->{visina}); |
183 |
} |
184 |
|
185 |
marc_add($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier}); |
186 |
|
187 |
marc_add($m_cache,'200','a',$ref->{titleInfo}->{title}); |
188 |
marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle}); |
189 |
|
190 |
marc_add($m_cache,'675','a',$ref->{classification}->{udc}); |
191 |
|
192 |
my $related = $ref->{relatedItem}->{type}; |
193 |
if ($related) { |
194 |
if ($related eq "series") { |
195 |
marc_add($m_cache,'225','a',$ref->{relatedItem}->{titleInfo}->{title}); |
196 |
marc_add($m_cache,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber}); |
197 |
} elsif ($related eq "preceding") { |
198 |
marc_add($m_cache,'430','a',$ref->{relatedItem}->{titleInfo}->{title}); |
199 |
} else { |
200 |
die "can't parse related item type $related" if ($related); |
201 |
} |
202 |
} |
203 |
|
204 |
marc_add($m_cache,'205','a',$ref->{originInfo}->{edition}); |
205 |
|
206 |
my $publisher = $ref->{originInfo}->{publisher}; |
207 |
if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) { |
208 |
marc_add($m_cache,'210','a', $2); |
209 |
marc_add($m_cache,'210','c', $1); |
210 |
} else { |
211 |
marc_add($m_cache,'210','c', $publisher); |
212 |
} |
213 |
|
214 |
marc_add($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal); |
215 |
|
216 |
marc_add($m_cache,'210','a',$ref->{originInfo}->{place}); |
217 |
|
218 |
marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued}); |
219 |
|
220 |
$nr++; |
221 |
print "$nr " if ($nr % 100 == 0); |
222 |
|
223 |
# dump record |
224 |
my $m=$marc->createrecord(); |
225 |
foreach my $fld (keys %{$m_cache}) { |
226 |
# print "$fld: ",join(" * ",@{$m_cache->{$fld}}),"\n"; |
227 |
$marc->addfield({record=>$m, |
228 |
field=>$fld, |
229 |
value=>\@{$m_cache->{$fld}} |
230 |
}); |
231 |
} |
232 |
|
233 |
$t->purge; # frees the memory |
234 |
} |
235 |
|