/[webpac]/trunk/tools/mods2unimarc.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/tools/mods2unimarc.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 328 - (show annotations)
Sat May 15 19:52:01 2004 UTC (19 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 8341 byte(s)
command line parametars are now working (xml files are at end, so that shell
glob will work), misc improvements

1 #!/usr/bin/perl -w
2
3 =head1 NAME
4
5 mods2marc.pl - convert MODS XML back to MARC (ISO2709)
6
7 =head1 SYNOPSIS
8
9 mods2marc.pl export.marc mods.xml [mods2.xml ... ]
10
11 =head1 DESCRIPTION
12
13 This script will convert MODS format
14 L<http://www.loc.gov/standards/mods/>
15 back to MARC (ISO2709) format.
16
17 Since conversion back to MARC is not simple, lot of things are hard-coded
18 in this script.
19
20 This script B<is somewhat specific> to MODS export from
21 Faculty of Electrical Engineering and Computing
22 so you might want to edit it (among other thing, it includes a lot
23 of fields which are in Croatian).
24
25 Feel free to hack this script and convert it to your own needs.
26
27 =head1 CAVEAT
28
29 This script will parse imput XML twice: once with C<XML::Twig> and
30 then each entry with C<XML::Simple> to produce in-memory structure.
31 That's because I wanted to keep node selection logical (and perl-like).
32
33 If you don't like it, you can rewrite this script to use XPATH. I tried
34 and failed (it seems that MODS is too complicated for my limited knowledge
35 of XPATH).
36
37 =cut
38
39 use strict;
40 use XML::Twig;
41 use XML::Simple;
42 use MARC;
43 use Text::Iconv;
44
45 use Data::Dumper;
46
47 my $marc_file = shift @ARGV || die "$0: need MARC export file";
48 die "$0: need at least one MODS XML file" if (! @ARGV);
49
50 $|=1;
51 my $nr = 0;
52
53 my $marc = MARC->new;
54
55 my $ENCODING = 'ISO-8859-2';
56
57 my $twig=XML::Twig->new(
58 twig_roots => { 'mods' => \&mods },
59 output_encoding => 'UTF8',
60 );
61
62 my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
63
64 foreach my $xml_file (@ARGV) {
65 print "$xml_file: ";
66 $twig->parsefile($xml_file);
67 $twig->purge;
68 print "$nr\n";
69 }
70
71 print "Saving MARC file...\n";
72
73 $marc->output({file=>"> $marc_file",'format'=>"usmarc"});
74
75 sub mods {
76 my( $t, $elt)= @_;
77
78 my $xml=$elt->xml_string;
79 my $ref = XMLin('<xml>'.$xml.'</xml>',
80 ForceArray => [
81 'name',
82 'classification',
83 'topic',
84 'relatedItem',
85 'partNumber',
86 ],
87 KeyAttr => {
88 'namePart' => 'type',
89 'identifier' => 'type',
90 'namePart' => 'type',
91 'role' => 'type',
92 },
93 GroupTags => {
94 'place' => 'placeTerm',
95 'physicalDescription' => 'extent',
96 'roleTerm' => 'content',
97 },
98 ContentKey => '-content',
99 );
100
101 my $m_cache;
102
103 sub marc_add {
104 my $m_cache = \shift || die "need m_cache";
105 my $fld = shift || die "need field!";
106 my $sf = shift;
107 my $data = shift || return;
108
109 #print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
110
111 if ($sf) {
112 push @{$$m_cache->{tmp}->{$fld}}, $sf;
113 }
114 push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
115 }
116
117 sub marc_rep {
118 my $m_cache = \shift || die "need m_cache";
119 foreach my $fld (@_) {
120 #print "marc_rep: $fld\n";
121 push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
122 delete $$m_cache->{tmp}->{$fld};
123 }
124 }
125
126 sub marc_single {
127 my $m_cache = \shift || die "need m_cache";
128 foreach my $fld (@_) {
129 #print "marc_single: $fld\n";
130
131 die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
132
133 $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
134 delete $$m_cache->{tmp}->{$fld};
135 }
136 }
137
138 sub marc_add_rep {
139 my $m_cache = \shift || die "need m_cache";
140 my $fld = shift || die "need field!";
141 my $sf = shift;
142 my $data = shift || return;
143
144 marc_add($$m_cache,$fld,$sf,$data);
145 marc_rep($$m_cache,$fld);
146 }
147
148 sub marc_add_single {
149 my $m_cache = \shift || die "need m_cache";
150 my $fld = shift || die "need field!";
151 my $sf = shift;
152 my $data = shift || return;
153
154 marc_add($$m_cache,$fld,$sf,$data);
155 marc_single($$m_cache,$fld);
156 }
157
158 my $journal = 0;
159 # Journals start with c- in our MODS
160 $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
161
162 foreach my $t (@{$ref->{subject}->{topic}}) {
163 marc_add($m_cache,'610','a', $t);
164 marc_rep($m_cache,'610');
165 }
166
167 my $fld_700 = '700';
168 my $fld_710 = '710';
169
170 foreach my $name (@{$ref->{name}}) {
171 my $role = $name->{role}->{roleTerm}->{content};
172 next if (! $role);
173 if ($role eq "author") {
174 marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
175 marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
176 marc_add($m_cache,$fld_700,'4',$role);
177
178 marc_rep($m_cache,$fld_700);
179
180 # first author goes in 700, others in 701
181 $fld_700 = '701';
182 } elsif ($role eq "editor" or $role eq "illustrator") {
183 marc_add($m_cache,'702','a',$name->{namePart}->{family});
184 marc_add($m_cache,'702','b',$name->{namePart}->{given});
185 marc_add($m_cache,'702','4',$role);
186 marc_rep($m_cache,'702');
187 } elsif ($role eq "corporate") {
188 marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
189 $fld_710 = '711';
190 } elsif ($role eq "conference") {
191 marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
192 $fld_710 = '711';
193 } else {
194 die "FATAL: don't know how to map role '$role'" if ($role);
195 }
196 }
197
198 my $note = $ref->{note};
199
200 if ($note) {
201 foreach my $n (split(/\s*;\s+/, $note)) {
202 if ($n =~ s/bibliogr:\s+//i) {
203 marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
204 } elsif ($n =~ s/ilustr:\s+//i) {
205 marc_add($m_cache,'215','c', $n);
206 } else {
207 marc_add_rep($m_cache,'320','a',$n);
208 }
209 }
210 }
211
212
213 my $type = $ref->{identifier}->{type};
214
215 if ($type) {
216 if ($type eq "isbn") {
217 marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
218 } elsif ($type eq "issn") {
219 marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
220 } elsif ($type eq "uri") {
221 marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
222 } else {
223 die "unknown identifier type $type";
224 }
225 }
226
227 my $phy_desc = $ref->{physicalDescription};
228 if ($phy_desc) {
229 my $tmp;
230 foreach my $t (split(/\s*;\s+/, $phy_desc)) {
231 if ($t =~ m/([^:]+):\s+(.+)$/) {
232 $tmp->{$1} = $2;
233 } else {
234 die "can't parse $t";
235 }
236 }
237 my $data = $tmp->{pagin};
238 $data .= ", " if ($data);
239 if ($tmp->{str}) {
240 $data .= $tmp->{str}." str";
241 }
242 marc_add($m_cache,'215','a', $data) if ($data);
243 marc_add($m_cache,'215','d', $tmp->{visina});
244 }
245 marc_rep($m_cache,'215');
246
247 marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});
248
249 marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
250 marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
251 marc_single($m_cache,'200');
252
253 foreach my $c (@{$ref->{classification}}) {
254 if ($c->{'authority'} eq "udc") {
255 marc_add_rep($m_cache,'675','a', $c->{'content'});
256 }
257 }
258
259 foreach my $ri (@{$ref->{relatedItem}}) {
260 my $related = $ri->{type};
261 if ($related) {
262 if ($related eq "series") {
263 marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
264 foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
265 marc_add_rep($m_cache,'999','a',$pn);
266 }
267 } elsif ($related eq "preceding") {
268 marc_add_rep($m_cache,'430','a',$ri->{titleInfo}->{title});
269 } else {
270 die "can't parse related item type $related" if ($related);
271 }
272 }
273 }
274
275 marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
276
277 my $publisher = $ref->{originInfo}->{publisher};
278 if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
279 marc_add($m_cache,'210','a', $2);
280 marc_add($m_cache,'210','c', $1);
281 } else {
282 marc_add($m_cache,'210','c', $publisher);
283 }
284
285 marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
286 marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
287
288 marc_single($m_cache,'210');
289
290 marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
291
292 $nr++;
293 print "$nr " if ($nr % 100 == 0);
294
295 # dump record
296 my $bib_level = "m";
297 $bib_level = "s" if ($journal);
298 my $m=$marc->createrecord({leader=>"00000na".$bib_level." 2200000 a 4500"});
299
300 foreach my $fld (keys %{$m_cache->{array}}) {
301 foreach my $arr (@{$m_cache->{array}->{$fld}}) {
302 #print "array = ",Dumper($arr);
303 my ($i1,$i2);
304 # do we have indicators?
305 if ($fld =~ m/^(.+)\t(.)(.)$/) {
306 $fld = $1;
307 ($i1,$i2) = ($2,$3);
308 }
309 $marc->addfield({record=>$m,
310 field=>$fld,
311 i1=>$i1,
312 i2=>$i2,
313 value=>$arr
314 });
315 }
316 }
317
318 foreach my $fld (keys %{$m_cache->{single}}) {
319 #print "single = ",Dumper($m_cache->{single}->{$fld});
320 my ($i1,$i2);
321 # do we have indicators?
322 if ($fld =~ m/^(.+)\t(.)(.)$/) {
323 $fld = $1;
324 ($i1,$i2) = ($2,$3);
325 }
326 $marc->addfield({record=>$m,
327 field=>$fld,
328 i1=>$i1,
329 i2=>$i2,
330 value=>$m_cache->{single}->{$fld}
331 });
332 }
333
334 $m_cache = {};
335
336 $t->purge; # frees the memory
337 }
338

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26