/[webpac]/trunk/tools/mods2unimarc.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/tools/mods2unimarc.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 327 - (show annotations)
Sat May 15 18:54:41 2004 UTC (19 years, 9 months ago) by dpavlin
File MIME type: text/plain
File size: 8121 byte(s)
first really working version: support single and repeating fields correctly,
output encodings implemented, support for indicators

1 #!/usr/bin/perl -w
2
3 =head1 NAME
4
5 mods2marc.pl - convert MODS XML back to MARC (ISO2709)
6
7 =head1 SYNOPSIS
8
9 mods2marc.pl mods.xml export.marc
10
11 =head1 DESCRIPTION
12
13 This script will convert MODS format
14 L<http://www.loc.gov/standards/mods/>
15 back to MARC (ISO2709) format.
16
17 Since conversion back to MARC is not simple, lot of things are hard-coded
18 in this script.
19
20 This script B<is somewhat specific> to MODS export from
21 Faculty of Electrical Engineering and Computing
22 so you might want to edit it (among other thing, it includes a lot
23 of fields which are in Croatian).
24
25 Feel free to hack this script and convert it to your own needs.
26
27 =head1 CAVEAT
28
29 This script will parse imput XML twice: once with C<XML::Twig> and
30 then each entry with C<XML::Simple> to produce in-memory structure.
31 That's because I wanted to keep node selection logical (and perl-like).
32
33 If you don't like it, you can rewrite this script to use XPATH. I tried
34 and failed (it seems that MODS is too complicated for my limited knowledge
35 of XPATH).
36
37 =cut
38
39 use strict;
40 use XML::Twig;
41 use XML::Simple;
42 use MARC;
43 use Text::Iconv;
44
45 use Data::Dumper;
46
47 my $xml_file = "/data/tehnika/fer/all.xml";
48 $xml_file = "/data/tehnika/fer/modsFER_1.xml";
49 $xml_file = "/data/tehnika/fer/mods-small.xml";
50 my $marc_file = "fer.marc";
51
52 $|=1;
53 my $nr = 0;
54
55 my $marc = MARC->new;
56
57 my $ENCODING = 'ISO-8859-2';
58
59 my $twig=XML::Twig->new(
60 twig_roots => { 'mods' => \&mods },
61 output_encoding => 'UTF8',
62 );
63
64 my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
65
66 print "$xml_file: ";
67 $twig->parsefile($xml_file);
68 $twig->purge;
69 print "$nr\nSaving MARC file...\n";
70
71 $marc->output({file=>"> $marc_file",'format'=>"usmarc"});
72
73 sub mods {
74 my( $t, $elt)= @_;
75
76 my $xml=$elt->xml_string;
77 my $ref = XMLin('<xml>'.$xml.'</xml>',
78 ForceArray => [
79 'name',
80 'classification',
81 'topic',
82 ],
83 KeyAttr => {
84 'namePart' => 'type',
85 'identifier' => 'type',
86 'namePart' => 'type',
87 'role' => 'type',
88 },
89 GroupTags => {
90 'place' => 'placeTerm',
91 'physicalDescription' => 'extent',
92 'roleTerm' => 'content',
93 },
94 ContentKey => '-content',
95 );
96
97 my $m_cache;
98
99 sub marc_add {
100 my $m_cache = \shift || die "need m_cache";
101 my $fld = shift || die "need field!";
102 my $sf = shift;
103 my $data = shift || return;
104
105 #print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
106
107 if ($sf) {
108 push @{$$m_cache->{tmp}->{$fld}}, $sf;
109 }
110 push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
111 }
112
113 sub marc_rep {
114 my $m_cache = \shift || die "need m_cache";
115 foreach my $fld (@_) {
116 #print "marc_rep: $fld\n";
117 push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
118 delete $$m_cache->{tmp}->{$fld};
119 }
120 }
121
122 sub marc_single {
123 my $m_cache = \shift || die "need m_cache";
124 foreach my $fld (@_) {
125 #print "marc_single: $fld\n";
126
127 die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
128
129 $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
130 delete $$m_cache->{tmp}->{$fld};
131 }
132 }
133
134 sub marc_add_rep {
135 my $m_cache = \shift || die "need m_cache";
136 my $fld = shift || die "need field!";
137 my $sf = shift;
138 my $data = shift || return;
139
140 marc_add($$m_cache,$fld,$sf,$data);
141 marc_rep($$m_cache,$fld);
142 }
143
144 sub marc_add_single {
145 my $m_cache = \shift || die "need m_cache";
146 my $fld = shift || die "need field!";
147 my $sf = shift;
148 my $data = shift || return;
149
150 marc_add($$m_cache,$fld,$sf,$data);
151 marc_single($$m_cache,$fld);
152 }
153
154 my $journal = 0;
155 # Journals start with c- in our MODS
156 $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
157
158 foreach my $t (@{$ref->{subject}->{topic}}) {
159 marc_add($m_cache,'610','a', $t);
160 marc_rep($m_cache,'610');
161 }
162
163 my $fld_700 = '700';
164 my $fld_710 = '710';
165
166 foreach my $name (@{$ref->{name}}) {
167 my $role = $name->{role}->{roleTerm}->{content};
168 next if (! $role);
169 if ($role eq "author") {
170 marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
171 marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
172 marc_add($m_cache,$fld_700,'4',$role);
173
174 marc_rep($m_cache,$fld_700);
175
176 # first author goes in 700, others in 701
177 $fld_700 = '701';
178 } elsif ($role eq "editor" or $role eq "illustrator") {
179 marc_add($m_cache,'702','a',$name->{namePart}->{family});
180 marc_add($m_cache,'702','b',$name->{namePart}->{given});
181 marc_add($m_cache,'702','4',$role);
182 marc_rep($m_cache,'702');
183 } elsif ($role eq "corporate") {
184 marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
185 $fld_710 = '711';
186 } elsif ($role eq "conference") {
187 marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
188 $fld_710 = '711';
189 } else {
190 die "FATAL: don't know how to map role '$role'" if ($role);
191 }
192 }
193
194 my $note = $ref->{note};
195
196 if ($note) {
197 foreach my $n (split(/\s*;\s+/, $note)) {
198 if ($n =~ s/bibliogr:\s+//i) {
199 marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
200 } elsif ($n =~ s/ilustr:\s+//i) {
201 marc_add($m_cache,'215','c', $n);
202 } else {
203 marc_add_rep($m_cache,'320','a',$n);
204 }
205 }
206 }
207
208
209 my $type = $ref->{identifier}->{type};
210
211 if ($type) {
212 if ($type eq "isbn") {
213 marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
214 } elsif ($type eq "issn") {
215 marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
216 } else {
217 die "unknown identifier type $type";
218 }
219 }
220
221 my $phy_desc = $ref->{physicalDescription};
222 if ($phy_desc) {
223 my $tmp;
224 foreach my $t (split(/\s*;\s+/, $phy_desc)) {
225 if ($t =~ m/([^:]+):\s+(.+)$/) {
226 $tmp->{$1} = $2;
227 } else {
228 die "can't parse $t";
229 }
230 }
231 my $data = $tmp->{pagin};
232 $data .= ", " if ($data);
233 if ($tmp->{str}) {
234 $data .= $tmp->{str}." str";
235 }
236 marc_add($m_cache,'215','a', $data) if ($data);
237 marc_add($m_cache,'215','d', $tmp->{visina});
238 }
239 marc_rep($m_cache,'215');
240
241 marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});
242
243 marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
244 marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
245 marc_single($m_cache,'200');
246
247 foreach my $c (@{$ref->{classification}}) {
248 if ($c->{'authority'} eq "udc") {
249 marc_add_rep($m_cache,'675','a', $c->{'content'});
250 }
251 }
252
253 my $related = $ref->{relatedItem}->{type};
254 if ($related) {
255 if ($related eq "series") {
256 marc_add($m_cache,'225','a',$ref->{relatedItem}->{titleInfo}->{title});
257 marc_add($m_cache,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber});
258 marc_rep($m_cache,'225','999');
259 } elsif ($related eq "preceding") {
260 marc_add_rep($m_cache,'430','a',$ref->{relatedItem}->{titleInfo}->{title});
261 } else {
262 die "can't parse related item type $related" if ($related);
263 }
264 }
265
266 marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
267
268 my $publisher = $ref->{originInfo}->{publisher};
269 if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
270 marc_add($m_cache,'210','a', $2);
271 marc_add($m_cache,'210','c', $1);
272 } else {
273 marc_add($m_cache,'210','c', $publisher);
274 }
275
276 marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
277 marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
278
279 marc_single($m_cache,'210');
280
281 marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
282
283 $nr++;
284 print "$nr " if ($nr % 100 == 0);
285
286 # dump record
287 my $m=$marc->createrecord({leader=>"00000nam 2200000 a 4500"});
288
289 foreach my $fld (keys %{$m_cache->{array}}) {
290 foreach my $arr (@{$m_cache->{array}->{$fld}}) {
291 #print "array = ",Dumper($arr);
292 my ($i1,$i2);
293 # do we have indicators?
294 if ($fld =~ m/^(.+)\t(.)(.)$/) {
295 $fld = $1;
296 ($i1,$i2) = ($2,$3);
297 }
298 $marc->addfield({record=>$m,
299 field=>$fld,
300 i1=>$i1,
301 i2=>$i2,
302 value=>$arr
303 });
304 }
305 }
306
307 foreach my $fld (keys %{$m_cache->{single}}) {
308 #print "single = ",Dumper($m_cache->{single}->{$fld});
309 my ($i1,$i2);
310 # do we have indicators?
311 if ($fld =~ m/^(.+)\t(.)(.)$/) {
312 $fld = $1;
313 ($i1,$i2) = ($2,$3);
314 }
315 $marc->addfield({record=>$m,
316 field=>$fld,
317 i1=>$i1,
318 i2=>$i2,
319 value=>$m_cache->{single}->{$fld}
320 });
321 }
322
323 $m_cache = {};
324
325 $t->purge; # frees the memory
326 }
327

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26