/[webpac]/trunk/tools/mods2unimarc.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/tools/mods2unimarc.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 329 - (hide annotations)
Sat May 15 20:54:40 2004 UTC (19 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 8375 byte(s)
don't die on field which can't be parsed (just print warning), corrected
subfield order for field 210

1 dpavlin 325 #!/usr/bin/perl -w
2    
3     =head1 NAME
4    
5     mods2marc.pl - convert MODS XML back to MARC (ISO2709)
6    
7     =head1 SYNOPSIS
8    
9 dpavlin 328 mods2marc.pl export.marc mods.xml [mods2.xml ... ]
10 dpavlin 325
11     =head1 DESCRIPTION
12    
13     This script will convert MODS format
14     L<http://www.loc.gov/standards/mods/>
15     back to MARC (ISO2709) format.
16    
17     Since conversion back to MARC is not simple, lot of things are hard-coded
18     in this script.
19    
20     This script B<is somewhat specific> to MODS export from
21     Faculty of Electrical Engineering and Computing
22 dpavlin 326 so you might want to edit it (among other thing, it includes a lot
23     of fields which are in Croatian).
24 dpavlin 325
25 dpavlin 326 Feel free to hack this script and convert it to your own needs.
26    
27 dpavlin 327 =head1 CAVEAT
28 dpavlin 325
29 dpavlin 327 This script will parse imput XML twice: once with C<XML::Twig> and
30     then each entry with C<XML::Simple> to produce in-memory structure.
31     That's because I wanted to keep node selection logical (and perl-like).
32 dpavlin 325
33 dpavlin 327 If you don't like it, you can rewrite this script to use XPATH. I tried
34     and failed (it seems that MODS is too complicated for my limited knowledge
35     of XPATH).
36    
37 dpavlin 325 =cut
38    
39     use strict;
40     use XML::Twig;
41     use XML::Simple;
42     use MARC;
43     use Text::Iconv;
44    
45     use Data::Dumper;
46    
47 dpavlin 328 my $marc_file = shift @ARGV || die "$0: need MARC export file";
48     die "$0: need at least one MODS XML file" if (! @ARGV);
49 dpavlin 325
50     $|=1;
51     my $nr = 0;
52    
53     my $marc = MARC->new;
54    
55 dpavlin 327 my $ENCODING = 'ISO-8859-2';
56    
57 dpavlin 325 my $twig=XML::Twig->new(
58 dpavlin 327 twig_roots => { 'mods' => \&mods },
59     output_encoding => 'UTF8',
60 dpavlin 325 );
61    
62 dpavlin 327 my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
63 dpavlin 325
64 dpavlin 328 foreach my $xml_file (@ARGV) {
65     print "$xml_file: ";
66     $twig->parsefile($xml_file);
67     $twig->purge;
68     print "$nr\n";
69     }
70 dpavlin 325
71 dpavlin 328 print "Saving MARC file...\n";
72    
73 dpavlin 325 $marc->output({file=>"> $marc_file",'format'=>"usmarc"});
74    
75 dpavlin 327 sub mods {
76 dpavlin 325 my( $t, $elt)= @_;
77    
78     my $xml=$elt->xml_string;
79 dpavlin 327 my $ref = XMLin('<xml>'.$xml.'</xml>',
80 dpavlin 325 ForceArray => [
81     'name',
82     'classification',
83     'topic',
84 dpavlin 328 'relatedItem',
85     'partNumber',
86 dpavlin 325 ],
87     KeyAttr => {
88     'namePart' => 'type',
89     'identifier' => 'type',
90     'namePart' => 'type',
91     'role' => 'type',
92     },
93     GroupTags => {
94     'place' => 'placeTerm',
95     'physicalDescription' => 'extent',
96     'roleTerm' => 'content',
97     },
98     ContentKey => '-content',
99     );
100    
101 dpavlin 326 my $m_cache;
102 dpavlin 325
103 dpavlin 326 sub marc_add {
104 dpavlin 327 my $m_cache = \shift || die "need m_cache";
105 dpavlin 326 my $fld = shift || die "need field!";
106 dpavlin 327 my $sf = shift;
107     my $data = shift || return;
108 dpavlin 325
109 dpavlin 327 #print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
110 dpavlin 325
111 dpavlin 327 if ($sf) {
112     push @{$$m_cache->{tmp}->{$fld}}, $sf;
113 dpavlin 325 }
114 dpavlin 327 push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
115     }
116 dpavlin 325
117 dpavlin 327 sub marc_rep {
118     my $m_cache = \shift || die "need m_cache";
119     foreach my $fld (@_) {
120     #print "marc_rep: $fld\n";
121     push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
122     delete $$m_cache->{tmp}->{$fld};
123     }
124     }
125 dpavlin 325
126 dpavlin 327 sub marc_single {
127     my $m_cache = \shift || die "need m_cache";
128     foreach my $fld (@_) {
129     #print "marc_single: $fld\n";
130 dpavlin 325
131 dpavlin 327 die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
132 dpavlin 326
133 dpavlin 327 $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
134     delete $$m_cache->{tmp}->{$fld};
135     }
136 dpavlin 325 }
137    
138 dpavlin 327 sub marc_add_rep {
139     my $m_cache = \shift || die "need m_cache";
140     my $fld = shift || die "need field!";
141     my $sf = shift;
142     my $data = shift || return;
143    
144     marc_add($$m_cache,$fld,$sf,$data);
145     marc_rep($$m_cache,$fld);
146     }
147    
148     sub marc_add_single {
149     my $m_cache = \shift || die "need m_cache";
150     my $fld = shift || die "need field!";
151     my $sf = shift;
152     my $data = shift || return;
153    
154     marc_add($$m_cache,$fld,$sf,$data);
155     marc_single($$m_cache,$fld);
156     }
157    
158 dpavlin 326 my $journal = 0;
159 dpavlin 327 # Journals start with c- in our MODS
160 dpavlin 326 $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
161 dpavlin 325
162 dpavlin 327 foreach my $t (@{$ref->{subject}->{topic}}) {
163     marc_add($m_cache,'610','a', $t);
164     marc_rep($m_cache,'610');
165     }
166 dpavlin 325
167 dpavlin 327 my $fld_700 = '700';
168     my $fld_710 = '710';
169    
170 dpavlin 325 foreach my $name (@{$ref->{name}}) {
171     my $role = $name->{role}->{roleTerm}->{content};
172     next if (! $role);
173     if ($role eq "author") {
174 dpavlin 327 marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
175     marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
176     marc_add($m_cache,$fld_700,'4',$role);
177 dpavlin 325
178 dpavlin 327 marc_rep($m_cache,$fld_700);
179    
180 dpavlin 325 # first author goes in 700, others in 701
181 dpavlin 327 $fld_700 = '701';
182 dpavlin 325 } elsif ($role eq "editor" or $role eq "illustrator") {
183 dpavlin 326 marc_add($m_cache,'702','a',$name->{namePart}->{family});
184     marc_add($m_cache,'702','b',$name->{namePart}->{given});
185     marc_add($m_cache,'702','4',$role);
186 dpavlin 327 marc_rep($m_cache,'702');
187     } elsif ($role eq "corporate") {
188     marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
189     $fld_710 = '711';
190     } elsif ($role eq "conference") {
191     marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
192     $fld_710 = '711';
193 dpavlin 325 } else {
194     die "FATAL: don't know how to map role '$role'" if ($role);
195     }
196     }
197    
198     my $note = $ref->{note};
199    
200     if ($note) {
201     foreach my $n (split(/\s*;\s+/, $note)) {
202     if ($n =~ s/bibliogr:\s+//i) {
203 dpavlin 327 marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
204 dpavlin 325 } elsif ($n =~ s/ilustr:\s+//i) {
205 dpavlin 326 marc_add($m_cache,'215','c', $n);
206 dpavlin 325 } else {
207 dpavlin 327 marc_add_rep($m_cache,'320','a',$n);
208 dpavlin 325 }
209     }
210     }
211    
212 dpavlin 327
213 dpavlin 325 my $type = $ref->{identifier}->{type};
214    
215     if ($type) {
216     if ($type eq "isbn") {
217 dpavlin 327 marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
218 dpavlin 325 } elsif ($type eq "issn") {
219 dpavlin 327 marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
220 dpavlin 328 } elsif ($type eq "uri") {
221     marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
222 dpavlin 325 } else {
223     die "unknown identifier type $type";
224     }
225     }
226    
227     my $phy_desc = $ref->{physicalDescription};
228     if ($phy_desc) {
229     my $tmp;
230     foreach my $t (split(/\s*;\s+/, $phy_desc)) {
231     if ($t =~ m/([^:]+):\s+(.+)$/) {
232     $tmp->{$1} = $2;
233     } else {
234 dpavlin 329 print STDERR "can't parse '$t' in ",Dumper($phy_desc);
235 dpavlin 325 }
236     }
237     my $data = $tmp->{pagin};
238     $data .= ", " if ($data);
239     if ($tmp->{str}) {
240     $data .= $tmp->{str}." str";
241     }
242 dpavlin 326 marc_add($m_cache,'215','a', $data) if ($data);
243     marc_add($m_cache,'215','d', $tmp->{visina});
244 dpavlin 325 }
245 dpavlin 327 marc_rep($m_cache,'215');
246 dpavlin 325
247 dpavlin 327 marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});
248 dpavlin 325
249 dpavlin 326 marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
250     marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
251 dpavlin 327 marc_single($m_cache,'200');
252 dpavlin 325
253 dpavlin 327 foreach my $c (@{$ref->{classification}}) {
254     if ($c->{'authority'} eq "udc") {
255     marc_add_rep($m_cache,'675','a', $c->{'content'});
256     }
257     }
258 dpavlin 325
259 dpavlin 328 foreach my $ri (@{$ref->{relatedItem}}) {
260     my $related = $ri->{type};
261     if ($related) {
262     if ($related eq "series") {
263     marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
264     foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
265     marc_add_rep($m_cache,'999','a',$pn);
266     }
267     } elsif ($related eq "preceding") {
268     marc_add_rep($m_cache,'430','a',$ri->{titleInfo}->{title});
269     } else {
270     die "can't parse related item type $related" if ($related);
271     }
272 dpavlin 325 }
273     }
274    
275 dpavlin 327 marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
276 dpavlin 325
277 dpavlin 329 marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
278    
279 dpavlin 325 my $publisher = $ref->{originInfo}->{publisher};
280     if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
281 dpavlin 326 marc_add($m_cache,'210','a', $2);
282     marc_add($m_cache,'210','c', $1);
283 dpavlin 325 } else {
284 dpavlin 326 marc_add($m_cache,'210','c', $publisher);
285 dpavlin 325 }
286    
287 dpavlin 326 marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
288    
289 dpavlin 327 marc_single($m_cache,'210');
290    
291     marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
292    
293 dpavlin 325 $nr++;
294     print "$nr " if ($nr % 100 == 0);
295    
296 dpavlin 326 # dump record
297 dpavlin 328 my $bib_level = "m";
298     $bib_level = "s" if ($journal);
299     my $m=$marc->createrecord({leader=>"00000na".$bib_level." 2200000 a 4500"});
300 dpavlin 327
301     foreach my $fld (keys %{$m_cache->{array}}) {
302     foreach my $arr (@{$m_cache->{array}->{$fld}}) {
303     #print "array = ",Dumper($arr);
304     my ($i1,$i2);
305     # do we have indicators?
306     if ($fld =~ m/^(.+)\t(.)(.)$/) {
307     $fld = $1;
308     ($i1,$i2) = ($2,$3);
309     }
310     $marc->addfield({record=>$m,
311     field=>$fld,
312     i1=>$i1,
313     i2=>$i2,
314     value=>$arr
315     });
316     }
317     }
318    
319     foreach my $fld (keys %{$m_cache->{single}}) {
320     #print "single = ",Dumper($m_cache->{single}->{$fld});
321     my ($i1,$i2);
322     # do we have indicators?
323     if ($fld =~ m/^(.+)\t(.)(.)$/) {
324     $fld = $1;
325     ($i1,$i2) = ($2,$3);
326     }
327 dpavlin 326 $marc->addfield({record=>$m,
328     field=>$fld,
329 dpavlin 327 i1=>$i1,
330     i2=>$i2,
331     value=>$m_cache->{single}->{$fld}
332 dpavlin 326 });
333     }
334    
335 dpavlin 327 $m_cache = {};
336    
337 dpavlin 325 $t->purge; # frees the memory
338     }
339    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26