/[webpac]/trunk/tools/mods2unimarc.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/tools/mods2unimarc.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 336 - (hide annotations)
Thu Jun 10 19:20:05 2004 UTC (19 years, 9 months ago) by dpavlin
File MIME type: text/plain
File size: 8771 byte(s)
save MFN as field 001

1 dpavlin 325 #!/usr/bin/perl -w
2    
3     =head1 NAME
4    
5     mods2marc.pl - convert MODS XML back to MARC (ISO2709)
6    
7     =head1 SYNOPSIS
8    
9 dpavlin 328 mods2marc.pl export.marc mods.xml [mods2.xml ... ]
10 dpavlin 325
11     =head1 DESCRIPTION
12    
13     This script will convert MODS format
14     L<http://www.loc.gov/standards/mods/>
15     back to MARC (ISO2709) format.
16    
17     Since conversion back to MARC is not simple, lot of things are hard-coded
18     in this script.
19    
20     This script B<is somewhat specific> to MODS export from
21     Faculty of Electrical Engineering and Computing
22 dpavlin 326 so you might want to edit it (among other thing, it includes a lot
23     of fields which are in Croatian).
24 dpavlin 325
25 dpavlin 326 Feel free to hack this script and convert it to your own needs.
26    
27 dpavlin 327 =head1 CAVEAT
28 dpavlin 325
29 dpavlin 327 This script will parse imput XML twice: once with C<XML::Twig> and
30     then each entry with C<XML::Simple> to produce in-memory structure.
31     That's because I wanted to keep node selection logical (and perl-like).
32 dpavlin 325
33 dpavlin 327 If you don't like it, you can rewrite this script to use XPATH. I tried
34     and failed (it seems that MODS is too complicated for my limited knowledge
35     of XPATH).
36    
37 dpavlin 325 =cut
38    
39     use strict;
40     use XML::Twig;
41     use XML::Simple;
42     use MARC;
43     use Text::Iconv;
44    
45     use Data::Dumper;
46    
47 dpavlin 328 my $marc_file = shift @ARGV || die "$0: need MARC export file";
48     die "$0: need at least one MODS XML file" if (! @ARGV);
49 dpavlin 325
50     $|=1;
51     my $nr = 0;
52    
53     my $marc = MARC->new;
54    
55 dpavlin 327 my $ENCODING = 'ISO-8859-2';
56 dpavlin 336 $ENCODING = 'windows-1250';
57 dpavlin 327
58 dpavlin 325 my $twig=XML::Twig->new(
59 dpavlin 327 twig_roots => { 'mods' => \&mods },
60     output_encoding => 'UTF8',
61 dpavlin 325 );
62    
63 dpavlin 327 my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
64 dpavlin 325
65 dpavlin 328 foreach my $xml_file (@ARGV) {
66     print "$xml_file: ";
67     $twig->parsefile($xml_file);
68     $twig->purge;
69     print "$nr\n";
70     }
71 dpavlin 325
72 dpavlin 328 print "Saving MARC file...\n";
73    
74 dpavlin 325 $marc->output({file=>"> $marc_file",'format'=>"usmarc"});
75    
76 dpavlin 327 sub mods {
77 dpavlin 325 my( $t, $elt)= @_;
78    
79     my $xml=$elt->xml_string;
80 dpavlin 327 my $ref = XMLin('<xml>'.$xml.'</xml>',
81 dpavlin 325 ForceArray => [
82     'name',
83     'classification',
84     'topic',
85 dpavlin 328 'relatedItem',
86     'partNumber',
87 dpavlin 325 ],
88     KeyAttr => {
89     'namePart' => 'type',
90     'identifier' => 'type',
91     'namePart' => 'type',
92     'role' => 'type',
93     },
94     GroupTags => {
95     'place' => 'placeTerm',
96     'physicalDescription' => 'extent',
97     'roleTerm' => 'content',
98     },
99     ContentKey => '-content',
100     );
101    
102 dpavlin 326 my $m_cache;
103 dpavlin 325
104 dpavlin 326 sub marc_add {
105 dpavlin 327 my $m_cache = \shift || die "need m_cache";
106 dpavlin 326 my $fld = shift || die "need field!";
107 dpavlin 327 my $sf = shift;
108     my $data = shift || return;
109 dpavlin 325
110 dpavlin 327 #print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
111 dpavlin 325
112 dpavlin 327 if ($sf) {
113     push @{$$m_cache->{tmp}->{$fld}}, $sf;
114 dpavlin 325 }
115 dpavlin 327 push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
116     }
117 dpavlin 325
118 dpavlin 327 sub marc_rep {
119     my $m_cache = \shift || die "need m_cache";
120     foreach my $fld (@_) {
121     #print "marc_rep: $fld\n";
122     push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
123     delete $$m_cache->{tmp}->{$fld};
124     }
125     }
126 dpavlin 325
127 dpavlin 327 sub marc_single {
128     my $m_cache = \shift || die "need m_cache";
129     foreach my $fld (@_) {
130     #print "marc_single: $fld\n";
131 dpavlin 325
132 dpavlin 327 die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
133 dpavlin 326
134 dpavlin 327 $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
135     delete $$m_cache->{tmp}->{$fld};
136     }
137 dpavlin 325 }
138    
139 dpavlin 327 sub marc_add_rep {
140     my $m_cache = \shift || die "need m_cache";
141     my $fld = shift || die "need field!";
142     my $sf = shift;
143     my $data = shift || return;
144    
145     marc_add($$m_cache,$fld,$sf,$data);
146     marc_rep($$m_cache,$fld);
147     }
148    
149     sub marc_add_single {
150     my $m_cache = \shift || die "need m_cache";
151     my $fld = shift || die "need field!";
152     my $sf = shift;
153     my $data = shift || return;
154    
155     marc_add($$m_cache,$fld,$sf,$data);
156     marc_single($$m_cache,$fld);
157     }
158    
159 dpavlin 326 my $journal = 0;
160 dpavlin 327 # Journals start with c- in our MODS
161 dpavlin 326 $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
162 dpavlin 325
163 dpavlin 327 foreach my $t (@{$ref->{subject}->{topic}}) {
164     marc_add($m_cache,'610','a', $t);
165     marc_rep($m_cache,'610');
166     }
167 dpavlin 325
168 dpavlin 327 my $fld_700 = '700';
169     my $fld_710 = '710';
170    
171 dpavlin 325 foreach my $name (@{$ref->{name}}) {
172     my $role = $name->{role}->{roleTerm}->{content};
173     next if (! $role);
174     if ($role eq "author") {
175 dpavlin 327 marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
176     marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
177     marc_add($m_cache,$fld_700,'4',$role);
178 dpavlin 325
179 dpavlin 327 marc_rep($m_cache,$fld_700);
180    
181 dpavlin 325 # first author goes in 700, others in 701
182 dpavlin 327 $fld_700 = '701';
183 dpavlin 325 } elsif ($role eq "editor" or $role eq "illustrator") {
184 dpavlin 326 marc_add($m_cache,'702','a',$name->{namePart}->{family});
185     marc_add($m_cache,'702','b',$name->{namePart}->{given});
186     marc_add($m_cache,'702','4',$role);
187 dpavlin 327 marc_rep($m_cache,'702');
188     } elsif ($role eq "corporate") {
189     marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
190     $fld_710 = '711';
191     } elsif ($role eq "conference") {
192     marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
193     $fld_710 = '711';
194 dpavlin 325 } else {
195     die "FATAL: don't know how to map role '$role'" if ($role);
196     }
197     }
198    
199     my $note = $ref->{note};
200    
201     if ($note) {
202     foreach my $n (split(/\s*;\s+/, $note)) {
203     if ($n =~ s/bibliogr:\s+//i) {
204 dpavlin 327 marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
205 dpavlin 325 } elsif ($n =~ s/ilustr:\s+//i) {
206 dpavlin 326 marc_add($m_cache,'215','c', $n);
207 dpavlin 325 } else {
208 dpavlin 327 marc_add_rep($m_cache,'320','a',$n);
209 dpavlin 325 }
210     }
211     }
212    
213 dpavlin 327
214 dpavlin 325 my $type = $ref->{identifier}->{type};
215    
216     if ($type) {
217     if ($type eq "isbn") {
218 dpavlin 327 marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
219 dpavlin 325 } elsif ($type eq "issn") {
220 dpavlin 327 marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
221 dpavlin 328 } elsif ($type eq "uri") {
222     marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content});
223 dpavlin 325 } else {
224     die "unknown identifier type $type";
225     }
226     }
227    
228     my $phy_desc = $ref->{physicalDescription};
229     if ($phy_desc) {
230     my $tmp;
231     foreach my $t (split(/\s*;\s+/, $phy_desc)) {
232     if ($t =~ m/([^:]+):\s+(.+)$/) {
233     $tmp->{$1} = $2;
234     } else {
235 dpavlin 329 print STDERR "can't parse '$t' in ",Dumper($phy_desc);
236 dpavlin 325 }
237     }
238     my $data = $tmp->{pagin};
239     $data .= ", " if ($data);
240     if ($tmp->{str}) {
241     $data .= $tmp->{str}." str";
242     }
243 dpavlin 326 marc_add($m_cache,'215','a', $data) if ($data);
244     marc_add($m_cache,'215','d', $tmp->{visina});
245 dpavlin 325 }
246 dpavlin 327 marc_rep($m_cache,'215');
247 dpavlin 325
248 dpavlin 336 my $mfn = $ref->{recordInfo}->{recordIdentifier};
249     $mfn =~ s/[^0-9]//g;
250     marc_add_single($m_cache,'001',undef,$mfn);
251 dpavlin 325
252 dpavlin 326 marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
253     marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
254 dpavlin 327 marc_single($m_cache,'200');
255 dpavlin 325
256 dpavlin 327 foreach my $c (@{$ref->{classification}}) {
257     if ($c->{'authority'} eq "udc") {
258     marc_add_rep($m_cache,'675','a', $c->{'content'});
259     }
260     }
261 dpavlin 325
262 dpavlin 328 foreach my $ri (@{$ref->{relatedItem}}) {
263     my $related = $ri->{type};
264     if ($related) {
265     if ($related eq "series") {
266     marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title});
267     foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) {
268 dpavlin 330 if ($journal) {
269     marc_add_rep($m_cache,'999','a',$pn);
270     } else {
271     marc_add_rep($m_cache,'225','v',$pn);
272     }
273 dpavlin 328 }
274     } elsif ($related eq "preceding") {
275 dpavlin 330 marc_add($m_cache,'520','a',$ri->{titleInfo}->{title});
276     if ($ri->{identifier}) {
277     if ($ri->{identifier}->{type} eq "issn") {
278     marc_add($m_cache,'520','x',$ri->{identifier}->{content});
279     } else {
280     die "can't store identifier type $type";
281     }
282     }
283     marc_rep($m_cache,'520');
284 dpavlin 328 } else {
285     die "can't parse related item type $related" if ($related);
286     }
287 dpavlin 325 }
288     }
289    
290 dpavlin 327 marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
291 dpavlin 325
292 dpavlin 329 marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
293    
294 dpavlin 325 my $publisher = $ref->{originInfo}->{publisher};
295     if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
296 dpavlin 326 marc_add($m_cache,'210','a', $2);
297     marc_add($m_cache,'210','c', $1);
298 dpavlin 325 } else {
299 dpavlin 326 marc_add($m_cache,'210','c', $publisher);
300 dpavlin 325 }
301    
302 dpavlin 326 marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
303    
304 dpavlin 327 marc_single($m_cache,'210');
305    
306     marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
307    
308 dpavlin 325 $nr++;
309     print "$nr " if ($nr % 100 == 0);
310    
311 dpavlin 326 # dump record
312 dpavlin 328 my $bib_level = "m";
313     $bib_level = "s" if ($journal);
314     my $m=$marc->createrecord({leader=>"00000na".$bib_level." 2200000 a 4500"});
315 dpavlin 327
316     foreach my $fld (keys %{$m_cache->{array}}) {
317     foreach my $arr (@{$m_cache->{array}->{$fld}}) {
318     #print "array = ",Dumper($arr);
319     my ($i1,$i2);
320     # do we have indicators?
321     if ($fld =~ m/^(.+)\t(.)(.)$/) {
322     $fld = $1;
323     ($i1,$i2) = ($2,$3);
324     }
325     $marc->addfield({record=>$m,
326     field=>$fld,
327     i1=>$i1,
328     i2=>$i2,
329     value=>$arr
330     });
331     }
332     }
333    
334     foreach my $fld (keys %{$m_cache->{single}}) {
335     #print "single = ",Dumper($m_cache->{single}->{$fld});
336     my ($i1,$i2);
337     # do we have indicators?
338     if ($fld =~ m/^(.+)\t(.)(.)$/) {
339     $fld = $1;
340     ($i1,$i2) = ($2,$3);
341     }
342 dpavlin 326 $marc->addfield({record=>$m,
343     field=>$fld,
344 dpavlin 327 i1=>$i1,
345     i2=>$i2,
346     value=>$m_cache->{single}->{$fld}
347 dpavlin 326 });
348     }
349    
350 dpavlin 327 $m_cache = {};
351    
352 dpavlin 325 $t->purge; # frees the memory
353     }
354    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26