/[webpac]/trunk2/tools/mods2unimarc.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk2/tools/mods2unimarc.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 327 - (hide annotations)
Sat May 15 18:54:41 2004 UTC (19 years, 11 months ago) by dpavlin
Original Path: trunk/tools/mods2unimarc.pl
File MIME type: text/plain
File size: 8121 byte(s)
first really working version: support single and repeating fields correctly,
output encodings implemented, support for indicators

1 dpavlin 325 #!/usr/bin/perl -w
2    
3     =head1 NAME
4    
5     mods2marc.pl - convert MODS XML back to MARC (ISO2709)
6    
7     =head1 SYNOPSIS
8    
9     mods2marc.pl mods.xml export.marc
10    
11     =head1 DESCRIPTION
12    
13     This script will convert MODS format
14     L<http://www.loc.gov/standards/mods/>
15     back to MARC (ISO2709) format.
16    
17     Since conversion back to MARC is not simple, lot of things are hard-coded
18     in this script.
19    
20     This script B<is somewhat specific> to MODS export from
21     Faculty of Electrical Engineering and Computing
22 dpavlin 326 so you might want to edit it (among other thing, it includes a lot
23     of fields which are in Croatian).
24 dpavlin 325
25 dpavlin 326 Feel free to hack this script and convert it to your own needs.
26    
27 dpavlin 327 =head1 CAVEAT
28 dpavlin 325
29 dpavlin 327 This script will parse imput XML twice: once with C<XML::Twig> and
30     then each entry with C<XML::Simple> to produce in-memory structure.
31     That's because I wanted to keep node selection logical (and perl-like).
32 dpavlin 325
33 dpavlin 327 If you don't like it, you can rewrite this script to use XPATH. I tried
34     and failed (it seems that MODS is too complicated for my limited knowledge
35     of XPATH).
36    
37 dpavlin 325 =cut
38    
39     use strict;
40     use XML::Twig;
41     use XML::Simple;
42     use MARC;
43     use Text::Iconv;
44    
45     use Data::Dumper;
46    
47     my $xml_file = "/data/tehnika/fer/all.xml";
48     $xml_file = "/data/tehnika/fer/modsFER_1.xml";
49 dpavlin 327 $xml_file = "/data/tehnika/fer/mods-small.xml";
50 dpavlin 325 my $marc_file = "fer.marc";
51    
52     $|=1;
53     my $nr = 0;
54    
55     my $marc = MARC->new;
56    
57 dpavlin 327 my $ENCODING = 'ISO-8859-2';
58    
59 dpavlin 325 my $twig=XML::Twig->new(
60 dpavlin 327 twig_roots => { 'mods' => \&mods },
61     output_encoding => 'UTF8',
62 dpavlin 325 );
63    
64 dpavlin 327 my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
65 dpavlin 325
66 dpavlin 327 print "$xml_file: ";
67 dpavlin 325 $twig->parsefile($xml_file);
68     $twig->purge;
69 dpavlin 327 print "$nr\nSaving MARC file...\n";
70 dpavlin 325
71     $marc->output({file=>"> $marc_file",'format'=>"usmarc"});
72    
73 dpavlin 327 sub mods {
74 dpavlin 325 my( $t, $elt)= @_;
75    
76     my $xml=$elt->xml_string;
77 dpavlin 327 my $ref = XMLin('<xml>'.$xml.'</xml>',
78 dpavlin 325 ForceArray => [
79     'name',
80     'classification',
81     'topic',
82     ],
83     KeyAttr => {
84     'namePart' => 'type',
85     'identifier' => 'type',
86     'namePart' => 'type',
87     'role' => 'type',
88     },
89     GroupTags => {
90     'place' => 'placeTerm',
91     'physicalDescription' => 'extent',
92     'roleTerm' => 'content',
93     },
94     ContentKey => '-content',
95     );
96    
97 dpavlin 326 my $m_cache;
98 dpavlin 325
99 dpavlin 326 sub marc_add {
100 dpavlin 327 my $m_cache = \shift || die "need m_cache";
101 dpavlin 326 my $fld = shift || die "need field!";
102 dpavlin 327 my $sf = shift;
103     my $data = shift || return;
104 dpavlin 325
105 dpavlin 327 #print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
106 dpavlin 325
107 dpavlin 327 if ($sf) {
108     push @{$$m_cache->{tmp}->{$fld}}, $sf;
109 dpavlin 325 }
110 dpavlin 327 push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
111     }
112 dpavlin 325
113 dpavlin 327 sub marc_rep {
114     my $m_cache = \shift || die "need m_cache";
115     foreach my $fld (@_) {
116     #print "marc_rep: $fld\n";
117     push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
118     delete $$m_cache->{tmp}->{$fld};
119     }
120     }
121 dpavlin 325
122 dpavlin 327 sub marc_single {
123     my $m_cache = \shift || die "need m_cache";
124     foreach my $fld (@_) {
125     #print "marc_single: $fld\n";
126 dpavlin 325
127 dpavlin 327 die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
128 dpavlin 326
129 dpavlin 327 $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
130     delete $$m_cache->{tmp}->{$fld};
131     }
132 dpavlin 325 }
133    
134 dpavlin 327 sub marc_add_rep {
135     my $m_cache = \shift || die "need m_cache";
136     my $fld = shift || die "need field!";
137     my $sf = shift;
138     my $data = shift || return;
139    
140     marc_add($$m_cache,$fld,$sf,$data);
141     marc_rep($$m_cache,$fld);
142     }
143    
144     sub marc_add_single {
145     my $m_cache = \shift || die "need m_cache";
146     my $fld = shift || die "need field!";
147     my $sf = shift;
148     my $data = shift || return;
149    
150     marc_add($$m_cache,$fld,$sf,$data);
151     marc_single($$m_cache,$fld);
152     }
153    
154 dpavlin 326 my $journal = 0;
155 dpavlin 327 # Journals start with c- in our MODS
156 dpavlin 326 $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
157 dpavlin 325
158 dpavlin 327 foreach my $t (@{$ref->{subject}->{topic}}) {
159     marc_add($m_cache,'610','a', $t);
160     marc_rep($m_cache,'610');
161     }
162 dpavlin 325
163 dpavlin 327 my $fld_700 = '700';
164     my $fld_710 = '710';
165    
166 dpavlin 325 foreach my $name (@{$ref->{name}}) {
167     my $role = $name->{role}->{roleTerm}->{content};
168     next if (! $role);
169     if ($role eq "author") {
170 dpavlin 327 marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
171     marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
172     marc_add($m_cache,$fld_700,'4',$role);
173 dpavlin 325
174 dpavlin 327 marc_rep($m_cache,$fld_700);
175    
176 dpavlin 325 # first author goes in 700, others in 701
177 dpavlin 327 $fld_700 = '701';
178 dpavlin 325 } elsif ($role eq "editor" or $role eq "illustrator") {
179 dpavlin 326 marc_add($m_cache,'702','a',$name->{namePart}->{family});
180     marc_add($m_cache,'702','b',$name->{namePart}->{given});
181     marc_add($m_cache,'702','4',$role);
182 dpavlin 327 marc_rep($m_cache,'702');
183     } elsif ($role eq "corporate") {
184     marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
185     $fld_710 = '711';
186     } elsif ($role eq "conference") {
187     marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
188     $fld_710 = '711';
189 dpavlin 325 } else {
190     die "FATAL: don't know how to map role '$role'" if ($role);
191     }
192     }
193    
194     my $note = $ref->{note};
195    
196     if ($note) {
197     foreach my $n (split(/\s*;\s+/, $note)) {
198     if ($n =~ s/bibliogr:\s+//i) {
199 dpavlin 327 marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
200 dpavlin 325 } elsif ($n =~ s/ilustr:\s+//i) {
201 dpavlin 326 marc_add($m_cache,'215','c', $n);
202 dpavlin 325 } else {
203 dpavlin 327 marc_add_rep($m_cache,'320','a',$n);
204 dpavlin 325 }
205     }
206     }
207    
208 dpavlin 327
209 dpavlin 325 my $type = $ref->{identifier}->{type};
210    
211     if ($type) {
212     if ($type eq "isbn") {
213 dpavlin 327 marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
214 dpavlin 325 } elsif ($type eq "issn") {
215 dpavlin 327 marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
216 dpavlin 325 } else {
217     die "unknown identifier type $type";
218     }
219     }
220    
221     my $phy_desc = $ref->{physicalDescription};
222     if ($phy_desc) {
223     my $tmp;
224     foreach my $t (split(/\s*;\s+/, $phy_desc)) {
225     if ($t =~ m/([^:]+):\s+(.+)$/) {
226     $tmp->{$1} = $2;
227     } else {
228     die "can't parse $t";
229     }
230     }
231     my $data = $tmp->{pagin};
232     $data .= ", " if ($data);
233     if ($tmp->{str}) {
234     $data .= $tmp->{str}." str";
235     }
236 dpavlin 326 marc_add($m_cache,'215','a', $data) if ($data);
237     marc_add($m_cache,'215','d', $tmp->{visina});
238 dpavlin 325 }
239 dpavlin 327 marc_rep($m_cache,'215');
240 dpavlin 325
241 dpavlin 327 marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});
242 dpavlin 325
243 dpavlin 326 marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
244     marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
245 dpavlin 327 marc_single($m_cache,'200');
246 dpavlin 325
247 dpavlin 327 foreach my $c (@{$ref->{classification}}) {
248     if ($c->{'authority'} eq "udc") {
249     marc_add_rep($m_cache,'675','a', $c->{'content'});
250     }
251     }
252 dpavlin 325
253     my $related = $ref->{relatedItem}->{type};
254     if ($related) {
255     if ($related eq "series") {
256 dpavlin 326 marc_add($m_cache,'225','a',$ref->{relatedItem}->{titleInfo}->{title});
257     marc_add($m_cache,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber});
258 dpavlin 327 marc_rep($m_cache,'225','999');
259 dpavlin 325 } elsif ($related eq "preceding") {
260 dpavlin 327 marc_add_rep($m_cache,'430','a',$ref->{relatedItem}->{titleInfo}->{title});
261 dpavlin 325 } else {
262     die "can't parse related item type $related" if ($related);
263     }
264     }
265    
266 dpavlin 327 marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
267 dpavlin 325
268     my $publisher = $ref->{originInfo}->{publisher};
269     if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
270 dpavlin 326 marc_add($m_cache,'210','a', $2);
271     marc_add($m_cache,'210','c', $1);
272 dpavlin 325 } else {
273 dpavlin 326 marc_add($m_cache,'210','c', $publisher);
274 dpavlin 325 }
275    
276 dpavlin 326 marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
277     marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
278    
279 dpavlin 327 marc_single($m_cache,'210');
280    
281     marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
282    
283 dpavlin 325 $nr++;
284     print "$nr " if ($nr % 100 == 0);
285    
286 dpavlin 326 # dump record
287 dpavlin 327 my $m=$marc->createrecord({leader=>"00000nam 2200000 a 4500"});
288    
289     foreach my $fld (keys %{$m_cache->{array}}) {
290     foreach my $arr (@{$m_cache->{array}->{$fld}}) {
291     #print "array = ",Dumper($arr);
292     my ($i1,$i2);
293     # do we have indicators?
294     if ($fld =~ m/^(.+)\t(.)(.)$/) {
295     $fld = $1;
296     ($i1,$i2) = ($2,$3);
297     }
298     $marc->addfield({record=>$m,
299     field=>$fld,
300     i1=>$i1,
301     i2=>$i2,
302     value=>$arr
303     });
304     }
305     }
306    
307     foreach my $fld (keys %{$m_cache->{single}}) {
308     #print "single = ",Dumper($m_cache->{single}->{$fld});
309     my ($i1,$i2);
310     # do we have indicators?
311     if ($fld =~ m/^(.+)\t(.)(.)$/) {
312     $fld = $1;
313     ($i1,$i2) = ($2,$3);
314     }
315 dpavlin 326 $marc->addfield({record=>$m,
316     field=>$fld,
317 dpavlin 327 i1=>$i1,
318     i2=>$i2,
319     value=>$m_cache->{single}->{$fld}
320 dpavlin 326 });
321     }
322    
323 dpavlin 327 $m_cache = {};
324    
325 dpavlin 325 $t->purge; # frees the memory
326     }
327    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26