19 |
|
|
20 |
This script B<is somewhat specific> to MODS export from |
This script B<is somewhat specific> to MODS export from |
21 |
Faculty of Electrical Engineering and Computing |
Faculty of Electrical Engineering and Computing |
22 |
so you might want to edit it |
so you might want to edit it (among other thing, it includes a lot |
23 |
|
of fields which are in Croatian). |
24 |
|
|
25 |
=head1 WARNING |
Feel free to hack this script and convert it to your own needs. |
26 |
|
|
27 |
This script is in state of flux. |
=head1 CAVEAT |
28 |
|
|
29 |
|
This script will parse imput XML twice: once with C<XML::Twig> and |
30 |
|
then each entry with C<XML::Simple> to produce in-memory structure. |
31 |
|
That's because I wanted to keep node selection logical (and perl-like). |
32 |
|
|
33 |
|
If you don't like it, you can rewrite this script to use XPATH. I tried |
34 |
|
and failed (it seems that MODS is too complicated for my limited knowledge |
35 |
|
of XPATH). |
36 |
|
|
37 |
=cut |
=cut |
38 |
|
|
46 |
|
|
47 |
my $xml_file = "/data/tehnika/fer/all.xml"; |
my $xml_file = "/data/tehnika/fer/all.xml"; |
48 |
$xml_file = "/data/tehnika/fer/modsFER_1.xml"; |
$xml_file = "/data/tehnika/fer/modsFER_1.xml"; |
49 |
|
$xml_file = "/data/tehnika/fer/mods-small.xml"; |
50 |
my $marc_file = "fer.marc"; |
my $marc_file = "fer.marc"; |
51 |
|
|
52 |
$|=1; |
$|=1; |
54 |
|
|
55 |
my $marc = MARC->new; |
my $marc = MARC->new; |
56 |
|
|
57 |
|
my $ENCODING = 'ISO-8859-2'; |
58 |
|
|
59 |
my $twig=XML::Twig->new( |
my $twig=XML::Twig->new( |
60 |
twig_roots => { 'mods' => \&item }, |
twig_roots => { 'mods' => \&mods }, |
61 |
output_encoding => 'iso-8859-2', |
output_encoding => 'UTF8', |
62 |
); |
); |
63 |
|
|
64 |
my $utf2iso = Text::Iconv->new("UTF-8", "ISO-8859-2"); |
my $utf2iso = Text::Iconv->new("UTF8", $ENCODING); |
65 |
|
|
66 |
|
print "$xml_file: "; |
67 |
$twig->parsefile($xml_file); |
$twig->parsefile($xml_file); |
68 |
$twig->purge; |
$twig->purge; |
69 |
|
print "$nr\nSaving MARC file...\n"; |
70 |
|
|
71 |
$marc->output({file=>"> $marc_file",'format'=>"usmarc"}); |
$marc->output({file=>"> $marc_file",'format'=>"usmarc"}); |
72 |
|
|
73 |
sub item { |
sub mods { |
74 |
my( $t, $elt)= @_; |
my( $t, $elt)= @_; |
75 |
|
|
76 |
my $xml=$elt->xml_string; |
my $xml=$elt->xml_string; |
77 |
my $ref = XMLin("<xml>".$xml."</xml>", |
my $ref = XMLin('<xml>'.$xml.'</xml>', |
78 |
ForceArray => [ |
ForceArray => [ |
79 |
'name', |
'name', |
80 |
'classification', |
'classification', |
81 |
'topic', |
'topic', |
|
'udc', |
|
82 |
], |
], |
83 |
KeyAttr => { |
KeyAttr => { |
84 |
'namePart' => 'type', |
'namePart' => 'type', |
85 |
'identifier' => 'type', |
'identifier' => 'type', |
|
'classification' => 'authority', |
|
86 |
'namePart' => 'type', |
'namePart' => 'type', |
87 |
'role' => 'type', |
'role' => 'type', |
88 |
}, |
}, |
94 |
ContentKey => '-content', |
ContentKey => '-content', |
95 |
); |
); |
96 |
|
|
97 |
my $m=$marc->createrecord(); |
my $m_cache; |
98 |
|
|
99 |
sub marc_arr { |
sub marc_add { |
100 |
my $m = shift || die "no marc record?"; |
my $m_cache = \shift || die "need m_cache"; |
101 |
my $fld = shift || die "no marc field?"; |
my $fld = shift || die "need field!"; |
102 |
my $sf = shift || ''; |
my $sf = shift; |
103 |
|
my $data = shift || return; |
104 |
|
|
105 |
return if (! @_); |
#print "add: $fld",($sf ? "^".$sf : ''),": $data\n"; |
106 |
|
|
107 |
my @a; |
if ($sf) { |
108 |
foreach (@_) { |
push @{$$m_cache->{tmp}->{$fld}}, $sf; |
|
next if (! $_); |
|
|
push @a,$sf; |
|
|
# push @a,$utf2iso->convert($_) || $_; |
|
|
push @a,$_; |
|
109 |
} |
} |
110 |
|
push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data); |
111 |
|
} |
112 |
|
|
113 |
return if (! @a); |
sub marc_rep { |
114 |
|
my $m_cache = \shift || die "need m_cache"; |
115 |
|
foreach my $fld (@_) { |
116 |
|
#print "marc_rep: $fld\n"; |
117 |
|
push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld}); |
118 |
|
delete $$m_cache->{tmp}->{$fld}; |
119 |
|
} |
120 |
|
} |
121 |
|
|
122 |
# print "storing $fld: ",join("|",@a),"\n"; |
sub marc_single { |
123 |
|
my $m_cache = \shift || die "need m_cache"; |
124 |
|
foreach my $fld (@_) { |
125 |
|
#print "marc_single: $fld\n"; |
126 |
|
|
127 |
$marc->addfield({record=>$m, |
die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld}); |
128 |
field=>$fld, |
|
129 |
# i1=>$i1, |
$$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld}); |
130 |
# i2=>$i2, |
delete $$m_cache->{tmp}->{$fld}; |
131 |
value=>\@a}); |
} |
132 |
|
} |
133 |
|
|
134 |
|
sub marc_add_rep { |
135 |
|
my $m_cache = \shift || die "need m_cache"; |
136 |
|
my $fld = shift || die "need field!"; |
137 |
|
my $sf = shift; |
138 |
|
my $data = shift || return; |
139 |
|
|
140 |
|
marc_add($$m_cache,$fld,$sf,$data); |
141 |
|
marc_rep($$m_cache,$fld); |
142 |
} |
} |
143 |
|
|
144 |
marc_arr($m,'610','a',@{$ref->{subject}->{topic}}); |
sub marc_add_single { |
145 |
|
my $m_cache = \shift || die "need m_cache"; |
146 |
|
my $fld = shift || die "need field!"; |
147 |
|
my $sf = shift; |
148 |
|
my $data = shift || return; |
149 |
|
|
150 |
|
marc_add($$m_cache,$fld,$sf,$data); |
151 |
|
marc_single($$m_cache,$fld); |
152 |
|
} |
153 |
|
|
154 |
|
my $journal = 0; |
155 |
|
# Journals start with c- in our MODS |
156 |
|
$journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/); |
157 |
|
|
158 |
|
foreach my $t (@{$ref->{subject}->{topic}}) { |
159 |
|
marc_add($m_cache,'610','a', $t); |
160 |
|
marc_rep($m_cache,'610'); |
161 |
|
} |
162 |
|
|
163 |
my $fld = '700'; |
my $fld_700 = '700'; |
164 |
|
my $fld_710 = '710'; |
165 |
|
|
166 |
foreach my $name (@{$ref->{name}}) { |
foreach my $name (@{$ref->{name}}) { |
167 |
my $role = $name->{role}->{roleTerm}->{content}; |
my $role = $name->{role}->{roleTerm}->{content}; |
168 |
next if (! $role); |
next if (! $role); |
169 |
if ($role eq "author") { |
if ($role eq "author") { |
170 |
marc_arr($m,$fld,'a',$name->{namePart}->{family}); |
marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family}); |
171 |
marc_arr($m,$fld,'b',$name->{namePart}->{given}); |
marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given}); |
172 |
marc_arr($m,$fld,'4',$role); |
marc_add($m_cache,$fld_700,'4',$role); |
173 |
|
|
174 |
|
marc_rep($m_cache,$fld_700); |
175 |
|
|
176 |
# first author goes in 700, others in 701 |
# first author goes in 700, others in 701 |
177 |
$fld = '701'; |
$fld_700 = '701'; |
178 |
} elsif ($role eq "editor" or $role eq "illustrator") { |
} elsif ($role eq "editor" or $role eq "illustrator") { |
179 |
marc_arr($m,'702','a',$name->{namePart}->{family}); |
marc_add($m_cache,'702','a',$name->{namePart}->{family}); |
180 |
marc_arr($m,'702','b',$name->{namePart}->{given}); |
marc_add($m_cache,'702','b',$name->{namePart}->{given}); |
181 |
marc_arr($m,'702','4',$role); |
marc_add($m_cache,'702','4',$role); |
182 |
|
marc_rep($m_cache,'702'); |
183 |
|
} elsif ($role eq "corporate") { |
184 |
|
marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart}); |
185 |
|
$fld_710 = '711'; |
186 |
|
} elsif ($role eq "conference") { |
187 |
|
marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart}); |
188 |
|
$fld_710 = '711'; |
189 |
} else { |
} else { |
190 |
die "FATAL: don't know how to map role '$role'" if ($role); |
die "FATAL: don't know how to map role '$role'" if ($role); |
191 |
} |
} |
196 |
if ($note) { |
if ($note) { |
197 |
foreach my $n (split(/\s*;\s+/, $note)) { |
foreach my $n (split(/\s*;\s+/, $note)) { |
198 |
if ($n =~ s/bibliogr:\s+//i) { |
if ($n =~ s/bibliogr:\s+//i) { |
199 |
marc_arr($m,'320','a',"Bibliografija: $n"); |
marc_add_rep($m_cache,'320','a',"Bibliografija: $n"); |
200 |
} elsif ($n =~ s/ilustr:\s+//i) { |
} elsif ($n =~ s/ilustr:\s+//i) { |
201 |
marc_arr($m,'215','c', $n); |
marc_add($m_cache,'215','c', $n); |
202 |
} else { |
} else { |
203 |
marc_arr($m,'320','a',$n); |
marc_add_rep($m_cache,'320','a',$n); |
204 |
} |
} |
205 |
} |
} |
206 |
} |
} |
207 |
|
|
208 |
|
|
209 |
my $type = $ref->{identifier}->{type}; |
my $type = $ref->{identifier}->{type}; |
210 |
|
|
211 |
if ($type) { |
if ($type) { |
212 |
if ($type eq "isbn") { |
if ($type eq "isbn") { |
213 |
marc_arr($m,'010','a',$ref->{identifier}->{content}); |
marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content}); |
214 |
} elsif ($type eq "issn") { |
} elsif ($type eq "issn") { |
215 |
marc_arr($m,'011','a',$ref->{identifier}->{content}); |
marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content}); |
216 |
} else { |
} else { |
217 |
die "unknown identifier type $type"; |
die "unknown identifier type $type"; |
218 |
} |
} |
233 |
if ($tmp->{str}) { |
if ($tmp->{str}) { |
234 |
$data .= $tmp->{str}." str"; |
$data .= $tmp->{str}." str"; |
235 |
} |
} |
236 |
marc_arr($m,'210','a', $data) if ($data); |
marc_add($m_cache,'215','a', $data) if ($data); |
237 |
marc_arr($m,'210','d', $tmp->{visina}); |
marc_add($m_cache,'215','d', $tmp->{visina}); |
238 |
} |
} |
239 |
|
marc_rep($m_cache,'215'); |
240 |
|
|
241 |
marc_arr($m,'001','',$ref->{recordInfo}->{recordIdentifier}); |
marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier}); |
242 |
|
|
243 |
marc_arr($m,'200','a',$ref->{titleInfo}->{title}); |
marc_add($m_cache,'200','a',$ref->{titleInfo}->{title}); |
244 |
marc_arr($m,'200','e',$ref->{titleInfo}->{subTitle}); |
marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle}); |
245 |
|
marc_single($m_cache,'200'); |
246 |
|
|
247 |
marc_arr($m,'675','a',$ref->{classification}->{udc}); |
foreach my $c (@{$ref->{classification}}) { |
248 |
|
if ($c->{'authority'} eq "udc") { |
249 |
|
marc_add_rep($m_cache,'675','a', $c->{'content'}); |
250 |
|
} |
251 |
|
} |
252 |
|
|
253 |
my $related = $ref->{relatedItem}->{type}; |
my $related = $ref->{relatedItem}->{type}; |
254 |
if ($related) { |
if ($related) { |
255 |
if ($related eq "series") { |
if ($related eq "series") { |
256 |
marc_arr($m,'675','a',$ref->{relatedItem}->{titleInfo}->{title}); |
marc_add($m_cache,'225','a',$ref->{relatedItem}->{titleInfo}->{title}); |
257 |
marc_arr($m,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber}); |
marc_add($m_cache,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber}); |
258 |
|
marc_rep($m_cache,'225','999'); |
259 |
} elsif ($related eq "preceding") { |
} elsif ($related eq "preceding") { |
260 |
marc_arr($m,'430','a',$ref->{relatedItem}->{titleInfo}->{title}); |
marc_add_rep($m_cache,'430','a',$ref->{relatedItem}->{titleInfo}->{title}); |
261 |
} else { |
} else { |
262 |
die "can't parse related item type $related" if ($related); |
die "can't parse related item type $related" if ($related); |
263 |
} |
} |
264 |
} |
} |
265 |
|
|
266 |
marc_arr($m,'205','a',$ref->{originInfo}->{edition}); |
marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition}); |
267 |
|
|
268 |
my $publisher = $ref->{originInfo}->{publisher}; |
my $publisher = $ref->{originInfo}->{publisher}; |
269 |
if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) { |
if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) { |
270 |
marc_arr($m,'210','a', $2); |
marc_add($m_cache,'210','a', $2); |
271 |
marc_arr($m,'210','c', $1); |
marc_add($m_cache,'210','c', $1); |
272 |
} else { |
} else { |
273 |
marc_arr($m,'210','c', $publisher); |
marc_add($m_cache,'210','c', $publisher); |
274 |
} |
} |
275 |
|
|
276 |
marc_arr($m,'326','a',$ref->{originInfo}->{frequency}); |
marc_add($m_cache,'210','a',$ref->{originInfo}->{place}); |
277 |
marc_arr($m,'326','a',$ref->{originInfo}->{place}); |
marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued}); |
278 |
|
|
279 |
|
marc_single($m_cache,'210'); |
280 |
|
|
281 |
marc_arr($m,'210','d',$ref->{originInfo}->{dateIssued}); |
marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal); |
282 |
|
|
283 |
$nr++; |
$nr++; |
284 |
print "$nr " if ($nr % 100 == 0); |
print "$nr " if ($nr % 100 == 0); |
285 |
|
|
286 |
$t->purge; # frees the memory |
# dump record |
287 |
} |
my $m=$marc->createrecord({leader=>"00000nam 2200000 a 4500"}); |
288 |
|
|
289 |
__END__ |
foreach my $fld (keys %{$m_cache->{array}}) { |
290 |
|
foreach my $arr (@{$m_cache->{array}->{$fld}}) { |
291 |
|
#print "array = ",Dumper($arr); |
292 |
|
my ($i1,$i2); |
293 |
|
# do we have indicators? |
294 |
|
if ($fld =~ m/^(.+)\t(.)(.)$/) { |
295 |
|
$fld = $1; |
296 |
|
($i1,$i2) = ($2,$3); |
297 |
|
} |
298 |
|
$marc->addfield({record=>$m, |
299 |
|
field=>$fld, |
300 |
|
i1=>$i1, |
301 |
|
i2=>$i2, |
302 |
|
value=>$arr |
303 |
|
}); |
304 |
|
} |
305 |
|
} |
306 |
|
|
307 |
KNJIGA = { |
foreach my $fld (keys %{$m_cache->{single}}) { |
308 |
610a 'subject' => [ |
#print "single = ",Dumper($m_cache->{single}->{$fld}); |
309 |
{ |
my ($i1,$i2); |
310 |
'topic' => [ |
# do we have indicators? |
311 |
'LIBRARIES-AUTOMATION', |
if ($fld =~ m/^(.+)\t(.)(.)$/) { |
312 |
'ELECTRONIC DATA PROCESSING-LIBRARY SCIENCE' |
$fld = $1; |
313 |
] |
($i1,$i2) = ($2,$3); |
314 |
} |
} |
315 |
], |
$marc->addfield({record=>$m, |
316 |
'name' => [ |
field=>$fld, |
317 |
{ |
i1=>$i1, |
318 |
'namePart' => { |
i2=>$i2, |
319 |
700b,701a... 'given' => 'Robert M.', |
value=>$m_cache->{single}->{$fld} |
320 |
700a,701b... 'family' => 'Hayes' |
}); |
321 |
}, |
} |
|
'type' => 'personal', |
|
|
'role' => { |
|
|
'roleTerm' => { |
|
|
7004,7014... 'content' => 'author', |
|
|
'type' => 'text' |
|
|
} |
|
|
} |
|
|
}, |
|
|
{ |
|
|
'namePart' => { |
|
|
702b 'given' => 'Joseph', |
|
|
702a 'family' => 'Becker' |
|
|
}, |
|
|
'type' => 'personal', |
|
|
'role' => { |
|
|
'roleTerm' => { |
|
|
7024 'content' => 'editor', |
|
|
'type' => 'text' |
|
|
} |
|
|
} |
|
|
}, |
|
|
{ |
|
|
'namePart' => { |
|
|
702b 'given' => 'Joseph', |
|
|
702a 'family' => 'Becker' |
|
|
}, |
|
|
'type' => 'personal', |
|
|
'role' => { |
|
|
'roleTerm' => { |
|
|
7024 'content' => 'illustrator', |
|
|
'type' => 'text' |
|
|
} |
|
|
} |
|
|
} |
|
|
], |
|
|
'note' => 'bibliogr: 645-647; kazalo; ilustr: ilustr.', |
|
|
'identifier' => { |
|
|
010a 'content' => '0-471-36483-5', |
|
|
'type' => 'isbn' |
|
|
}, |
|
|
215a;215d 'physicalDescription' => 'str: 688; pagin: xvi; visina: 24. cm', |
|
|
001 'recordInfo' => { |
|
|
'recordIdentifier' => 'k-7996-8073' |
|
|
}, |
|
|
200a 'titleInfo' => { |
|
|
'title' => 'Handbook of data processing for libraries' |
|
|
}, |
|
|
'typeOfResource' => 'text', |
|
|
675a 'classification' => { |
|
|
'udc' => '=20' |
|
|
}, |
|
|
225a 'relatedItem' => { |
|
|
'titleInfo' => { |
|
|
'title' => 'A WILEY-BECKER & HAYES SERIES BOOK' |
|
|
}, |
|
|
'type' => 'series' |
|
|
}, |
|
|
'originInfo' => { |
|
|
'issuance' => 'monographic', |
|
|
205a 'edition' => '2.', |
|
|
210c/210a 'publisher' => 'MELVILLE PUBLISHING COMPANY /LOS ANGELES, CALIFORNIA/', |
|
|
210d 'dateIssued' => '1974' |
|
|
} |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
CASOPIS = { |
|
|
'identifier' => { |
|
|
011a 'content' => '1041-5173', |
|
|
'type' => 'issn' |
|
|
}, |
|
|
'recordInfo' => { |
|
|
'recordIdentifier' => 'c-1' |
|
|
}, |
|
|
'titleInfo' => { |
|
|
'title' => 'DBMS - CLIENT/SERVER COMPUTING' |
|
|
}, |
|
|
'typeOfResource' => 'text', |
|
|
999a 'relatedItem' => { |
|
|
'titleInfo' => { |
|
|
'partNumber' => 'g. 1990, vol. 137, br. 5' |
|
|
}, |
|
|
'type' => 'series' |
|
|
}, |
|
|
'classification' => { |
|
|
'udc' => '=20' |
|
|
}, |
|
|
'originInfo' => { |
|
|
326a 'frequency' => 'mjeseļæ½no', |
|
|
'issuance' => 'continuing', |
|
|
210a 'place' => 'SAN MATEO, KANADA', |
|
|
210c 'publisher' => 'M&T PUBLISHING INC.' |
|
|
} |
|
|
}; |
|
322 |
|
|
323 |
|
$m_cache = {}; |
324 |
|
|
325 |
=cut |
$t->purge; # frees the memory |
326 |
|
} |
327 |
|
|