/[webpac]/trunk/tools/mods2unimarc.pl

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/tools/mods2unimarc.pl

Parent Directory | Revision Log | View Patch Patch

-revision 325 by dpavlin,
Fri May 14 16:38:22 2004 UTC
+revision 327 by dpavlin,
Sat May 15 18:54:41 2004 UTC
 Line 19 
 in this script.
  This script B<is somewhat specific> to MODS export from
  Faculty of Electrical Engineering and Computing
- so you might want to edit it
+ so you might want to edit it (among other thing, it includes a lot
+ of fields which are in Croatian).
- =head1 WARNING
+ Feel free to hack this script and convert it to your own needs.
- This script is in state of flux.
+ =head1 CAVEAT
+ This script will parse imput XML twice: once with C<XML::Twig> and
+ then each entry with C<XML::Simple> to produce in-memory structure.
+ That's because I wanted to keep node selection logical (and perl-like).
+ If you don't like it, you can rewrite this script to use XPATH. I tried
+ and failed (it seems that MODS is too complicated for my limited knowledge
+ of XPATH).
  =cut
-Line 37 
 use Data::Dumper;
+Line 46 
 use Data::Dumper;
  my $xml_file = "/data/tehnika/fer/all.xml";
  $xml_file = "/data/tehnika/fer/modsFER_1.xml";
+ $xml_file = "/data/tehnika/fer/mods-small.xml";
  my $marc_file = "fer.marc";
  $|=1;
-Line 44 
 my $nr = 0;
+Line 54 
 my $nr = 0;
  my $marc = MARC->new;
+ my $ENCODING = 'ISO-8859-2';
  my $twig=XML::Twig->new(
-         twig_roots => { 'mods' => \&item },
+         twig_roots => { 'mods' => \&mods },
-         output_encoding => 'iso-8859-2',
+         output_encoding => 'UTF8',
  );
- my $utf2iso = Text::Iconv->new("UTF-8", "ISO-8859-2");
+ my $utf2iso = Text::Iconv->new("UTF8", $ENCODING);
+ print "$xml_file: ";
  $twig->parsefile($xml_file);
  $twig->purge;
+ print "$nr\nSaving MARC file...\n";
  $marc->output({file=>"> $marc_file",'format'=>"usmarc"});
- sub item {
+ sub mods {
          my( $t, $elt)= @_;
          my $xml=$elt->xml_string;
-         my $ref = XMLin("<xml>".$xml."</xml>",
+         my $ref = XMLin('<xml>'.$xml.'</xml>',
                  ForceArray => [
                          'name',
                          'classification',
                          'topic',
-                         'udc',
                  ],
                  KeyAttr => {
                          'namePart' => 'type',
                          'identifier' => 'type',
-                         'classification' => 'authority',
                          'namePart' => 'type',
                          'role' => 'type',
                  },
-Line 82 
 sub item {
+Line 94 
 sub item {
                  ContentKey => '-content',
          );
-         my $m=$marc->createrecord();
+         my $m_cache;
-         sub marc_arr {
+         sub marc_add {
-                 my $m = shift || die "no marc record?";
+                 my $m_cache = \shift || die "need m_cache";
-                 my $fld = shift || die "no marc field?";
+                 my $fld = shift || die "need field!";
-                 my $sf = shift || '';
+                 my $sf = shift;
+                 my $data = shift || return;
-                 return if (! @_);
+ #print "add: $fld",($sf ? "^".$sf : ''),": $data\n";
-                 my @a;
+                 if ($sf) {
-                 foreach (@_) {
+                         push @{$$m_cache->{tmp}->{$fld}}, $sf;
-                         next if (! $_);
-                         push @a,$sf;
- #                       push @a,$utf2iso->convert($_) || $_;
-                         push @a,$_;
                  }
+                 push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data);
+         }
-                 return if (! @a);
+         sub marc_rep {
+                 my $m_cache = \shift || die "need m_cache";
+                 foreach my $fld (@_) {
+ #print "marc_rep: $fld\n";
+                         push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld});
+                         delete $$m_cache->{tmp}->{$fld};
+                 }
+         }
- #               print "storing $fld: ",join("|",@a),"\n";
+         sub marc_single {
+                 my $m_cache = \shift || die "need m_cache";
+                 foreach my $fld (@_) {
+ #print "marc_single: $fld\n";
-                 $marc->addfield({record=>$m,
+                         die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld});
-                         field=>$fld,
-         #               i1=>$i1,
+                         $$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld});
-         #               i2=>$i2,
+                         delete $$m_cache->{tmp}->{$fld};
-                 value=>\@a});
+                 }
+         }
+         sub marc_add_rep {
+                 my $m_cache = \shift || die "need m_cache";
+                 my $fld = shift || die "need field!";
+                 my $sf = shift;
+                 my $data = shift || return;
+                 marc_add($$m_cache,$fld,$sf,$data);
+                 marc_rep($$m_cache,$fld);
          }
-         marc_arr($m,'610','a',@{$ref->{subject}->{topic}});
+         sub marc_add_single {
+                 my $m_cache = \shift || die "need m_cache";
+                 my $fld = shift || die "need field!";
+                 my $sf = shift;
+                 my $data = shift || return;
+                 marc_add($$m_cache,$fld,$sf,$data);
+                 marc_single($$m_cache,$fld);
+         }
+         my $journal = 0;
+         # Journals start with c- in our MODS
+         $journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/);
+         foreach my $t (@{$ref->{subject}->{topic}}) {
+                 marc_add($m_cache,'610','a', $t);
+                 marc_rep($m_cache,'610');
+         }
-         my $fld = '700';
+         my $fld_700 = '700';
+         my $fld_710 = '710';
          foreach my $name (@{$ref->{name}}) {
                  my $role = $name->{role}->{roleTerm}->{content};
                  next if (! $role);
                  if ($role eq "author") {
-                         marc_arr($m,$fld,'a',$name->{namePart}->{family});
+                         marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family});
-                         marc_arr($m,$fld,'b',$name->{namePart}->{given});
+                         marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given});
-                         marc_arr($m,$fld,'4',$role);
+                         marc_add($m_cache,$fld_700,'4',$role);
+                         marc_rep($m_cache,$fld_700);
                          # first author goes in 700, others in 701
-                         $fld = '701';
+                         $fld_700 = '701';
                  } elsif ($role eq "editor" or $role eq "illustrator") {
-                         marc_arr($m,'702','a',$name->{namePart}->{family});
+                         marc_add($m_cache,'702','a',$name->{namePart}->{family});
-                         marc_arr($m,'702','b',$name->{namePart}->{given});
+                         marc_add($m_cache,'702','b',$name->{namePart}->{given});
-                         marc_arr($m,'702','4',$role);
+                         marc_add($m_cache,'702','4',$role);
+                         marc_rep($m_cache,'702');
+                 } elsif ($role eq "corporate") {
+                         marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart});
+                         $fld_710 = '711';
+                 } elsif ($role eq "conference") {
+                         marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart});
+                         $fld_710 = '711';
                  } else {
                          die "FATAL: don't know how to map role '$role'" if ($role);
                  }
-Line 138 
 sub item {
+Line 196 
 sub item {
          if ($note) {
                  foreach my $n (split(/\s*;\s+/, $note)) {
                          if ($n =~ s/bibliogr:\s+//i) {
-                                 marc_arr($m,'320','a',"Bibliografija: $n");
+                                 marc_add_rep($m_cache,'320','a',"Bibliografija: $n");
                          } elsif ($n =~ s/ilustr:\s+//i) {
-                                 marc_arr($m,'215','c', $n);
+                                 marc_add($m_cache,'215','c', $n);
                          } else {
-                                 marc_arr($m,'320','a',$n);
+                                 marc_add_rep($m_cache,'320','a',$n);
                          }
                  }
          }
          my $type = $ref->{identifier}->{type};
          if ($type) {
                  if ($type eq "isbn") {
-                         marc_arr($m,'010','a',$ref->{identifier}->{content});
+                         marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content});
                  } elsif ($type eq "issn") {
-                         marc_arr($m,'011','a',$ref->{identifier}->{content});
+                         marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content});
                  } else {
                          die "unknown identifier type $type";
                  }
-Line 175 
 sub item {
+Line 233 
 sub item {
                  if ($tmp->{str}) {
                          $data .= $tmp->{str}." str";
                  }
-                 marc_arr($m,'210','a', $data) if ($data);
+                 marc_add($m_cache,'215','a', $data) if ($data);
-                 marc_arr($m,'210','d', $tmp->{visina});
+                 marc_add($m_cache,'215','d', $tmp->{visina});
          }
+         marc_rep($m_cache,'215');
-         marc_arr($m,'001','',$ref->{recordInfo}->{recordIdentifier});
+         marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier});
-         marc_arr($m,'200','a',$ref->{titleInfo}->{title});
+         marc_add($m_cache,'200','a',$ref->{titleInfo}->{title});
-         marc_arr($m,'200','e',$ref->{titleInfo}->{subTitle});
+         marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle});
+         marc_single($m_cache,'200');
-         marc_arr($m,'675','a',$ref->{classification}->{udc});
+         foreach my $c (@{$ref->{classification}}) {
+                 if ($c->{'authority'} eq "udc") {
+                         marc_add_rep($m_cache,'675','a', $c->{'content'});
+                 }
+         }
          my $related = $ref->{relatedItem}->{type};
          if ($related) {
                  if ($related eq "series") {
-                         marc_arr($m,'675','a',$ref->{relatedItem}->{titleInfo}->{title});
+                         marc_add($m_cache,'225','a',$ref->{relatedItem}->{titleInfo}->{title});
-                         marc_arr($m,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber});
+                         marc_add($m_cache,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber});
+                         marc_rep($m_cache,'225','999');
                  } elsif ($related eq "preceding") {
-                         marc_arr($m,'430','a',$ref->{relatedItem}->{titleInfo}->{title});
+                         marc_add_rep($m_cache,'430','a',$ref->{relatedItem}->{titleInfo}->{title});
                  } else {
                          die "can't parse related item type $related" if ($related);
                  }
          }
-         marc_arr($m,'205','a',$ref->{originInfo}->{edition});
+         marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition});
          my $publisher = $ref->{originInfo}->{publisher};
          if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) {
-                 marc_arr($m,'210','a', $2);
+                 marc_add($m_cache,'210','a', $2);
-                 marc_arr($m,'210','c', $1);
+                 marc_add($m_cache,'210','c', $1);
          } else {
-                 marc_arr($m,'210','c', $publisher);
+                 marc_add($m_cache,'210','c', $publisher);
          }
-         marc_arr($m,'326','a',$ref->{originInfo}->{frequency});
+         marc_add($m_cache,'210','a',$ref->{originInfo}->{place});
-         marc_arr($m,'326','a',$ref->{originInfo}->{place});
+         marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued});
+         marc_single($m_cache,'210');
-         marc_arr($m,'210','d',$ref->{originInfo}->{dateIssued});
+         marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal);
          $nr++;
          print "$nr " if ($nr % 100 == 0);
-         $t->purge;           # frees the memory
+         # dump record
- }
+         my $m=$marc->createrecord({leader=>"00000nam  2200000 a 4500"});
- __END__
+         foreach my $fld (keys %{$m_cache->{array}}) {
+                 foreach my $arr (@{$m_cache->{array}->{$fld}}) {
+ #print "array = ",Dumper($arr);
+                         my ($i1,$i2);
+                         # do we have indicators?
+                         if ($fld =~ m/^(.+)\t(.)(.)$/) {
+                                 $fld = $1;
+                                 ($i1,$i2) = ($2,$3);
+                         }
+                         $marc->addfield({record=>$m,
+                                 field=>$fld,
+                                 i1=>$i1,
+                                 i2=>$i2,
+                                 value=>$arr
+                         });
+                 }
+         }
- KNJIGA = {
+         foreach my $fld (keys %{$m_cache->{single}}) {
-a           'subject' => [
+ #print "single = ",Dumper($m_cache->{single}->{$fld});
-                           {
+                 my ($i1,$i2);
-                             'topic' => [
+                 # do we have indicators?
-                                              'LIBRARIES-AUTOMATION',
+                 if ($fld =~ m/^(.+)\t(.)(.)$/) {
-                                              'ELECTRONIC DATA PROCESSING-LIBRARY SCIENCE'
+                         $fld = $1;
-                                        ]
+                         ($i1,$i2) = ($2,$3);
-                           }
+                 }
-                         ],
+                 $marc->addfield({record=>$m,
-            'name' => [
+                         field=>$fld,
-                        {
+                         i1=>$i1,
-                          'namePart' => {
+                         i2=>$i2,
-b,701a...                                 'given' => 'Robert M.',
+                         value=>$m_cache->{single}->{$fld}
-a,701b...                                 'family' => 'Hayes'
+                 });
-                                        },
+         }
-                          'type' => 'personal',
-                          'role' => {
-                                      'roleTerm' => {
-,7014...                                         'content' => 'author',
-                                                      'type' => 'text'
-                                                    }
-                                    }
-                        },
-                        {
-                          'namePart' => {
-b                                     'given' => 'Joseph',
-a                                     'family' => 'Becker'
-                                        },
-                          'type' => 'personal',
-                          'role' => {
-                                      'roleTerm' => {
-                                                'content' => 'editor',
-                                                      'type' => 'text'
-                                                    }
-                                    }
-                        },
-                        {
-                          'namePart' => {
-b                                     'given' => 'Joseph',
-a                                     'family' => 'Becker'
-                                        },
-                          'type' => 'personal',
-                          'role' => {
-                                      'roleTerm' => {
-                                                'content' => 'illustrator',
-                                                      'type' => 'text'
-                                                    }
-                                    }
-                        }
-                      ],
-            'note' => 'bibliogr: 645-647; kazalo; ilustr: ilustr.',
-            'identifier' => {
-a                         'content' => '0-471-36483-5',
-                              'type' => 'isbn'
-                            },
-a;215d  'physicalDescription' => 'str: 688; pagin: xvi; visina: 24. cm',
-       'recordInfo' => {
-                              'recordIdentifier' => 'k-7996-8073'
-                            },
-a       'titleInfo' => {
-                             'title' => 'Handbook of data processing for libraries'
-                           },
-            'typeOfResource' => 'text',
-a       'classification' => {
-                                  'udc' => '=20'
-                                },
-a       'relatedItem' => {
-                               'titleInfo' => {
-                                                'title' => 'A WILEY-BECKER & HAYES  SERIES BOOK'
-                                              },
-                               'type' => 'series'
-                             },
-            'originInfo' => {
-                              'issuance' => 'monographic',
-a                         'edition' => '2.',
-c/210a                    'publisher' => 'MELVILLE PUBLISHING COMPANY /LOS ANGELES, CALIFORNIA/',
-d                         'dateIssued' => '1974'
-                            }
-          };
-  CASOPIS = {
-            'identifier' => {
-a                         'content' => '1041-5173',
-                              'type' => 'issn'
-                            },
-            'recordInfo' => {
-                              'recordIdentifier' => 'c-1'
-                            },
-            'titleInfo' => {
-                             'title' => 'DBMS - CLIENT/SERVER COMPUTING'
-                           },
-            'typeOfResource' => 'text',
-a       'relatedItem' => {
-                               'titleInfo' => {
-                                                'partNumber' => 'g. 1990, vol. 137, br. 5'
-                                              },
-                               'type' => 'series'
-                             },
-            'classification' => {
-                                  'udc' => '=20'
-                                },
-            'originInfo' => {
-a                         'frequency' => 'mjese�no',
-                              'issuance' => 'continuing',
-a                         'place' => 'SAN MATEO, KANADA',
-c                         'publisher' => 'M&T PUBLISHING INC.'
-                            }
-          };
+         $m_cache = {};
- =cut
+         $t->purge;           # frees the memory
+ }

 Legend:



Removed from v.325
 


changed lines


 
Added in v.327
 Legend:



Removed from v.325
 


changed lines


 
Added in v.327
-Removed from v.325
+Added in v.327

	ViewVC Help
Powered by ViewVC 1.1.26