/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/Biblio/Isis.pm

Parent Directory | Revision Log | View Patch Patch

-revision 10 by dpavlin,
Wed Dec 29 16:04:07 2004 UTC
+revision 19 by dpavlin,
Thu Dec 30 23:16:20 2004 UTC
 Line 2 
 package IsisDB;
  use strict;
  use Carp;
+ use File::Glob qw(:globally :nocase);
  use Data::Dumper;
  BEGIN {
          use Exporter ();
          use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
-         $VERSION     = 0.03;
+         $VERSION     = 0.07;
          @ISA         = qw (Exporter);
          #Give a hoot don't pollute, do not export more than needed by default
          @EXPORT      = qw ();
-Line 18 
 BEGIN {
+Line 20 
 BEGIN {
  =head1 NAME
- IsisDB - Read CDS/ISIS database
+ IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
  =head1 SYNOPSIS
-   use IsisDB
+   use IsisDB;
    my $isis = new IsisDB(
          isisdb => './cds/cds',
    );
+   for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
+         print $isis->to_ascii($mfn),"\n";
+   }
  =head1 DESCRIPTION
- This module will read CDS/ISIS databases and create hash values out of it.
+ This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
- It can be used as perl-only alternative to OpenIsis module.
+ IsisMarc. It can be used as perl-only alternative to OpenIsis module.
+ It can create hash values from data in ISIS database (using C<to_hash>),
+ ASCII dump (using C<to_ascii>) or just hash with field names and packed
+ values (like C<^asomething^belse>).
+ Unique feature of this module is ability to C<include_deleted> records.
+ It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
+ fields which are zero sized will be filled with random junk from memory).
+ It also has support for identifiers (only if ISIS database is created by
+ IsisMarc), see C<to_hash>.
+ This will module will always be slower than OpenIsis module which use C
+ library. However, since it's written in perl, it's platform independent (so
+ you don't need C compiler), and can be easily modified. I hope that it
+ creates data structures which are easier to use than ones created by
+ OpenIsis, so reduced time in other parts of the code should compensate for
+ slower performance of this module (speed of reading ISIS database is
+ rarely an issue).
  =head1 METHODS
-Line 50 
 It can be used as perl-only alternative
+Line 76 
 It can be used as perl-only alternative
  # some binary reads
  #
- sub Read32 {
-         my $self = shift;
-         my $f = shift || die "Read32 needs file handle";
-         read($$f,$b,4) || die "can't read 4 bytes from $$f from position ".tell($f);
-         return unpack("l",$b);
- }
  =head2 new
- Open CDS/ISIS database
+ Open ISIS database
   my $isis = new IsisDB(
          isisdb => './cds/cds',
          read_fdt => 1,
-         debug => 1,
          include_deleted => 1,
+         hash_filter => sub {
+                 my $v = shift;
+                 $v =~ s#foo#bar#g;
+         },
+         debug => 1,
   );
  Options are described below:
-Line 75 
 Options are described below:
+Line 97 
 Options are described below:
  =item isisdb
- Prefix path to CDS/ISIS. It should contain full or relative path to database
+ This is full or relative path to ISIS database files which include
- and common prefix of C<.FDT>, C<.MST>, C<.CNT>, C<.XRF> and C<.MST> files.
+ common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
+ C<read_fdt> option) files.
+ In this example it uses C<./cds/cds.MST> and related files.
  =item read_fdt
  Boolean flag to specify if field definition table should be read. It's off
  by default.
- =item debug
+ =item include_deleted
- Dump a C<lot> of debugging output.
+ Don't skip logically deleted records in ISIS.
- =item include_deleted
+ =item hash_filter
+ Filter code ref which will be used before data is converted to hash.
- Don't skip logically deleted records.
+ =item debug
+ Dump a B<lot> of debugging output.
  =back
-Line 104 
 sub new {
+Line 133 
 sub new {
          croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
-         foreach my $v (qw{isisdb debug include_deleted}) {
+         foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
                  $self->{$v} = {@_}->{$v};
          }
+         my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
+         foreach my $f (@isis_files) {
+                 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
+                 $self->{lc($ext)."_file"} = $f;
+         }
+         my @must_exist = qw(mst xrf);
+         push @must_exist, "fdt" if ($self->{read_fdt});
+         foreach my $ext (@must_exist) {
+                 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
+         }
+         print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
          # if you want to read .FDT file use read_fdt argument when creating class!
-         if ({@_}->{read_fdt} && -e $self->{isisdb}.".FDT") {
+         if ($self->{read_fdt} && -e $self->{fdt_file}) {
                  # read the $db.FDT file for tags
                  my $fieldzone=0;
-                 open(fileFDT, $self->{isisdb}.".FDT") || croak "can't read '$self->{isisdb}.FDT': $!";
+                 open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
                  while (<fileFDT>) {
                          chomp;
-Line 138 
 sub new {
+Line 183 
 sub new {
          # Get the Maximum MFN from $db.MST
-         open(fileMST,$self->{isisdb}.".MST") || croak "can't read '$self->{isisdb}.MST': $!";
+         open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
          # MST format:   (* = 32 bit signed)
          # CTLMFN*       always 0
-Line 146 
 sub new {
+Line 191 
 sub new {
          # NXTMFB*       last block allocated to master file
          # NXTMFP        offset to next available position in last block
          # MFTYPE        always 0 for user db file (1 for system)
-         seek(fileMST,4,0);
+         seek($self->{'fileMST'},4,0);
-         $self->{'NXTMFN'}=$self->Read32(\*fileMST) || carp "NXTNFN is zero";
+         my $buff;
+         read($self->{'fileMST'}, $buff, 4);
+         $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
          # save maximum MFN
          $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
-         close(fileMST);
+         print STDERR Dumper($self),"\n" if ($self->{debug});
+         # open files for later
+         open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
+         $self ? return $self : return undef;
+ }
+ =head2 read_cnt
+ This function is not really used by module, but can be useful to find info
+ about your index (if debugging it for example).
+   print Dumper($isis->read_cnt);
+ =cut
+ sub read_cnt  {
+         my $self = shift;
+         croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
          # Get the index information from $db.CNT
-         open(fileCNT, $self->{isisdb}.".CNT") || croak "can't read '$self->{isisdb}.CNT': $!";
+         open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
          # There is two 26 Bytes fixed lenght records
-Line 180 
 sub new {
+Line 252 
 sub new {
                  my $buff = shift || return;
                  my @arr = unpack("ssssssllls", $buff);
-                 print "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
+                 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
                  my $IDTYPE = shift @arr;
                  foreach (@flds) {
-                         $self->{$IDTYPE}->{$_} = abs(shift @arr);
+                         $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
                  }
          }
          my $buff;
          read(fileCNT, $buff, 26);
          $self->unpack_cnt($buff);
          read(fileCNT, $buff, 26);
          $self->unpack_cnt($buff);
          close(fileCNT);
-         print Dumper($self) if ($self->{debug});
+         return $self->{cnt};
-         # open files for later
-         open($self->{'fileXRF'}, $self->{isisdb}.".XRF") || croak "can't open '$self->{isisdb}.XRF': $!";
-         open($self->{'fileMST'}, $self->{isisdb}.".MST") || croak "can't open '$self->{isisdb}.MST': $!";
-         $self ? return $self : return undef;
  }
  =head2 fetch
-Line 215 
 Read record with selected MFN
+Line 280 
 Read record with selected MFN
    my $rec = $isis->fetch(55);
  Returns hash with keys which are field names and values are unpacked values
- for that field.
+ for that field like this:
+   $rec = {
+     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
+     '990' => [ '2140', '88', 'HAY' ],
+   };
  =cut
-Line 224 
 sub fetch {
+Line 294 
 sub fetch {
          my $mfn = shift || croak "fetch needs MFN as argument!";
-         print "fetch: $mfn\n" if ($self->{debug});
+         # is mfn allready in memory?
+         my $old_mfn = $self->{'current_mfn'} || -1;
+         return if ($mfn == $old_mfn);
+         print STDERR "## fetch: $mfn\n" if ($self->{debug});
          # XXX check this?
          my $mfnpos=($mfn+int(($mfn-1)/127))*4;
-         print "seeking to $mfnpos in file '$self->{isisdb}.XRF'\n" if ($self->{debug});
+         print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
          seek($self->{'fileXRF'},$mfnpos,0);
+         my $buff;
          # read XRFMFB abd XRFMFP
-         my $pointer=$self->Read32(\*{$self->{'fileXRF'}});
+         read($self->{'fileXRF'}, $buff, 4);
+         my $pointer=unpack("l",$buff) || carp "pointer is null";
          my $XRFMFB = int($pointer/2048);
          my $XRFMFP = $pointer - ($XRFMFB*2048);
-         print "XRFMFB: $XRFMFB XRFMFP: $XRFMFP\n" if ($self->{debug});
-         # XXX fix this to be more readable!!
+         # (XRFMFB - 1) * 512 + XRFMFP
-         # e.g. (XRFMFB - 1) * 512 + XRFMFP
+         # why do i have to do XRFMFP % 1024 ?
-         my $offset = $pointer;
+         my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 1024);
-         my $offset2=int($offset/2048)-1;
-         my $offset22=int($offset/4096);
-         my $offset3=$offset-($offset22*4096);
-         if ($offset3>512) {
-                 $offset3=$offset3-2048;
-         }
-         my $offset4=($offset2*512)+$offset3;
-         print "$offset - $offset2 - $offset3 - $offset4\n" if ($self->{debug});
+         print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
          # Get Record Information
-         seek($self->{'fileMST'},$offset4,0);
+         seek($self->{'fileMST'},$blk_off,0);
-         my $value=$self->Read32(\*{$self->{'fileMST'}});
+         read($self->{'fileMST'}, $buff, 4);
+         my $value=unpack("l",$buff);
+         print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
          if ($value!=$mfn) {
- print ("Error: The MFN:".$mfn." is not found in MST(".$value.")");
+                 carp "Error: MFN ".$mfn." not found in MST(".$value.")";
-                 return -1;      # XXX deleted record?
+                 #return;                # XXX deleted record?
          }
  #       $MFRL=$self->Read16($fileMST);
-Line 272 
 print ("Error: The MFN:".$mfn." is not f
+Line 344 
 print ("Error: The MFN:".$mfn." is not f
  #       $NVF=$self->Read16($fileMST);
  #       $STATUS=$self->Read16($fileMST);
-         my $buff;
          read($self->{'fileMST'}, $buff, 14);
          my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
-         print "MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
+         print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
          # delete old record
          delete $self->{record};
+         ## FIXME this is a bug
          if (! $self->{'include_deleted'} && $MFRL < 0) {
                  print "## logically deleted record $mfn, skipping...\n" if ($self->{debug});
                  return;
          }
+         warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
          # Get Directory Format
          my @FieldPOS;
-Line 295 
 print ("Error: The MFN:".$mfn." is not f
+Line 369 
 print ("Error: The MFN:".$mfn." is not f
          read($self->{'fileMST'}, $buff, 6 * $NVF);
-         my $fld_len = 0;
+         my $rec_len = 0;
          for (my $i = 0 ; $i < $NVF ; $i++) {
-Line 305 
 print ("Error: The MFN:".$mfn." is not f
+Line 379 
 print ("Error: The MFN:".$mfn." is not f
                  my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
-                 print "TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
+                 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
                  # The TAG does not exists in .FDT so we set it to 0.
                  #
-Line 321 
 print ("Error: The MFN:".$mfn." is not f
+Line 395 
 print ("Error: The MFN:".$mfn." is not f
                  push @FieldPOS,$POS;
                  push @FieldLEN,$LEN;
-                 $fld_len += $LEN;
+                 $rec_len += $LEN;
          }
          # Get Variable Fields
-         read($self->{'fileMST'},$buff,$fld_len);
+         read($self->{'fileMST'},$buff,$rec_len);
+         print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
          for (my $i = 0 ; $i < $NVF ; $i++) {
                  # skip zero-sized fields
-Line 334 
 print ("Error: The MFN:".$mfn." is not f
+Line 410 
 print ("Error: The MFN:".$mfn." is not f
                  push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
          }
-         close(fileMST);
-         print Dumper($self) if ($self->{debug});
+         $self->{'current_mfn'} = $mfn;
+         print Dumper($self),"\n" if ($self->{debug});
          return $self->{'record'};
  }
  =head2 to_ascii
- Dump ascii output of selected MFN
+ Dump ASCII output of record with specified MFN
+   print $isis->to_ascii(42);
-   print $isis->to_ascii(55);
+ It outputs something like this:
+  ^aNew York^cNew York University press^dcop. 1988
+  2140
+  88
+  HAY
+ If C<read_fdt> is specified when calling C<new> it will display field names
+ from C<.FDT> file instead of numeric tags.
  =cut
-Line 359 
 sub to_ascii {
+Line 446 
 sub to_ascii {
          my $out = "0\t$mfn";
          foreach my $f (sort keys %{$rec}) {
-                 $out .= "\n$f\t".join("\n$f\t",@{$self->{record}->{$f}});
+                 my $fn = $self->tag_name($f);
+                 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
          }
          $out .= "\n";
-Line 367 
 sub to_ascii {
+Line 455 
 sub to_ascii {
          return $out;
  }
- #
+ =head2 to_hash
- # XXX porting from php left-over:
- #
+ Read record with specified MFN and convert it to hash
- # do I *REALLY* need those methods, or should I use
- # $self->{something} directly?
+   my $hash = $isis->to_hash($mfn);
- #
- # Probably direct usage is better!
- #
- sub TagName {
+ It has ability to convert characters (using C<hash_filter> from ISIS
+ database before creating structures enabling character re-mapping or quick
+ fix-up of data.
+ This function returns hash which is like this:
+   $hash = {
+     '210' => [
+                {
+                  'c' => 'New York University press',
+                  'a' => 'New York',
+                  'd' => 'cop. 1988'
+                }
+              ],
+     '990' => [
+                '2140',
+                '88',
+                'HAY'
+              ],
+   };
+ You can later use that hash to produce any output from ISIS data.
+ If database is created using IsisMarc, it will also have to special fields
+ which will be used for identifiers, C<i1> and C<i2> like this:
+   '200' => [
+              {
+                'i1' => '1',
+                'i2' => ' '
+                'a' => 'Goa',
+                'f' => 'Valdo D\'Arienzo',
+                'e' => 'tipografie e tipografi nel XVI secolo',
+              }
+            ],
+ This method will also create additional field C<000> with MFN.
+ =cut
+ sub to_hash {
          my $self = shift;
-         return $self->{TagName};
+         my $mfn = shift || confess "need mfn!";
+         # init record to include MFN as field 000
+         my $rec = { '000' => [ $mfn ] };
+         my $row = $self->fetch($mfn);
+         foreach my $k (keys %{$row}) {
+                 foreach my $l (@{$row->{$k}}) {
+                         # filter output
+                         $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
+                         my $val;
+                         # has identifiers?
+                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])//);
+                         # has subfields?
+                         if ($l =~ m/\^/) {
+                                 foreach my $t (split(/\^/,$l)) {
+                                         next if (! $t);
+                                         $val->{substr($t,0,1)} = substr($t,1);
+                                 }
+                         } else {
+                                 $val = $l;
+                         }
+                         push @{$rec->{$k}}, $val;
+                 }
+         }
+         return $rec;
  }
- sub NextMFN {
+ =head2 tag_name
+ Return name of selected tag
+  print $isis->tag_name('200');
+ =cut
+ sub tag_name {
          my $self = shift;
-         return $self->{NXTMFN};
+         my $tag = shift || return;
+         return $self->{'TagName'}->{$tag} || $tag;
  }
 ;
-Line 399 
 This module has been very lightly tested
+Line 566 
 This module has been very lightly tested
          dpavlin@rot13.org
          http://www.rot13.org/~dpavlin/
- This module is based heavily on code from LIBISIS.PHP - Library to read ISIS files V0.1.1
+ This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
- written in php and (c) 2000 Franck Martin - <franck@sopac.org> released under LGPL.
+ written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
  =head1 COPYRIGHT
-Line 413 
 LICENSE file included with this module.
+Line 580 
 LICENSE file included with this module.
  =head1 SEE ALSO
- L<http://www.openisis.org|OpenIsis>, perl(1).
+ OpenIsis web site L<http://www.openisis.org>
+ perl4lib site L<http://perl4lib.perl.org>

 Legend:



Removed from v.10
 


changed lines


 
Added in v.19
 Legend:



Removed from v.10
 


changed lines


 
Added in v.19
-Removed from v.10
+Added in v.19

	ViewVC Help
Powered by ViewVC 1.1.26