/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/Biblio/Isis.pm

Parent Directory | Revision Log | View Patch Patch

-trunk/IsisDB.pm
revision 15 by dpavlin,
Wed Dec 29 22:46:40 2004 UTC
+trunk/lib/Biblio/Isis.pm
revision 45 by dpavlin,
Thu Jul  6 20:31:46 2006 UTC
 Line 1
- package IsisDB;
+ package Biblio::Isis;
  use strict;
  use Carp;
- use Data::Dumper;
+ use File::Glob qw(:globally :nocase);
  BEGIN {
          use Exporter ();
          use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
-         $VERSION     = 0.05;
+         $VERSION     = 0.14;
          @ISA         = qw (Exporter);
          #Give a hoot don't pollute, do not export more than needed by default
          @EXPORT      = qw ();
 Line 18 
 BEGIN {
  =head1 NAME
- IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
+ Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
  =head1 SYNOPSIS
-   use IsisDB;
+   use Biblio::Isis;
-   my $isis = new IsisDB(
+   my $isis = new Biblio::Isis(
          isisdb => './cds/cds',
    );
-   for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
+   for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
          print $isis->to_ascii($mfn),"\n";
    }
  =head1 DESCRIPTION
  This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
- IsisMarc. It can be used as perl-only alternative to OpenIsis module.
+ IsisMarc. It can be used as perl-only alternative to OpenIsis module which
+ seems to depriciate it's old C<XS> bindings for perl.
  It can create hash values from data in ISIS database (using C<to_hash>),
  ASCII dump (using C<to_ascii>) or just hash with field names and packed
-Line 48 
 fields which are zero sized will be fill
+Line 49 
 fields which are zero sized will be fill
  It also has support for identifiers (only if ISIS database is created by
  IsisMarc), see C<to_hash>.
- This will module will always be slower than OpenIsis module which use C
+ This module will always be slower than OpenIsis module which use C
  library. However, since it's written in perl, it's platform independent (so
  you don't need C compiler), and can be easily modified. I hope that it
  creates data structures which are easier to use than ones created by
-Line 78 
 rarely an issue).
+Line 79 
 rarely an issue).
  Open ISIS database
-  my $isis = new IsisDB(
+  my $isis = new Biblio::Isis(
          isisdb => './cds/cds',
          read_fdt => 1,
          include_deleted => 1,
-Line 96 
 Options are described below:
+Line 97 
 Options are described below:
  =item isisdb
  This is full or relative path to ISIS database files which include
- common prefix of C<.FDT>, C<.MST>, C<.CNT>, C<.XRF> and C<.MST> files.
+ common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
+ C<read_fdt> option) files.
  In this example it uses C<./cds/cds.MST> and related files.
-Line 119 
 Dump a B<lot> of debugging output.
+Line 121 
 Dump a B<lot> of debugging output.
  =back
- It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
  =cut
  sub new {
 Line 134 
 sub new {
                  $self->{$v} = {@_}->{$v};
          }
+         my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
+         foreach my $f (@isis_files) {
+                 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
+                 $self->{lc($ext)."_file"} = $f;
+         }
+         my @must_exist = qw(mst xrf);
+         push @must_exist, "fdt" if ($self->{read_fdt});
+         foreach my $ext (@must_exist) {
+                 unless ($self->{$ext."_file"}) {
+                         carp "missing ",uc($ext)," file in ",$self->{isisdb};
+                         return;
+                 }
+         }
+         if ($self->{debug}) {
+                 print STDERR "## using files: ",join(" ",@isis_files),"\n";
+                 eval "use Data::Dump";
+                 if (! $@) {
+                         *Dumper = *Data::Dump::dump;
+                 } else {
+                         use Data::Dumper;
+                 }
+         }
          # if you want to read .FDT file use read_fdt argument when creating class!
-         if ({@_}->{read_fdt} && -e $self->{isisdb}.".FDT") {
+         if ($self->{read_fdt} && -e $self->{fdt_file}) {
                  # read the $db.FDT file for tags
                  my $fieldzone=0;
-                 open(fileFDT, $self->{isisdb}.".FDT") || croak "can't read '$self->{isisdb}.FDT': $!";
+                 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
+                 binmode($fileFDT);
-                 while (<fileFDT>) {
+                 while (<$fileFDT>) {
                          chomp;
                          if ($fieldzone) {
                                  my $name=substr($_,0,30);
-Line 159 
 sub new {
+Line 188 
 sub new {
                          }
                  }
-                 close(fileFDT);
+                 close($fileFDT);
          }
          # Get the Maximum MFN from $db.MST
-         open(fileMST,$self->{isisdb}.".MST") || croak "can't read '$self->{isisdb}.MST': $!";
+         open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
+         binmode($self->{'fileMST'});
          # MST format:   (* = 32 bit signed)
          # CTLMFN*       always 0
-Line 172 
 sub new {
+Line 202 
 sub new {
          # NXTMFB*       last block allocated to master file
          # NXTMFP        offset to next available position in last block
          # MFTYPE        always 0 for user db file (1 for system)
-         seek(fileMST,4,0);
+         seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
          my $buff;
-         read(fileMST, $buff, 4);
+         read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
-         $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
+         $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
-         # save maximum MFN
-         $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
-         close(fileMST);
+         print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
-         # Get the index information from $db.CNT
+         # open files for later
+         open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
-         open(fileCNT, $self->{isisdb}.".CNT") || croak "can't read '$self->{isisdb}.CNT': $!";
+         binmode($self->{'fileXRF'});
-         # There is two 26 Bytes fixed lenght records
-         #  0: IDTYPE    BTree type                              16
-         #  2: ORDN      Nodes Order                             16
-         #  4: ORDF      Leafs Order                             16
-         #  6: N         Number of Memory buffers for nodes      16
-         #  8: K         Number of buffers for first level index 16
-         # 10: LIV       Current number of Index Levels          16
-         # 12: POSRX*    Pointer to Root Record in N0x           32
-         # 16: NMAXPOS*  Next Available position in N0x          32
-         # 20: FMAXPOS*  Next available position in L0x          32
-         # 24: ABNORMAL  Formal BTree normality indicator        16
-         # length: 26 bytes
-         sub unpack_cnt {
-                 my $self = shift;
-                 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
-                 my $buff = shift || return;
-                 my @arr = unpack("ssssssllls", $buff);
-                 print "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
-                 my $IDTYPE = shift @arr;
-                 foreach (@flds) {
-                         $self->{$IDTYPE}->{$_} = abs(shift @arr);
-                 }
-         }
-         read(fileCNT, $buff, 26);
-         $self->unpack_cnt($buff);
-         read(fileCNT, $buff, 26);
-         $self->unpack_cnt($buff);
+         $self ? return $self : return undef;
+ }
-         close(fileCNT);
+ =head2 count
-         print Dumper($self),"\n" if ($self->{debug});
+ Return number of records in database
-         # open files for later
+   print $isis->count;
-         open($self->{'fileXRF'}, $self->{isisdb}.".XRF") || croak "can't open '$self->{isisdb}.XRF': $!";
-         open($self->{'fileMST'}, $self->{isisdb}.".MST") || croak "can't open '$self->{isisdb}.MST': $!";
+ =cut
-         $self ? return $self : return undef;
+ sub count {
+         my $self = shift;
+         return $self->{'NXTMFN'} - 1;
  }
  =head2 fetch
-Line 258 
 sub fetch {
+Line 252 
 sub fetch {
          my $mfn = shift || croak "fetch needs MFN as argument!";
-         print "fetch: $mfn\n" if ($self->{debug});
+         # is mfn allready in memory?
+         my $old_mfn = $self->{'current_mfn'} || -1;
+         return $self->{record} if ($mfn == $old_mfn);
+         print STDERR "## fetch: $mfn\n" if ($self->{debug});
          # XXX check this?
          my $mfnpos=($mfn+int(($mfn-1)/127))*4;
-         print "seeking to $mfnpos in file '$self->{isisdb}.XRF'\n" if ($self->{debug});
+         print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
          seek($self->{'fileXRF'},$mfnpos,0);
          my $buff;
+         # delete old record
+         delete $self->{record};
          # read XRFMFB abd XRFMFP
          read($self->{'fileXRF'}, $buff, 4);
-         my $pointer=unpack("l",$buff) || carp "pointer is null";
+         my $pointer=unpack("V",$buff);
+         if (! $pointer) {
+                 if ($self->{include_deleted}) {
+                         return;
+                 } else {
+                         warn "pointer for MFN $mfn is null\n";
+                         return;
+                 }
+         }
+         # check for logically deleted record
+         if ($pointer & 0x80000000) {
+                 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
+                 $self->{deleted} = $mfn;
+                 return unless $self->{include_deleted};
+                 # abs
+                 $pointer = ($pointer ^ 0xffffffff) + 1;
+         }
          my $XRFMFB = int($pointer/2048);
          my $XRFMFP = $pointer - ($XRFMFB*2048);
-         print "XRFMFB: $XRFMFB XRFMFP: $XRFMFP\n" if ($self->{debug});
+         # (XRFMFB - 1) * 512 + XRFMFP
+         # why do i have to do XRFMFP % 1024 ?
-         # XXX fix this to be more readable!!
-         # e.g. (XRFMFB - 1) * 512 + XRFMFP
-         my $offset = $pointer;
+         my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
-         my $offset2=int($offset/2048)-1;
-         my $offset22=int($offset/4096);
-         my $offset3=$offset-($offset22*4096);
-         if ($offset3>512) {
-                 $offset3=$offset3-2048;
-         }
-         my $offset4=($offset2*512)+$offset3;
-         print "$offset - $offset2 - $offset3 - $offset4\n" if ($self->{debug});
+         print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
          # Get Record Information
-         seek($self->{'fileMST'},$offset4,0);
+         seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
-         read($self->{'fileMST'}, $buff, 4);
+         read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
-         my $value=unpack("l",$buff);
+         my $value=unpack("V",$buff);
+         print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
          if ($value!=$mfn) {
- print ("Error: The MFN:".$mfn." is not found in MST(".$value.")");
+                 if ($value == 0) {
-                 return -1;      # XXX deleted record?
+                         print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
-         }
+                         $self->{deleted} = $mfn;
+                         return;
+                 }
- #       $MFRL=$self->Read16($fileMST);
+                 carp "Error: MFN ".$mfn." not found in MST file, found $value";
- #       $MFBWB=$self->Read32($fileMST);
+                 return;
- #       $MFBWP=$self->Read16($fileMST);
+         }
- #       $BASE=$self->Read16($fileMST);
- #       $NVF=$self->Read16($fileMST);
- #       $STATUS=$self->Read16($fileMST);
          read($self->{'fileMST'}, $buff, 14);
-         my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
+         my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
-         print "MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
+         print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
-         # delete old record
+         warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
-         delete $self->{record};
-         if (! $self->{'include_deleted'} && $MFRL < 0) {
+         warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
-                 print "## logically deleted record $mfn, skipping...\n" if ($self->{debug});
-                 return;
-         }
          # Get Directory Format
-Line 332 
 print ("Error: The MFN:".$mfn." is not f
+Line 340 
 print ("Error: The MFN:".$mfn." is not f
          read($self->{'fileMST'}, $buff, 6 * $NVF);
-         my $fld_len = 0;
+         my $rec_len = 0;
          for (my $i = 0 ; $i < $NVF ; $i++) {
- #               $TAG=$self->Read16($fileMST);
+                 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
- #               $POS=$self->Read16($fileMST);
- #               $LEN=$self->Read16($fileMST);
-                 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
-                 print "TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
+                 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
                  # The TAG does not exists in .FDT so we set it to 0.
                  #
-Line 358 
 print ("Error: The MFN:".$mfn." is not f
+Line 362 
 print ("Error: The MFN:".$mfn." is not f
                  push @FieldPOS,$POS;
                  push @FieldLEN,$LEN;
-                 $fld_len += $LEN;
+                 $rec_len += $LEN;
          }
          # Get Variable Fields
-         read($self->{'fileMST'},$buff,$fld_len);
+         read($self->{'fileMST'},$buff,$rec_len);
+         print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
          for (my $i = 0 ; $i < $NVF ; $i++) {
                  # skip zero-sized fields
-Line 371 
 print ("Error: The MFN:".$mfn." is not f
+Line 377 
 print ("Error: The MFN:".$mfn." is not f
                  push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
          }
-         close(fileMST);
-         print Dumper($self),"\n" if ($self->{debug});
+         $self->{'current_mfn'} = $mfn;
+         print STDERR Dumper($self),"\n" if ($self->{debug});
          return $self->{'record'};
  }
  =head2 to_ascii
- Dump ASCII output of record with specified MFN
+ Returns ASCII output of record with specified MFN
    print $isis->to_ascii(42);
- It outputs something like this:
+ This outputs something like this:
   ^aNew York^cNew York University press^dcop. 1988
   2140
-Line 401 
 sub to_ascii {
+Line 408 
 sub to_ascii {
          my $mfn = shift || croak "need MFN";
-         my $rec = $self->fetch($mfn);
+         my $rec = $self->fetch($mfn) || return;
          my $out = "0\t$mfn";
-Line 421 
 Read record with specified MFN and conve
+Line 428 
 Read record with specified MFN and conve
    my $hash = $isis->to_hash($mfn);
- It has ability to convert characters (using C<hash_filter> from ISIS
+ It has ability to convert characters (using C<hash_filter>) from ISIS
  database before creating structures enabling character re-mapping or quick
  fix-up of data.
-Line 467 
 sub to_hash {
+Line 474 
 sub to_hash {
          my $mfn = shift || confess "need mfn!";
          # init record to include MFN as field 000
-         my $rec = { '000' => $mfn };
+         my $rec = { '000' => [ $mfn ] };
-         my $row = $self->fetch($mfn);
+         my $row = $self->fetch($mfn) || return;
          foreach my $k (keys %{$row}) {
                  foreach my $l (@{$row->{$k}}) {
                          # filter output
-                         $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
+                         if ($self->{'hash_filter'}) {
+                                 $l = $self->{'hash_filter'}->($l);
+                                 next unless defined($l);
+                         }
                          my $val;
                          # has identifiers?
-                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])//);
+                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
                          # has subfields?
                          if ($l =~ m/\^/) {
-Line 513 
 sub tag_name {
+Line 523 
 sub tag_name {
          return $self->{'TagName'}->{$tag} || $tag;
  }
+ =head2 read_cnt
+ Read content of C<.CNT> file and return hash containing it.
+   print Dumper($isis->read_cnt);
+ This function is not used by module (C<.CNT> files are not required for this
+ module to work), but it can be useful to examine your index (while debugging
+ for example).
+ =cut
+ sub read_cnt  {
+         my $self = shift;
+         croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
+         # Get the index information from $db.CNT
+         open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
+         binmode($fileCNT);
+         my $buff;
+         read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
+         $self->unpack_cnt($buff);
+         read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
+         $self->unpack_cnt($buff);
+         close($fileCNT);
+         return $self->{cnt};
+ }
+ =head2 unpack_cnt
+ Unpack one of two 26 bytes fixed length record in C<.CNT> file.
+ Here is definition of record:
+  off key        description                             size
+: IDTYPE     BTree type                              s
+: ORDN       Nodes Order                             s
+: ORDF       Leafs Order                             s
+: N          Number of Memory buffers for nodes      s
+: K          Number of buffers for first level index s
+: LIV        Current number of Index Levels          s
+: POSRX      Pointer to Root Record in N0x           l
+: NMAXPOS    Next Available position in N0x          l
+: FMAXPOS    Next available position in L0x          l
+: ABNORMAL   Formal BTree normality indicator        s
+  length: 26 bytes
+ This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
+ =cut
+ sub unpack_cnt {
+         my $self = shift;
+         my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
+         my $buff = shift || return;
+         my @arr = unpack("vvvvvvVVVv", $buff);
+         print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
+         my $IDTYPE = shift @arr;
+         foreach (@flds) {
+                 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
+         }
+ }
 ;
  =head1 BUGS
- This module has been very lightly tested. Use with caution and report bugs.
+ Some parts of CDS/ISIS documentation are not detailed enough to exmplain
+ some variations in input databases which has been tested with this module.
+ When I was in doubt, I assumed that OpenIsis's implementation was right
+ (except for obvious bugs).
+ However, every effort has been made to test this module with as much
+ databases (and programs that create them) as possible.
+ I would be very greatful for success or failure reports about usage of this
+ module with databases from programs other than WinIsis and IsisMarc. I had
+ tested this against ouput of one C<isis.dll>-based application, but I don't
+ know any details about it's version.
  =head1 AUTHOR

 Legend:



Removed from v.15
 


changed lines


 
Added in v.45
 Legend:



Removed from v.15
 


changed lines


 
Added in v.45
-Removed from v.15
+Added in v.45

	ViewVC Help
Powered by ViewVC 1.1.26