/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/Biblio/Isis.pm

Parent Directory | Revision Log | View Patch Patch

-trunk/IsisDB.pm
revision 2 by dpavlin,
Tue Dec 28 01:41:45 2004 UTC
+trunk/lib/Biblio/Isis.pm
revision 62 by dpavlin,
Mon Jul 10 12:01:04 2006 UTC
 Line 1
- package IsisDB;
+ package Biblio::Isis;
  use strict;
  use Carp;
- use Data::Dumper;
+ use File::Glob qw(:globally :nocase);
  BEGIN {
          use Exporter ();
          use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
-         $VERSION     = 0.01;
+         $VERSION     = 0.22_2;
          @ISA         = qw (Exporter);
          #Give a hoot don't pollute, do not export more than needed by default
          @EXPORT      = qw ();
 Line 18 
 BEGIN {
  =head1 NAME
- IsisDB - Read CDS/ISIS database
+ Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
  =head1 SYNOPSIS
-   use IsisDB
+   use Biblio::Isis;
-   my $isis = new IsisDB(
+   my $isis = new Biblio::Isis(
          isisdb => './cds/cds',
    );
+   for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
+         print $isis->to_ascii($mfn),"\n";
+   }
  =head1 DESCRIPTION
- This module will read CDS/ISIS databases and create hash values out of it.
+ This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
- It can be used as perl-only alternative to OpenIsis module.
+ IsisMarc. It can be used as perl-only alternative to OpenIsis module which
+ seems to depriciate it's old C<XS> bindings for perl.
+ It can create hash values from data in ISIS database (using C<to_hash>),
+ ASCII dump (using C<to_ascii>) or just hash with field names and packed
+ values (like C<^asomething^belse>).
+ Unique feature of this module is ability to C<include_deleted> records.
+ It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
+ fields which are zero sized will be filled with random junk from memory).
+ It also has support for identifiers (only if ISIS database is created by
+ IsisMarc), see C<to_hash>.
+ This module will always be slower than OpenIsis module which use C
+ library. However, since it's written in perl, it's platform independent (so
+ you don't need C compiler), and can be easily modified. I hope that it
+ creates data structures which are easier to use than ones created by
+ OpenIsis, so reduced time in other parts of the code should compensate for
+ slower performance of this module (speed of reading ISIS database is
+ rarely an issue).
  =head1 METHODS
-Line 50 
 It can be used as perl-only alternative
+Line 75 
 It can be used as perl-only alternative
  # some binary reads
  #
- sub Read32 {
-         my $self = shift;
-         my $f = shift || die "Read32 needs file handle";
-         read($$f,$b,4) || die "can't read 4 bytes from $$f from position ".tell($f);
-         return unpack("l",$b);
- }
  =head2 new
- Open CDS/ISIS database
+ Open ISIS database
-  my $isis = new IsisDB(
+  my $isis = new Biblio::Isis(
          isisdb => './cds/cds',
          read_fdt => 1,
+         include_deleted => 1,
+         hash_filter => sub {
+                 my $v = shift;
+                 $v =~ s#foo#bar#g;
+         },
          debug => 1,
+         join_subfields_with => ' ; ',
+         regexps => [
+                 's/something/else/g',
+         ],
   );
  Options are described below:
-Line 74 
 Options are described below:
+Line 100 
 Options are described below:
  =item isisdb
- Prefix path to CDS/ISIS. It should contain full or relative path to database
+ This is full or relative path to ISIS database files which include
- and common prefix of C<.FDT>, C<.MST>, C<.CNT>, C<.XRF> and C<.MST> files.
+ common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
+ C<read_fdt> option) files.
+ In this example it uses C<./cds/cds.MST> and related files.
  =item read_fdt
  Boolean flag to specify if field definition table should be read. It's off
  by default.
+ =item include_deleted
+ Don't skip logically deleted records in ISIS.
+ =item hash_filter
+ Filter code ref which will be used before data is converted to hash.
  =item debug
- Dump a C<lot> of debugging output.
+ Dump a B<lot> of debugging output even at level 1. For even more increase level.
- =back
+ =item join_subfields_with
+ Define delimiter which will be used to join repeatable subfields. This
+ option is included to support lagacy application written against version
+ older than 0.21 of this module. By default, it disabled. See L</to_hash>.
+ =item regexpes
- It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
+ Define (any number) of regexpes to apply at field values before they are
+ splitted into subfield. This is great place to split subfields in input to
+ mulitple subfields if needed or rename subfields.
+ =back
  =cut
-Line 97 
 sub new {
+Line 144 
 sub new {
          my $self = {};
          bless($self, $class);
-         $self->{isisdb} = {@_}->{isisdb} || croak "new needs database name as argument!";
+         croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
+         foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
+                 $self->{$v} = {@_}->{$v};
+         }
+         my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
+         foreach my $f (@isis_files) {
+                 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
+                 $self->{lc($ext)."_file"} = $f;
+         }
-         $self->{debug} = {@_}->{debug};
+         my @must_exist = qw(mst xrf);
+         push @must_exist, "fdt" if ($self->{read_fdt});
+         foreach my $ext (@must_exist) {
+                 unless ($self->{$ext."_file"}) {
+                         carp "missing ",uc($ext)," file in ",$self->{isisdb};
+                         return;
+                 }
+         }
+         if ($self->{debug}) {
+                 print STDERR "## using files: ",join(" ",@isis_files),"\n";
+                 eval "use Data::Dump";
+                 if (! $@) {
+                         *Dumper = *Data::Dump::dump;
+                 } else {
+                         use Data::Dumper;
+                 }
+         }
          # if you want to read .FDT file use read_fdt argument when creating class!
-         if ({@_}->{read_fdt} && -e $self->{isisdb}.".FDT") {
+         if ($self->{read_fdt} && -e $self->{fdt_file}) {
                  # read the $db.FDT file for tags
                  my $fieldzone=0;
-                 open(fileFDT, $self->{isisdb}.".FDT") || croak "can't read '$self->{isisdb}.FDT': $!";
+                 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
+                 binmode($fileFDT);
-                 while (<fileFDT>) {
+                 while (<$fileFDT>) {
                          chomp;
                          if ($fieldzone) {
                                  my $name=substr($_,0,30);
-Line 126 
 sub new {
+Line 204 
 sub new {
                          }
                  }
-                 close(fileFDT);
+                 close($fileFDT);
          }
          # Get the Maximum MFN from $db.MST
-         open(fileMST,$self->{isisdb}.".MST") || croak "can't read '$self->{isisdb}.MST': $!";
+         open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
+         binmode($self->{'fileMST'});
          # MST format:   (* = 32 bit signed)
          # CTLMFN*       always 0
-Line 139 
 sub new {
+Line 218 
 sub new {
          # NXTMFB*       last block allocated to master file
          # NXTMFP        offset to next available position in last block
          # MFTYPE        always 0 for user db file (1 for system)
-         seek(fileMST,4,0);
+         seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
-         $self->{'NXTMFN'}=$self->Read32(\*fileMST) || carp "NXTNFN is zero";
-         # save maximum MFN
-         $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
-         close(fileMST);
+         my $buff;
-         # Get the index information from $db.CNT
+         read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
+         $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
-         open(fileCNT, $self->{isisdb}.".CNT") || croak "can't read '$self->{isisdb}.CNT': $!";
-         # There is two 26 Bytes fixed lenght records
+         print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
-         #  0: IDTYPE    BTree type                              16
+         # open files for later
-         #  2: ORDN      Nodes Order                             16
+         open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
-         #  4: ORDF      Leafs Order                             16
+         binmode($self->{'fileXRF'});
-         #  6: N         Number of Memory buffers for nodes      16
-         #  8: K         Number of buffers for first level index 16
-         # 10: LIV       Current number of Index Levels          16
-         # 12: POSRX*    Pointer to Root Record in N0x           32
-         # 16: NMAXPOS*  Next Available position in N0x          32
-         # 20: FMAXPOS*  Next available position in L0x          32
-         # 24: ABNORMAL  Formal BTree normality indicator        16
-         # length: 26 bytes
-         sub unpack_cnt {
-                 my $self = shift;
-                 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
-                 my $buff = shift || return;
-                 my @arr = unpack("ssssssllls", $buff);
-                 my $IDTYPE = shift @arr;
-                 foreach (@flds) {
-                         $self->{$IDTYPE}->{$_} = abs(shift @arr);
-                 }
-         }
-         my $buff;
+         $self ? return $self : return undef;
-         read(fileCNT, $buff, 26);
+ }
-         $self->unpack_cnt($buff);
-         read(fileCNT, $buff, 26);
+ =head2 count
-         $self->unpack_cnt($buff);
+ Return number of records in database
-         close(fileCNT);
+   print $isis->count;
-         print Dumper($self) if ($self->{debug});
+ =cut
-         $self ? return $self : return undef;
+ sub count {
+         my $self = shift;
+         return $self->{'NXTMFN'} - 1;
  }
- =head2 GetMFN
+ =head2 fetch
  Read record with selected MFN
-   my $rec = $isis->GetMFN(55);
+   my $rec = $isis->fetch(55);
  Returns hash with keys which are field names and values are unpacked values
- for that field.
+ for that field like this:
+   $rec = {
+     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
+     '990' => [ '2140', '88', 'HAY' ],
+   };
  =cut
- sub GetMFN {
+ sub fetch {
          my $self = shift;
-         my $mfn = shift || croak "GetMFN needs MFN as argument!";
+         my $mfn = shift || croak "fetch needs MFN as argument!";
-         print "GetMFN: $mfn\n" if ($self->{debug});
+         # is mfn allready in memory?
+         my $old_mfn = $self->{'current_mfn'} || -1;
+         return $self->{record} if ($mfn == $old_mfn);
-         open(fileXRF, $self->{isisdb}.".XRF") || croak "can't open '$self->{isisdb}.XRF': $!";
+         print STDERR "## fetch: $mfn\n" if ($self->{debug});
          # XXX check this?
          my $mfnpos=($mfn+int(($mfn-1)/127))*4;
-         print "seeking to $mfnpos in file '$self->{isisdb}.XRF'\n" if ($self->{debug});
+         print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
-         seek(fileXRF,$mfnpos,0);
+         seek($self->{'fileXRF'},$mfnpos,0);
-         # read XRFMFB abd XRFMFP
+         my $buff;
-         my $pointer=$self->Read32(\*fileXRF);
-         my $XRFMFB = int($pointer/2048);
+         # delete old record
-         my $XRFMFP = $pointer - ($XRFMFB*2048);
+         delete $self->{record};
+         # read XRFMFB abd XRFMFP
+         read($self->{'fileXRF'}, $buff, 4);
+         my $pointer=unpack("V",$buff);
+         if (! $pointer) {
+                 if ($self->{include_deleted}) {
+                         return;
+                 } else {
+                         warn "pointer for MFN $mfn is null\n";
+                         return;
+                 }
+         }
-         print "XRFMFB: $XRFMFB XRFMFP: $XRFMFP\n" if ($self->{debug});
+         # check for logically deleted record
+         if ($pointer & 0x80000000) {
+                 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
+                 $self->{deleted} = $mfn;
-         # XXX fix this to be more readable!!
+                 return unless $self->{include_deleted};
-         # e.g. (XRFMFB - 1) * 512 + XRFMFP
-         my $offset = $pointer;
+                 # abs
-         my $offset2=int($offset/2048)-1;
+                 $pointer = ($pointer ^ 0xffffffff) + 1;
-         my $offset22=int($offset/4096);
-         my $offset3=$offset-($offset22*4096);
-         if ($offset3>512) {
-                 $offset3=$offset3-2048;
          }
-         my $offset4=($offset2*512)+$offset3;
-         print "$offset - $offset2 - $offset3 - $offset4\n" if ($self->{debug});
+         my $XRFMFB = int($pointer/2048);
+         my $XRFMFP = $pointer - ($XRFMFB*2048);
+         # (XRFMFB - 1) * 512 + XRFMFP
+         # why do i have to do XRFMFP % 1024 ?
+         my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
-         close(fileXRF);
+         print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
          # Get Record Information
-         open(fileMST, $self->{isisdb}.".MST") || croak "can't open '$self->{isisdb}.MST': $!";
+         seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
-         seek(fileMST,$offset4,0);
+         read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
+         my $value=unpack("V",$buff);
-         my $value=$self->Read32(\*fileMST);
+         print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
          if ($value!=$mfn) {
- print ("Error: The MFN:".$mfn." is not found in MST(".$value.")");
+                 if ($value == 0) {
-                 return -1;      # XXX deleted record?
+                         print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
+                         $self->{deleted} = $mfn;
+                         return;
+                 }
+                 carp "Error: MFN ".$mfn." not found in MST file, found $value";
+                 return;
          }
- #       $MFRL=$self->Read16($fileMST);
+         read($self->{'fileMST'}, $buff, 14);
- #       $MFBWB=$self->Read32($fileMST);
- #       $MFBWP=$self->Read16($fileMST);
- #       $BASE=$self->Read16($fileMST);
- #       $NVF=$self->Read16($fileMST);
- #       $STATUS=$self->Read16($fileMST);
-         my $buff;
+         my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
-         read(fileMST, $buff, 14);
-         my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
+         print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
-         print "MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
+         warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
+         warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
          # Get Directory Format
-Line 277 
 print ("Error: The MFN:".$mfn." is not f
+Line 354 
 print ("Error: The MFN:".$mfn." is not f
          my @FieldLEN;
          my @FieldTAG;
-         for (my $i = 0 ; $i < $NVF ; $i++) {
+         read($self->{'fileMST'}, $buff, 6 * $NVF);
+         my $rec_len = 0;
- #               $TAG=$self->Read16($fileMST);
+         for (my $i = 0 ; $i < $NVF ; $i++) {
- #               $POS=$self->Read16($fileMST);
- #               $LEN=$self->Read16($fileMST);
-                 read(fileMST, $buff, 6);
+                 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
-                 my ($TAG,$POS,$LEN) = unpack("sss", $buff);
-                 print "TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
+                 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
                  # The TAG does not exists in .FDT so we set it to 0.
                  #
-Line 301 
 print ("Error: The MFN:".$mfn." is not f
+Line 377 
 print ("Error: The MFN:".$mfn." is not f
                  push @FieldTAG,$TAG;
                  push @FieldPOS,$POS;
                  push @FieldLEN,$LEN;
+                 $rec_len += $LEN;
          }
          # Get Variable Fields
-         delete $self->{record};
+         read($self->{'fileMST'},$buff,$rec_len);
+         print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
          for (my $i = 0 ; $i < $NVF ; $i++) {
-                 my $rec;
+                 # skip zero-sized fields
-                 read(fileMST,$rec,$FieldLEN[$i]);
+                 next if ($FieldLEN[$i] == 0);
-                 push @{$self->{record}->{$FieldTAG[$i]}}, $rec;
-         }
-         close(fileMST);
-         # The record is marked for deletion
+                 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
-         if ($STATUS==1) {
-                 return -1;
          }
-         print Dumper($self) if ($self->{debug});
+         $self->{'current_mfn'} = $mfn;
+         print STDERR Dumper($self),"\n" if ($self->{debug});
          return $self->{'record'};
  }
+ =head2 mfn
+ Returns current MFN position
+   my $mfn = $isis->mfn;
+ =cut
+ # This function should be simple return $self->{current_mfn},
+ # but if new is called with _hack_mfn it becomes setter.
+ # It's useful in tests when setting $isis->{record} directly
+ sub mfn {
+         my $self = shift;
+         return $self->{current_mfn};
+ };
  =head2 to_ascii
- Dump ascii output of selected MFN
+ Returns ASCII output of record with specified MFN
-   print $isis->to_ascii(55);
+   print $isis->to_ascii(42);
+ This outputs something like this:
+  ^aNew York^cNew York University press^dcop. 1988
+  2140
+  88
+  HAY
+ If C<read_fdt> is specified when calling C<new> it will display field names
+ from C<.FDT> file instead of numeric tags.
  =cut
-Line 337 
 sub to_ascii {
+Line 442 
 sub to_ascii {
          my $mfn = shift || croak "need MFN";
-         my $rec = $self->GetMFN($mfn);
+         my $rec = $self->fetch($mfn) || return;
- print STDERR Dumper($rec);
          my $out = "0\t$mfn";
          foreach my $f (sort keys %{$rec}) {
-                 $out .= "\n$f\t".join("\n$f\t",@{$self->{record}->{$f}});
+                 my $fn = $self->tag_name($f);
+                 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
          }
          $out .= "\n";
-Line 352 
 print STDERR Dumper($rec);
+Line 456 
 print STDERR Dumper($rec);
          return $out;
  }
- ################# old cruft which is not ported from php to perl
+ =head2 to_hash
- =begin php
+ Read record with specified MFN and convert it to hash
-   # Load the dictionary from the $db.L0x files.
+   my $hash = $isis->to_hash($mfn);
-   # Not usefull Yet
-   sub LoadDictionary()
-   {
-     $fileL01=fopen($self->{isisdb}.".L01","r");
-     rewind($fileL01);
-     do
-     {
-       $POS=$self->Read32($fileL01);
-       $OCK=$self->Read16($fileL01);
-       $IT=$self->Read16($fileL01);
-       $PS=$self->Read32($fileL01);
- print "<br>PS:".$PS." ".$self->{ORDF}->{1}." ";
-       for ($i=0;$i<$OCK;$i++)
-       {
-         $KEY=fread($fileL01,10);
-         print $KEY." ### ";
-         $INFO1=$self->Read32($fileL01);
-         $INFO2=$self->Read32($fileL01);
-         #L01Key->{$key}=array($INFO1,$INFO2);
-       }
-       rewind($fileL01);
-       $offset=($PS-1)*(12+$self->{ORDF}->{1}*18*2);
-       fseek($fileL01,$offset);
-     } While (!feof($fileL01));
+ It has ability to convert characters (using C<hash_filter>) from ISIS
+ database before creating structures enabling character re-mapping or quick
+ fix-up of data.
-     fclose($fileL01);
+ This function returns hash which is like this:
-   }
-   # self function search through the tree and returns an array of pointers to IFP
+   $hash = {
-   # The function must be recursive
+     '210' => [
+                {
+                  'c' => 'New York University press',
+                  'a' => 'New York',
+                  'd' => 'cop. 1988'
+                }
+              ],
+     '990' => [
+                '2140',
+                '88',
+                'HAY'
+              ],
+   };
-   sub SearchTree($search,$fileNB,$PUNT)
+ You can later use that hash to produce any output from ISIS data.
-   {
-       $offset=(($PUNT-1)*(8+2*$self->{ORDN}->{1}*14));
-         rewind($fileNB1);
-         fseek($fileNB,$offset);
-         $POS=$self->Read32($fileNB);
-         $OCK=$self->Read16($fileNB);
-         $IT=$self->Read16($fileNB);
- #print "<br>".$POS." - ".$OCK." - ".$IT;
-         $OLDPUNT=$POS;
-         $j=0;
-         for ($i=0;$i<$OCK;$i++)
-         {
-           $KEY=fread($fileNB,10);
-           $PUNT=$self->Read32($fileNB);
- #print " ## ".chop($KEY)."(".$PUNT."-".$OLDPUNT.") ## ";
-           If (strcmp($search,chop($KEY))<0)
-           {
-             break;
-           }
-           $OLDPUNT=$PUNT;
-         }
- #print $OLDPUNT;
-         Return $OLDPUNT;
-   }
-   # Search ISIS for record containing search
+ If database is created using IsisMarc, it will also have to special fields
-   # Return a sorted array of MFN
+ which will be used for identifiers, C<i1> and C<i2> like this:
-   sub Search($search)
+   '200' => [
-   {
+              {
+                'i1' => '1',
+                'i2' => ' '
+                'a' => 'Goa',
+                'f' => 'Valdo D\'Arienzo',
+                'e' => 'tipografie e tipografi nel XVI secolo',
+              }
+            ],
-   $search=strtoupper($search);
+ In case there are repeatable subfields in record, this will create
- #print "Searching....".$search." - ".$self->{POSRX}->{1}."<br>";
+ following structure:
-     # first search .x01
-     # Search in .N01
-     $fileN01=fopen($self->{isisdb}.".N01","r");
-     $offset=(($self->{POSRX}->{1}-1)*(8+2*$self->{ORDN}->{1}*14));
-       do
-       {
-         rewind($fileN01);
-         fseek($fileN01,$offset);
-         $POS=$self->Read32($fileN01);
-         $OCK=$self->Read16($fileN01);
-         $IT=$self->Read16($fileN01);
- #print "<br>".$POS." - ".$OCK." - ".$IT;
-         $OLDPUNT=$POS;
-         for ($i=0;$i<$OCK;$i++)
-         {
-           $KEY=fread($fileN01,10);
-           $PUNT=$self->Read32($fileN01);
- #print " ## ".chop($KEY)."(".$PUNT."-".$OLDPUNT.") ## ";
-           If (strcmp($search,chop($KEY))<0)
-           {
-             break;
-           }
-           $OLDPUNT=$PUNT;
-         }
-         $offset=(($OLDPUNT-1)*(8+2*$self->{ORDN}->{1}*14));
-       } while ($OLDPUNT>0);
- #print $OLDPUNT;
-     fclose($fileN01);
-     # Now look for records in .L01 file
-     $fileL01=fopen($self->{isisdb}.".L01","r");
-     rewind($fileL01);
-     $offset=(-$OLDPUNT-1)*(12+$self->{ORDF}->{1}*18*2);
-     fseek($fileL01,$offset);
-     $POS=$self->Read32($fileL01);
-     $OCK=$self->Read16($fileL01);
-     $IT=$self->Read16($fileL01);
-     $PS=$self->Read32($fileL01);
- #print "<br>POS:".$POS." ".$self->{ORDF}->{1}." ";
-     for ($i=0;$i<$OCK;$i++)
-     {
-       $KEY=fread($fileL01,10);
- #print $KEY." ### ";
-       $INFO1=$self->Read32($fileL01);
-       $INFO2=$self->Read32($fileL01);
-       If (strcmp($search,chop($KEY))==0)
-       {
-         break;
-       }
-     }
-     fclose($fileL01);
- #print $INFO1."--".$INFO2;
-     # Now look in .IFP for the MFN
-     $fileIFP=fopen($self->{isisdb}.".IFP","r");
-     rewind($fileIFP);
-     $offset=($INFO1-1)*512+($INFO2*4);
-     fseek($fileIFP,$offset);
-     $IFPBLK=$self->Read32($fileIFP);
-     $IFPNXTB=$self->Read32($fileIFP);
-     $IFPNXTP=$self->Read32($fileIFP);
-     $IFPTOTP=$self->Read32($fileIFP);
-     $IFPSEGP=$self->Read32($fileIFP);
-     $IFPSEGC=$self->Read32($fileIFP);
- #print "<br>IFP:".$IFPBLK." # ".$IFPNXTB." - ".$IFPNXTP." - ".$IFPTOTP." - ".$IFPSEGP." - ".$IFPSEGC;
-     rewind($fileIFP);
-     $offset=($INFO1-1)*512+24+($INFO2*4);
-     fseek($fileIFP,$offset);
-     $j=24+($INFO2*4);
-     $k=0;
-     $l=1;
-     $OLDPMFN="";
-     for ($i=0;$i<$IFPSEGP;$i++)
-     {
-       $B1=$self->Read8($fileIFP);
-       $B2=$self->Read8($fileIFP);
-       $B3=$self->Read8($fileIFP);
-       $B4=$self->Read8($fileIFP);
-       $B5=$self->Read8($fileIFP);
-       $B6=$self->Read8($fileIFP);
-       $B7=$self->Read8($fileIFP);
-       $B8=$self->Read8($fileIFP);
-       $PMFN=$B1*65536+$B2*256+$B3;
-       $PTAG=$B4*256+$B5;
-       $POCC=$B6;
-       $PCNT=$B7*256+$B8;
-       if ($OLDPMFN!=$PMFN)
-       {
-         if ($PMFN!=0)
-         {
-           $self->{MFNArray}->{$l}=$PMFN;
-           $OLDPMFN=$PMFN;
-           $l+=1;
-         }
-       }
-       $j=$j+8;
- #print "<br>".$PMFN."-".$PTAG." - ".$POCC." - ".$PCNT;
- #print "@@".$j."@@@@";
-       if ($j>=504)
-       {
-         if ($IFPNXTB==0 && $IFPNXTP==0)
-         {
-           $k=$k+1;
-           rewind($fileIFP);
-           $offset=($INFO1-1+$k)*512;
-           fseek($fileIFP,$offset);
-           $B=$self->Read32($fileIFP);
- #print "<br>-".$B."-<br>";
-           $j=0;
-         } else
-         {
-           rewind($fileIFP);
-           $offset=($IFPNXTB-1)*512;
-           fseek($fileIFP,$offset);
-           $OLDIFPNXTB=$IFPNXTB;
-           $OLDIFPNXTP=$IFPNXTP;
-           $IFPBLK=$self->Read32($fileIFP);
-           $IFPNXTB=$self->Read32($fileIFP);
-           $IFPNXTP=$self->Read32($fileIFP);
-           $IFPTOTP=$self->Read32($fileIFP);
-           $IFPSEGP=$self->Read32($fileIFP);
-           $IFPSEGC=$self->Read32($fileIFP);
-           rewind($fileIFP);
-           $offset=($OLDIFPNXTB-1)*512+24+($OLDIFPNXTP*4);
-           fseek($fileIFP,$offset);
-           $j=24+($OLDIFPNXTP*4);
-           $k=0;
-           $j=0;
-         }
-       }
-     }
-     fclose($fileIFP);
-     return $l-1;
-   }
- =cut
+   '900' => [ {
+         'a' => [ 'foo', 'bar', 'baz' ],
+   }]
- #
+ Or in more complex example of
- # XXX porting from php left-over:
- #
- # do I *REALLY* need those methods, or should I use
- # $self->{something} directly?
- #
- # Probably direct usage is better!
- #
- sub GetFieldName {
+  ^aa1^aa2^aa3^bb1^aa4^bb2^cc1^aa5
-         my $self = shift;
-         return $self->{FieldName};
- }
- sub GetTagName {
+ it will create
-         my $self = shift;
-         return $self->{TagName};
- }
- sub GetFieldTag {
+  => [
-         my $self = shift;
+         { a => ["a1", "a2", "a3", "a4", "a5"], b => ["b1", "b2"], c => "c1" },
-         return $self->{FieldTAG};
+   ],
- }
+ This behaviour can be changed using C<join_subfields_with> option to L</new>,
+ in which case C<to_hash> will always create single value for each subfield.
+ This will change result to:
+ This method will also create additional field C<000> with MFN.
+ There is also more elaborative way to call C<to_hash> like this:
+   my $hash = $isis->to_hash({
+         mfn => 42,
+         include_subfields => 1,
+         regexps => [
+                 's/something/else/g',
+         ],
+   });
+ Each option controll creation of hash:
+ =over 4
+ =item mfn
+ Specify MFN number of record
+ =item include_subfields
- sub GetNextMFN {
+ This option will create additional key in hash called C<subfields> which will
+ have original record subfield order and index to that subfield like this:
+  => [ {
+         a => ["a1", "a2", "a3", "a4", "a5"],
+         b => ["b1", "b2"],
+         c => "c1",
+         subfields => ["a", 0, "a", 1, "a", 2, "b", 0, "a", 3, "b", 1, "c", 0, "a", 4],
+   } ],
+ =item join_subfields_with
+ Define delimiter which will be used to join repeatable subfields. You can
+ specify option here instead in L</new> if you want to have per-record control.
+ =item regexpes
+ Override C<regexpes> specified in L</new>.
+ =back
+ =cut
+ sub to_hash {
          my $self = shift;
-         return $self->{NXTMFN};
+         my $mfn = shift || confess "need mfn!";
+         my $arg;
+         if (ref($mfn) eq 'HASH') {
+                 $arg = $mfn;
+                 $mfn = $arg->{mfn} || confess "need mfn in arguments";
+         }
+         $arg->{regexpes} ||= $self->{regexpes};
+         confess "regexps must be HASH" if ($arg->{regexps} && ref($arg->{regexps}) ne 'HASH');
+         # init record to include MFN as field 000
+         my $rec = { '000' => [ $mfn ] };
+         my $row = $self->fetch($mfn) || return;
+         my $j_rs = $arg->{join_subfields_with};
+         $j_rs = $self->{join_subfields_with} unless(defined($j_rs));
+         my $i_sf = $arg->{include_subfields};
+         foreach my $f_nr (keys %{$row}) {
+                 foreach my $l (@{$row->{$f_nr}}) {
+                         # filter output
+                         if ($self->{'hash_filter'}) {
+                                 $l = $self->{'hash_filter'}->($l);
+                                 next unless defined($l);
+                         }
+                         # apply regexps
+                         if ($arg->{regexps} && defined($arg->{regexps}->{$f_nr})) {
+                                 confess "regexps->{$f_nr} must be ARRAY" if (ref($arg->{regexps}->{$f_nr}) ne 'ARRAY');
+                                 my $c = 0;
+                                 foreach my $r (@{ $arg->{regexps}->{$f_nr} }) {
+                                         while ( eval '$l =~ ' . $r ) { $c++ };
+                                 }
+                                 warn "## field $f_nr triggered $c regexpes\n" if ($c && $self->{debug});
+                         }
+                         my $val;
+                         my $r_sf;       # repeatable subfields in this record
+                         # has identifiers?
+                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
+                         # has subfields?
+                         if ($l =~ m/\^/) {
+                                 foreach my $t (split(/\^/,$l)) {
+                                         next if (! $t);
+                                         my ($sf,$v) = (substr($t,0,1), substr($t,1));
+                                         # XXX this might be option, but why?
+                                         next unless ($v);
+ #                                       warn "### $f_nr^$sf:$v",$/ if ($self->{debug} > 1);
+                                         if (ref( $val->{$sf} ) eq 'ARRAY') {
+                                                 push @{ $val->{$sf} }, $v;
+                                                 # record repeatable subfield it it's offset
+                                                 push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } ) if (! $j_rs && $i_sf);
+                                                 $r_sf->{$sf}++;
+                                         } elsif (defined( $val->{$sf} )) {
+                                                 # convert scalar field to array
+                                                 $val->{$sf} = [ $val->{$sf}, $v ];
+                                                 push @{ $val->{subfields} }, ( $sf, 1 ) if (! $j_rs && $i_sf);
+                                                 $r_sf->{$sf}++;
+                                         } else {
+                                                 $val->{$sf} = $v;
+                                                 push @{ $val->{subfields} }, ( $sf, 0 ) if ($i_sf);
+                                         }
+                                 }
+                         } else {
+                                 $val = $l;
+                         }
+                         if ($j_rs) {
+                                 map {
+                                         $val->{$_} = join($j_rs, @{ $val->{$_} });
+                                 } keys %$r_sf
+                         }
+                         push @{$rec->{$f_nr}}, $val;
+                 }
+         }
+         return $rec;
  }
- sub GetMFNArray {
+ =head2 tag_name
+ Return name of selected tag
+  print $isis->tag_name('200');
+ =cut
+ sub tag_name {
          my $self = shift;
-         return $self->{MFNArray};
+         my $tag = shift || return;
+         return $self->{'TagName'}->{$tag} || $tag;
  }
- =begin php
-   sub Read32($fileNB)
-   {
-     $B1=ord(fread($fileNB,1));
-     $B2=ord(fread($fileNB,1));
-     $B3=ord(fread($fileNB,1));
-     $B4=ord(fread($fileNB,1));
-     if ($B4<=128)
-     {
-       $value=$B1+$B2*256+$B3*65536+$B4*16777216;
-     } else
-     {
-       $value=$self->Not8($B1)+$self->Not8($B2)*256+$self->Not8($B3)*65536+$self->Not8($B4)*16777216;
-       $value=-($value+1);
-     }
- #    print "(".$B1.",".$B2.",".$B3.",".$B4.":".$value.")";
-     return $value;
+ =head2 read_cnt
-   }
-   sub Read24($fileNB)
+ Read content of C<.CNT> file and return hash containing it.
-   {
-     $B1=ord(fread($fileNB,1));
-     $B2=ord(fread($fileNB,1));
-     $B3=ord(fread($fileNB,1));
-     $value=$B1+$B2*256+$B3*65536;
+   print Dumper($isis->read_cnt);
- #    print "(".$B1.",".$B2.",".$B3.":".$value.")";
+ This function is not used by module (C<.CNT> files are not required for this
+ module to work), but it can be useful to examine your index (while debugging
+ for example).
-     return $value;
+ =cut
-   }
-   sub Read16($fileNB)
+ sub read_cnt  {
-   {
+         my $self = shift;
-     $B1=ord(fread($fileNB,1));
-     $B2=ord(fread($fileNB,1));
-     $value=$B1+$B2*256;
+         croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
- #    print "(".$B1.",".$B2.":".$value.")";
-     return $value;
+         # Get the index information from $db.CNT
-   }
+         open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
+         binmode($fileCNT);
-   sub Read8($fileNB)
+         my $buff;
-   {
-     $B1=ord(fread($fileNB,1));
-     $value=$B1;
+         read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
- #    print "(".$value.")";
+         $self->unpack_cnt($buff);
-     return $value;
+         read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
-   }
+         $self->unpack_cnt($buff);
-   sub Not8($value)
+         close($fileCNT);
-   {
-     $value=decbin($value);
+         return $self->{cnt};
-     if (strlen($value)<8)
-     {
-       $buffer="";
-       for($i=0;$i<(8-strlen($value));$i++)
-       {
-         $buffer.="0";
-       }
-       $value=$buffer.$value;
-     }
-     $value=ereg_replace("0","3",$value);
-     $value=ereg_replace("1","0",$value);
-     $value=ereg_replace("3","1",$value);
-     $value=bindec($value);
-     return $value;
-   }
  }
+ =head2 unpack_cnt
+ Unpack one of two 26 bytes fixed length record in C<.CNT> file.
+ Here is definition of record:
+  off key        description                             size
+: IDTYPE     BTree type                              s
+: ORDN       Nodes Order                             s
+: ORDF       Leafs Order                             s
+: N          Number of Memory buffers for nodes      s
+: K          Number of buffers for first level index s
+: LIV        Current number of Index Levels          s
+: POSRX      Pointer to Root Record in N0x           l
+: NMAXPOS    Next Available position in N0x          l
+: FMAXPOS    Next available position in L0x          l
+: ABNORMAL   Formal BTree normality indicator        s
+  length: 26 bytes
+ This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
  =cut
+ sub unpack_cnt {
+         my $self = shift;
+         my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
+         my $buff = shift || return;
+         my @arr = unpack("vvvvvvVVVv", $buff);
+         print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
+         my $IDTYPE = shift @arr;
+         foreach (@flds) {
+                 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
+         }
+ }
 ;
- __END__
  =head1 BUGS
- This module has been very lightly tested. Use with caution and report bugs.
+ Some parts of CDS/ISIS documentation are not detailed enough to exmplain
+ some variations in input databases which has been tested with this module.
+ When I was in doubt, I assumed that OpenIsis's implementation was right
+ (except for obvious bugs).
+ However, every effort has been made to test this module with as much
+ databases (and programs that create them) as possible.
+ I would be very greatful for success or failure reports about usage of this
+ module with databases from programs other than WinIsis and IsisMarc. I had
+ tested this against ouput of one C<isis.dll>-based application, but I don't
+ know any details about it's version.
+ =head1 VERSIONS
+ As this is young module, new features are added in subsequent version. It's
+ a good idea to specify version when using this module like this:
+   use Biblio::Isis 0.21
+ Below is list of changes in specific version of module (so you can target
+ older versions if you really have to):
+ =over 8
+ =item 0.21
+ Added C<join_subfields_with> to L</new> and L</to_hash>.
+ Added C<include_subfields> to L</to_hash>.
+ =item 0.20
+ Added C<< $isis->mfn >>, support for repeatable subfields and
+ C<< $isis->to_hash({ mfn => 42, ... }) >> calling convention
+ =back
  =head1 AUTHOR
-Line 736 
 This module has been very lightly tested
+Line 801 
 This module has been very lightly tested
          dpavlin@rot13.org
          http://www.rot13.org/~dpavlin/
- This module is based heavily on code from LIBISIS.PHP - Library to read ISIS files V0.1.1
+ This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
- written in php and (c) 2000 Franck Martin - <franck@sopac.org> released under LGPL.
+ written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
  =head1 COPYRIGHT
-Line 750 
 LICENSE file included with this module.
+Line 815 
 LICENSE file included with this module.
  =head1 SEE ALSO
- L<http://www.openisis.org|OpenIsis>, perl(1).
+ L<Biblio::Isis::Manual> for CDS/ISIS manual appendix F, G and H which describe file format
+ OpenIsis web site L<http://www.openisis.org>
+ perl4lib site L<http://perl4lib.perl.org>

 Legend:



Removed from v.2
 


changed lines


 
Added in v.62
 Legend:



Removed from v.2
 


changed lines


 
Added in v.62
-Removed from v.2
+Added in v.62

	ViewVC Help
Powered by ViewVC 1.1.26