lib/Biblio/Isis.pm

package IsisDB;
use strict;

use Carp;
use File::Glob qw(:globally :nocase);

use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.08;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();

}

=head1 NAME

IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database

=head1 SYNOPSIS

  use IsisDB;

  my $isis = new IsisDB(
        isisdb => './cds/cds',
  );

  for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
        print $isis->to_ascii($mfn),"\n";
  }

=head1 DESCRIPTION

This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
IsisMarc. It can be used as perl-only alternative to OpenIsis module which
seems to depriciate it's old C<XS> bindings for perl.

It can create hash values from data in ISIS database (using C<to_hash>),
ASCII dump (using C<to_ascii>) or just hash with field names and packed
values (like C<^asomething^belse>).

Unique feature of this module is ability to C<include_deleted> records.
It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
fields which are zero sized will be filled with random junk from memory).

It also has support for identifiers (only if ISIS database is created by
IsisMarc), see C<to_hash>.

This module will always be slower than OpenIsis module which use C
library. However, since it's written in perl, it's platform independent (so
you don't need C compiler), and can be easily modified. I hope that it
creates data structures which are easier to use than ones created by
OpenIsis, so reduced time in other parts of the code should compensate for
slower performance of this module (speed of reading ISIS database is
rarely an issue).

=head1 METHODS

=cut

#  my $ORDN;            # Nodes Order
#  my $ORDF;            # Leafs Order
#  my $N;               # Number of Memory buffers for nodes
#  my $K;               # Number of buffers for first level index
#  my $LIV;             # Current number of Index Levels
#  my $POSRX;           # Pointer to Root Record in N0x
#  my $NMAXPOS;         # Next Available position in N0x
#  my $FMAXPOS;         # Next available position in L0x
#  my $ABNORMAL;        # Formal BTree normality indicator

#
# some binary reads
#

=head2 new

Open ISIS database

 my $isis = new IsisDB(
        isisdb => './cds/cds',
        read_fdt => 1,
        include_deleted => 1,
        hash_filter => sub {
                my $v = shift;
                $v =~ s#foo#bar#g;
        },
        debug => 1,
 );

Options are described below:

=over 5

=item isisdb

This is full or relative path to ISIS database files which include
common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
C<read_fdt> option) files.

In this example it uses C<./cds/cds.MST> and related files.

=item read_fdt

Boolean flag to specify if field definition table should be read. It's off
by default.

=item include_deleted

Don't skip logically deleted records in ISIS.

=item hash_filter

Filter code ref which will be used before data is converted to hash.

=item debug

Dump a B<lot> of debugging output.

=back

It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.

=cut

sub new {
        my $class = shift;
        my $self = {};
        bless($self, $class);

        croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});

        foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
                $self->{$v} = {@_}->{$v};
        }

        my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));

        foreach my $f (@isis_files) {
                my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
                $self->{lc($ext)."_file"} = $f;
        }

        my @must_exist = qw(mst xrf);
        push @must_exist, "fdt" if ($self->{read_fdt});

        foreach my $ext (@must_exist) {
                croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
        }

        print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});

        # if you want to read .FDT file use read_fdt argument when creating class!
        if ($self->{read_fdt} && -e $self->{fdt_file}) {

                # read the $db.FDT file for tags
                my $fieldzone=0;

                open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";

                while (<fileFDT>) {
                        chomp;
                        if ($fieldzone) {
                                my $name=substr($_,0,30);
                                my $tag=substr($_,50,3);

                                $name =~ s/\s+$//;
                                $tag =~ s/\s+$//;

                                $self->{'TagName'}->{$tag}=$name;  
                        }

                        if (/^\*\*\*/) {
                                $fieldzone=1;
                        }
                }
                
                close(fileFDT);
        }

        # Get the Maximum MFN from $db.MST

        open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";

        # MST format:   (* = 32 bit signed)
        # CTLMFN*       always 0
        # NXTMFN*       MFN to be assigned to the next record created
        # NXTMFB*       last block allocated to master file
        # NXTMFP        offset to next available position in last block
        # MFTYPE        always 0 for user db file (1 for system)
        seek($self->{'fileMST'},4,0);

        my $buff;

        read($self->{'fileMST'}, $buff, 4);
        $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";

        # save maximum MFN
        $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;


        print STDERR Dumper($self),"\n" if ($self->{debug});

        # open files for later
        open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";

        $self ? return $self : return undef;
}

=head2 read_cnt

Read content of C<.CNT> file and return hash containing it.

  print Dumper($isis->read_cnt);

This function is not used by module (C<.CNT> files are not required for this
module to work), but it can be useful to examine your index (while debugging
for example).

=cut

sub read_cnt  {
        my $self = shift;

        croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});

        # Get the index information from $db.CNT
   
        open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";

        # There is two 26 Bytes fixed lenght records

        #  0: IDTYPE    BTree type                              16
        #  2: ORDN      Nodes Order                             16
        #  4: ORDF      Leafs Order                             16
        #  6: N         Number of Memory buffers for nodes      16
        #  8: K         Number of buffers for first level index 16
        # 10: LIV       Current number of Index Levels          16
        # 12: POSRX*    Pointer to Root Record in N0x           32
        # 16: NMAXPOS*  Next Available position in N0x          32
        # 20: FMAXPOS*  Next available position in L0x          32
        # 24: ABNORMAL  Formal BTree normality indicator        16
        # length: 26 bytes

        sub unpack_cnt {
                my $self = shift;

                my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);

                my $buff = shift || return;
                my @arr = unpack("ssssssllls", $buff);

                print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});

                my $IDTYPE = shift @arr;
                foreach (@flds) {
                        $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
                }
        }

        my $buff;

        read(fileCNT, $buff, 26);
        $self->unpack_cnt($buff);

        read(fileCNT, $buff, 26);
        $self->unpack_cnt($buff);

        close(fileCNT);

        return $self->{cnt};
}

=head2 fetch

Read record with selected MFN

  my $rec = $isis->fetch(55);

Returns hash with keys which are field names and values are unpacked values
for that field like this:

  $rec = {
    '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
    '990' => [ '2140', '88', 'HAY' ],
  };

=cut

sub fetch {
        my $self = shift;

        my $mfn = shift || croak "fetch needs MFN as argument!";

        # is mfn allready in memory?
        my $old_mfn = $self->{'current_mfn'} || -1;
        return $self->{record} if ($mfn == $old_mfn);

        print STDERR "## fetch: $mfn\n" if ($self->{debug});

        # XXX check this?
        my $mfnpos=($mfn+int(($mfn-1)/127))*4;

        print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
        seek($self->{'fileXRF'},$mfnpos,0);

        my $buff;

        # delete old record
        delete $self->{record};

        # read XRFMFB abd XRFMFP
        read($self->{'fileXRF'}, $buff, 4);
        my $pointer=unpack("l",$buff) || carp "pointer is null";

        # check for logically deleted record
        if ($pointer < 0) {
                print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
                $self->{deleted} = $mfn;

                return unless $self->{include_deleted};

                $pointer = abs($pointer);
        }

        my $XRFMFB = int($pointer/2048);
        my $XRFMFP = $pointer - ($XRFMFB*2048);

        # (XRFMFB - 1) * 512 + XRFMFP
        # why do i have to do XRFMFP % 1024 ?

        my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);

        print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});

        # Get Record Information

        seek($self->{'fileMST'},$blk_off,0);

        read($self->{'fileMST'}, $buff, 4);
        my $value=unpack("l",$buff);

        print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});

        if ($value!=$mfn) {
                if ($value == 0) {
                        print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
                        $self->{deleted} = $mfn;
                        return;
                }

                carp "Error: MFN ".$mfn." not found in MST file, found $value";    
                return;
        }

        read($self->{'fileMST'}, $buff, 14);

        my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);

        print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});

        warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);

        warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);

        # Get Directory Format

        my @FieldPOS;
        my @FieldLEN;
        my @FieldTAG;

        read($self->{'fileMST'}, $buff, 6 * $NVF);

        my $rec_len = 0;

        for (my $i = 0 ; $i < $NVF ; $i++) {

                my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));

                print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});

                # The TAG does not exists in .FDT so we set it to 0.
                #
                # XXX This is removed from perl version; .FDT file is updated manually, so
                # you will often have fields in .MST file which aren't in .FDT. On the other
                # hand, IsisMarc doesn't use .FDT files at all!

                #if (! $self->{TagName}->{$TAG}) {
                #       $TAG=0;
                #}

                push @FieldTAG,$TAG;
                push @FieldPOS,$POS;
                push @FieldLEN,$LEN;

                $rec_len += $LEN;
        }

        # Get Variable Fields

        read($self->{'fileMST'},$buff,$rec_len);

        print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});

        for (my $i = 0 ; $i < $NVF ; $i++) {
                # skip zero-sized fields
                next if ($FieldLEN[$i] == 0);

                push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
        }

        $self->{'current_mfn'} = $mfn;

        print STDERR Dumper($self),"\n" if ($self->{debug});

        return $self->{'record'};
}

=head2 to_ascii

Returns ASCII output of record with specified MFN

  print $isis->to_ascii(42);

This outputs something like this:

  210   ^aNew York^cNew York University press^dcop. 1988
  990   2140
  990   88
  990   HAY

If C<read_fdt> is specified when calling C<new> it will display field names
from C<.FDT> file instead of numeric tags.

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || croak "need MFN";

        my $rec = $self->fetch($mfn);

        my $out = "0\t$mfn";

        foreach my $f (sort keys %{$rec}) {
                my $fn = $self->tag_name($f);
                $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
        }

        $out .= "\n";

        return $out;
}

=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $isis->to_hash($mfn);

It has ability to convert characters (using C<hash_filter>) from ISIS
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  $hash = {
    '210' => [
               {
                 'c' => 'New York University press',
                 'a' => 'New York',
                 'd' => 'cop. 1988'
               }
             ],
    '990' => [
               '2140',
               '88',
               'HAY'
             ],
  };

You can later use that hash to produce any output from ISIS data.

If database is created using IsisMarc, it will also have to special fields
which will be used for identifiers, C<i1> and C<i2> like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn);

        foreach my $k (keys %{$row}) {
                foreach my $l (@{$row->{$k}}) {

                        # filter output
                        $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);

                        # has subfields?
                        if ($l =~ m/\^/) {
                                foreach my $t (split(/\^/,$l)) {
                                        next if (! $t);
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$k}}, $val;
                }
        }

        return $rec;
}

=head2 tag_name

Return name of selected tag

 print $isis->tag_name('200');

=cut

sub tag_name {
        my $self = shift;
        my $tag = shift || return;
        return $self->{'TagName'}->{$tag} || $tag;
}

1;

=head1 BUGS

Some parts of CDS/ISIS documentation are not detailed enough to exmplain
some variations in input databases which has been tested with this module.
When I was in doubt, I assumed that OpenIsis's implementation was right
(except for obvious bugs).

However, every effort has been made to test this module with as much
databases (and programs that create them) as possible.

I would be very greatful for success or failure reports about usage of this
module with databases from programs other than WinIsis and IsisMarc. I had
tested this against ouput of one C<isis.dll>-based application, but I don't
know any details about it's version.

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

OpenIsis web site L<http://www.openisis.org>

perl4lib site L<http://perl4lib.perl.org>

1	dpavlin	1	package IsisDB;
2			use strict;
3
4			use Carp;
5	dpavlin	18	use File::Glob qw(:globally :nocase);
6
7	dpavlin	1	use Data::Dumper;
8
9			BEGIN {
10			use Exporter ();
11			use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12	dpavlin	25	$VERSION = 0.08;
13	dpavlin	1	@ISA = qw (Exporter);
14			#Give a hoot don't pollute, do not export more than needed by default
15			@EXPORT = qw ();
16			@EXPORT_OK = qw ();
17			%EXPORT_TAGS = ();
18
19			}
20
21			=head1 NAME
22
23	dpavlin	15	IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24	dpavlin	1
25			=head1 SYNOPSIS
26
27	dpavlin	11	use IsisDB;
28
29	dpavlin	1	my $isis = new IsisDB(
30			isisdb => './cds/cds',
31			);
32
33	dpavlin	11	for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
34			print $isis->to_ascii($mfn),"\n";
35			}
36
37	dpavlin	1	=head1 DESCRIPTION
38
39	dpavlin	15	This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40	dpavlin	27	IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41			seems to depriciate it's old C<XS> bindings for perl.
42	dpavlin	1
43	dpavlin	15	It can create hash values from data in ISIS database (using C<to_hash>),
44			ASCII dump (using C<to_ascii>) or just hash with field names and packed
45			values (like C<^asomething^belse>).
46	dpavlin	11
47			Unique feature of this module is ability to C<include_deleted> records.
48			It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49			fields which are zero sized will be filled with random junk from memory).
50
51	dpavlin	15	It also has support for identifiers (only if ISIS database is created by
52			IsisMarc), see C<to_hash>.
53
54	dpavlin	27	This module will always be slower than OpenIsis module which use C
55	dpavlin	15	library. However, since it's written in perl, it's platform independent (so
56			you don't need C compiler), and can be easily modified. I hope that it
57			creates data structures which are easier to use than ones created by
58			OpenIsis, so reduced time in other parts of the code should compensate for
59			slower performance of this module (speed of reading ISIS database is
60			rarely an issue).
61
62	dpavlin	1	=head1 METHODS
63
64			=cut
65
66			# my $ORDN; # Nodes Order
67			# my $ORDF; # Leafs Order
68			# my $N; # Number of Memory buffers for nodes
69			# my $K; # Number of buffers for first level index
70			# my $LIV; # Current number of Index Levels
71			# my $POSRX; # Pointer to Root Record in N0x
72			# my $NMAXPOS; # Next Available position in N0x
73			# my $FMAXPOS; # Next available position in L0x
74			# my $ABNORMAL; # Formal BTree normality indicator
75
76			#
77			# some binary reads
78			#
79
80			=head2 new
81
82	dpavlin	15	Open ISIS database
83	dpavlin	1
84			my $isis = new IsisDB(
85			isisdb => './cds/cds',
86			read_fdt => 1,
87	dpavlin	12	include_deleted => 1,
88			hash_filter => sub {
89			my $v = shift;
90			$v =~ s#foo#bar#g;
91			},
92	dpavlin	1	debug => 1,
93			);
94
95	dpavlin	2	Options are described below:
96
97			=over 5
98
99	dpavlin	1	=item isisdb
100
101	dpavlin	15	This is full or relative path to ISIS database files which include
102	dpavlin	18	common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103			C<read_fdt> option) files.
104	dpavlin	1
105	dpavlin	15	In this example it uses C<./cds/cds.MST> and related files.
106
107	dpavlin	1	=item read_fdt
108
109			Boolean flag to specify if field definition table should be read. It's off
110			by default.
111
112	dpavlin	9	=item include_deleted
113
114	dpavlin	11	Don't skip logically deleted records in ISIS.
115	dpavlin	9
116	dpavlin	12	=item hash_filter
117
118			Filter code ref which will be used before data is converted to hash.
119
120			=item debug
121
122			Dump a B<lot> of debugging output.
123
124	dpavlin	2	=back
125
126			It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
127
128	dpavlin	1	=cut
129
130			sub new {
131			my $class = shift;
132			my $self = {};
133			bless($self, $class);
134
135	dpavlin	9	croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
136	dpavlin	1
137	dpavlin	12	foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
138	dpavlin	9	$self->{$v} = {@_}->{$v};
139			}
140	dpavlin	1
141	dpavlin	18	my @isis_files = grep(/\.(FDT\|MST\|XRF\|CNT)$/i,glob($self->{isisdb}."*"));
142
143			foreach my $f (@isis_files) {
144			my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
145			$self->{lc($ext)."_file"} = $f;
146			}
147
148			my @must_exist = qw(mst xrf);
149			push @must_exist, "fdt" if ($self->{read_fdt});
150
151			foreach my $ext (@must_exist) {
152	dpavlin	19	croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
153	dpavlin	18	}
154
155			print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
156
157	dpavlin	1	# if you want to read .FDT file use read_fdt argument when creating class!
158	dpavlin	18	if ($self->{read_fdt} && -e $self->{fdt_file}) {
159	dpavlin	1
160			# read the $db.FDT file for tags
161			my $fieldzone=0;
162
163	dpavlin	18	open(fileFDT, $self->{fdt_file}) \|\| croak "can't read '$self->{fdt_file}': $!";
164	dpavlin	1
165			while (<fileFDT>) {
166			chomp;
167			if ($fieldzone) {
168			my $name=substr($_,0,30);
169			my $tag=substr($_,50,3);
170
171			$name =~ s/\s+$//;
172			$tag =~ s/\s+$//;
173
174			$self->{'TagName'}->{$tag}=$name;
175			}
176
177			if (/^\\\*/) {
178			$fieldzone=1;
179			}
180			}
181
182			close(fileFDT);
183			}
184
185			# Get the Maximum MFN from $db.MST
186
187	dpavlin	18	open($self->{'fileMST'}, $self->{mst_file}) \|\| croak "can't open '$self->{mst_file}': $!";
188	dpavlin	1
189			# MST format: (* = 32 bit signed)
190			# CTLMFN* always 0
191			# NXTMFN* MFN to be assigned to the next record created
192			# NXTMFB* last block allocated to master file
193			# NXTMFP offset to next available position in last block
194			# MFTYPE always 0 for user db file (1 for system)
195	dpavlin	18	seek($self->{'fileMST'},4,0);
196	dpavlin	1
197	dpavlin	11	my $buff;
198
199	dpavlin	18	read($self->{'fileMST'}, $buff, 4);
200	dpavlin	11	$self->{'NXTMFN'}=unpack("l",$buff) \|\| carp "NXTNFN is zero";
201
202	dpavlin	2	# save maximum MFN
203			$self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
204
205	dpavlin	1
206	dpavlin	18
207
208			print STDERR Dumper($self),"\n" if ($self->{debug});
209
210			# open files for later
211			open($self->{'fileXRF'}, $self->{xrf_file}) \|\| croak "can't open '$self->{xrf_file}': $!";
212
213			$self ? return $self : return undef;
214			}
215
216			=head2 read_cnt
217
218	dpavlin	27	Read content of C<.CNT> file and return hash containing it.
219	dpavlin	18
220			print Dumper($isis->read_cnt);
221
222	dpavlin	27	This function is not used by module (C<.CNT> files are not required for this
223			module to work), but it can be useful to examine your index (while debugging
224			for example).
225
226	dpavlin	18	=cut
227
228			sub read_cnt {
229			my $self = shift;
230
231	dpavlin	19	croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
232	dpavlin	18
233	dpavlin	1	# Get the index information from $db.CNT
234
235	dpavlin	18	open(fileCNT, $self->{cnt_file}) \|\| croak "can't read '$self->{cnt_file}': $!";
236	dpavlin	1
237			# There is two 26 Bytes fixed lenght records
238
239			# 0: IDTYPE BTree type 16
240			# 2: ORDN Nodes Order 16
241			# 4: ORDF Leafs Order 16
242			# 6: N Number of Memory buffers for nodes 16
243			# 8: K Number of buffers for first level index 16
244			# 10: LIV Current number of Index Levels 16
245			# 12: POSRX* Pointer to Root Record in N0x 32
246			# 16: NMAXPOS* Next Available position in N0x 32
247			# 20: FMAXPOS* Next available position in L0x 32
248			# 24: ABNORMAL Formal BTree normality indicator 16
249			# length: 26 bytes
250
251			sub unpack_cnt {
252			my $self = shift;
253
254			my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
255
256			my $buff = shift \|\| return;
257			my @arr = unpack("ssssssllls", $buff);
258
259	dpavlin	16	print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
260	dpavlin	9
261	dpavlin	1	my $IDTYPE = shift @arr;
262			foreach (@flds) {
263	dpavlin	18	$self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
264	dpavlin	1	}
265			}
266
267	dpavlin	18	my $buff;
268
269	dpavlin	1	read(fileCNT, $buff, 26);
270			$self->unpack_cnt($buff);
271
272			read(fileCNT, $buff, 26);
273			$self->unpack_cnt($buff);
274
275			close(fileCNT);
276
277	dpavlin	18	return $self->{cnt};
278	dpavlin	1	}
279
280	dpavlin	7	=head2 fetch
281	dpavlin	1
282	dpavlin	2	Read record with selected MFN
283	dpavlin	1
284	dpavlin	7	my $rec = $isis->fetch(55);
285	dpavlin	2
286			Returns hash with keys which are field names and values are unpacked values
287	dpavlin	15	for that field like this:
288	dpavlin	2
289	dpavlin	15	$rec = {
290			'210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
291			'990' => [ '2140', '88', 'HAY' ],
292			};
293
294	dpavlin	2	=cut
295
296	dpavlin	7	sub fetch {
297	dpavlin	1	my $self = shift;
298
299	dpavlin	7	my $mfn = shift \|\| croak "fetch needs MFN as argument!";
300	dpavlin	1
301	dpavlin	16	# is mfn allready in memory?
302			my $old_mfn = $self->{'current_mfn'} \|\| -1;
303	dpavlin	25	return $self->{record} if ($mfn == $old_mfn);
304	dpavlin	1
305	dpavlin	16	print STDERR "## fetch: $mfn\n" if ($self->{debug});
306
307	dpavlin	1	# XXX check this?
308			my $mfnpos=($mfn+int(($mfn-1)/127))*4;
309
310	dpavlin	18	print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
311	dpavlin	7	seek($self->{'fileXRF'},$mfnpos,0);
312	dpavlin	1
313	dpavlin	11	my $buff;
314
315	dpavlin	25	# delete old record
316			delete $self->{record};
317
318	dpavlin	1	# read XRFMFB abd XRFMFP
319	dpavlin	11	read($self->{'fileXRF'}, $buff, 4);
320			my $pointer=unpack("l",$buff) \|\| carp "pointer is null";
321	dpavlin	1
322	dpavlin	25	# check for logically deleted record
323			if ($pointer < 0) {
324			print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
325			$self->{deleted} = $mfn;
326
327			return unless $self->{include_deleted};
328
329			$pointer = abs($pointer);
330			}
331
332	dpavlin	1	my $XRFMFB = int($pointer/2048);
333			my $XRFMFP = $pointer - ($XRFMFB*2048);
334
335	dpavlin	16	# (XRFMFB - 1) * 512 + XRFMFP
336			# why do i have to do XRFMFP % 1024 ?
337	dpavlin	1
338	dpavlin	26	my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
339	dpavlin	1
340	dpavlin	16	print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
341	dpavlin	1
342			# Get Record Information
343
344	dpavlin	16	seek($self->{'fileMST'},$blk_off,0);
345	dpavlin	1
346	dpavlin	11	read($self->{'fileMST'}, $buff, 4);
347			my $value=unpack("l",$buff);
348	dpavlin	1
349	dpavlin	16	print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
350
351	dpavlin	1	if ($value!=$mfn) {
352	dpavlin	26	if ($value == 0) {
353			print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
354			$self->{deleted} = $mfn;
355			return;
356			}
357
358			carp "Error: MFN ".$mfn." not found in MST file, found $value";
359			return;
360	dpavlin	1	}
361
362	dpavlin	7	read($self->{'fileMST'}, $buff, 14);
363	dpavlin	1
364			my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
365
366	dpavlin	16	print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
367	dpavlin	1
368	dpavlin	25	warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
369	dpavlin	9
370	dpavlin	16	warn "BASE is not 18+6NVF" unless ($BASE == 18 + 6 $NVF);
371
372	dpavlin	1	# Get Directory Format
373
374			my @FieldPOS;
375			my @FieldLEN;
376			my @FieldTAG;
377
378	dpavlin	8	read($self->{'fileMST'}, $buff, 6 * $NVF);
379
380	dpavlin	16	my $rec_len = 0;
381	dpavlin	8
382	dpavlin	1	for (my $i = 0 ; $i < $NVF ; $i++) {
383
384	dpavlin	8	my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
385	dpavlin	1
386	dpavlin	16	print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
387	dpavlin	1
388			# The TAG does not exists in .FDT so we set it to 0.
389			#
390			# XXX This is removed from perl version; .FDT file is updated manually, so
391			# you will often have fields in .MST file which aren't in .FDT. On the other
392			# hand, IsisMarc doesn't use .FDT files at all!
393
394			#if (! $self->{TagName}->{$TAG}) {
395			# $TAG=0;
396			#}
397
398			push @FieldTAG,$TAG;
399			push @FieldPOS,$POS;
400			push @FieldLEN,$LEN;
401	dpavlin	8
402	dpavlin	16	$rec_len += $LEN;
403	dpavlin	1	}
404
405			# Get Variable Fields
406
407	dpavlin	16	read($self->{'fileMST'},$buff,$rec_len);
408	dpavlin	8
409	dpavlin	16	print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
410
411	dpavlin	1	for (my $i = 0 ; $i < $NVF ; $i++) {
412	dpavlin	10	# skip zero-sized fields
413			next if ($FieldLEN[$i] == 0);
414
415	dpavlin	8	push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
416	dpavlin	1	}
417
418	dpavlin	16	$self->{'current_mfn'} = $mfn;
419
420	dpavlin	25	print STDERR Dumper($self),"\n" if ($self->{debug});
421	dpavlin	1
422	dpavlin	2	return $self->{'record'};
423	dpavlin	1	}
424
425	dpavlin	2	=head2 to_ascii
426
427	dpavlin	27	Returns ASCII output of record with specified MFN
428	dpavlin	2
429	dpavlin	15	print $isis->to_ascii(42);
430	dpavlin	2
431	dpavlin	27	This outputs something like this:
432	dpavlin	15
433			210 ^aNew York^cNew York University press^dcop. 1988
434			990 2140
435			990 88
436			990 HAY
437
438			If C<read_fdt> is specified when calling C<new> it will display field names
439			from C<.FDT> file instead of numeric tags.
440
441	dpavlin	2	=cut
442
443			sub to_ascii {
444			my $self = shift;
445
446			my $mfn = shift \|\| croak "need MFN";
447
448	dpavlin	7	my $rec = $self->fetch($mfn);
449	dpavlin	2
450			my $out = "0\t$mfn";
451
452			foreach my $f (sort keys %{$rec}) {
453	dpavlin	15	my $fn = $self->tag_name($f);
454			$out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
455	dpavlin	2	}
456
457			$out .= "\n";
458
459			return $out;
460			}
461
462	dpavlin	12	=head2 to_hash
463
464	dpavlin	15	Read record with specified MFN and convert it to hash
465	dpavlin	12
466			my $hash = $isis->to_hash($mfn);
467
468	dpavlin	27	It has ability to convert characters (using C<hash_filter>) from ISIS
469	dpavlin	15	database before creating structures enabling character re-mapping or quick
470			fix-up of data.
471	dpavlin	12
472			This function returns hash which is like this:
473
474			$hash = {
475			'210' => [
476			{
477			'c' => 'New York University press',
478			'a' => 'New York',
479			'd' => 'cop. 1988'
480			}
481			],
482			'990' => [
483			'2140',
484			'88',
485			'HAY'
486			],
487			};
488
489	dpavlin	15	You can later use that hash to produce any output from ISIS data.
490	dpavlin	12
491	dpavlin	15	If database is created using IsisMarc, it will also have to special fields
492			which will be used for identifiers, C<i1> and C<i2> like this:
493
494			'200' => [
495			{
496			'i1' => '1',
497			'i2' => ' '
498			'a' => 'Goa',
499			'f' => 'Valdo D\'Arienzo',
500			'e' => 'tipografie e tipografi nel XVI secolo',
501			}
502			],
503
504			This method will also create additional field C<000> with MFN.
505
506	dpavlin	12	=cut
507
508			sub to_hash {
509			my $self = shift;
510
511			my $mfn = shift \|\| confess "need mfn!";
512
513	dpavlin	15	# init record to include MFN as field 000
514	dpavlin	16	my $rec = { '000' => [ $mfn ] };
515	dpavlin	15
516	dpavlin	12	my $row = $self->fetch($mfn);
517
518			foreach my $k (keys %{$row}) {
519			foreach my $l (@{$row->{$k}}) {
520
521			# filter output
522			$l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
523
524	dpavlin	15	my $val;
525
526			# has identifiers?
527	dpavlin	23	($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
528	dpavlin	15
529	dpavlin	12	# has subfields?
530			if ($l =~ m/\^/) {
531			foreach my $t (split(/\^/,$l)) {
532			next if (! $t);
533			$val->{substr($t,0,1)} = substr($t,1);
534			}
535			} else {
536			$val = $l;
537			}
538
539			push @{$rec->{$k}}, $val;
540			}
541			}
542
543			return $rec;
544			}
545
546	dpavlin	15	=head2 tag_name
547	dpavlin	1
548	dpavlin	15	Return name of selected tag
549	dpavlin	1
550	dpavlin	15	print $isis->tag_name('200');
551
552			=cut
553
554			sub tag_name {
555	dpavlin	1	my $self = shift;
556	dpavlin	15	my $tag = shift \|\| return;
557			return $self->{'TagName'}->{$tag} \|\| $tag;
558	dpavlin	1	}
559
560			1;
561
562			=head1 BUGS
563
564	dpavlin	27	Some parts of CDS/ISIS documentation are not detailed enough to exmplain
565			some variations in input databases which has been tested with this module.
566			When I was in doubt, I assumed that OpenIsis's implementation was right
567			(except for obvious bugs).
568	dpavlin	1
569	dpavlin	27	However, every effort has been made to test this module with as much
570			databases (and programs that create them) as possible.
571
572			I would be very greatful for success or failure reports about usage of this
573			module with databases from programs other than WinIsis and IsisMarc. I had
574			tested this against ouput of one C<isis.dll>-based application, but I don't
575			know any details about it's version.
576
577	dpavlin	1	=head1 AUTHOR
578
579			Dobrica Pavlinusic
580			CPAN ID: DPAVLIN
581			dpavlin@rot13.org
582			http://www.rot13.org/~dpavlin/
583
584	dpavlin	15	This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
585			written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
586	dpavlin	1
587			=head1 COPYRIGHT
588
589			This program is free software; you can redistribute
590			it and/or modify it under the same terms as Perl itself.
591
592			The full text of the license can be found in the
593			LICENSE file included with this module.
594
595
596			=head1 SEE ALSO
597
598	dpavlin	15	OpenIsis web site L<http://www.openisis.org>
599	dpavlin	1
600	dpavlin	15	perl4lib site L<http://perl4lib.perl.org>
601