Biblio-Isis/trunk/Isis.pm

package IsisDB;
use strict;

use Carp;
use File::Glob qw(:globally :nocase);

use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.09;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();

}

=head1 NAME

IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database

=head1 SYNOPSIS

  use IsisDB;

  my $isis = new IsisDB(
        isisdb => './cds/cds',
  );

  for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
        print $isis->to_ascii($mfn),"\n";
  }

=head1 DESCRIPTION

This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
IsisMarc. It can be used as perl-only alternative to OpenIsis module which
seems to depriciate it's old C<XS> bindings for perl.

It can create hash values from data in ISIS database (using C<to_hash>),
ASCII dump (using C<to_ascii>) or just hash with field names and packed
values (like C<^asomething^belse>).

Unique feature of this module is ability to C<include_deleted> records.
It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
fields which are zero sized will be filled with random junk from memory).

It also has support for identifiers (only if ISIS database is created by
IsisMarc), see C<to_hash>.

This module will always be slower than OpenIsis module which use C
library. However, since it's written in perl, it's platform independent (so
you don't need C compiler), and can be easily modified. I hope that it
creates data structures which are easier to use than ones created by
OpenIsis, so reduced time in other parts of the code should compensate for
slower performance of this module (speed of reading ISIS database is
rarely an issue).

=head1 METHODS

=cut

#  my $ORDN;            # Nodes Order
#  my $ORDF;            # Leafs Order
#  my $N;               # Number of Memory buffers for nodes
#  my $K;               # Number of buffers for first level index
#  my $LIV;             # Current number of Index Levels
#  my $POSRX;           # Pointer to Root Record in N0x
#  my $NMAXPOS;         # Next Available position in N0x
#  my $FMAXPOS;         # Next available position in L0x
#  my $ABNORMAL;        # Formal BTree normality indicator

#
# some binary reads
#

=head2 new

Open ISIS database

 my $isis = new IsisDB(
        isisdb => './cds/cds',
        read_fdt => 1,
        include_deleted => 1,
        hash_filter => sub {
                my $v = shift;
                $v =~ s#foo#bar#g;
        },
        debug => 1,
 );

Options are described below:

=over 5

=item isisdb

This is full or relative path to ISIS database files which include
common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
C<read_fdt> option) files.

In this example it uses C<./cds/cds.MST> and related files.

=item read_fdt

Boolean flag to specify if field definition table should be read. It's off
by default.

=item include_deleted

Don't skip logically deleted records in ISIS.

=item hash_filter

Filter code ref which will be used before data is converted to hash.

=item debug

Dump a B<lot> of debugging output.

=back

=cut

sub new {
        my $class = shift;
        my $self = {};
        bless($self, $class);

        croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});

        foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
                $self->{$v} = {@_}->{$v};
        }

        my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));

        foreach my $f (@isis_files) {
                my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
                $self->{lc($ext)."_file"} = $f;
        }

        my @must_exist = qw(mst xrf);
        push @must_exist, "fdt" if ($self->{read_fdt});

        foreach my $ext (@must_exist) {
                croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
        }

        print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});

        # if you want to read .FDT file use read_fdt argument when creating class!
        if ($self->{read_fdt} && -e $self->{fdt_file}) {

                # read the $db.FDT file for tags
                my $fieldzone=0;

                open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
                binmode($fileFDT);

                while (<$fileFDT>) {
                        chomp;
                        if ($fieldzone) {
                                my $name=substr($_,0,30);
                                my $tag=substr($_,50,3);

                                $name =~ s/\s+$//;
                                $tag =~ s/\s+$//;

                                $self->{'TagName'}->{$tag}=$name;  
                        }

                        if (/^\*\*\*/) {
                                $fieldzone=1;
                        }
                }
                
                close($fileFDT);
        }

        # Get the Maximum MFN from $db.MST

        open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
        binmode($self->{'fileMST'});

        # MST format:   (* = 32 bit signed)
        # CTLMFN*       always 0
        # NXTMFN*       MFN to be assigned to the next record created
        # NXTMFB*       last block allocated to master file
        # NXTMFP        offset to next available position in last block
        # MFTYPE        always 0 for user db file (1 for system)
        seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";

        my $buff;

        read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
        $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";

        print STDERR Dumper($self),"\n" if ($self->{debug});

        # open files for later
        open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
        binmode($self->{'fileXRF'});

        $self ? return $self : return undef;
}

=head2 count

Return number of records in database

  print $isis->count;

=cut

sub count {
        my $self = shift;
        return $self->{'NXTMFN'} - 1;
}

=head2 fetch

Read record with selected MFN

  my $rec = $isis->fetch(55);

Returns hash with keys which are field names and values are unpacked values
for that field like this:

  $rec = {
    '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
    '990' => [ '2140', '88', 'HAY' ],
  };

=cut

sub fetch {
        my $self = shift;

        my $mfn = shift || croak "fetch needs MFN as argument!";

        # is mfn allready in memory?
        my $old_mfn = $self->{'current_mfn'} || -1;
        return $self->{record} if ($mfn == $old_mfn);

        print STDERR "## fetch: $mfn\n" if ($self->{debug});

        # XXX check this?
        my $mfnpos=($mfn+int(($mfn-1)/127))*4;

        print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
        seek($self->{'fileXRF'},$mfnpos,0);

        my $buff;

        # delete old record
        delete $self->{record};

        # read XRFMFB abd XRFMFP
        read($self->{'fileXRF'}, $buff, 4);
        my $pointer=unpack("V",$buff) || croak "pointer is null";

        # check for logically deleted record
        if ($pointer & 0x80000000) {
                print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
                $self->{deleted} = $mfn;

                return unless $self->{include_deleted};

                # abs
                $pointer = ($pointer ^ 0xffffffff) + 1;
        }

        my $XRFMFB = int($pointer/2048);
        my $XRFMFP = $pointer - ($XRFMFB*2048);

        # (XRFMFB - 1) * 512 + XRFMFP
        # why do i have to do XRFMFP % 1024 ?

        my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);

        print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});

        # Get Record Information

        seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";

        read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
        my $value=unpack("V",$buff);

        print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});

        if ($value!=$mfn) {
                if ($value == 0) {
                        print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
                        $self->{deleted} = $mfn;
                        return;
                }

                carp "Error: MFN ".$mfn." not found in MST file, found $value";    
                return;
        }

        read($self->{'fileMST'}, $buff, 14);

        my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);

        print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});

        warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);

        warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);

        # Get Directory Format

        my @FieldPOS;
        my @FieldLEN;
        my @FieldTAG;

        read($self->{'fileMST'}, $buff, 6 * $NVF);

        my $rec_len = 0;

        for (my $i = 0 ; $i < $NVF ; $i++) {

                my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));

                print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});

                # The TAG does not exists in .FDT so we set it to 0.
                #
                # XXX This is removed from perl version; .FDT file is updated manually, so
                # you will often have fields in .MST file which aren't in .FDT. On the other
                # hand, IsisMarc doesn't use .FDT files at all!

                #if (! $self->{TagName}->{$TAG}) {
                #       $TAG=0;
                #}

                push @FieldTAG,$TAG;
                push @FieldPOS,$POS;
                push @FieldLEN,$LEN;

                $rec_len += $LEN;
        }

        # Get Variable Fields

        read($self->{'fileMST'},$buff,$rec_len);

        print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});

        for (my $i = 0 ; $i < $NVF ; $i++) {
                # skip zero-sized fields
                next if ($FieldLEN[$i] == 0);

                push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
        }

        $self->{'current_mfn'} = $mfn;

        print STDERR Dumper($self),"\n" if ($self->{debug});

        return $self->{'record'};
}

=head2 to_ascii

Returns ASCII output of record with specified MFN

  print $isis->to_ascii(42);

This outputs something like this:

  210   ^aNew York^cNew York University press^dcop. 1988
  990   2140
  990   88
  990   HAY

If C<read_fdt> is specified when calling C<new> it will display field names
from C<.FDT> file instead of numeric tags.

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || croak "need MFN";

        my $rec = $self->fetch($mfn);

        my $out = "0\t$mfn";

        foreach my $f (sort keys %{$rec}) {
                my $fn = $self->tag_name($f);
                $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
        }

        $out .= "\n";

        return $out;
}

=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $isis->to_hash($mfn);

It has ability to convert characters (using C<hash_filter>) from ISIS
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  $hash = {
    '210' => [
               {
                 'c' => 'New York University press',
                 'a' => 'New York',
                 'd' => 'cop. 1988'
               }
             ],
    '990' => [
               '2140',
               '88',
               'HAY'
             ],
  };

You can later use that hash to produce any output from ISIS data.

If database is created using IsisMarc, it will also have to special fields
which will be used for identifiers, C<i1> and C<i2> like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn);

        foreach my $k (keys %{$row}) {
                foreach my $l (@{$row->{$k}}) {

                        # filter output
                        $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);

                        # has subfields?
                        if ($l =~ m/\^/) {
                                foreach my $t (split(/\^/,$l)) {
                                        next if (! $t);
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$k}}, $val;
                }
        }

        return $rec;
}

=head2 tag_name

Return name of selected tag

 print $isis->tag_name('200');

=cut

sub tag_name {
        my $self = shift;
        my $tag = shift || return;
        return $self->{'TagName'}->{$tag} || $tag;
}


=head2 read_cnt

Read content of C<.CNT> file and return hash containing it.

  print Dumper($isis->read_cnt);

This function is not used by module (C<.CNT> files are not required for this
module to work), but it can be useful to examine your index (while debugging
for example).

=cut

sub read_cnt  {
        my $self = shift;

        croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});

        # Get the index information from $db.CNT
   
        open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
        binmode($fileCNT);

        my $buff;

        read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
        $self->unpack_cnt($buff);

        read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
        $self->unpack_cnt($buff);

        close($fileCNT);

        return $self->{cnt};
}

=head2 unpack_cnt

Unpack one of two 26 bytes fixed length record in C<.CNT> file.

Here is definition of record:

 off key        description                             size
  0: IDTYPE     BTree type                              s
  2: ORDN       Nodes Order                             s
  4: ORDF       Leafs Order                             s
  6: N          Number of Memory buffers for nodes      s
  8: K          Number of buffers for first level index s
 10: LIV        Current number of Index Levels          s
 12: POSRX      Pointer to Root Record in N0x           l
 16: NMAXPOS    Next Available position in N0x          l
 20: FMAXPOS    Next available position in L0x          l
 24: ABNORMAL   Formal BTree normality indicator        s
 length: 26 bytes

This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.

=cut

sub unpack_cnt {
        my $self = shift;

        my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);

        my $buff = shift || return;
        my @arr = unpack("vvvvvvVVVv", $buff);

        print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});

        my $IDTYPE = shift @arr;
        foreach (@flds) {
                $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
        }
}

1;

=head1 BUGS

Some parts of CDS/ISIS documentation are not detailed enough to exmplain
some variations in input databases which has been tested with this module.
When I was in doubt, I assumed that OpenIsis's implementation was right
(except for obvious bugs).

However, every effort has been made to test this module with as much
databases (and programs that create them) as possible.

I would be very greatful for success or failure reports about usage of this
module with databases from programs other than WinIsis and IsisMarc. I had
tested this against ouput of one C<isis.dll>-based application, but I don't
know any details about it's version.

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

OpenIsis web site L<http://www.openisis.org>

perl4lib site L<http://perl4lib.perl.org>

1	package IsisDB;
2	use strict;
3
4	use Carp;
5	use File::Glob qw(:globally :nocase);
6
7	use Data::Dumper;
8
9	BEGIN {
10	use Exporter ();
11	use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12	$VERSION = 0.09;
13	@ISA = qw (Exporter);
14	#Give a hoot don't pollute, do not export more than needed by default
15	@EXPORT = qw ();
16	@EXPORT_OK = qw ();
17	%EXPORT_TAGS = ();
18
19	}
20
21	=head1 NAME
22
23	IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24
25	=head1 SYNOPSIS
26
27	use IsisDB;
28
29	my $isis = new IsisDB(
30	isisdb => './cds/cds',
31	);
32
33	for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34	print $isis->to_ascii($mfn),"\n";
35	}
36
37	=head1 DESCRIPTION
38
39	This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40	IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41	seems to depriciate it's old C<XS> bindings for perl.
42
43	It can create hash values from data in ISIS database (using C<to_hash>),
44	ASCII dump (using C<to_ascii>) or just hash with field names and packed
45	values (like C<^asomething^belse>).
46
47	Unique feature of this module is ability to C<include_deleted> records.
48	It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49	fields which are zero sized will be filled with random junk from memory).
50
51	It also has support for identifiers (only if ISIS database is created by
52	IsisMarc), see C<to_hash>.
53
54	This module will always be slower than OpenIsis module which use C
55	library. However, since it's written in perl, it's platform independent (so
56	you don't need C compiler), and can be easily modified. I hope that it
57	creates data structures which are easier to use than ones created by
58	OpenIsis, so reduced time in other parts of the code should compensate for
59	slower performance of this module (speed of reading ISIS database is
60	rarely an issue).
61
62	=head1 METHODS
63
64	=cut
65
66	# my $ORDN; # Nodes Order
67	# my $ORDF; # Leafs Order
68	# my $N; # Number of Memory buffers for nodes
69	# my $K; # Number of buffers for first level index
70	# my $LIV; # Current number of Index Levels
71	# my $POSRX; # Pointer to Root Record in N0x
72	# my $NMAXPOS; # Next Available position in N0x
73	# my $FMAXPOS; # Next available position in L0x
74	# my $ABNORMAL; # Formal BTree normality indicator
75
76	#
77	# some binary reads
78	#
79
80	=head2 new
81
82	Open ISIS database
83
84	my $isis = new IsisDB(
85	isisdb => './cds/cds',
86	read_fdt => 1,
87	include_deleted => 1,
88	hash_filter => sub {
89	my $v = shift;
90	$v =~ s#foo#bar#g;
91	},
92	debug => 1,
93	);
94
95	Options are described below:
96
97	=over 5
98
99	=item isisdb
100
101	This is full or relative path to ISIS database files which include
102	common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103	C<read_fdt> option) files.
104
105	In this example it uses C<./cds/cds.MST> and related files.
106
107	=item read_fdt
108
109	Boolean flag to specify if field definition table should be read. It's off
110	by default.
111
112	=item include_deleted
113
114	Don't skip logically deleted records in ISIS.
115
116	=item hash_filter
117
118	Filter code ref which will be used before data is converted to hash.
119
120	=item debug
121
122	Dump a B<lot> of debugging output.
123
124	=back
125
126	=cut
127
128	sub new {
129	my $class = shift;
130	my $self = {};
131	bless($self, $class);
132
133	croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134
135	foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136	$self->{$v} = {@_}->{$v};
137	}
138
139	my @isis_files = grep(/\.(FDT\|MST\|XRF\|CNT)$/i,glob($self->{isisdb}."*"));
140
141	foreach my $f (@isis_files) {
142	my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143	$self->{lc($ext)."_file"} = $f;
144	}
145
146	my @must_exist = qw(mst xrf);
147	push @must_exist, "fdt" if ($self->{read_fdt});
148
149	foreach my $ext (@must_exist) {
150	croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
151	}
152
153	print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
154
155	# if you want to read .FDT file use read_fdt argument when creating class!
156	if ($self->{read_fdt} && -e $self->{fdt_file}) {
157
158	# read the $db.FDT file for tags
159	my $fieldzone=0;
160
161	open(my $fileFDT, $self->{fdt_file}) \|\| croak "can't read '$self->{fdt_file}': $!";
162	binmode($fileFDT);
163
164	while (<$fileFDT>) {
165	chomp;
166	if ($fieldzone) {
167	my $name=substr($_,0,30);
168	my $tag=substr($_,50,3);
169
170	$name =~ s/\s+$//;
171	$tag =~ s/\s+$//;
172
173	$self->{'TagName'}->{$tag}=$name;
174	}
175
176	if (/^\\\*/) {
177	$fieldzone=1;
178	}
179	}
180
181	close($fileFDT);
182	}
183
184	# Get the Maximum MFN from $db.MST
185
186	open($self->{'fileMST'}, $self->{mst_file}) \|\| croak "can't open '$self->{mst_file}': $!";
187	binmode($self->{'fileMST'});
188
189	# MST format: (* = 32 bit signed)
190	# CTLMFN* always 0
191	# NXTMFN* MFN to be assigned to the next record created
192	# NXTMFB* last block allocated to master file
193	# NXTMFP offset to next available position in last block
194	# MFTYPE always 0 for user db file (1 for system)
195	seek($self->{'fileMST'},4,0) \|\| croak "can't seek to offset 0 in MST: $!";
196
197	my $buff;
198
199	read($self->{'fileMST'}, $buff, 4) \|\| croak "can't read NXTMFN from MST: $!";
200	$self->{'NXTMFN'}=unpack("V",$buff) \|\| croak "NXTNFN is zero";
201
202	print STDERR Dumper($self),"\n" if ($self->{debug});
203
204	# open files for later
205	open($self->{'fileXRF'}, $self->{xrf_file}) \|\| croak "can't open '$self->{xrf_file}': $!";
206	binmode($self->{'fileXRF'});
207
208	$self ? return $self : return undef;
209	}
210
211	=head2 count
212
213	Return number of records in database
214
215	print $isis->count;
216
217	=cut
218
219	sub count {
220	my $self = shift;
221	return $self->{'NXTMFN'} - 1;
222	}
223
224	=head2 fetch
225
226	Read record with selected MFN
227
228	my $rec = $isis->fetch(55);
229
230	Returns hash with keys which are field names and values are unpacked values
231	for that field like this:
232
233	$rec = {
234	'210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
235	'990' => [ '2140', '88', 'HAY' ],
236	};
237
238	=cut
239
240	sub fetch {
241	my $self = shift;
242
243	my $mfn = shift \|\| croak "fetch needs MFN as argument!";
244
245	# is mfn allready in memory?
246	my $old_mfn = $self->{'current_mfn'} \|\| -1;
247	return $self->{record} if ($mfn == $old_mfn);
248
249	print STDERR "## fetch: $mfn\n" if ($self->{debug});
250
251	# XXX check this?
252	my $mfnpos=($mfn+int(($mfn-1)/127))*4;
253
254	print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
255	seek($self->{'fileXRF'},$mfnpos,0);
256
257	my $buff;
258
259	# delete old record
260	delete $self->{record};
261
262	# read XRFMFB abd XRFMFP
263	read($self->{'fileXRF'}, $buff, 4);
264	my $pointer=unpack("V",$buff) \|\| croak "pointer is null";
265
266	# check for logically deleted record
267	if ($pointer & 0x80000000) {
268	print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
269	$self->{deleted} = $mfn;
270
271	return unless $self->{include_deleted};
272
273	# abs
274	$pointer = ($pointer ^ 0xffffffff) + 1;
275	}
276
277	my $XRFMFB = int($pointer/2048);
278	my $XRFMFP = $pointer - ($XRFMFB*2048);
279
280	# (XRFMFB - 1) * 512 + XRFMFP
281	# why do i have to do XRFMFP % 1024 ?
282
283	my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
284
285	print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
286
287	# Get Record Information
288
289	seek($self->{'fileMST'},$blk_off,0) \|\| croak "can't seek to $blk_off: $!";
290
291	read($self->{'fileMST'}, $buff, 4) \|\| croak "can't read 4 bytes at offset $blk_off from MST file: $!";
292	my $value=unpack("V",$buff);
293
294	print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
295
296	if ($value!=$mfn) {
297	if ($value == 0) {
298	print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
299	$self->{deleted} = $mfn;
300	return;
301	}
302
303	carp "Error: MFN ".$mfn." not found in MST file, found $value";
304	return;
305	}
306
307	read($self->{'fileMST'}, $buff, 14);
308
309	my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
310
311	print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
312
313	warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
314
315	warn "BASE is not 18+6NVF" unless ($BASE == 18 + 6 $NVF);
316
317	# Get Directory Format
318
319	my @FieldPOS;
320	my @FieldLEN;
321	my @FieldTAG;
322
323	read($self->{'fileMST'}, $buff, 6 * $NVF);
324
325	my $rec_len = 0;
326
327	for (my $i = 0 ; $i < $NVF ; $i++) {
328
329	my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
330
331	print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
332
333	# The TAG does not exists in .FDT so we set it to 0.
334	#
335	# XXX This is removed from perl version; .FDT file is updated manually, so
336	# you will often have fields in .MST file which aren't in .FDT. On the other
337	# hand, IsisMarc doesn't use .FDT files at all!
338
339	#if (! $self->{TagName}->{$TAG}) {
340	# $TAG=0;
341	#}
342
343	push @FieldTAG,$TAG;
344	push @FieldPOS,$POS;
345	push @FieldLEN,$LEN;
346
347	$rec_len += $LEN;
348	}
349
350	# Get Variable Fields
351
352	read($self->{'fileMST'},$buff,$rec_len);
353
354	print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
355
356	for (my $i = 0 ; $i < $NVF ; $i++) {
357	# skip zero-sized fields
358	next if ($FieldLEN[$i] == 0);
359
360	push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
361	}
362
363	$self->{'current_mfn'} = $mfn;
364
365	print STDERR Dumper($self),"\n" if ($self->{debug});
366
367	return $self->{'record'};
368	}
369
370	=head2 to_ascii
371
372	Returns ASCII output of record with specified MFN
373
374	print $isis->to_ascii(42);
375
376	This outputs something like this:
377
378	210 ^aNew York^cNew York University press^dcop. 1988
379	990 2140
380	990 88
381	990 HAY
382
383	If C<read_fdt> is specified when calling C<new> it will display field names
384	from C<.FDT> file instead of numeric tags.
385
386	=cut
387
388	sub to_ascii {
389	my $self = shift;
390
391	my $mfn = shift \|\| croak "need MFN";
392
393	my $rec = $self->fetch($mfn);
394
395	my $out = "0\t$mfn";
396
397	foreach my $f (sort keys %{$rec}) {
398	my $fn = $self->tag_name($f);
399	$out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
400	}
401
402	$out .= "\n";
403
404	return $out;
405	}
406
407	=head2 to_hash
408
409	Read record with specified MFN and convert it to hash
410
411	my $hash = $isis->to_hash($mfn);
412
413	It has ability to convert characters (using C<hash_filter>) from ISIS
414	database before creating structures enabling character re-mapping or quick
415	fix-up of data.
416
417	This function returns hash which is like this:
418
419	$hash = {
420	'210' => [
421	{
422	'c' => 'New York University press',
423	'a' => 'New York',
424	'd' => 'cop. 1988'
425	}
426	],
427	'990' => [
428	'2140',
429	'88',
430	'HAY'
431	],
432	};
433
434	You can later use that hash to produce any output from ISIS data.
435
436	If database is created using IsisMarc, it will also have to special fields
437	which will be used for identifiers, C<i1> and C<i2> like this:
438
439	'200' => [
440	{
441	'i1' => '1',
442	'i2' => ' '
443	'a' => 'Goa',
444	'f' => 'Valdo D\'Arienzo',
445	'e' => 'tipografie e tipografi nel XVI secolo',
446	}
447	],
448
449	This method will also create additional field C<000> with MFN.
450
451	=cut
452
453	sub to_hash {
454	my $self = shift;
455
456	my $mfn = shift \|\| confess "need mfn!";
457
458	# init record to include MFN as field 000
459	my $rec = { '000' => [ $mfn ] };
460
461	my $row = $self->fetch($mfn);
462
463	foreach my $k (keys %{$row}) {
464	foreach my $l (@{$row->{$k}}) {
465
466	# filter output
467	$l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
468
469	my $val;
470
471	# has identifiers?
472	($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
473
474	# has subfields?
475	if ($l =~ m/\^/) {
476	foreach my $t (split(/\^/,$l)) {
477	next if (! $t);
478	$val->{substr($t,0,1)} = substr($t,1);
479	}
480	} else {
481	$val = $l;
482	}
483
484	push @{$rec->{$k}}, $val;
485	}
486	}
487
488	return $rec;
489	}
490
491	=head2 tag_name
492
493	Return name of selected tag
494
495	print $isis->tag_name('200');
496
497	=cut
498
499	sub tag_name {
500	my $self = shift;
501	my $tag = shift \|\| return;
502	return $self->{'TagName'}->{$tag} \|\| $tag;
503	}
504
505
506	=head2 read_cnt
507
508	Read content of C<.CNT> file and return hash containing it.
509
510	print Dumper($isis->read_cnt);
511
512	This function is not used by module (C<.CNT> files are not required for this
513	module to work), but it can be useful to examine your index (while debugging
514	for example).
515
516	=cut
517
518	sub read_cnt {
519	my $self = shift;
520
521	croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
522
523	# Get the index information from $db.CNT
524
525	open(my $fileCNT, $self->{cnt_file}) \|\| croak "can't read '$self->{cnt_file}': $!";
526	binmode($fileCNT);
527
528	my $buff;
529
530	read($fileCNT, $buff, 26) \|\| croak "can't read first table from CNT: $!";
531	$self->unpack_cnt($buff);
532
533	read($fileCNT, $buff, 26) \|\| croak "can't read second table from CNT: $!";
534	$self->unpack_cnt($buff);
535
536	close($fileCNT);
537
538	return $self->{cnt};
539	}
540
541	=head2 unpack_cnt
542
543	Unpack one of two 26 bytes fixed length record in C<.CNT> file.
544
545	Here is definition of record:
546
547	off key description size
548	0: IDTYPE BTree type s
549	2: ORDN Nodes Order s
550	4: ORDF Leafs Order s
551	6: N Number of Memory buffers for nodes s
552	8: K Number of buffers for first level index s
553	10: LIV Current number of Index Levels s
554	12: POSRX Pointer to Root Record in N0x l
555	16: NMAXPOS Next Available position in N0x l
556	20: FMAXPOS Next available position in L0x l
557	24: ABNORMAL Formal BTree normality indicator s
558	length: 26 bytes
559
560	This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
561
562	=cut
563
564	sub unpack_cnt {
565	my $self = shift;
566
567	my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
568
569	my $buff = shift \|\| return;
570	my @arr = unpack("vvvvvvVVVv", $buff);
571
572	print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
573
574	my $IDTYPE = shift @arr;
575	foreach (@flds) {
576	$self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
577	}
578	}
579
580	1;
581
582	=head1 BUGS
583
584	Some parts of CDS/ISIS documentation are not detailed enough to exmplain
585	some variations in input databases which has been tested with this module.
586	When I was in doubt, I assumed that OpenIsis's implementation was right
587	(except for obvious bugs).
588
589	However, every effort has been made to test this module with as much
590	databases (and programs that create them) as possible.
591
592	I would be very greatful for success or failure reports about usage of this
593	module with databases from programs other than WinIsis and IsisMarc. I had
594	tested this against ouput of one C<isis.dll>-based application, but I don't
595	know any details about it's version.
596
597	=head1 AUTHOR
598
599	Dobrica Pavlinusic
600	CPAN ID: DPAVLIN
601	dpavlin@rot13.org
602	http://www.rot13.org/~dpavlin/
603
604	This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
605	written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
606
607	=head1 COPYRIGHT
608
609	This program is free software; you can redistribute
610	it and/or modify it under the same terms as Perl itself.
611
612	The full text of the license can be found in the
613	LICENSE file included with this module.
614
615
616	=head1 SEE ALSO
617
618	OpenIsis web site L<http://www.openisis.org>
619
620	perl4lib site L<http://perl4lib.perl.org>
621