/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 35 - (hide annotations)
Thu Jan 6 16:27:07 2005 UTC (19 years, 2 months ago) by dpavlin
Original Path: trunk/IsisDB.pm
File size: 14836 byte(s)
moved *_cnt function to end of module (so that documetation ends up at end)

1 dpavlin 1 package IsisDB;
2     use strict;
3    
4     use Carp;
5 dpavlin 18 use File::Glob qw(:globally :nocase);
6    
7 dpavlin 1 use Data::Dumper;
8    
9     BEGIN {
10     use Exporter ();
11     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 dpavlin 32 $VERSION = 0.09;
13 dpavlin 1 @ISA = qw (Exporter);
14     #Give a hoot don't pollute, do not export more than needed by default
15     @EXPORT = qw ();
16     @EXPORT_OK = qw ();
17     %EXPORT_TAGS = ();
18    
19     }
20    
21     =head1 NAME
22    
23 dpavlin 15 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 11 use IsisDB;
28    
29 dpavlin 1 my $isis = new IsisDB(
30     isisdb => './cds/cds',
31     );
32    
33 dpavlin 32 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34 dpavlin 11 print $isis->to_ascii($mfn),"\n";
35     }
36    
37 dpavlin 1 =head1 DESCRIPTION
38    
39 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 dpavlin 27 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41     seems to depriciate it's old C<XS> bindings for perl.
42 dpavlin 1
43 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
44     ASCII dump (using C<to_ascii>) or just hash with field names and packed
45     values (like C<^asomething^belse>).
46 dpavlin 11
47     Unique feature of this module is ability to C<include_deleted> records.
48     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49     fields which are zero sized will be filled with random junk from memory).
50    
51 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
52     IsisMarc), see C<to_hash>.
53    
54 dpavlin 27 This module will always be slower than OpenIsis module which use C
55 dpavlin 15 library. However, since it's written in perl, it's platform independent (so
56     you don't need C compiler), and can be easily modified. I hope that it
57     creates data structures which are easier to use than ones created by
58     OpenIsis, so reduced time in other parts of the code should compensate for
59     slower performance of this module (speed of reading ISIS database is
60     rarely an issue).
61    
62 dpavlin 1 =head1 METHODS
63    
64     =cut
65    
66     # my $ORDN; # Nodes Order
67     # my $ORDF; # Leafs Order
68     # my $N; # Number of Memory buffers for nodes
69     # my $K; # Number of buffers for first level index
70     # my $LIV; # Current number of Index Levels
71     # my $POSRX; # Pointer to Root Record in N0x
72     # my $NMAXPOS; # Next Available position in N0x
73     # my $FMAXPOS; # Next available position in L0x
74     # my $ABNORMAL; # Formal BTree normality indicator
75    
76     #
77     # some binary reads
78     #
79    
80     =head2 new
81    
82 dpavlin 15 Open ISIS database
83 dpavlin 1
84     my $isis = new IsisDB(
85     isisdb => './cds/cds',
86     read_fdt => 1,
87 dpavlin 12 include_deleted => 1,
88     hash_filter => sub {
89     my $v = shift;
90     $v =~ s#foo#bar#g;
91     },
92 dpavlin 1 debug => 1,
93     );
94    
95 dpavlin 2 Options are described below:
96    
97     =over 5
98    
99 dpavlin 1 =item isisdb
100    
101 dpavlin 15 This is full or relative path to ISIS database files which include
102 dpavlin 18 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103     C<read_fdt> option) files.
104 dpavlin 1
105 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
106    
107 dpavlin 1 =item read_fdt
108    
109     Boolean flag to specify if field definition table should be read. It's off
110     by default.
111    
112 dpavlin 9 =item include_deleted
113    
114 dpavlin 11 Don't skip logically deleted records in ISIS.
115 dpavlin 9
116 dpavlin 12 =item hash_filter
117    
118     Filter code ref which will be used before data is converted to hash.
119    
120     =item debug
121    
122     Dump a B<lot> of debugging output.
123    
124 dpavlin 2 =back
125    
126 dpavlin 1 =cut
127    
128     sub new {
129     my $class = shift;
130     my $self = {};
131     bless($self, $class);
132    
133 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134 dpavlin 1
135 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136 dpavlin 9 $self->{$v} = {@_}->{$v};
137     }
138 dpavlin 1
139 dpavlin 18 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140    
141     foreach my $f (@isis_files) {
142     my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143     $self->{lc($ext)."_file"} = $f;
144     }
145    
146     my @must_exist = qw(mst xrf);
147     push @must_exist, "fdt" if ($self->{read_fdt});
148    
149     foreach my $ext (@must_exist) {
150 dpavlin 19 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
151 dpavlin 18 }
152    
153     print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
154    
155 dpavlin 1 # if you want to read .FDT file use read_fdt argument when creating class!
156 dpavlin 18 if ($self->{read_fdt} && -e $self->{fdt_file}) {
157 dpavlin 1
158     # read the $db.FDT file for tags
159     my $fieldzone=0;
160    
161 dpavlin 33 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
162     binmode($fileFDT);
163 dpavlin 1
164 dpavlin 33 while (<$fileFDT>) {
165 dpavlin 1 chomp;
166     if ($fieldzone) {
167     my $name=substr($_,0,30);
168     my $tag=substr($_,50,3);
169    
170     $name =~ s/\s+$//;
171     $tag =~ s/\s+$//;
172    
173     $self->{'TagName'}->{$tag}=$name;
174     }
175    
176     if (/^\*\*\*/) {
177     $fieldzone=1;
178     }
179     }
180    
181 dpavlin 33 close($fileFDT);
182 dpavlin 1 }
183    
184     # Get the Maximum MFN from $db.MST
185    
186 dpavlin 18 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
187 dpavlin 33 binmode($self->{'fileMST'});
188 dpavlin 1
189     # MST format: (* = 32 bit signed)
190     # CTLMFN* always 0
191     # NXTMFN* MFN to be assigned to the next record created
192     # NXTMFB* last block allocated to master file
193     # NXTMFP offset to next available position in last block
194     # MFTYPE always 0 for user db file (1 for system)
195 dpavlin 34 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
196 dpavlin 1
197 dpavlin 11 my $buff;
198    
199 dpavlin 34 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
200     $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
201 dpavlin 11
202 dpavlin 18 print STDERR Dumper($self),"\n" if ($self->{debug});
203    
204     # open files for later
205     open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
206 dpavlin 33 binmode($self->{'fileXRF'});
207 dpavlin 18
208     $self ? return $self : return undef;
209     }
210    
211 dpavlin 32 =head2 count
212    
213     Return number of records in database
214    
215     print $isis->count;
216    
217     =cut
218    
219     sub count {
220     my $self = shift;
221     return $self->{'NXTMFN'} - 1;
222     }
223    
224 dpavlin 7 =head2 fetch
225 dpavlin 1
226 dpavlin 2 Read record with selected MFN
227 dpavlin 1
228 dpavlin 7 my $rec = $isis->fetch(55);
229 dpavlin 2
230     Returns hash with keys which are field names and values are unpacked values
231 dpavlin 15 for that field like this:
232 dpavlin 2
233 dpavlin 15 $rec = {
234     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
235     '990' => [ '2140', '88', 'HAY' ],
236     };
237    
238 dpavlin 2 =cut
239    
240 dpavlin 7 sub fetch {
241 dpavlin 1 my $self = shift;
242    
243 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
244 dpavlin 1
245 dpavlin 16 # is mfn allready in memory?
246     my $old_mfn = $self->{'current_mfn'} || -1;
247 dpavlin 25 return $self->{record} if ($mfn == $old_mfn);
248 dpavlin 1
249 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
250    
251 dpavlin 1 # XXX check this?
252     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
253    
254 dpavlin 18 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
255 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
256 dpavlin 1
257 dpavlin 11 my $buff;
258    
259 dpavlin 25 # delete old record
260     delete $self->{record};
261    
262 dpavlin 1 # read XRFMFB abd XRFMFP
263 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
264 dpavlin 34 my $pointer=unpack("V",$buff) || croak "pointer is null";
265 dpavlin 1
266 dpavlin 25 # check for logically deleted record
267 dpavlin 33 if ($pointer & 0x80000000) {
268 dpavlin 25 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
269     $self->{deleted} = $mfn;
270    
271     return unless $self->{include_deleted};
272    
273 dpavlin 33 # abs
274     $pointer = ($pointer ^ 0xffffffff) + 1;
275 dpavlin 25 }
276    
277 dpavlin 1 my $XRFMFB = int($pointer/2048);
278     my $XRFMFP = $pointer - ($XRFMFB*2048);
279    
280 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
281     # why do i have to do XRFMFP % 1024 ?
282 dpavlin 1
283 dpavlin 26 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
284 dpavlin 1
285 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
286 dpavlin 1
287     # Get Record Information
288    
289 dpavlin 33 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
290 dpavlin 1
291 dpavlin 33 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
292     my $value=unpack("V",$buff);
293 dpavlin 1
294 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
295    
296 dpavlin 1 if ($value!=$mfn) {
297 dpavlin 26 if ($value == 0) {
298     print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
299     $self->{deleted} = $mfn;
300     return;
301     }
302    
303     carp "Error: MFN ".$mfn." not found in MST file, found $value";
304     return;
305 dpavlin 1 }
306    
307 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
308 dpavlin 1
309 dpavlin 33 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
310 dpavlin 1
311 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
312 dpavlin 1
313 dpavlin 25 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
314 dpavlin 9
315 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
316    
317 dpavlin 1 # Get Directory Format
318    
319     my @FieldPOS;
320     my @FieldLEN;
321     my @FieldTAG;
322    
323 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
324    
325 dpavlin 16 my $rec_len = 0;
326 dpavlin 8
327 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
328    
329 dpavlin 33 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
330 dpavlin 1
331 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
332 dpavlin 1
333     # The TAG does not exists in .FDT so we set it to 0.
334     #
335     # XXX This is removed from perl version; .FDT file is updated manually, so
336     # you will often have fields in .MST file which aren't in .FDT. On the other
337     # hand, IsisMarc doesn't use .FDT files at all!
338    
339     #if (! $self->{TagName}->{$TAG}) {
340     # $TAG=0;
341     #}
342    
343     push @FieldTAG,$TAG;
344     push @FieldPOS,$POS;
345     push @FieldLEN,$LEN;
346 dpavlin 8
347 dpavlin 16 $rec_len += $LEN;
348 dpavlin 1 }
349    
350     # Get Variable Fields
351    
352 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
353 dpavlin 8
354 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
355    
356 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
357 dpavlin 10 # skip zero-sized fields
358     next if ($FieldLEN[$i] == 0);
359    
360 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
361 dpavlin 1 }
362    
363 dpavlin 16 $self->{'current_mfn'} = $mfn;
364    
365 dpavlin 25 print STDERR Dumper($self),"\n" if ($self->{debug});
366 dpavlin 1
367 dpavlin 2 return $self->{'record'};
368 dpavlin 1 }
369    
370 dpavlin 2 =head2 to_ascii
371    
372 dpavlin 27 Returns ASCII output of record with specified MFN
373 dpavlin 2
374 dpavlin 15 print $isis->to_ascii(42);
375 dpavlin 2
376 dpavlin 27 This outputs something like this:
377 dpavlin 15
378     210 ^aNew York^cNew York University press^dcop. 1988
379     990 2140
380     990 88
381     990 HAY
382    
383     If C<read_fdt> is specified when calling C<new> it will display field names
384     from C<.FDT> file instead of numeric tags.
385    
386 dpavlin 2 =cut
387    
388     sub to_ascii {
389     my $self = shift;
390    
391     my $mfn = shift || croak "need MFN";
392    
393 dpavlin 7 my $rec = $self->fetch($mfn);
394 dpavlin 2
395     my $out = "0\t$mfn";
396    
397     foreach my $f (sort keys %{$rec}) {
398 dpavlin 15 my $fn = $self->tag_name($f);
399     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
400 dpavlin 2 }
401    
402     $out .= "\n";
403    
404     return $out;
405     }
406    
407 dpavlin 12 =head2 to_hash
408    
409 dpavlin 15 Read record with specified MFN and convert it to hash
410 dpavlin 12
411     my $hash = $isis->to_hash($mfn);
412    
413 dpavlin 27 It has ability to convert characters (using C<hash_filter>) from ISIS
414 dpavlin 15 database before creating structures enabling character re-mapping or quick
415     fix-up of data.
416 dpavlin 12
417     This function returns hash which is like this:
418    
419     $hash = {
420     '210' => [
421     {
422     'c' => 'New York University press',
423     'a' => 'New York',
424     'd' => 'cop. 1988'
425     }
426     ],
427     '990' => [
428     '2140',
429     '88',
430     'HAY'
431     ],
432     };
433    
434 dpavlin 15 You can later use that hash to produce any output from ISIS data.
435 dpavlin 12
436 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
437     which will be used for identifiers, C<i1> and C<i2> like this:
438    
439     '200' => [
440     {
441     'i1' => '1',
442     'i2' => ' '
443     'a' => 'Goa',
444     'f' => 'Valdo D\'Arienzo',
445     'e' => 'tipografie e tipografi nel XVI secolo',
446     }
447     ],
448    
449     This method will also create additional field C<000> with MFN.
450    
451 dpavlin 12 =cut
452    
453     sub to_hash {
454     my $self = shift;
455    
456     my $mfn = shift || confess "need mfn!";
457    
458 dpavlin 15 # init record to include MFN as field 000
459 dpavlin 16 my $rec = { '000' => [ $mfn ] };
460 dpavlin 15
461 dpavlin 12 my $row = $self->fetch($mfn);
462    
463     foreach my $k (keys %{$row}) {
464     foreach my $l (@{$row->{$k}}) {
465    
466     # filter output
467     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
468    
469 dpavlin 15 my $val;
470    
471     # has identifiers?
472 dpavlin 23 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
473 dpavlin 15
474 dpavlin 12 # has subfields?
475     if ($l =~ m/\^/) {
476     foreach my $t (split(/\^/,$l)) {
477     next if (! $t);
478     $val->{substr($t,0,1)} = substr($t,1);
479     }
480     } else {
481     $val = $l;
482     }
483    
484     push @{$rec->{$k}}, $val;
485     }
486     }
487    
488     return $rec;
489     }
490    
491 dpavlin 15 =head2 tag_name
492 dpavlin 1
493 dpavlin 15 Return name of selected tag
494 dpavlin 1
495 dpavlin 15 print $isis->tag_name('200');
496    
497     =cut
498    
499     sub tag_name {
500 dpavlin 1 my $self = shift;
501 dpavlin 15 my $tag = shift || return;
502     return $self->{'TagName'}->{$tag} || $tag;
503 dpavlin 1 }
504    
505 dpavlin 35
506     =head2 read_cnt
507    
508     Read content of C<.CNT> file and return hash containing it.
509    
510     print Dumper($isis->read_cnt);
511    
512     This function is not used by module (C<.CNT> files are not required for this
513     module to work), but it can be useful to examine your index (while debugging
514     for example).
515    
516     =cut
517    
518     sub read_cnt {
519     my $self = shift;
520    
521     croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
522    
523     # Get the index information from $db.CNT
524    
525     open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
526     binmode($fileCNT);
527    
528     my $buff;
529    
530     read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
531     $self->unpack_cnt($buff);
532    
533     read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
534     $self->unpack_cnt($buff);
535    
536     close($fileCNT);
537    
538     return $self->{cnt};
539     }
540    
541     =head2 unpack_cnt
542    
543     Unpack one of two 26 bytes fixed length record in C<.CNT> file.
544    
545     Here is definition of record:
546    
547     off key description size
548     0: IDTYPE BTree type s
549     2: ORDN Nodes Order s
550     4: ORDF Leafs Order s
551     6: N Number of Memory buffers for nodes s
552     8: K Number of buffers for first level index s
553     10: LIV Current number of Index Levels s
554     12: POSRX Pointer to Root Record in N0x l
555     16: NMAXPOS Next Available position in N0x l
556     20: FMAXPOS Next available position in L0x l
557     24: ABNORMAL Formal BTree normality indicator s
558     length: 26 bytes
559    
560     This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
561    
562     =cut
563    
564     sub unpack_cnt {
565     my $self = shift;
566    
567     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
568    
569     my $buff = shift || return;
570     my @arr = unpack("vvvvvvVVVv", $buff);
571    
572     print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
573    
574     my $IDTYPE = shift @arr;
575     foreach (@flds) {
576     $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
577     }
578     }
579    
580 dpavlin 1 1;
581    
582     =head1 BUGS
583    
584 dpavlin 27 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
585     some variations in input databases which has been tested with this module.
586     When I was in doubt, I assumed that OpenIsis's implementation was right
587     (except for obvious bugs).
588 dpavlin 1
589 dpavlin 27 However, every effort has been made to test this module with as much
590     databases (and programs that create them) as possible.
591    
592     I would be very greatful for success or failure reports about usage of this
593     module with databases from programs other than WinIsis and IsisMarc. I had
594     tested this against ouput of one C<isis.dll>-based application, but I don't
595     know any details about it's version.
596    
597 dpavlin 1 =head1 AUTHOR
598    
599     Dobrica Pavlinusic
600     CPAN ID: DPAVLIN
601     dpavlin@rot13.org
602     http://www.rot13.org/~dpavlin/
603    
604 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
605     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
606 dpavlin 1
607     =head1 COPYRIGHT
608    
609     This program is free software; you can redistribute
610     it and/or modify it under the same terms as Perl itself.
611    
612     The full text of the license can be found in the
613     LICENSE file included with this module.
614    
615    
616     =head1 SEE ALSO
617    
618 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
619 dpavlin 1
620 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
621    

  ViewVC Help
Powered by ViewVC 1.1.26