/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 41 - (hide annotations)
Sat Mar 12 21:05:29 2005 UTC (19 years ago) by dpavlin
File size: 15013 byte(s)
better support for ISIS files with null pointers (it will warn and not die)

1 dpavlin 36 package Biblio::Isis;
2 dpavlin 1 use strict;
3    
4     use Carp;
5 dpavlin 18 use File::Glob qw(:globally :nocase);
6    
7 dpavlin 1 use Data::Dumper;
8    
9     BEGIN {
10     use Exporter ();
11     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 dpavlin 41 $VERSION = 0.13;
13 dpavlin 1 @ISA = qw (Exporter);
14     #Give a hoot don't pollute, do not export more than needed by default
15     @EXPORT = qw ();
16     @EXPORT_OK = qw ();
17     %EXPORT_TAGS = ();
18    
19     }
20    
21     =head1 NAME
22    
23 dpavlin 36 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 36 use Biblio::Isis;
28 dpavlin 11
29 dpavlin 36 my $isis = new Biblio::Isis(
30 dpavlin 1 isisdb => './cds/cds',
31     );
32    
33 dpavlin 32 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34 dpavlin 11 print $isis->to_ascii($mfn),"\n";
35     }
36    
37 dpavlin 1 =head1 DESCRIPTION
38    
39 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 dpavlin 27 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41     seems to depriciate it's old C<XS> bindings for perl.
42 dpavlin 1
43 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
44     ASCII dump (using C<to_ascii>) or just hash with field names and packed
45     values (like C<^asomething^belse>).
46 dpavlin 11
47     Unique feature of this module is ability to C<include_deleted> records.
48     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49     fields which are zero sized will be filled with random junk from memory).
50    
51 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
52     IsisMarc), see C<to_hash>.
53    
54 dpavlin 27 This module will always be slower than OpenIsis module which use C
55 dpavlin 15 library. However, since it's written in perl, it's platform independent (so
56     you don't need C compiler), and can be easily modified. I hope that it
57     creates data structures which are easier to use than ones created by
58     OpenIsis, so reduced time in other parts of the code should compensate for
59     slower performance of this module (speed of reading ISIS database is
60     rarely an issue).
61    
62 dpavlin 1 =head1 METHODS
63    
64     =cut
65    
66     # my $ORDN; # Nodes Order
67     # my $ORDF; # Leafs Order
68     # my $N; # Number of Memory buffers for nodes
69     # my $K; # Number of buffers for first level index
70     # my $LIV; # Current number of Index Levels
71     # my $POSRX; # Pointer to Root Record in N0x
72     # my $NMAXPOS; # Next Available position in N0x
73     # my $FMAXPOS; # Next available position in L0x
74     # my $ABNORMAL; # Formal BTree normality indicator
75    
76     #
77     # some binary reads
78     #
79    
80     =head2 new
81    
82 dpavlin 15 Open ISIS database
83 dpavlin 1
84 dpavlin 36 my $isis = new Biblio::Isis(
85 dpavlin 1 isisdb => './cds/cds',
86     read_fdt => 1,
87 dpavlin 12 include_deleted => 1,
88     hash_filter => sub {
89     my $v = shift;
90     $v =~ s#foo#bar#g;
91     },
92 dpavlin 1 debug => 1,
93     );
94    
95 dpavlin 2 Options are described below:
96    
97     =over 5
98    
99 dpavlin 1 =item isisdb
100    
101 dpavlin 15 This is full or relative path to ISIS database files which include
102 dpavlin 18 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103     C<read_fdt> option) files.
104 dpavlin 1
105 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
106    
107 dpavlin 1 =item read_fdt
108    
109     Boolean flag to specify if field definition table should be read. It's off
110     by default.
111    
112 dpavlin 9 =item include_deleted
113    
114 dpavlin 11 Don't skip logically deleted records in ISIS.
115 dpavlin 9
116 dpavlin 12 =item hash_filter
117    
118     Filter code ref which will be used before data is converted to hash.
119    
120     =item debug
121    
122     Dump a B<lot> of debugging output.
123    
124 dpavlin 2 =back
125    
126 dpavlin 1 =cut
127    
128     sub new {
129     my $class = shift;
130     my $self = {};
131     bless($self, $class);
132    
133 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134 dpavlin 1
135 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136 dpavlin 9 $self->{$v} = {@_}->{$v};
137     }
138 dpavlin 1
139 dpavlin 18 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140    
141     foreach my $f (@isis_files) {
142     my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143     $self->{lc($ext)."_file"} = $f;
144     }
145    
146     my @must_exist = qw(mst xrf);
147     push @must_exist, "fdt" if ($self->{read_fdt});
148    
149     foreach my $ext (@must_exist) {
150 dpavlin 39 unless ($self->{$ext."_file"}) {
151     carp "missing ",uc($ext)," file in ",$self->{isisdb};
152     return;
153     }
154 dpavlin 18 }
155    
156     print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
157    
158 dpavlin 1 # if you want to read .FDT file use read_fdt argument when creating class!
159 dpavlin 18 if ($self->{read_fdt} && -e $self->{fdt_file}) {
160 dpavlin 1
161     # read the $db.FDT file for tags
162     my $fieldzone=0;
163    
164 dpavlin 33 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
165     binmode($fileFDT);
166 dpavlin 1
167 dpavlin 33 while (<$fileFDT>) {
168 dpavlin 1 chomp;
169     if ($fieldzone) {
170     my $name=substr($_,0,30);
171     my $tag=substr($_,50,3);
172    
173     $name =~ s/\s+$//;
174     $tag =~ s/\s+$//;
175    
176     $self->{'TagName'}->{$tag}=$name;
177     }
178    
179     if (/^\*\*\*/) {
180     $fieldzone=1;
181     }
182     }
183    
184 dpavlin 33 close($fileFDT);
185 dpavlin 1 }
186    
187     # Get the Maximum MFN from $db.MST
188    
189 dpavlin 18 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
190 dpavlin 33 binmode($self->{'fileMST'});
191 dpavlin 1
192     # MST format: (* = 32 bit signed)
193     # CTLMFN* always 0
194     # NXTMFN* MFN to be assigned to the next record created
195     # NXTMFB* last block allocated to master file
196     # NXTMFP offset to next available position in last block
197     # MFTYPE always 0 for user db file (1 for system)
198 dpavlin 34 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
199 dpavlin 1
200 dpavlin 11 my $buff;
201    
202 dpavlin 34 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
203     $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
204 dpavlin 11
205 dpavlin 18 print STDERR Dumper($self),"\n" if ($self->{debug});
206    
207     # open files for later
208     open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
209 dpavlin 33 binmode($self->{'fileXRF'});
210 dpavlin 18
211     $self ? return $self : return undef;
212     }
213    
214 dpavlin 32 =head2 count
215    
216     Return number of records in database
217    
218     print $isis->count;
219    
220     =cut
221    
222     sub count {
223     my $self = shift;
224     return $self->{'NXTMFN'} - 1;
225     }
226    
227 dpavlin 7 =head2 fetch
228 dpavlin 1
229 dpavlin 2 Read record with selected MFN
230 dpavlin 1
231 dpavlin 7 my $rec = $isis->fetch(55);
232 dpavlin 2
233     Returns hash with keys which are field names and values are unpacked values
234 dpavlin 15 for that field like this:
235 dpavlin 2
236 dpavlin 15 $rec = {
237     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
238     '990' => [ '2140', '88', 'HAY' ],
239     };
240    
241 dpavlin 2 =cut
242    
243 dpavlin 7 sub fetch {
244 dpavlin 1 my $self = shift;
245    
246 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
247 dpavlin 1
248 dpavlin 16 # is mfn allready in memory?
249     my $old_mfn = $self->{'current_mfn'} || -1;
250 dpavlin 25 return $self->{record} if ($mfn == $old_mfn);
251 dpavlin 1
252 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
253    
254 dpavlin 1 # XXX check this?
255     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
256    
257 dpavlin 18 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
258 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
259 dpavlin 1
260 dpavlin 11 my $buff;
261    
262 dpavlin 25 # delete old record
263     delete $self->{record};
264    
265 dpavlin 1 # read XRFMFB abd XRFMFP
266 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
267 dpavlin 41 my $pointer=unpack("V",$buff);
268     if (! $pointer) {
269     if ($self->{include_deleted}) {
270     return;
271     } else {
272     warn "pointer for MFN $mfn is null\n";
273     return;
274     }
275     }
276 dpavlin 1
277 dpavlin 25 # check for logically deleted record
278 dpavlin 33 if ($pointer & 0x80000000) {
279 dpavlin 25 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
280     $self->{deleted} = $mfn;
281    
282     return unless $self->{include_deleted};
283    
284 dpavlin 33 # abs
285     $pointer = ($pointer ^ 0xffffffff) + 1;
286 dpavlin 25 }
287    
288 dpavlin 1 my $XRFMFB = int($pointer/2048);
289     my $XRFMFP = $pointer - ($XRFMFB*2048);
290    
291 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
292     # why do i have to do XRFMFP % 1024 ?
293 dpavlin 1
294 dpavlin 26 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
295 dpavlin 1
296 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
297 dpavlin 1
298     # Get Record Information
299    
300 dpavlin 33 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
301 dpavlin 1
302 dpavlin 33 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
303     my $value=unpack("V",$buff);
304 dpavlin 1
305 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
306    
307 dpavlin 1 if ($value!=$mfn) {
308 dpavlin 26 if ($value == 0) {
309     print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
310     $self->{deleted} = $mfn;
311     return;
312     }
313    
314     carp "Error: MFN ".$mfn." not found in MST file, found $value";
315     return;
316 dpavlin 1 }
317    
318 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
319 dpavlin 1
320 dpavlin 33 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
321 dpavlin 1
322 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
323 dpavlin 1
324 dpavlin 25 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
325 dpavlin 9
326 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
327    
328 dpavlin 1 # Get Directory Format
329    
330     my @FieldPOS;
331     my @FieldLEN;
332     my @FieldTAG;
333    
334 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
335    
336 dpavlin 16 my $rec_len = 0;
337 dpavlin 8
338 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
339    
340 dpavlin 33 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
341 dpavlin 1
342 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
343 dpavlin 1
344     # The TAG does not exists in .FDT so we set it to 0.
345     #
346     # XXX This is removed from perl version; .FDT file is updated manually, so
347     # you will often have fields in .MST file which aren't in .FDT. On the other
348     # hand, IsisMarc doesn't use .FDT files at all!
349    
350     #if (! $self->{TagName}->{$TAG}) {
351     # $TAG=0;
352     #}
353    
354     push @FieldTAG,$TAG;
355     push @FieldPOS,$POS;
356     push @FieldLEN,$LEN;
357 dpavlin 8
358 dpavlin 16 $rec_len += $LEN;
359 dpavlin 1 }
360    
361     # Get Variable Fields
362    
363 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
364 dpavlin 8
365 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
366    
367 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
368 dpavlin 10 # skip zero-sized fields
369     next if ($FieldLEN[$i] == 0);
370    
371 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
372 dpavlin 1 }
373    
374 dpavlin 16 $self->{'current_mfn'} = $mfn;
375    
376 dpavlin 25 print STDERR Dumper($self),"\n" if ($self->{debug});
377 dpavlin 1
378 dpavlin 2 return $self->{'record'};
379 dpavlin 1 }
380    
381 dpavlin 2 =head2 to_ascii
382    
383 dpavlin 27 Returns ASCII output of record with specified MFN
384 dpavlin 2
385 dpavlin 15 print $isis->to_ascii(42);
386 dpavlin 2
387 dpavlin 27 This outputs something like this:
388 dpavlin 15
389     210 ^aNew York^cNew York University press^dcop. 1988
390     990 2140
391     990 88
392     990 HAY
393    
394     If C<read_fdt> is specified when calling C<new> it will display field names
395     from C<.FDT> file instead of numeric tags.
396    
397 dpavlin 2 =cut
398    
399     sub to_ascii {
400     my $self = shift;
401    
402     my $mfn = shift || croak "need MFN";
403    
404 dpavlin 41 my $rec = $self->fetch($mfn) || return;
405 dpavlin 2
406     my $out = "0\t$mfn";
407    
408     foreach my $f (sort keys %{$rec}) {
409 dpavlin 15 my $fn = $self->tag_name($f);
410     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
411 dpavlin 2 }
412    
413     $out .= "\n";
414    
415     return $out;
416     }
417    
418 dpavlin 12 =head2 to_hash
419    
420 dpavlin 15 Read record with specified MFN and convert it to hash
421 dpavlin 12
422     my $hash = $isis->to_hash($mfn);
423    
424 dpavlin 27 It has ability to convert characters (using C<hash_filter>) from ISIS
425 dpavlin 15 database before creating structures enabling character re-mapping or quick
426     fix-up of data.
427 dpavlin 12
428     This function returns hash which is like this:
429    
430     $hash = {
431     '210' => [
432     {
433     'c' => 'New York University press',
434     'a' => 'New York',
435     'd' => 'cop. 1988'
436     }
437     ],
438     '990' => [
439     '2140',
440     '88',
441     'HAY'
442     ],
443     };
444    
445 dpavlin 15 You can later use that hash to produce any output from ISIS data.
446 dpavlin 12
447 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
448     which will be used for identifiers, C<i1> and C<i2> like this:
449    
450     '200' => [
451     {
452     'i1' => '1',
453     'i2' => ' '
454     'a' => 'Goa',
455     'f' => 'Valdo D\'Arienzo',
456     'e' => 'tipografie e tipografi nel XVI secolo',
457     }
458     ],
459    
460     This method will also create additional field C<000> with MFN.
461    
462 dpavlin 12 =cut
463    
464     sub to_hash {
465     my $self = shift;
466    
467     my $mfn = shift || confess "need mfn!";
468    
469 dpavlin 15 # init record to include MFN as field 000
470 dpavlin 16 my $rec = { '000' => [ $mfn ] };
471 dpavlin 15
472 dpavlin 41 my $row = $self->fetch($mfn) || return;
473 dpavlin 12
474     foreach my $k (keys %{$row}) {
475     foreach my $l (@{$row->{$k}}) {
476    
477     # filter output
478     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
479    
480 dpavlin 15 my $val;
481    
482     # has identifiers?
483 dpavlin 23 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
484 dpavlin 15
485 dpavlin 12 # has subfields?
486     if ($l =~ m/\^/) {
487     foreach my $t (split(/\^/,$l)) {
488     next if (! $t);
489     $val->{substr($t,0,1)} = substr($t,1);
490     }
491     } else {
492     $val = $l;
493     }
494    
495     push @{$rec->{$k}}, $val;
496     }
497     }
498    
499     return $rec;
500     }
501    
502 dpavlin 15 =head2 tag_name
503 dpavlin 1
504 dpavlin 15 Return name of selected tag
505 dpavlin 1
506 dpavlin 15 print $isis->tag_name('200');
507    
508     =cut
509    
510     sub tag_name {
511 dpavlin 1 my $self = shift;
512 dpavlin 15 my $tag = shift || return;
513     return $self->{'TagName'}->{$tag} || $tag;
514 dpavlin 1 }
515    
516 dpavlin 35
517     =head2 read_cnt
518    
519     Read content of C<.CNT> file and return hash containing it.
520    
521     print Dumper($isis->read_cnt);
522    
523     This function is not used by module (C<.CNT> files are not required for this
524     module to work), but it can be useful to examine your index (while debugging
525     for example).
526    
527     =cut
528    
529     sub read_cnt {
530     my $self = shift;
531    
532     croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
533    
534     # Get the index information from $db.CNT
535    
536     open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
537     binmode($fileCNT);
538    
539     my $buff;
540    
541     read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
542     $self->unpack_cnt($buff);
543    
544     read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
545     $self->unpack_cnt($buff);
546    
547     close($fileCNT);
548    
549     return $self->{cnt};
550     }
551    
552     =head2 unpack_cnt
553    
554     Unpack one of two 26 bytes fixed length record in C<.CNT> file.
555    
556     Here is definition of record:
557    
558     off key description size
559     0: IDTYPE BTree type s
560     2: ORDN Nodes Order s
561     4: ORDF Leafs Order s
562     6: N Number of Memory buffers for nodes s
563     8: K Number of buffers for first level index s
564     10: LIV Current number of Index Levels s
565     12: POSRX Pointer to Root Record in N0x l
566     16: NMAXPOS Next Available position in N0x l
567     20: FMAXPOS Next available position in L0x l
568     24: ABNORMAL Formal BTree normality indicator s
569     length: 26 bytes
570    
571     This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
572    
573     =cut
574    
575     sub unpack_cnt {
576     my $self = shift;
577    
578     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
579    
580     my $buff = shift || return;
581     my @arr = unpack("vvvvvvVVVv", $buff);
582    
583     print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
584    
585     my $IDTYPE = shift @arr;
586     foreach (@flds) {
587     $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
588     }
589     }
590    
591 dpavlin 1 1;
592    
593     =head1 BUGS
594    
595 dpavlin 27 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
596     some variations in input databases which has been tested with this module.
597     When I was in doubt, I assumed that OpenIsis's implementation was right
598     (except for obvious bugs).
599 dpavlin 1
600 dpavlin 27 However, every effort has been made to test this module with as much
601     databases (and programs that create them) as possible.
602    
603     I would be very greatful for success or failure reports about usage of this
604     module with databases from programs other than WinIsis and IsisMarc. I had
605     tested this against ouput of one C<isis.dll>-based application, but I don't
606     know any details about it's version.
607    
608 dpavlin 1 =head1 AUTHOR
609    
610     Dobrica Pavlinusic
611     CPAN ID: DPAVLIN
612     dpavlin@rot13.org
613     http://www.rot13.org/~dpavlin/
614    
615 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
616     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
617 dpavlin 1
618     =head1 COPYRIGHT
619    
620     This program is free software; you can redistribute
621     it and/or modify it under the same terms as Perl itself.
622    
623     The full text of the license can be found in the
624     LICENSE file included with this module.
625    
626    
627     =head1 SEE ALSO
628    
629 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
630 dpavlin 1
631 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
632    

  ViewVC Help
Powered by ViewVC 1.1.26