/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 39 - (hide annotations)
Thu Jan 27 22:01:17 2005 UTC (19 years, 2 months ago) by dpavlin
File size: 14885 byte(s)
carp and not croak if MST or XRF file isn't found (calling program will
receive undef from new and warning will be issued).

1 dpavlin 36 package Biblio::Isis;
2 dpavlin 1 use strict;
3    
4     use Carp;
5 dpavlin 18 use File::Glob qw(:globally :nocase);
6    
7 dpavlin 1 use Data::Dumper;
8    
9     BEGIN {
10     use Exporter ();
11     use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 dpavlin 39 $VERSION = 0.12;
13 dpavlin 1 @ISA = qw (Exporter);
14     #Give a hoot don't pollute, do not export more than needed by default
15     @EXPORT = qw ();
16     @EXPORT_OK = qw ();
17     %EXPORT_TAGS = ();
18    
19     }
20    
21     =head1 NAME
22    
23 dpavlin 36 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 36 use Biblio::Isis;
28 dpavlin 11
29 dpavlin 36 my $isis = new Biblio::Isis(
30 dpavlin 1 isisdb => './cds/cds',
31     );
32    
33 dpavlin 32 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
34 dpavlin 11 print $isis->to_ascii($mfn),"\n";
35     }
36    
37 dpavlin 1 =head1 DESCRIPTION
38    
39 dpavlin 15 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 dpavlin 27 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41     seems to depriciate it's old C<XS> bindings for perl.
42 dpavlin 1
43 dpavlin 15 It can create hash values from data in ISIS database (using C<to_hash>),
44     ASCII dump (using C<to_ascii>) or just hash with field names and packed
45     values (like C<^asomething^belse>).
46 dpavlin 11
47     Unique feature of this module is ability to C<include_deleted> records.
48     It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49     fields which are zero sized will be filled with random junk from memory).
50    
51 dpavlin 15 It also has support for identifiers (only if ISIS database is created by
52     IsisMarc), see C<to_hash>.
53    
54 dpavlin 27 This module will always be slower than OpenIsis module which use C
55 dpavlin 15 library. However, since it's written in perl, it's platform independent (so
56     you don't need C compiler), and can be easily modified. I hope that it
57     creates data structures which are easier to use than ones created by
58     OpenIsis, so reduced time in other parts of the code should compensate for
59     slower performance of this module (speed of reading ISIS database is
60     rarely an issue).
61    
62 dpavlin 1 =head1 METHODS
63    
64     =cut
65    
66     # my $ORDN; # Nodes Order
67     # my $ORDF; # Leafs Order
68     # my $N; # Number of Memory buffers for nodes
69     # my $K; # Number of buffers for first level index
70     # my $LIV; # Current number of Index Levels
71     # my $POSRX; # Pointer to Root Record in N0x
72     # my $NMAXPOS; # Next Available position in N0x
73     # my $FMAXPOS; # Next available position in L0x
74     # my $ABNORMAL; # Formal BTree normality indicator
75    
76     #
77     # some binary reads
78     #
79    
80     =head2 new
81    
82 dpavlin 15 Open ISIS database
83 dpavlin 1
84 dpavlin 36 my $isis = new Biblio::Isis(
85 dpavlin 1 isisdb => './cds/cds',
86     read_fdt => 1,
87 dpavlin 12 include_deleted => 1,
88     hash_filter => sub {
89     my $v = shift;
90     $v =~ s#foo#bar#g;
91     },
92 dpavlin 1 debug => 1,
93     );
94    
95 dpavlin 2 Options are described below:
96    
97     =over 5
98    
99 dpavlin 1 =item isisdb
100    
101 dpavlin 15 This is full or relative path to ISIS database files which include
102 dpavlin 18 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103     C<read_fdt> option) files.
104 dpavlin 1
105 dpavlin 15 In this example it uses C<./cds/cds.MST> and related files.
106    
107 dpavlin 1 =item read_fdt
108    
109     Boolean flag to specify if field definition table should be read. It's off
110     by default.
111    
112 dpavlin 9 =item include_deleted
113    
114 dpavlin 11 Don't skip logically deleted records in ISIS.
115 dpavlin 9
116 dpavlin 12 =item hash_filter
117    
118     Filter code ref which will be used before data is converted to hash.
119    
120     =item debug
121    
122     Dump a B<lot> of debugging output.
123    
124 dpavlin 2 =back
125    
126 dpavlin 1 =cut
127    
128     sub new {
129     my $class = shift;
130     my $self = {};
131     bless($self, $class);
132    
133 dpavlin 9 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
134 dpavlin 1
135 dpavlin 12 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
136 dpavlin 9 $self->{$v} = {@_}->{$v};
137     }
138 dpavlin 1
139 dpavlin 18 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
140    
141     foreach my $f (@isis_files) {
142     my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
143     $self->{lc($ext)."_file"} = $f;
144     }
145    
146     my @must_exist = qw(mst xrf);
147     push @must_exist, "fdt" if ($self->{read_fdt});
148    
149     foreach my $ext (@must_exist) {
150 dpavlin 39 unless ($self->{$ext."_file"}) {
151     carp "missing ",uc($ext)," file in ",$self->{isisdb};
152     return;
153     }
154 dpavlin 18 }
155    
156     print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
157    
158 dpavlin 1 # if you want to read .FDT file use read_fdt argument when creating class!
159 dpavlin 18 if ($self->{read_fdt} && -e $self->{fdt_file}) {
160 dpavlin 1
161     # read the $db.FDT file for tags
162     my $fieldzone=0;
163    
164 dpavlin 33 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
165     binmode($fileFDT);
166 dpavlin 1
167 dpavlin 33 while (<$fileFDT>) {
168 dpavlin 1 chomp;
169     if ($fieldzone) {
170     my $name=substr($_,0,30);
171     my $tag=substr($_,50,3);
172    
173     $name =~ s/\s+$//;
174     $tag =~ s/\s+$//;
175    
176     $self->{'TagName'}->{$tag}=$name;
177     }
178    
179     if (/^\*\*\*/) {
180     $fieldzone=1;
181     }
182     }
183    
184 dpavlin 33 close($fileFDT);
185 dpavlin 1 }
186    
187     # Get the Maximum MFN from $db.MST
188    
189 dpavlin 18 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
190 dpavlin 33 binmode($self->{'fileMST'});
191 dpavlin 1
192     # MST format: (* = 32 bit signed)
193     # CTLMFN* always 0
194     # NXTMFN* MFN to be assigned to the next record created
195     # NXTMFB* last block allocated to master file
196     # NXTMFP offset to next available position in last block
197     # MFTYPE always 0 for user db file (1 for system)
198 dpavlin 34 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
199 dpavlin 1
200 dpavlin 11 my $buff;
201    
202 dpavlin 34 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
203     $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
204 dpavlin 11
205 dpavlin 18 print STDERR Dumper($self),"\n" if ($self->{debug});
206    
207     # open files for later
208     open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
209 dpavlin 33 binmode($self->{'fileXRF'});
210 dpavlin 18
211     $self ? return $self : return undef;
212     }
213    
214 dpavlin 32 =head2 count
215    
216     Return number of records in database
217    
218     print $isis->count;
219    
220     =cut
221    
222     sub count {
223     my $self = shift;
224     return $self->{'NXTMFN'} - 1;
225     }
226    
227 dpavlin 7 =head2 fetch
228 dpavlin 1
229 dpavlin 2 Read record with selected MFN
230 dpavlin 1
231 dpavlin 7 my $rec = $isis->fetch(55);
232 dpavlin 2
233     Returns hash with keys which are field names and values are unpacked values
234 dpavlin 15 for that field like this:
235 dpavlin 2
236 dpavlin 15 $rec = {
237     '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
238     '990' => [ '2140', '88', 'HAY' ],
239     };
240    
241 dpavlin 2 =cut
242    
243 dpavlin 7 sub fetch {
244 dpavlin 1 my $self = shift;
245    
246 dpavlin 7 my $mfn = shift || croak "fetch needs MFN as argument!";
247 dpavlin 1
248 dpavlin 16 # is mfn allready in memory?
249     my $old_mfn = $self->{'current_mfn'} || -1;
250 dpavlin 25 return $self->{record} if ($mfn == $old_mfn);
251 dpavlin 1
252 dpavlin 16 print STDERR "## fetch: $mfn\n" if ($self->{debug});
253    
254 dpavlin 1 # XXX check this?
255     my $mfnpos=($mfn+int(($mfn-1)/127))*4;
256    
257 dpavlin 18 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
258 dpavlin 7 seek($self->{'fileXRF'},$mfnpos,0);
259 dpavlin 1
260 dpavlin 11 my $buff;
261    
262 dpavlin 25 # delete old record
263     delete $self->{record};
264    
265 dpavlin 1 # read XRFMFB abd XRFMFP
266 dpavlin 11 read($self->{'fileXRF'}, $buff, 4);
267 dpavlin 34 my $pointer=unpack("V",$buff) || croak "pointer is null";
268 dpavlin 1
269 dpavlin 25 # check for logically deleted record
270 dpavlin 33 if ($pointer & 0x80000000) {
271 dpavlin 25 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
272     $self->{deleted} = $mfn;
273    
274     return unless $self->{include_deleted};
275    
276 dpavlin 33 # abs
277     $pointer = ($pointer ^ 0xffffffff) + 1;
278 dpavlin 25 }
279    
280 dpavlin 1 my $XRFMFB = int($pointer/2048);
281     my $XRFMFP = $pointer - ($XRFMFB*2048);
282    
283 dpavlin 16 # (XRFMFB - 1) * 512 + XRFMFP
284     # why do i have to do XRFMFP % 1024 ?
285 dpavlin 1
286 dpavlin 26 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
287 dpavlin 1
288 dpavlin 16 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
289 dpavlin 1
290     # Get Record Information
291    
292 dpavlin 33 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
293 dpavlin 1
294 dpavlin 33 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
295     my $value=unpack("V",$buff);
296 dpavlin 1
297 dpavlin 16 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
298    
299 dpavlin 1 if ($value!=$mfn) {
300 dpavlin 26 if ($value == 0) {
301     print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
302     $self->{deleted} = $mfn;
303     return;
304     }
305    
306     carp "Error: MFN ".$mfn." not found in MST file, found $value";
307     return;
308 dpavlin 1 }
309    
310 dpavlin 7 read($self->{'fileMST'}, $buff, 14);
311 dpavlin 1
312 dpavlin 33 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
313 dpavlin 1
314 dpavlin 16 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
315 dpavlin 1
316 dpavlin 25 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
317 dpavlin 9
318 dpavlin 16 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
319    
320 dpavlin 1 # Get Directory Format
321    
322     my @FieldPOS;
323     my @FieldLEN;
324     my @FieldTAG;
325    
326 dpavlin 8 read($self->{'fileMST'}, $buff, 6 * $NVF);
327    
328 dpavlin 16 my $rec_len = 0;
329 dpavlin 8
330 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
331    
332 dpavlin 33 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
333 dpavlin 1
334 dpavlin 16 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
335 dpavlin 1
336     # The TAG does not exists in .FDT so we set it to 0.
337     #
338     # XXX This is removed from perl version; .FDT file is updated manually, so
339     # you will often have fields in .MST file which aren't in .FDT. On the other
340     # hand, IsisMarc doesn't use .FDT files at all!
341    
342     #if (! $self->{TagName}->{$TAG}) {
343     # $TAG=0;
344     #}
345    
346     push @FieldTAG,$TAG;
347     push @FieldPOS,$POS;
348     push @FieldLEN,$LEN;
349 dpavlin 8
350 dpavlin 16 $rec_len += $LEN;
351 dpavlin 1 }
352    
353     # Get Variable Fields
354    
355 dpavlin 16 read($self->{'fileMST'},$buff,$rec_len);
356 dpavlin 8
357 dpavlin 16 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
358    
359 dpavlin 1 for (my $i = 0 ; $i < $NVF ; $i++) {
360 dpavlin 10 # skip zero-sized fields
361     next if ($FieldLEN[$i] == 0);
362    
363 dpavlin 8 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
364 dpavlin 1 }
365    
366 dpavlin 16 $self->{'current_mfn'} = $mfn;
367    
368 dpavlin 25 print STDERR Dumper($self),"\n" if ($self->{debug});
369 dpavlin 1
370 dpavlin 2 return $self->{'record'};
371 dpavlin 1 }
372    
373 dpavlin 2 =head2 to_ascii
374    
375 dpavlin 27 Returns ASCII output of record with specified MFN
376 dpavlin 2
377 dpavlin 15 print $isis->to_ascii(42);
378 dpavlin 2
379 dpavlin 27 This outputs something like this:
380 dpavlin 15
381     210 ^aNew York^cNew York University press^dcop. 1988
382     990 2140
383     990 88
384     990 HAY
385    
386     If C<read_fdt> is specified when calling C<new> it will display field names
387     from C<.FDT> file instead of numeric tags.
388    
389 dpavlin 2 =cut
390    
391     sub to_ascii {
392     my $self = shift;
393    
394     my $mfn = shift || croak "need MFN";
395    
396 dpavlin 7 my $rec = $self->fetch($mfn);
397 dpavlin 2
398     my $out = "0\t$mfn";
399    
400     foreach my $f (sort keys %{$rec}) {
401 dpavlin 15 my $fn = $self->tag_name($f);
402     $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
403 dpavlin 2 }
404    
405     $out .= "\n";
406    
407     return $out;
408     }
409    
410 dpavlin 12 =head2 to_hash
411    
412 dpavlin 15 Read record with specified MFN and convert it to hash
413 dpavlin 12
414     my $hash = $isis->to_hash($mfn);
415    
416 dpavlin 27 It has ability to convert characters (using C<hash_filter>) from ISIS
417 dpavlin 15 database before creating structures enabling character re-mapping or quick
418     fix-up of data.
419 dpavlin 12
420     This function returns hash which is like this:
421    
422     $hash = {
423     '210' => [
424     {
425     'c' => 'New York University press',
426     'a' => 'New York',
427     'd' => 'cop. 1988'
428     }
429     ],
430     '990' => [
431     '2140',
432     '88',
433     'HAY'
434     ],
435     };
436    
437 dpavlin 15 You can later use that hash to produce any output from ISIS data.
438 dpavlin 12
439 dpavlin 15 If database is created using IsisMarc, it will also have to special fields
440     which will be used for identifiers, C<i1> and C<i2> like this:
441    
442     '200' => [
443     {
444     'i1' => '1',
445     'i2' => ' '
446     'a' => 'Goa',
447     'f' => 'Valdo D\'Arienzo',
448     'e' => 'tipografie e tipografi nel XVI secolo',
449     }
450     ],
451    
452     This method will also create additional field C<000> with MFN.
453    
454 dpavlin 12 =cut
455    
456     sub to_hash {
457     my $self = shift;
458    
459     my $mfn = shift || confess "need mfn!";
460    
461 dpavlin 15 # init record to include MFN as field 000
462 dpavlin 16 my $rec = { '000' => [ $mfn ] };
463 dpavlin 15
464 dpavlin 12 my $row = $self->fetch($mfn);
465    
466     foreach my $k (keys %{$row}) {
467     foreach my $l (@{$row->{$k}}) {
468    
469     # filter output
470     $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
471    
472 dpavlin 15 my $val;
473    
474     # has identifiers?
475 dpavlin 23 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
476 dpavlin 15
477 dpavlin 12 # has subfields?
478     if ($l =~ m/\^/) {
479     foreach my $t (split(/\^/,$l)) {
480     next if (! $t);
481     $val->{substr($t,0,1)} = substr($t,1);
482     }
483     } else {
484     $val = $l;
485     }
486    
487     push @{$rec->{$k}}, $val;
488     }
489     }
490    
491     return $rec;
492     }
493    
494 dpavlin 15 =head2 tag_name
495 dpavlin 1
496 dpavlin 15 Return name of selected tag
497 dpavlin 1
498 dpavlin 15 print $isis->tag_name('200');
499    
500     =cut
501    
502     sub tag_name {
503 dpavlin 1 my $self = shift;
504 dpavlin 15 my $tag = shift || return;
505     return $self->{'TagName'}->{$tag} || $tag;
506 dpavlin 1 }
507    
508 dpavlin 35
509     =head2 read_cnt
510    
511     Read content of C<.CNT> file and return hash containing it.
512    
513     print Dumper($isis->read_cnt);
514    
515     This function is not used by module (C<.CNT> files are not required for this
516     module to work), but it can be useful to examine your index (while debugging
517     for example).
518    
519     =cut
520    
521     sub read_cnt {
522     my $self = shift;
523    
524     croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
525    
526     # Get the index information from $db.CNT
527    
528     open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
529     binmode($fileCNT);
530    
531     my $buff;
532    
533     read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
534     $self->unpack_cnt($buff);
535    
536     read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
537     $self->unpack_cnt($buff);
538    
539     close($fileCNT);
540    
541     return $self->{cnt};
542     }
543    
544     =head2 unpack_cnt
545    
546     Unpack one of two 26 bytes fixed length record in C<.CNT> file.
547    
548     Here is definition of record:
549    
550     off key description size
551     0: IDTYPE BTree type s
552     2: ORDN Nodes Order s
553     4: ORDF Leafs Order s
554     6: N Number of Memory buffers for nodes s
555     8: K Number of buffers for first level index s
556     10: LIV Current number of Index Levels s
557     12: POSRX Pointer to Root Record in N0x l
558     16: NMAXPOS Next Available position in N0x l
559     20: FMAXPOS Next available position in L0x l
560     24: ABNORMAL Formal BTree normality indicator s
561     length: 26 bytes
562    
563     This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
564    
565     =cut
566    
567     sub unpack_cnt {
568     my $self = shift;
569    
570     my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
571    
572     my $buff = shift || return;
573     my @arr = unpack("vvvvvvVVVv", $buff);
574    
575     print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
576    
577     my $IDTYPE = shift @arr;
578     foreach (@flds) {
579     $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
580     }
581     }
582    
583 dpavlin 1 1;
584    
585     =head1 BUGS
586    
587 dpavlin 27 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
588     some variations in input databases which has been tested with this module.
589     When I was in doubt, I assumed that OpenIsis's implementation was right
590     (except for obvious bugs).
591 dpavlin 1
592 dpavlin 27 However, every effort has been made to test this module with as much
593     databases (and programs that create them) as possible.
594    
595     I would be very greatful for success or failure reports about usage of this
596     module with databases from programs other than WinIsis and IsisMarc. I had
597     tested this against ouput of one C<isis.dll>-based application, but I don't
598     know any details about it's version.
599    
600 dpavlin 1 =head1 AUTHOR
601    
602     Dobrica Pavlinusic
603     CPAN ID: DPAVLIN
604     dpavlin@rot13.org
605     http://www.rot13.org/~dpavlin/
606    
607 dpavlin 15 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
608     written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
609 dpavlin 1
610     =head1 COPYRIGHT
611    
612     This program is free software; you can redistribute
613     it and/or modify it under the same terms as Perl itself.
614    
615     The full text of the license can be found in the
616     LICENSE file included with this module.
617    
618    
619     =head1 SEE ALSO
620    
621 dpavlin 15 OpenIsis web site L<http://www.openisis.org>
622 dpavlin 1
623 dpavlin 15 perl4lib site L<http://perl4lib.perl.org>
624    

  ViewVC Help
Powered by ViewVC 1.1.26