/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 58 - (show annotations)
Sun Jul 9 12:18:44 2006 UTC (17 years, 8 months ago) by dpavlin
File size: 18743 byte(s)
test and fix join_subfields_with
1 package Biblio::Isis;
2 use strict;
3
4 use Carp;
5 use File::Glob qw(:globally :nocase);
6
7 BEGIN {
8 use Exporter ();
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 $VERSION = 0.21;
11 @ISA = qw (Exporter);
12 #Give a hoot don't pollute, do not export more than needed by default
13 @EXPORT = qw ();
14 @EXPORT_OK = qw ();
15 %EXPORT_TAGS = ();
16
17 }
18
19 =head1 NAME
20
21 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
22
23 =head1 SYNOPSIS
24
25 use Biblio::Isis;
26
27 my $isis = new Biblio::Isis(
28 isisdb => './cds/cds',
29 );
30
31 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32 print $isis->to_ascii($mfn),"\n";
33 }
34
35 =head1 DESCRIPTION
36
37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39 seems to depriciate it's old C<XS> bindings for perl.
40
41 It can create hash values from data in ISIS database (using C<to_hash>),
42 ASCII dump (using C<to_ascii>) or just hash with field names and packed
43 values (like C<^asomething^belse>).
44
45 Unique feature of this module is ability to C<include_deleted> records.
46 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
47 fields which are zero sized will be filled with random junk from memory).
48
49 It also has support for identifiers (only if ISIS database is created by
50 IsisMarc), see C<to_hash>.
51
52 This module will always be slower than OpenIsis module which use C
53 library. However, since it's written in perl, it's platform independent (so
54 you don't need C compiler), and can be easily modified. I hope that it
55 creates data structures which are easier to use than ones created by
56 OpenIsis, so reduced time in other parts of the code should compensate for
57 slower performance of this module (speed of reading ISIS database is
58 rarely an issue).
59
60 =head1 METHODS
61
62 =cut
63
64 # my $ORDN; # Nodes Order
65 # my $ORDF; # Leafs Order
66 # my $N; # Number of Memory buffers for nodes
67 # my $K; # Number of buffers for first level index
68 # my $LIV; # Current number of Index Levels
69 # my $POSRX; # Pointer to Root Record in N0x
70 # my $NMAXPOS; # Next Available position in N0x
71 # my $FMAXPOS; # Next available position in L0x
72 # my $ABNORMAL; # Formal BTree normality indicator
73
74 #
75 # some binary reads
76 #
77
78 =head2 new
79
80 Open ISIS database
81
82 my $isis = new Biblio::Isis(
83 isisdb => './cds/cds',
84 read_fdt => 1,
85 include_deleted => 1,
86 hash_filter => sub {
87 my $v = shift;
88 $v =~ s#foo#bar#g;
89 },
90 debug => 1,
91 join_subfields_with => ' ; ',
92 );
93
94 Options are described below:
95
96 =over 5
97
98 =item isisdb
99
100 This is full or relative path to ISIS database files which include
101 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
102 C<read_fdt> option) files.
103
104 In this example it uses C<./cds/cds.MST> and related files.
105
106 =item read_fdt
107
108 Boolean flag to specify if field definition table should be read. It's off
109 by default.
110
111 =item include_deleted
112
113 Don't skip logically deleted records in ISIS.
114
115 =item hash_filter
116
117 Filter code ref which will be used before data is converted to hash.
118
119 =item debug
120
121 Dump a B<lot> of debugging output even at level 1. For even more increase level.
122
123 =item join_subfields_with
124
125 Define delimiter which will be used to join repeatable subfields. This
126 option is included to support lagacy application written against version
127 older than 0.21 of this module. By default, it disabled. See L</to_hash>.
128
129 =back
130
131 =cut
132
133 sub new {
134 my $class = shift;
135 my $self = {};
136 bless($self, $class);
137
138 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
139
140 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
141 $self->{$v} = {@_}->{$v};
142 }
143
144 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
145
146 foreach my $f (@isis_files) {
147 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
148 $self->{lc($ext)."_file"} = $f;
149 }
150
151 my @must_exist = qw(mst xrf);
152 push @must_exist, "fdt" if ($self->{read_fdt});
153
154 foreach my $ext (@must_exist) {
155 unless ($self->{$ext."_file"}) {
156 carp "missing ",uc($ext)," file in ",$self->{isisdb};
157 return;
158 }
159 }
160
161 if ($self->{debug}) {
162 print STDERR "## using files: ",join(" ",@isis_files),"\n";
163 eval "use Data::Dump";
164
165 if (! $@) {
166 *Dumper = *Data::Dump::dump;
167 } else {
168 use Data::Dumper;
169 }
170 }
171
172 # if you want to read .FDT file use read_fdt argument when creating class!
173 if ($self->{read_fdt} && -e $self->{fdt_file}) {
174
175 # read the $db.FDT file for tags
176 my $fieldzone=0;
177
178 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
179 binmode($fileFDT);
180
181 while (<$fileFDT>) {
182 chomp;
183 if ($fieldzone) {
184 my $name=substr($_,0,30);
185 my $tag=substr($_,50,3);
186
187 $name =~ s/\s+$//;
188 $tag =~ s/\s+$//;
189
190 $self->{'TagName'}->{$tag}=$name;
191 }
192
193 if (/^\*\*\*/) {
194 $fieldzone=1;
195 }
196 }
197
198 close($fileFDT);
199 }
200
201 # Get the Maximum MFN from $db.MST
202
203 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
204 binmode($self->{'fileMST'});
205
206 # MST format: (* = 32 bit signed)
207 # CTLMFN* always 0
208 # NXTMFN* MFN to be assigned to the next record created
209 # NXTMFB* last block allocated to master file
210 # NXTMFP offset to next available position in last block
211 # MFTYPE always 0 for user db file (1 for system)
212 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
213
214 my $buff;
215
216 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
217 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
218
219 print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
220
221 # open files for later
222 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
223 binmode($self->{'fileXRF'});
224
225 $self ? return $self : return undef;
226 }
227
228 =head2 count
229
230 Return number of records in database
231
232 print $isis->count;
233
234 =cut
235
236 sub count {
237 my $self = shift;
238 return $self->{'NXTMFN'} - 1;
239 }
240
241 =head2 fetch
242
243 Read record with selected MFN
244
245 my $rec = $isis->fetch(55);
246
247 Returns hash with keys which are field names and values are unpacked values
248 for that field like this:
249
250 $rec = {
251 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
252 '990' => [ '2140', '88', 'HAY' ],
253 };
254
255 =cut
256
257 sub fetch {
258 my $self = shift;
259
260 my $mfn = shift || croak "fetch needs MFN as argument!";
261
262 # is mfn allready in memory?
263 my $old_mfn = $self->{'current_mfn'} || -1;
264 return $self->{record} if ($mfn == $old_mfn);
265
266 print STDERR "## fetch: $mfn\n" if ($self->{debug});
267
268 # XXX check this?
269 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
270
271 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
272 seek($self->{'fileXRF'},$mfnpos,0);
273
274 my $buff;
275
276 # delete old record
277 delete $self->{record};
278
279 # read XRFMFB abd XRFMFP
280 read($self->{'fileXRF'}, $buff, 4);
281 my $pointer=unpack("V",$buff);
282 if (! $pointer) {
283 if ($self->{include_deleted}) {
284 return;
285 } else {
286 warn "pointer for MFN $mfn is null\n";
287 return;
288 }
289 }
290
291 # check for logically deleted record
292 if ($pointer & 0x80000000) {
293 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
294 $self->{deleted} = $mfn;
295
296 return unless $self->{include_deleted};
297
298 # abs
299 $pointer = ($pointer ^ 0xffffffff) + 1;
300 }
301
302 my $XRFMFB = int($pointer/2048);
303 my $XRFMFP = $pointer - ($XRFMFB*2048);
304
305 # (XRFMFB - 1) * 512 + XRFMFP
306 # why do i have to do XRFMFP % 1024 ?
307
308 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
309
310 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
311
312 # Get Record Information
313
314 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
315
316 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
317 my $value=unpack("V",$buff);
318
319 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
320
321 if ($value!=$mfn) {
322 if ($value == 0) {
323 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
324 $self->{deleted} = $mfn;
325 return;
326 }
327
328 carp "Error: MFN ".$mfn." not found in MST file, found $value";
329 return;
330 }
331
332 read($self->{'fileMST'}, $buff, 14);
333
334 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
335
336 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
337
338 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
339
340 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
341
342 # Get Directory Format
343
344 my @FieldPOS;
345 my @FieldLEN;
346 my @FieldTAG;
347
348 read($self->{'fileMST'}, $buff, 6 * $NVF);
349
350 my $rec_len = 0;
351
352 for (my $i = 0 ; $i < $NVF ; $i++) {
353
354 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
355
356 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
357
358 # The TAG does not exists in .FDT so we set it to 0.
359 #
360 # XXX This is removed from perl version; .FDT file is updated manually, so
361 # you will often have fields in .MST file which aren't in .FDT. On the other
362 # hand, IsisMarc doesn't use .FDT files at all!
363
364 #if (! $self->{TagName}->{$TAG}) {
365 # $TAG=0;
366 #}
367
368 push @FieldTAG,$TAG;
369 push @FieldPOS,$POS;
370 push @FieldLEN,$LEN;
371
372 $rec_len += $LEN;
373 }
374
375 # Get Variable Fields
376
377 read($self->{'fileMST'},$buff,$rec_len);
378
379 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
380
381 for (my $i = 0 ; $i < $NVF ; $i++) {
382 # skip zero-sized fields
383 next if ($FieldLEN[$i] == 0);
384
385 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
386 }
387
388 $self->{'current_mfn'} = $mfn;
389
390 print STDERR Dumper($self),"\n" if ($self->{debug});
391
392 return $self->{'record'};
393 }
394
395 =head2 mfn
396
397 Returns current MFN position
398
399 my $mfn = $isis->mfn;
400
401 =cut
402
403 # This function should be simple return $self->{current_mfn},
404 # but if new is called with _hack_mfn it becomes setter.
405 # It's useful in tests when setting $isis->{record} directly
406
407 sub mfn {
408 my $self = shift;
409 return $self->{current_mfn};
410 };
411
412
413 =head2 to_ascii
414
415 Returns ASCII output of record with specified MFN
416
417 print $isis->to_ascii(42);
418
419 This outputs something like this:
420
421 210 ^aNew York^cNew York University press^dcop. 1988
422 990 2140
423 990 88
424 990 HAY
425
426 If C<read_fdt> is specified when calling C<new> it will display field names
427 from C<.FDT> file instead of numeric tags.
428
429 =cut
430
431 sub to_ascii {
432 my $self = shift;
433
434 my $mfn = shift || croak "need MFN";
435
436 my $rec = $self->fetch($mfn) || return;
437
438 my $out = "0\t$mfn";
439
440 foreach my $f (sort keys %{$rec}) {
441 my $fn = $self->tag_name($f);
442 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
443 }
444
445 $out .= "\n";
446
447 return $out;
448 }
449
450 =head2 to_hash
451
452 Read record with specified MFN and convert it to hash
453
454 my $hash = $isis->to_hash($mfn);
455
456 It has ability to convert characters (using C<hash_filter>) from ISIS
457 database before creating structures enabling character re-mapping or quick
458 fix-up of data.
459
460 This function returns hash which is like this:
461
462 $hash = {
463 '210' => [
464 {
465 'c' => 'New York University press',
466 'a' => 'New York',
467 'd' => 'cop. 1988'
468 }
469 ],
470 '990' => [
471 '2140',
472 '88',
473 'HAY'
474 ],
475 };
476
477 You can later use that hash to produce any output from ISIS data.
478
479 If database is created using IsisMarc, it will also have to special fields
480 which will be used for identifiers, C<i1> and C<i2> like this:
481
482 '200' => [
483 {
484 'i1' => '1',
485 'i2' => ' '
486 'a' => 'Goa',
487 'f' => 'Valdo D\'Arienzo',
488 'e' => 'tipografie e tipografi nel XVI secolo',
489 }
490 ],
491
492 In case there are repeatable subfields in record, this will create
493 following structure:
494
495 '900' => [ {
496 'a' => [ 'foo', 'bar', 'baz' ],
497 }]
498
499 Or in more complex example of
500
501 902 ^aa1^aa2^aa3^bb1^aa4^bb2^cc1^aa5
502
503 it will create
504
505 902 => [
506 { a => ["a1", "a2", "a3", "a4", "a5"], b => ["b1", "b2"], c => "c1" },
507 ],
508
509 This behaviour can be changed using C<join_subfields_with> option to L</new>,
510 in which case C<to_hash> will always create single value for each subfield.
511 This will change result to:
512
513
514
515 This method will also create additional field C<000> with MFN.
516
517 There is also more elaborative way to call C<to_hash> like this:
518
519 my $hash = $isis->to_hash({
520 mfn => 42,
521 include_subfields => 1,
522 });
523
524 Each option controll creation of hash:
525
526 =over 4
527
528 =item mfn
529
530 Specify MFN number of record
531
532 =item include_subfields
533
534 This option will create additional key in hash called C<subfields> which will
535 have original record subfield order and index to that subfield like this:
536
537 902 => [ {
538 a => ["a1", "a2", "a3", "a4", "a5"],
539 b => ["b1", "b2"],
540 c => "c1",
541 subfields => ["a", 0, "a", 1, "a", 2, "b", 0, "a", 3, "b", 1, "c", 0, "a", 4],
542 } ],
543
544 =item join_subfields_with
545
546 Define delimiter which will be used to join repeatable subfields. You can
547 specify option here instead in L</new> if you want to have per-record control.
548
549 =back
550
551 =cut
552
553 sub to_hash {
554 my $self = shift;
555
556
557 my $mfn = shift || confess "need mfn!";
558 my $arg;
559
560 if (ref($mfn) eq 'HASH') {
561 $arg = $mfn;
562 $mfn = $arg->{mfn} || confess "need mfn in arguments";
563 }
564
565 # init record to include MFN as field 000
566 my $rec = { '000' => [ $mfn ] };
567
568 my $row = $self->fetch($mfn) || return;
569
570 my $j_rs = $arg->{join_subfields_with};
571 $j_rs = $self->{join_subfields_with} unless(defined($j_rs));
572 my $i_sf = $arg->{include_subfields};
573
574 foreach my $f_nr (keys %{$row}) {
575 foreach my $l (@{$row->{$f_nr}}) {
576
577 # filter output
578 if ($self->{'hash_filter'}) {
579 $l = $self->{'hash_filter'}->($l);
580 next unless defined($l);
581 }
582
583 my $val;
584 my $r_sf; # repeatable subfields in this record
585
586 # has identifiers?
587 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
588
589 # has subfields?
590 if ($l =~ m/\^/) {
591 foreach my $t (split(/\^/,$l)) {
592 next if (! $t);
593 my ($sf,$v) = (substr($t,0,1), substr($t,1));
594 # XXX this might be option, but why?
595 next unless ($v);
596 # warn "### $f_nr^$sf:$v",$/ if ($self->{debug} > 1);
597
598 if (ref( $val->{$sf} ) eq 'ARRAY') {
599
600 push @{ $val->{$sf} }, $v;
601
602 # record repeatable subfield it it's offset
603 push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } ) if (! $j_rs && $i_sf);
604 $r_sf->{$sf}++;
605
606 } elsif (defined( $val->{$sf} )) {
607
608 # convert scalar field to array
609 $val->{$sf} = [ $val->{$sf}, $v ];
610
611 push @{ $val->{subfields} }, ( $sf, 1 ) if (! $j_rs && $i_sf);
612 $r_sf->{$sf}++;
613
614 } else {
615 $val->{$sf} = $v;
616 push @{ $val->{subfields} }, ( $sf, 0 ) if ($i_sf);
617 }
618 }
619 } else {
620 $val = $l;
621 }
622
623 if ($j_rs) {
624 map {
625 $val->{$_} = join($j_rs, @{ $val->{$_} });
626 } keys %$r_sf
627 }
628
629 push @{$rec->{$f_nr}}, $val;
630 }
631 }
632
633 return $rec;
634 }
635
636 =head2 tag_name
637
638 Return name of selected tag
639
640 print $isis->tag_name('200');
641
642 =cut
643
644 sub tag_name {
645 my $self = shift;
646 my $tag = shift || return;
647 return $self->{'TagName'}->{$tag} || $tag;
648 }
649
650
651 =head2 read_cnt
652
653 Read content of C<.CNT> file and return hash containing it.
654
655 print Dumper($isis->read_cnt);
656
657 This function is not used by module (C<.CNT> files are not required for this
658 module to work), but it can be useful to examine your index (while debugging
659 for example).
660
661 =cut
662
663 sub read_cnt {
664 my $self = shift;
665
666 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
667
668 # Get the index information from $db.CNT
669
670 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
671 binmode($fileCNT);
672
673 my $buff;
674
675 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
676 $self->unpack_cnt($buff);
677
678 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
679 $self->unpack_cnt($buff);
680
681 close($fileCNT);
682
683 return $self->{cnt};
684 }
685
686 =head2 unpack_cnt
687
688 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
689
690 Here is definition of record:
691
692 off key description size
693 0: IDTYPE BTree type s
694 2: ORDN Nodes Order s
695 4: ORDF Leafs Order s
696 6: N Number of Memory buffers for nodes s
697 8: K Number of buffers for first level index s
698 10: LIV Current number of Index Levels s
699 12: POSRX Pointer to Root Record in N0x l
700 16: NMAXPOS Next Available position in N0x l
701 20: FMAXPOS Next available position in L0x l
702 24: ABNORMAL Formal BTree normality indicator s
703 length: 26 bytes
704
705 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
706
707 =cut
708
709 sub unpack_cnt {
710 my $self = shift;
711
712 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
713
714 my $buff = shift || return;
715 my @arr = unpack("vvvvvvVVVv", $buff);
716
717 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
718
719 my $IDTYPE = shift @arr;
720 foreach (@flds) {
721 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
722 }
723 }
724
725 1;
726
727 =head1 BUGS
728
729 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
730 some variations in input databases which has been tested with this module.
731 When I was in doubt, I assumed that OpenIsis's implementation was right
732 (except for obvious bugs).
733
734 However, every effort has been made to test this module with as much
735 databases (and programs that create them) as possible.
736
737 I would be very greatful for success or failure reports about usage of this
738 module with databases from programs other than WinIsis and IsisMarc. I had
739 tested this against ouput of one C<isis.dll>-based application, but I don't
740 know any details about it's version.
741
742 =head1 VERSIONS
743
744 As this is young module, new features are added in subsequent version. It's
745 a good idea to specify version when using this module like this:
746
747 use Biblio::Isis 0.21
748
749 Below is list of changes in specific version of module (so you can target
750 older versions if you really have to):
751
752 =over 8
753
754 =item 0.21
755
756 Added C<join_subfields_with> to L</new> and L</to_hash>.
757
758 Added C<include_subfields> to L</to_hash>.
759
760 =item 0.20
761
762 Added C<< $isis->mfn >>, support for repeatable subfields and
763 C<< $isis->to_hash({ mfn => 42, ... }) >> calling convention
764
765 =back
766
767 =head1 AUTHOR
768
769 Dobrica Pavlinusic
770 CPAN ID: DPAVLIN
771 dpavlin@rot13.org
772 http://www.rot13.org/~dpavlin/
773
774 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
775 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
776
777 =head1 COPYRIGHT
778
779 This program is free software; you can redistribute
780 it and/or modify it under the same terms as Perl itself.
781
782 The full text of the license can be found in the
783 LICENSE file included with this module.
784
785
786 =head1 SEE ALSO
787
788 OpenIsis web site L<http://www.openisis.org>
789
790 perl4lib site L<http://perl4lib.perl.org>
791

  ViewVC Help
Powered by ViewVC 1.1.26