/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 45 - (show annotations)
Thu Jul 6 20:31:46 2006 UTC (16 years, 5 months ago) by dpavlin
File size: 15160 byte(s)
better logging, use Data::Dump if available [0.14]
1 package Biblio::Isis;
2 use strict;
3
4 use Carp;
5 use File::Glob qw(:globally :nocase);
6
7 BEGIN {
8 use Exporter ();
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 $VERSION = 0.14;
11 @ISA = qw (Exporter);
12 #Give a hoot don't pollute, do not export more than needed by default
13 @EXPORT = qw ();
14 @EXPORT_OK = qw ();
15 %EXPORT_TAGS = ();
16
17 }
18
19 =head1 NAME
20
21 Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
22
23 =head1 SYNOPSIS
24
25 use Biblio::Isis;
26
27 my $isis = new Biblio::Isis(
28 isisdb => './cds/cds',
29 );
30
31 for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32 print $isis->to_ascii($mfn),"\n";
33 }
34
35 =head1 DESCRIPTION
36
37 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39 seems to depriciate it's old C<XS> bindings for perl.
40
41 It can create hash values from data in ISIS database (using C<to_hash>),
42 ASCII dump (using C<to_ascii>) or just hash with field names and packed
43 values (like C<^asomething^belse>).
44
45 Unique feature of this module is ability to C<include_deleted> records.
46 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
47 fields which are zero sized will be filled with random junk from memory).
48
49 It also has support for identifiers (only if ISIS database is created by
50 IsisMarc), see C<to_hash>.
51
52 This module will always be slower than OpenIsis module which use C
53 library. However, since it's written in perl, it's platform independent (so
54 you don't need C compiler), and can be easily modified. I hope that it
55 creates data structures which are easier to use than ones created by
56 OpenIsis, so reduced time in other parts of the code should compensate for
57 slower performance of this module (speed of reading ISIS database is
58 rarely an issue).
59
60 =head1 METHODS
61
62 =cut
63
64 # my $ORDN; # Nodes Order
65 # my $ORDF; # Leafs Order
66 # my $N; # Number of Memory buffers for nodes
67 # my $K; # Number of buffers for first level index
68 # my $LIV; # Current number of Index Levels
69 # my $POSRX; # Pointer to Root Record in N0x
70 # my $NMAXPOS; # Next Available position in N0x
71 # my $FMAXPOS; # Next available position in L0x
72 # my $ABNORMAL; # Formal BTree normality indicator
73
74 #
75 # some binary reads
76 #
77
78 =head2 new
79
80 Open ISIS database
81
82 my $isis = new Biblio::Isis(
83 isisdb => './cds/cds',
84 read_fdt => 1,
85 include_deleted => 1,
86 hash_filter => sub {
87 my $v = shift;
88 $v =~ s#foo#bar#g;
89 },
90 debug => 1,
91 );
92
93 Options are described below:
94
95 =over 5
96
97 =item isisdb
98
99 This is full or relative path to ISIS database files which include
100 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
101 C<read_fdt> option) files.
102
103 In this example it uses C<./cds/cds.MST> and related files.
104
105 =item read_fdt
106
107 Boolean flag to specify if field definition table should be read. It's off
108 by default.
109
110 =item include_deleted
111
112 Don't skip logically deleted records in ISIS.
113
114 =item hash_filter
115
116 Filter code ref which will be used before data is converted to hash.
117
118 =item debug
119
120 Dump a B<lot> of debugging output.
121
122 =back
123
124 =cut
125
126 sub new {
127 my $class = shift;
128 my $self = {};
129 bless($self, $class);
130
131 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
132
133 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
134 $self->{$v} = {@_}->{$v};
135 }
136
137 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
138
139 foreach my $f (@isis_files) {
140 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
141 $self->{lc($ext)."_file"} = $f;
142 }
143
144 my @must_exist = qw(mst xrf);
145 push @must_exist, "fdt" if ($self->{read_fdt});
146
147 foreach my $ext (@must_exist) {
148 unless ($self->{$ext."_file"}) {
149 carp "missing ",uc($ext)," file in ",$self->{isisdb};
150 return;
151 }
152 }
153
154 if ($self->{debug}) {
155 print STDERR "## using files: ",join(" ",@isis_files),"\n";
156 eval "use Data::Dump";
157
158 if (! $@) {
159 *Dumper = *Data::Dump::dump;
160 } else {
161 use Data::Dumper;
162 }
163 }
164
165 # if you want to read .FDT file use read_fdt argument when creating class!
166 if ($self->{read_fdt} && -e $self->{fdt_file}) {
167
168 # read the $db.FDT file for tags
169 my $fieldzone=0;
170
171 open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
172 binmode($fileFDT);
173
174 while (<$fileFDT>) {
175 chomp;
176 if ($fieldzone) {
177 my $name=substr($_,0,30);
178 my $tag=substr($_,50,3);
179
180 $name =~ s/\s+$//;
181 $tag =~ s/\s+$//;
182
183 $self->{'TagName'}->{$tag}=$name;
184 }
185
186 if (/^\*\*\*/) {
187 $fieldzone=1;
188 }
189 }
190
191 close($fileFDT);
192 }
193
194 # Get the Maximum MFN from $db.MST
195
196 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
197 binmode($self->{'fileMST'});
198
199 # MST format: (* = 32 bit signed)
200 # CTLMFN* always 0
201 # NXTMFN* MFN to be assigned to the next record created
202 # NXTMFB* last block allocated to master file
203 # NXTMFP offset to next available position in last block
204 # MFTYPE always 0 for user db file (1 for system)
205 seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
206
207 my $buff;
208
209 read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
210 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
211
212 print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
213
214 # open files for later
215 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
216 binmode($self->{'fileXRF'});
217
218 $self ? return $self : return undef;
219 }
220
221 =head2 count
222
223 Return number of records in database
224
225 print $isis->count;
226
227 =cut
228
229 sub count {
230 my $self = shift;
231 return $self->{'NXTMFN'} - 1;
232 }
233
234 =head2 fetch
235
236 Read record with selected MFN
237
238 my $rec = $isis->fetch(55);
239
240 Returns hash with keys which are field names and values are unpacked values
241 for that field like this:
242
243 $rec = {
244 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
245 '990' => [ '2140', '88', 'HAY' ],
246 };
247
248 =cut
249
250 sub fetch {
251 my $self = shift;
252
253 my $mfn = shift || croak "fetch needs MFN as argument!";
254
255 # is mfn allready in memory?
256 my $old_mfn = $self->{'current_mfn'} || -1;
257 return $self->{record} if ($mfn == $old_mfn);
258
259 print STDERR "## fetch: $mfn\n" if ($self->{debug});
260
261 # XXX check this?
262 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
263
264 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
265 seek($self->{'fileXRF'},$mfnpos,0);
266
267 my $buff;
268
269 # delete old record
270 delete $self->{record};
271
272 # read XRFMFB abd XRFMFP
273 read($self->{'fileXRF'}, $buff, 4);
274 my $pointer=unpack("V",$buff);
275 if (! $pointer) {
276 if ($self->{include_deleted}) {
277 return;
278 } else {
279 warn "pointer for MFN $mfn is null\n";
280 return;
281 }
282 }
283
284 # check for logically deleted record
285 if ($pointer & 0x80000000) {
286 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
287 $self->{deleted} = $mfn;
288
289 return unless $self->{include_deleted};
290
291 # abs
292 $pointer = ($pointer ^ 0xffffffff) + 1;
293 }
294
295 my $XRFMFB = int($pointer/2048);
296 my $XRFMFP = $pointer - ($XRFMFB*2048);
297
298 # (XRFMFB - 1) * 512 + XRFMFP
299 # why do i have to do XRFMFP % 1024 ?
300
301 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
302
303 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
304
305 # Get Record Information
306
307 seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
308
309 read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
310 my $value=unpack("V",$buff);
311
312 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
313
314 if ($value!=$mfn) {
315 if ($value == 0) {
316 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
317 $self->{deleted} = $mfn;
318 return;
319 }
320
321 carp "Error: MFN ".$mfn." not found in MST file, found $value";
322 return;
323 }
324
325 read($self->{'fileMST'}, $buff, 14);
326
327 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
328
329 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
330
331 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
332
333 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
334
335 # Get Directory Format
336
337 my @FieldPOS;
338 my @FieldLEN;
339 my @FieldTAG;
340
341 read($self->{'fileMST'}, $buff, 6 * $NVF);
342
343 my $rec_len = 0;
344
345 for (my $i = 0 ; $i < $NVF ; $i++) {
346
347 my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
348
349 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
350
351 # The TAG does not exists in .FDT so we set it to 0.
352 #
353 # XXX This is removed from perl version; .FDT file is updated manually, so
354 # you will often have fields in .MST file which aren't in .FDT. On the other
355 # hand, IsisMarc doesn't use .FDT files at all!
356
357 #if (! $self->{TagName}->{$TAG}) {
358 # $TAG=0;
359 #}
360
361 push @FieldTAG,$TAG;
362 push @FieldPOS,$POS;
363 push @FieldLEN,$LEN;
364
365 $rec_len += $LEN;
366 }
367
368 # Get Variable Fields
369
370 read($self->{'fileMST'},$buff,$rec_len);
371
372 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
373
374 for (my $i = 0 ; $i < $NVF ; $i++) {
375 # skip zero-sized fields
376 next if ($FieldLEN[$i] == 0);
377
378 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
379 }
380
381 $self->{'current_mfn'} = $mfn;
382
383 print STDERR Dumper($self),"\n" if ($self->{debug});
384
385 return $self->{'record'};
386 }
387
388 =head2 to_ascii
389
390 Returns ASCII output of record with specified MFN
391
392 print $isis->to_ascii(42);
393
394 This outputs something like this:
395
396 210 ^aNew York^cNew York University press^dcop. 1988
397 990 2140
398 990 88
399 990 HAY
400
401 If C<read_fdt> is specified when calling C<new> it will display field names
402 from C<.FDT> file instead of numeric tags.
403
404 =cut
405
406 sub to_ascii {
407 my $self = shift;
408
409 my $mfn = shift || croak "need MFN";
410
411 my $rec = $self->fetch($mfn) || return;
412
413 my $out = "0\t$mfn";
414
415 foreach my $f (sort keys %{$rec}) {
416 my $fn = $self->tag_name($f);
417 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
418 }
419
420 $out .= "\n";
421
422 return $out;
423 }
424
425 =head2 to_hash
426
427 Read record with specified MFN and convert it to hash
428
429 my $hash = $isis->to_hash($mfn);
430
431 It has ability to convert characters (using C<hash_filter>) from ISIS
432 database before creating structures enabling character re-mapping or quick
433 fix-up of data.
434
435 This function returns hash which is like this:
436
437 $hash = {
438 '210' => [
439 {
440 'c' => 'New York University press',
441 'a' => 'New York',
442 'd' => 'cop. 1988'
443 }
444 ],
445 '990' => [
446 '2140',
447 '88',
448 'HAY'
449 ],
450 };
451
452 You can later use that hash to produce any output from ISIS data.
453
454 If database is created using IsisMarc, it will also have to special fields
455 which will be used for identifiers, C<i1> and C<i2> like this:
456
457 '200' => [
458 {
459 'i1' => '1',
460 'i2' => ' '
461 'a' => 'Goa',
462 'f' => 'Valdo D\'Arienzo',
463 'e' => 'tipografie e tipografi nel XVI secolo',
464 }
465 ],
466
467 This method will also create additional field C<000> with MFN.
468
469 =cut
470
471 sub to_hash {
472 my $self = shift;
473
474 my $mfn = shift || confess "need mfn!";
475
476 # init record to include MFN as field 000
477 my $rec = { '000' => [ $mfn ] };
478
479 my $row = $self->fetch($mfn) || return;
480
481 foreach my $k (keys %{$row}) {
482 foreach my $l (@{$row->{$k}}) {
483
484 # filter output
485 if ($self->{'hash_filter'}) {
486 $l = $self->{'hash_filter'}->($l);
487 next unless defined($l);
488 }
489
490 my $val;
491
492 # has identifiers?
493 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
494
495 # has subfields?
496 if ($l =~ m/\^/) {
497 foreach my $t (split(/\^/,$l)) {
498 next if (! $t);
499 $val->{substr($t,0,1)} = substr($t,1);
500 }
501 } else {
502 $val = $l;
503 }
504
505 push @{$rec->{$k}}, $val;
506 }
507 }
508
509 return $rec;
510 }
511
512 =head2 tag_name
513
514 Return name of selected tag
515
516 print $isis->tag_name('200');
517
518 =cut
519
520 sub tag_name {
521 my $self = shift;
522 my $tag = shift || return;
523 return $self->{'TagName'}->{$tag} || $tag;
524 }
525
526
527 =head2 read_cnt
528
529 Read content of C<.CNT> file and return hash containing it.
530
531 print Dumper($isis->read_cnt);
532
533 This function is not used by module (C<.CNT> files are not required for this
534 module to work), but it can be useful to examine your index (while debugging
535 for example).
536
537 =cut
538
539 sub read_cnt {
540 my $self = shift;
541
542 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
543
544 # Get the index information from $db.CNT
545
546 open(my $fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
547 binmode($fileCNT);
548
549 my $buff;
550
551 read($fileCNT, $buff, 26) || croak "can't read first table from CNT: $!";
552 $self->unpack_cnt($buff);
553
554 read($fileCNT, $buff, 26) || croak "can't read second table from CNT: $!";
555 $self->unpack_cnt($buff);
556
557 close($fileCNT);
558
559 return $self->{cnt};
560 }
561
562 =head2 unpack_cnt
563
564 Unpack one of two 26 bytes fixed length record in C<.CNT> file.
565
566 Here is definition of record:
567
568 off key description size
569 0: IDTYPE BTree type s
570 2: ORDN Nodes Order s
571 4: ORDF Leafs Order s
572 6: N Number of Memory buffers for nodes s
573 8: K Number of buffers for first level index s
574 10: LIV Current number of Index Levels s
575 12: POSRX Pointer to Root Record in N0x l
576 16: NMAXPOS Next Available position in N0x l
577 20: FMAXPOS Next available position in L0x l
578 24: ABNORMAL Formal BTree normality indicator s
579 length: 26 bytes
580
581 This will fill C<$self> object under C<cnt> with hash. It's used by C<read_cnt>.
582
583 =cut
584
585 sub unpack_cnt {
586 my $self = shift;
587
588 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
589
590 my $buff = shift || return;
591 my @arr = unpack("vvvvvvVVVv", $buff);
592
593 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
594
595 my $IDTYPE = shift @arr;
596 foreach (@flds) {
597 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
598 }
599 }
600
601 1;
602
603 =head1 BUGS
604
605 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
606 some variations in input databases which has been tested with this module.
607 When I was in doubt, I assumed that OpenIsis's implementation was right
608 (except for obvious bugs).
609
610 However, every effort has been made to test this module with as much
611 databases (and programs that create them) as possible.
612
613 I would be very greatful for success or failure reports about usage of this
614 module with databases from programs other than WinIsis and IsisMarc. I had
615 tested this against ouput of one C<isis.dll>-based application, but I don't
616 know any details about it's version.
617
618 =head1 AUTHOR
619
620 Dobrica Pavlinusic
621 CPAN ID: DPAVLIN
622 dpavlin@rot13.org
623 http://www.rot13.org/~dpavlin/
624
625 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
626 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
627
628 =head1 COPYRIGHT
629
630 This program is free software; you can redistribute
631 it and/or modify it under the same terms as Perl itself.
632
633 The full text of the license can be found in the
634 LICENSE file included with this module.
635
636
637 =head1 SEE ALSO
638
639 OpenIsis web site L<http://www.openisis.org>
640
641 perl4lib site L<http://perl4lib.perl.org>
642

  ViewVC Help
Powered by ViewVC 1.1.26