/[Biblio-Isis]/trunk/lib/Biblio/Isis.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/Biblio/Isis.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 27 - (show annotations)
Sat Jan 1 22:29:35 2005 UTC (19 years, 2 months ago) by dpavlin
Original Path: trunk/IsisDB.pm
File size: 14273 byte(s)
documentation improvement

1 package IsisDB;
2 use strict;
3
4 use Carp;
5 use File::Glob qw(:globally :nocase);
6
7 use Data::Dumper;
8
9 BEGIN {
10 use Exporter ();
11 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 $VERSION = 0.08;
13 @ISA = qw (Exporter);
14 #Give a hoot don't pollute, do not export more than needed by default
15 @EXPORT = qw ();
16 @EXPORT_OK = qw ();
17 %EXPORT_TAGS = ();
18
19 }
20
21 =head1 NAME
22
23 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24
25 =head1 SYNOPSIS
26
27 use IsisDB;
28
29 my $isis = new IsisDB(
30 isisdb => './cds/cds',
31 );
32
33 for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
34 print $isis->to_ascii($mfn),"\n";
35 }
36
37 =head1 DESCRIPTION
38
39 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 IsisMarc. It can be used as perl-only alternative to OpenIsis module which
41 seems to depriciate it's old C<XS> bindings for perl.
42
43 It can create hash values from data in ISIS database (using C<to_hash>),
44 ASCII dump (using C<to_ascii>) or just hash with field names and packed
45 values (like C<^asomething^belse>).
46
47 Unique feature of this module is ability to C<include_deleted> records.
48 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
49 fields which are zero sized will be filled with random junk from memory).
50
51 It also has support for identifiers (only if ISIS database is created by
52 IsisMarc), see C<to_hash>.
53
54 This module will always be slower than OpenIsis module which use C
55 library. However, since it's written in perl, it's platform independent (so
56 you don't need C compiler), and can be easily modified. I hope that it
57 creates data structures which are easier to use than ones created by
58 OpenIsis, so reduced time in other parts of the code should compensate for
59 slower performance of this module (speed of reading ISIS database is
60 rarely an issue).
61
62 =head1 METHODS
63
64 =cut
65
66 # my $ORDN; # Nodes Order
67 # my $ORDF; # Leafs Order
68 # my $N; # Number of Memory buffers for nodes
69 # my $K; # Number of buffers for first level index
70 # my $LIV; # Current number of Index Levels
71 # my $POSRX; # Pointer to Root Record in N0x
72 # my $NMAXPOS; # Next Available position in N0x
73 # my $FMAXPOS; # Next available position in L0x
74 # my $ABNORMAL; # Formal BTree normality indicator
75
76 #
77 # some binary reads
78 #
79
80 =head2 new
81
82 Open ISIS database
83
84 my $isis = new IsisDB(
85 isisdb => './cds/cds',
86 read_fdt => 1,
87 include_deleted => 1,
88 hash_filter => sub {
89 my $v = shift;
90 $v =~ s#foo#bar#g;
91 },
92 debug => 1,
93 );
94
95 Options are described below:
96
97 =over 5
98
99 =item isisdb
100
101 This is full or relative path to ISIS database files which include
102 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
103 C<read_fdt> option) files.
104
105 In this example it uses C<./cds/cds.MST> and related files.
106
107 =item read_fdt
108
109 Boolean flag to specify if field definition table should be read. It's off
110 by default.
111
112 =item include_deleted
113
114 Don't skip logically deleted records in ISIS.
115
116 =item hash_filter
117
118 Filter code ref which will be used before data is converted to hash.
119
120 =item debug
121
122 Dump a B<lot> of debugging output.
123
124 =back
125
126 It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
127
128 =cut
129
130 sub new {
131 my $class = shift;
132 my $self = {};
133 bless($self, $class);
134
135 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
136
137 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
138 $self->{$v} = {@_}->{$v};
139 }
140
141 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
142
143 foreach my $f (@isis_files) {
144 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
145 $self->{lc($ext)."_file"} = $f;
146 }
147
148 my @must_exist = qw(mst xrf);
149 push @must_exist, "fdt" if ($self->{read_fdt});
150
151 foreach my $ext (@must_exist) {
152 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
153 }
154
155 print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
156
157 # if you want to read .FDT file use read_fdt argument when creating class!
158 if ($self->{read_fdt} && -e $self->{fdt_file}) {
159
160 # read the $db.FDT file for tags
161 my $fieldzone=0;
162
163 open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
164
165 while (<fileFDT>) {
166 chomp;
167 if ($fieldzone) {
168 my $name=substr($_,0,30);
169 my $tag=substr($_,50,3);
170
171 $name =~ s/\s+$//;
172 $tag =~ s/\s+$//;
173
174 $self->{'TagName'}->{$tag}=$name;
175 }
176
177 if (/^\*\*\*/) {
178 $fieldzone=1;
179 }
180 }
181
182 close(fileFDT);
183 }
184
185 # Get the Maximum MFN from $db.MST
186
187 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
188
189 # MST format: (* = 32 bit signed)
190 # CTLMFN* always 0
191 # NXTMFN* MFN to be assigned to the next record created
192 # NXTMFB* last block allocated to master file
193 # NXTMFP offset to next available position in last block
194 # MFTYPE always 0 for user db file (1 for system)
195 seek($self->{'fileMST'},4,0);
196
197 my $buff;
198
199 read($self->{'fileMST'}, $buff, 4);
200 $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
201
202 # save maximum MFN
203 $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
204
205
206
207
208 print STDERR Dumper($self),"\n" if ($self->{debug});
209
210 # open files for later
211 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
212
213 $self ? return $self : return undef;
214 }
215
216 =head2 read_cnt
217
218 Read content of C<.CNT> file and return hash containing it.
219
220 print Dumper($isis->read_cnt);
221
222 This function is not used by module (C<.CNT> files are not required for this
223 module to work), but it can be useful to examine your index (while debugging
224 for example).
225
226 =cut
227
228 sub read_cnt {
229 my $self = shift;
230
231 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
232
233 # Get the index information from $db.CNT
234
235 open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
236
237 # There is two 26 Bytes fixed lenght records
238
239 # 0: IDTYPE BTree type 16
240 # 2: ORDN Nodes Order 16
241 # 4: ORDF Leafs Order 16
242 # 6: N Number of Memory buffers for nodes 16
243 # 8: K Number of buffers for first level index 16
244 # 10: LIV Current number of Index Levels 16
245 # 12: POSRX* Pointer to Root Record in N0x 32
246 # 16: NMAXPOS* Next Available position in N0x 32
247 # 20: FMAXPOS* Next available position in L0x 32
248 # 24: ABNORMAL Formal BTree normality indicator 16
249 # length: 26 bytes
250
251 sub unpack_cnt {
252 my $self = shift;
253
254 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
255
256 my $buff = shift || return;
257 my @arr = unpack("ssssssllls", $buff);
258
259 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
260
261 my $IDTYPE = shift @arr;
262 foreach (@flds) {
263 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
264 }
265 }
266
267 my $buff;
268
269 read(fileCNT, $buff, 26);
270 $self->unpack_cnt($buff);
271
272 read(fileCNT, $buff, 26);
273 $self->unpack_cnt($buff);
274
275 close(fileCNT);
276
277 return $self->{cnt};
278 }
279
280 =head2 fetch
281
282 Read record with selected MFN
283
284 my $rec = $isis->fetch(55);
285
286 Returns hash with keys which are field names and values are unpacked values
287 for that field like this:
288
289 $rec = {
290 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
291 '990' => [ '2140', '88', 'HAY' ],
292 };
293
294 =cut
295
296 sub fetch {
297 my $self = shift;
298
299 my $mfn = shift || croak "fetch needs MFN as argument!";
300
301 # is mfn allready in memory?
302 my $old_mfn = $self->{'current_mfn'} || -1;
303 return $self->{record} if ($mfn == $old_mfn);
304
305 print STDERR "## fetch: $mfn\n" if ($self->{debug});
306
307 # XXX check this?
308 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
309
310 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
311 seek($self->{'fileXRF'},$mfnpos,0);
312
313 my $buff;
314
315 # delete old record
316 delete $self->{record};
317
318 # read XRFMFB abd XRFMFP
319 read($self->{'fileXRF'}, $buff, 4);
320 my $pointer=unpack("l",$buff) || carp "pointer is null";
321
322 # check for logically deleted record
323 if ($pointer < 0) {
324 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
325 $self->{deleted} = $mfn;
326
327 return unless $self->{include_deleted};
328
329 $pointer = abs($pointer);
330 }
331
332 my $XRFMFB = int($pointer/2048);
333 my $XRFMFP = $pointer - ($XRFMFB*2048);
334
335 # (XRFMFB - 1) * 512 + XRFMFP
336 # why do i have to do XRFMFP % 1024 ?
337
338 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
339
340 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
341
342 # Get Record Information
343
344 seek($self->{'fileMST'},$blk_off,0);
345
346 read($self->{'fileMST'}, $buff, 4);
347 my $value=unpack("l",$buff);
348
349 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
350
351 if ($value!=$mfn) {
352 if ($value == 0) {
353 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
354 $self->{deleted} = $mfn;
355 return;
356 }
357
358 carp "Error: MFN ".$mfn." not found in MST file, found $value";
359 return;
360 }
361
362 read($self->{'fileMST'}, $buff, 14);
363
364 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
365
366 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
367
368 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
369
370 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
371
372 # Get Directory Format
373
374 my @FieldPOS;
375 my @FieldLEN;
376 my @FieldTAG;
377
378 read($self->{'fileMST'}, $buff, 6 * $NVF);
379
380 my $rec_len = 0;
381
382 for (my $i = 0 ; $i < $NVF ; $i++) {
383
384 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
385
386 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
387
388 # The TAG does not exists in .FDT so we set it to 0.
389 #
390 # XXX This is removed from perl version; .FDT file is updated manually, so
391 # you will often have fields in .MST file which aren't in .FDT. On the other
392 # hand, IsisMarc doesn't use .FDT files at all!
393
394 #if (! $self->{TagName}->{$TAG}) {
395 # $TAG=0;
396 #}
397
398 push @FieldTAG,$TAG;
399 push @FieldPOS,$POS;
400 push @FieldLEN,$LEN;
401
402 $rec_len += $LEN;
403 }
404
405 # Get Variable Fields
406
407 read($self->{'fileMST'},$buff,$rec_len);
408
409 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
410
411 for (my $i = 0 ; $i < $NVF ; $i++) {
412 # skip zero-sized fields
413 next if ($FieldLEN[$i] == 0);
414
415 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
416 }
417
418 $self->{'current_mfn'} = $mfn;
419
420 print STDERR Dumper($self),"\n" if ($self->{debug});
421
422 return $self->{'record'};
423 }
424
425 =head2 to_ascii
426
427 Returns ASCII output of record with specified MFN
428
429 print $isis->to_ascii(42);
430
431 This outputs something like this:
432
433 210 ^aNew York^cNew York University press^dcop. 1988
434 990 2140
435 990 88
436 990 HAY
437
438 If C<read_fdt> is specified when calling C<new> it will display field names
439 from C<.FDT> file instead of numeric tags.
440
441 =cut
442
443 sub to_ascii {
444 my $self = shift;
445
446 my $mfn = shift || croak "need MFN";
447
448 my $rec = $self->fetch($mfn);
449
450 my $out = "0\t$mfn";
451
452 foreach my $f (sort keys %{$rec}) {
453 my $fn = $self->tag_name($f);
454 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
455 }
456
457 $out .= "\n";
458
459 return $out;
460 }
461
462 =head2 to_hash
463
464 Read record with specified MFN and convert it to hash
465
466 my $hash = $isis->to_hash($mfn);
467
468 It has ability to convert characters (using C<hash_filter>) from ISIS
469 database before creating structures enabling character re-mapping or quick
470 fix-up of data.
471
472 This function returns hash which is like this:
473
474 $hash = {
475 '210' => [
476 {
477 'c' => 'New York University press',
478 'a' => 'New York',
479 'd' => 'cop. 1988'
480 }
481 ],
482 '990' => [
483 '2140',
484 '88',
485 'HAY'
486 ],
487 };
488
489 You can later use that hash to produce any output from ISIS data.
490
491 If database is created using IsisMarc, it will also have to special fields
492 which will be used for identifiers, C<i1> and C<i2> like this:
493
494 '200' => [
495 {
496 'i1' => '1',
497 'i2' => ' '
498 'a' => 'Goa',
499 'f' => 'Valdo D\'Arienzo',
500 'e' => 'tipografie e tipografi nel XVI secolo',
501 }
502 ],
503
504 This method will also create additional field C<000> with MFN.
505
506 =cut
507
508 sub to_hash {
509 my $self = shift;
510
511 my $mfn = shift || confess "need mfn!";
512
513 # init record to include MFN as field 000
514 my $rec = { '000' => [ $mfn ] };
515
516 my $row = $self->fetch($mfn);
517
518 foreach my $k (keys %{$row}) {
519 foreach my $l (@{$row->{$k}}) {
520
521 # filter output
522 $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
523
524 my $val;
525
526 # has identifiers?
527 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
528
529 # has subfields?
530 if ($l =~ m/\^/) {
531 foreach my $t (split(/\^/,$l)) {
532 next if (! $t);
533 $val->{substr($t,0,1)} = substr($t,1);
534 }
535 } else {
536 $val = $l;
537 }
538
539 push @{$rec->{$k}}, $val;
540 }
541 }
542
543 return $rec;
544 }
545
546 =head2 tag_name
547
548 Return name of selected tag
549
550 print $isis->tag_name('200');
551
552 =cut
553
554 sub tag_name {
555 my $self = shift;
556 my $tag = shift || return;
557 return $self->{'TagName'}->{$tag} || $tag;
558 }
559
560 1;
561
562 =head1 BUGS
563
564 Some parts of CDS/ISIS documentation are not detailed enough to exmplain
565 some variations in input databases which has been tested with this module.
566 When I was in doubt, I assumed that OpenIsis's implementation was right
567 (except for obvious bugs).
568
569 However, every effort has been made to test this module with as much
570 databases (and programs that create them) as possible.
571
572 I would be very greatful for success or failure reports about usage of this
573 module with databases from programs other than WinIsis and IsisMarc. I had
574 tested this against ouput of one C<isis.dll>-based application, but I don't
575 know any details about it's version.
576
577 =head1 AUTHOR
578
579 Dobrica Pavlinusic
580 CPAN ID: DPAVLIN
581 dpavlin@rot13.org
582 http://www.rot13.org/~dpavlin/
583
584 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
585 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
586
587 =head1 COPYRIGHT
588
589 This program is free software; you can redistribute
590 it and/or modify it under the same terms as Perl itself.
591
592 The full text of the license can be found in the
593 LICENSE file included with this module.
594
595
596 =head1 SEE ALSO
597
598 OpenIsis web site L<http://www.openisis.org>
599
600 perl4lib site L<http://perl4lib.perl.org>
601

  ViewVC Help
Powered by ViewVC 1.1.26