/[Biblio-Isis]/trunk/IsisDB.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/IsisDB.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 25 - (show annotations)
Fri Dec 31 05:43:20 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 13398 byte(s)
major improvments and new version:
- implement logically deleted records (really!)
- re-ordered values tests using cmp_ok so that reporting is correct,
- return record in fetch even if it's in memory (bugfix)
- removed some obsolete code

1 package IsisDB;
2 use strict;
3
4 use Carp;
5 use File::Glob qw(:globally :nocase);
6
7 use Data::Dumper;
8
9 BEGIN {
10 use Exporter ();
11 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
12 $VERSION = 0.08;
13 @ISA = qw (Exporter);
14 #Give a hoot don't pollute, do not export more than needed by default
15 @EXPORT = qw ();
16 @EXPORT_OK = qw ();
17 %EXPORT_TAGS = ();
18
19 }
20
21 =head1 NAME
22
23 IsisDB - Read CDS/ISIS, WinISIS and IsisMarc database
24
25 =head1 SYNOPSIS
26
27 use IsisDB;
28
29 my $isis = new IsisDB(
30 isisdb => './cds/cds',
31 );
32
33 for(my $mfn = 1; $mfn <= $isis->{'maxmfn'}; $mfn++) {
34 print $isis->to_ascii($mfn),"\n";
35 }
36
37 =head1 DESCRIPTION
38
39 This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
40 IsisMarc. It can be used as perl-only alternative to OpenIsis module.
41
42 It can create hash values from data in ISIS database (using C<to_hash>),
43 ASCII dump (using C<to_ascii>) or just hash with field names and packed
44 values (like C<^asomething^belse>).
45
46 Unique feature of this module is ability to C<include_deleted> records.
47 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
48 fields which are zero sized will be filled with random junk from memory).
49
50 It also has support for identifiers (only if ISIS database is created by
51 IsisMarc), see C<to_hash>.
52
53 This will module will always be slower than OpenIsis module which use C
54 library. However, since it's written in perl, it's platform independent (so
55 you don't need C compiler), and can be easily modified. I hope that it
56 creates data structures which are easier to use than ones created by
57 OpenIsis, so reduced time in other parts of the code should compensate for
58 slower performance of this module (speed of reading ISIS database is
59 rarely an issue).
60
61 =head1 METHODS
62
63 =cut
64
65 # my $ORDN; # Nodes Order
66 # my $ORDF; # Leafs Order
67 # my $N; # Number of Memory buffers for nodes
68 # my $K; # Number of buffers for first level index
69 # my $LIV; # Current number of Index Levels
70 # my $POSRX; # Pointer to Root Record in N0x
71 # my $NMAXPOS; # Next Available position in N0x
72 # my $FMAXPOS; # Next available position in L0x
73 # my $ABNORMAL; # Formal BTree normality indicator
74
75 #
76 # some binary reads
77 #
78
79 =head2 new
80
81 Open ISIS database
82
83 my $isis = new IsisDB(
84 isisdb => './cds/cds',
85 read_fdt => 1,
86 include_deleted => 1,
87 hash_filter => sub {
88 my $v = shift;
89 $v =~ s#foo#bar#g;
90 },
91 debug => 1,
92 );
93
94 Options are described below:
95
96 =over 5
97
98 =item isisdb
99
100 This is full or relative path to ISIS database files which include
101 common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
102 C<read_fdt> option) files.
103
104 In this example it uses C<./cds/cds.MST> and related files.
105
106 =item read_fdt
107
108 Boolean flag to specify if field definition table should be read. It's off
109 by default.
110
111 =item include_deleted
112
113 Don't skip logically deleted records in ISIS.
114
115 =item hash_filter
116
117 Filter code ref which will be used before data is converted to hash.
118
119 =item debug
120
121 Dump a B<lot> of debugging output.
122
123 =back
124
125 It will also set C<$isis-E<gt>{'maxmfn'}> which is maximum MFN stored in database.
126
127 =cut
128
129 sub new {
130 my $class = shift;
131 my $self = {};
132 bless($self, $class);
133
134 croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
135
136 foreach my $v (qw{isisdb debug include_deleted hash_filter}) {
137 $self->{$v} = {@_}->{$v};
138 }
139
140 my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
141
142 foreach my $f (@isis_files) {
143 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
144 $self->{lc($ext)."_file"} = $f;
145 }
146
147 my @must_exist = qw(mst xrf);
148 push @must_exist, "fdt" if ($self->{read_fdt});
149
150 foreach my $ext (@must_exist) {
151 croak "missing ",uc($ext)," file in ",$self->{isisdb} unless ($self->{$ext."_file"});
152 }
153
154 print STDERR "## using files: ",join(" ",@isis_files),"\n" if ($self->{debug});
155
156 # if you want to read .FDT file use read_fdt argument when creating class!
157 if ($self->{read_fdt} && -e $self->{fdt_file}) {
158
159 # read the $db.FDT file for tags
160 my $fieldzone=0;
161
162 open(fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
163
164 while (<fileFDT>) {
165 chomp;
166 if ($fieldzone) {
167 my $name=substr($_,0,30);
168 my $tag=substr($_,50,3);
169
170 $name =~ s/\s+$//;
171 $tag =~ s/\s+$//;
172
173 $self->{'TagName'}->{$tag}=$name;
174 }
175
176 if (/^\*\*\*/) {
177 $fieldzone=1;
178 }
179 }
180
181 close(fileFDT);
182 }
183
184 # Get the Maximum MFN from $db.MST
185
186 open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
187
188 # MST format: (* = 32 bit signed)
189 # CTLMFN* always 0
190 # NXTMFN* MFN to be assigned to the next record created
191 # NXTMFB* last block allocated to master file
192 # NXTMFP offset to next available position in last block
193 # MFTYPE always 0 for user db file (1 for system)
194 seek($self->{'fileMST'},4,0);
195
196 my $buff;
197
198 read($self->{'fileMST'}, $buff, 4);
199 $self->{'NXTMFN'}=unpack("l",$buff) || carp "NXTNFN is zero";
200
201 # save maximum MFN
202 $self->{'maxmfn'} = $self->{'NXTMFN'} - 1;
203
204
205
206
207 print STDERR Dumper($self),"\n" if ($self->{debug});
208
209 # open files for later
210 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
211
212 $self ? return $self : return undef;
213 }
214
215 =head2 read_cnt
216
217 This function is not really used by module, but can be useful to find info
218 about your index (if debugging it for example).
219
220 print Dumper($isis->read_cnt);
221
222 =cut
223
224 sub read_cnt {
225 my $self = shift;
226
227 croak "missing CNT file in ",$self->{isisdb} unless ($self->{cnt_file});
228
229 # Get the index information from $db.CNT
230
231 open(fileCNT, $self->{cnt_file}) || croak "can't read '$self->{cnt_file}': $!";
232
233 # There is two 26 Bytes fixed lenght records
234
235 # 0: IDTYPE BTree type 16
236 # 2: ORDN Nodes Order 16
237 # 4: ORDF Leafs Order 16
238 # 6: N Number of Memory buffers for nodes 16
239 # 8: K Number of buffers for first level index 16
240 # 10: LIV Current number of Index Levels 16
241 # 12: POSRX* Pointer to Root Record in N0x 32
242 # 16: NMAXPOS* Next Available position in N0x 32
243 # 20: FMAXPOS* Next available position in L0x 32
244 # 24: ABNORMAL Formal BTree normality indicator 16
245 # length: 26 bytes
246
247 sub unpack_cnt {
248 my $self = shift;
249
250 my @flds = qw(ORDN ORDF N K LIV POSRX NMAXPOS FMAXPOS ABNORMAL);
251
252 my $buff = shift || return;
253 my @arr = unpack("ssssssllls", $buff);
254
255 print STDERR "unpack_cnt: ",join(" ",@arr),"\n" if ($self->{'debug'});
256
257 my $IDTYPE = shift @arr;
258 foreach (@flds) {
259 $self->{cnt}->{$IDTYPE}->{$_} = abs(shift @arr);
260 }
261 }
262
263 my $buff;
264
265 read(fileCNT, $buff, 26);
266 $self->unpack_cnt($buff);
267
268 read(fileCNT, $buff, 26);
269 $self->unpack_cnt($buff);
270
271 close(fileCNT);
272
273 return $self->{cnt};
274 }
275
276 =head2 fetch
277
278 Read record with selected MFN
279
280 my $rec = $isis->fetch(55);
281
282 Returns hash with keys which are field names and values are unpacked values
283 for that field like this:
284
285 $rec = {
286 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
287 '990' => [ '2140', '88', 'HAY' ],
288 };
289
290 =cut
291
292 sub fetch {
293 my $self = shift;
294
295 my $mfn = shift || croak "fetch needs MFN as argument!";
296
297 # is mfn allready in memory?
298 my $old_mfn = $self->{'current_mfn'} || -1;
299 return $self->{record} if ($mfn == $old_mfn);
300
301 print STDERR "## fetch: $mfn\n" if ($self->{debug});
302
303 # XXX check this?
304 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
305
306 print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
307 seek($self->{'fileXRF'},$mfnpos,0);
308
309 my $buff;
310
311 # delete old record
312 delete $self->{record};
313
314 # read XRFMFB abd XRFMFP
315 read($self->{'fileXRF'}, $buff, 4);
316 my $pointer=unpack("l",$buff) || carp "pointer is null";
317
318 # check for logically deleted record
319 if ($pointer < 0) {
320 print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
321 $self->{deleted} = $mfn;
322
323 return unless $self->{include_deleted};
324
325 $pointer = abs($pointer);
326 }
327
328 my $XRFMFB = int($pointer/2048);
329 my $XRFMFP = $pointer - ($XRFMFB*2048);
330
331 # (XRFMFB - 1) * 512 + XRFMFP
332 # why do i have to do XRFMFP % 1024 ?
333
334 my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 1024);
335
336 print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
337
338 # Get Record Information
339
340 seek($self->{'fileMST'},$blk_off,0);
341
342 read($self->{'fileMST'}, $buff, 4);
343 my $value=unpack("l",$buff);
344
345 print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
346
347 if ($value!=$mfn) {
348 carp "Error: MFN ".$mfn." not found in MST(".$value.")";
349 #return;
350 }
351
352 read($self->{'fileMST'}, $buff, 14);
353
354 my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("slssss", $buff);
355
356 print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
357
358 warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
359
360 warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
361
362 # Get Directory Format
363
364 my @FieldPOS;
365 my @FieldLEN;
366 my @FieldTAG;
367
368 read($self->{'fileMST'}, $buff, 6 * $NVF);
369
370 my $rec_len = 0;
371
372 for (my $i = 0 ; $i < $NVF ; $i++) {
373
374 my ($TAG,$POS,$LEN) = unpack("sss", substr($buff,$i * 6, 6));
375
376 print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
377
378 # The TAG does not exists in .FDT so we set it to 0.
379 #
380 # XXX This is removed from perl version; .FDT file is updated manually, so
381 # you will often have fields in .MST file which aren't in .FDT. On the other
382 # hand, IsisMarc doesn't use .FDT files at all!
383
384 #if (! $self->{TagName}->{$TAG}) {
385 # $TAG=0;
386 #}
387
388 push @FieldTAG,$TAG;
389 push @FieldPOS,$POS;
390 push @FieldLEN,$LEN;
391
392 $rec_len += $LEN;
393 }
394
395 # Get Variable Fields
396
397 read($self->{'fileMST'},$buff,$rec_len);
398
399 print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
400
401 for (my $i = 0 ; $i < $NVF ; $i++) {
402 # skip zero-sized fields
403 next if ($FieldLEN[$i] == 0);
404
405 push @{$self->{record}->{$FieldTAG[$i]}}, substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
406 }
407
408 $self->{'current_mfn'} = $mfn;
409
410 print STDERR Dumper($self),"\n" if ($self->{debug});
411
412 return $self->{'record'};
413 }
414
415 =head2 to_ascii
416
417 Dump ASCII output of record with specified MFN
418
419 print $isis->to_ascii(42);
420
421 It outputs something like this:
422
423 210 ^aNew York^cNew York University press^dcop. 1988
424 990 2140
425 990 88
426 990 HAY
427
428 If C<read_fdt> is specified when calling C<new> it will display field names
429 from C<.FDT> file instead of numeric tags.
430
431 =cut
432
433 sub to_ascii {
434 my $self = shift;
435
436 my $mfn = shift || croak "need MFN";
437
438 my $rec = $self->fetch($mfn);
439
440 my $out = "0\t$mfn";
441
442 foreach my $f (sort keys %{$rec}) {
443 my $fn = $self->tag_name($f);
444 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
445 }
446
447 $out .= "\n";
448
449 return $out;
450 }
451
452 =head2 to_hash
453
454 Read record with specified MFN and convert it to hash
455
456 my $hash = $isis->to_hash($mfn);
457
458 It has ability to convert characters (using C<hash_filter> from ISIS
459 database before creating structures enabling character re-mapping or quick
460 fix-up of data.
461
462 This function returns hash which is like this:
463
464 $hash = {
465 '210' => [
466 {
467 'c' => 'New York University press',
468 'a' => 'New York',
469 'd' => 'cop. 1988'
470 }
471 ],
472 '990' => [
473 '2140',
474 '88',
475 'HAY'
476 ],
477 };
478
479 You can later use that hash to produce any output from ISIS data.
480
481 If database is created using IsisMarc, it will also have to special fields
482 which will be used for identifiers, C<i1> and C<i2> like this:
483
484 '200' => [
485 {
486 'i1' => '1',
487 'i2' => ' '
488 'a' => 'Goa',
489 'f' => 'Valdo D\'Arienzo',
490 'e' => 'tipografie e tipografi nel XVI secolo',
491 }
492 ],
493
494 This method will also create additional field C<000> with MFN.
495
496 =cut
497
498 sub to_hash {
499 my $self = shift;
500
501 my $mfn = shift || confess "need mfn!";
502
503 # init record to include MFN as field 000
504 my $rec = { '000' => [ $mfn ] };
505
506 my $row = $self->fetch($mfn);
507
508 foreach my $k (keys %{$row}) {
509 foreach my $l (@{$row->{$k}}) {
510
511 # filter output
512 $l = $self->{'hash_filter'}->($l) if ($self->{'hash_filter'});
513
514 my $val;
515
516 # has identifiers?
517 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
518
519 # has subfields?
520 if ($l =~ m/\^/) {
521 foreach my $t (split(/\^/,$l)) {
522 next if (! $t);
523 $val->{substr($t,0,1)} = substr($t,1);
524 }
525 } else {
526 $val = $l;
527 }
528
529 push @{$rec->{$k}}, $val;
530 }
531 }
532
533 return $rec;
534 }
535
536 =head2 tag_name
537
538 Return name of selected tag
539
540 print $isis->tag_name('200');
541
542 =cut
543
544 sub tag_name {
545 my $self = shift;
546 my $tag = shift || return;
547 return $self->{'TagName'}->{$tag} || $tag;
548 }
549
550 1;
551
552 =head1 BUGS
553
554 This module has been very lightly tested. Use with caution and report bugs.
555
556 =head1 AUTHOR
557
558 Dobrica Pavlinusic
559 CPAN ID: DPAVLIN
560 dpavlin@rot13.org
561 http://www.rot13.org/~dpavlin/
562
563 This module is based heavily on code from C<LIBISIS.PHP> library to read ISIS files V0.1.1
564 written in php and (c) 2000 Franck Martin <franck@sopac.org> and released under LGPL.
565
566 =head1 COPYRIGHT
567
568 This program is free software; you can redistribute
569 it and/or modify it under the same terms as Perl itself.
570
571 The full text of the license can be found in the
572 LICENSE file included with this module.
573
574
575 =head1 SEE ALSO
576
577 OpenIsis web site L<http://www.openisis.org>
578
579 perl4lib site L<http://perl4lib.perl.org>
580

  ViewVC Help
Powered by ViewVC 1.1.26