Line # Revision Author
1 36 dpavlin package Biblio::Isis;
2 1 dpavlin use strict;
3
4 use Carp;
5 18 dpavlin use File::Glob qw(:globally :nocase);
6
7 1 dpavlin BEGIN {
8 use Exporter ();
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 72 dpavlin $VERSION = 0.24;
11 1 dpavlin @ISA = qw (Exporter);
12 #Give a hoot don't pollute, do not export more than needed by default
13 @EXPORT = qw ();
14 @EXPORT_OK = qw ();
15 %EXPORT_TAGS = ();
16
17 }
18
19 =head1 NAME
20
21 36 dpavlin Biblio::Isis - Read CDS/ISIS, WinISIS and IsisMarc database
22 1 dpavlin
23 =head1 SYNOPSIS
24
25 36 dpavlin use Biblio::Isis;
26 11 dpavlin
27 36 dpavlin my $isis = new Biblio::Isis(
28 1 dpavlin isisdb => './cds/cds',
29 );
30
31 32 dpavlin for(my $mfn = 1; $mfn <= $isis->count; $mfn++) {
32 11 dpavlin print $isis->to_ascii($mfn),"\n";
33 }
34
35 1 dpavlin =head1 DESCRIPTION
36
37 15 dpavlin This module will read ISIS databases created by DOS CDS/ISIS, WinIsis or
38 27 dpavlin IsisMarc. It can be used as perl-only alternative to OpenIsis module which
39 seems to depriciate it's old C<XS> bindings for perl.
40 1 dpavlin
41 15 dpavlin It can create hash values from data in ISIS database (using C<to_hash>),
42 ASCII dump (using C<to_ascii>) or just hash with field names and packed
43 values (like C<^asomething^belse>).
44 11 dpavlin
45 Unique feature of this module is ability to C<include_deleted> records.
46 It will also skip zero sized fields (OpenIsis has a bug in XS bindings, so
47 fields which are zero sized will be filled with random junk from memory).
48
49 15 dpavlin It also has support for identifiers (only if ISIS database is created by
50 IsisMarc), see C<to_hash>.
51
52 27 dpavlin This module will always be slower than OpenIsis module which use C
53 15 dpavlin library. However, since it's written in perl, it's platform independent (so
54 you don't need C compiler), and can be easily modified. I hope that it
55 creates data structures which are easier to use than ones created by
56 OpenIsis, so reduced time in other parts of the code should compensate for
57 slower performance of this module (speed of reading ISIS database is
58 rarely an issue).
59
60 1 dpavlin =head1 METHODS
61
62 =cut
63
64 # my $ORDN; # Nodes Order
65 # my $ORDF; # Leafs Order
66 # my $N; # Number of Memory buffers for nodes
67 # my $K; # Number of buffers for first level index
68 # my $LIV; # Current number of Index Levels
69 # my $POSRX; # Pointer to Root Record in N0x
70 # my $NMAXPOS; # Next Available position in N0x
71 # my $FMAXPOS; # Next available position in L0x
72 # my $ABNORMAL; # Formal BTree normality indicator
73
74 #
75 # some binary reads
76 #
77
78 =head2 new
79
80 15 dpavlin Open ISIS database
81 1 dpavlin
82 36 dpavlin my $isis = new Biblio::Isis(
83 1 dpavlin isisdb => './cds/cds',
84 read_fdt => 1,
85 12 dpavlin include_deleted => 1,
86 hash_filter => sub {
87 64 dpavlin my ($v,$field_number) = @_;
88 12 dpavlin $v =~ s#foo#bar#g;
89 },
90 1 dpavlin debug => 1,
91 57 dpavlin join_subfields_with => ' ; ',
92 1 dpavlin );
93
94 2 dpavlin Options are described below:
95
96 =over 5
97
98 1 dpavlin =item isisdb
99
100 15 dpavlin This is full or relative path to ISIS database files which include
101 18 dpavlin common prefix of C<.MST>, and C<.XRF> and optionally C<.FDT> (if using
102 C<read_fdt> option) files.
103 1 dpavlin
104 15 dpavlin In this example it uses C<./cds/cds.MST> and related files.
105
106 1 dpavlin =item read_fdt
107
108 Boolean flag to specify if field definition table should be read. It's off
109 by default.
110
111 9 dpavlin =item include_deleted
112
113 11 dpavlin Don't skip logically deleted records in ISIS.
114 9 dpavlin
115 12 dpavlin =item hash_filter
116
117 65 dpavlin Filter code ref which will be used before data is converted to hash. It will
118 receive two arguments, whole line from current field (in C<< $_[0] >>) and
119 field number (in C<< $_[1] >>).
120 12 dpavlin
121 =item debug
122
123 54 dpavlin Dump a B<lot> of debugging output even at level 1. For even more increase level.
124 12 dpavlin
125 57 dpavlin =item join_subfields_with
126
127 Define delimiter which will be used to join repeatable subfields. This
128 option is included to support lagacy application written against version
129 older than 0.21 of this module. By default, it disabled. See L</to_hash>.
130
131 70 dpavlin =item ignore_empty_subfields
132
133 Remove all empty subfields while reading from ISIS file.
134
135 2 dpavlin =back
136
137 1 dpavlin =cut
138
139 sub new {
140 my $class = shift;
141 my $self = {};
142 bless($self, $class);
143
144 9 dpavlin croak "new needs database name (isisdb) as argument!" unless ({@_}->{isisdb});
145 1 dpavlin
146 70 dpavlin foreach my $v (qw{isisdb debug include_deleted hash_filter join_subfields_with ignore_empty_subfields}) {
147 69 dpavlin $self->{$v} = {@_}->{$v} if defined({@_}->{$v});
148 9 dpavlin }
149 1 dpavlin
150 18 dpavlin my @isis_files = grep(/\.(FDT|MST|XRF|CNT)$/i,glob($self->{isisdb}."*"));
151
152 foreach my $f (@isis_files) {
153 my $ext = $1 if ($f =~ m/\.(\w\w\w)$/);
154 $self->{lc($ext)."_file"} = $f;
155 }
156
157 my @must_exist = qw(mst xrf);
158 push @must_exist, "fdt" if ($self->{read_fdt});
159
160 foreach my $ext (@must_exist) {
161 39 dpavlin unless ($self->{$ext."_file"}) {
162 carp "missing ",uc($ext)," file in ",$self->{isisdb};
163 return;
164 }
165 18 dpavlin }
166
167 45 dpavlin if ($self->{debug}) {
168 print STDERR "## using files: ",join(" ",@isis_files),"\n";
169 eval "use Data::Dump";
170 18 dpavlin
171 45 dpavlin if (! $@) {
172 *Dumper = *Data::Dump::dump;
173 } else {
174 use Data::Dumper;
175 }
176 }
177
178 1 dpavlin # if you want to read .FDT file use read_fdt argument when creating class!
179 18 dpavlin if ($self->{read_fdt} && -e $self->{fdt_file}) {
180 1 dpavlin
181 # read the $db.FDT file for tags
182 my $fieldzone=0;
183
184 33 dpavlin open(my $fileFDT, $self->{fdt_file}) || croak "can't read '$self->{fdt_file}': $!";
185 binmode($fileFDT);
186 1 dpavlin
187 33 dpavlin while (<$fileFDT>) {
188 1 dpavlin chomp;
189 if ($fieldzone) {
190 my $name=substr($_,0,30);
191 my $tag=substr($_,50,3);
192
193 $name =~ s/\s+$//;
194 $tag =~ s/\s+$//;
195
196 $self->{'TagName'}->{$tag}=$name;
197 }
198
199 if (/^\*\*\*/) {
200 $fieldzone=1;
201 }
202 }
203
204 33 dpavlin close($fileFDT);
205 1 dpavlin }
206
207 # Get the Maximum MFN from $db.MST
208
209 18 dpavlin open($self->{'fileMST'}, $self->{mst_file}) || croak "can't open '$self->{mst_file}': $!";
210 33 dpavlin binmode($self->{'fileMST'});
211 1 dpavlin
212 # MST format: (* = 32 bit signed)
213 # CTLMFN* always 0
214 # NXTMFN* MFN to be assigned to the next record created
215 # NXTMFB* last block allocated to master file
216 # NXTMFP offset to next available position in last block
217 # MFTYPE always 0 for user db file (1 for system)
218 34 dpavlin seek($self->{'fileMST'},4,0) || croak "can't seek to offset 0 in MST: $!";
219 1 dpavlin
220 11 dpavlin my $buff;
221
222 34 dpavlin read($self->{'fileMST'}, $buff, 4) || croak "can't read NXTMFN from MST: $!";
223 $self->{'NXTMFN'}=unpack("V",$buff) || croak "NXTNFN is zero";
224 11 dpavlin
225 45 dpavlin print STDERR "## self ",Dumper($self),"\n" if ($self->{debug});
226 18 dpavlin
227 # open files for later
228 open($self->{'fileXRF'}, $self->{xrf_file}) || croak "can't open '$self->{xrf_file}': $!";
229 33 dpavlin binmode($self->{'fileXRF'});
230 18 dpavlin
231 $self ? return $self : return undef;
232 }
233
234 32 dpavlin =head2 count
235
236 Return number of records in database
237
238 print $isis->count;
239
240 =cut
241
242 sub count {
243 my $self = shift;
244 return $self->{'NXTMFN'} - 1;
245 }
246
247 7 dpavlin =head2 fetch
248 1 dpavlin
249 2 dpavlin Read record with selected MFN
250 1 dpavlin
251 7 dpavlin my $rec = $isis->fetch(55);
252 2 dpavlin
253 Returns hash with keys which are field names and values are unpacked values
254 15 dpavlin for that field like this:
255 2 dpavlin
256 15 dpavlin $rec = {
257 '210' => [ '^aNew York^cNew York University press^dcop. 1988' ],
258 '990' => [ '2140', '88', 'HAY' ],
259 };
260
261 2 dpavlin =cut
262
263 7 dpavlin sub fetch {
264 1 dpavlin my $self = shift;
265
266 7 dpavlin my $mfn = shift || croak "fetch needs MFN as argument!";
267 1 dpavlin
268 16 dpavlin # is mfn allready in memory?
269 my $old_mfn = $self->{'current_mfn'} || -1;
270 25 dpavlin return $self->{record} if ($mfn == $old_mfn);
271 1 dpavlin
272 16 dpavlin print STDERR "## fetch: $mfn\n" if ($self->{debug});
273
274 1 dpavlin # XXX check this?
275 my $mfnpos=($mfn+int(($mfn-1)/127))*4;
276
277 18 dpavlin print STDERR "## seeking to $mfnpos in file '$self->{xrf_file}'\n" if ($self->{debug});
278 7 dpavlin seek($self->{'fileXRF'},$mfnpos,0);
279 1 dpavlin
280 11 dpavlin my $buff;
281
282 25 dpavlin # delete old record
283 delete $self->{record};
284
285 1 dpavlin # read XRFMFB abd XRFMFP
286 11 dpavlin read($self->{'fileXRF'}, $buff, 4);
287 41 dpavlin my $pointer=unpack("V",$buff);
288 if (! $pointer) {
289 if ($self->{include_deleted}) {
290 return;
291 } else {
292 warn "pointer for MFN $mfn is null\n";
293 return;
294 }
295 }
296 1 dpavlin
297 25 dpavlin # check for logically deleted record
298 33 dpavlin if ($pointer & 0x80000000) {
299 25 dpavlin print STDERR "## record $mfn is logically deleted\n" if ($self->{debug});
300 $self->{deleted} = $mfn;
301
302 return unless $self->{include_deleted};
303
304 33 dpavlin # abs
305 $pointer = ($pointer ^ 0xffffffff) + 1;
306 25 dpavlin }
307
308 1 dpavlin my $XRFMFB = int($pointer/2048);
309 my $XRFMFP = $pointer - ($XRFMFB*2048);
310
311 16 dpavlin # (XRFMFB - 1) * 512 + XRFMFP
312 # why do i have to do XRFMFP % 1024 ?
313 1 dpavlin
314 26 dpavlin my $blk_off = (($XRFMFB - 1) * 512) + ($XRFMFP % 512);
315 1 dpavlin
316 16 dpavlin print STDERR "## pointer: $pointer XRFMFB: $XRFMFB XRFMFP: $XRFMFP offset: $blk_off\n" if ($self->{'debug'});
317 1 dpavlin
318 # Get Record Information
319
320 33 dpavlin seek($self->{'fileMST'},$blk_off,0) || croak "can't seek to $blk_off: $!";
321 1 dpavlin
322 33 dpavlin read($self->{'fileMST'}, $buff, 4) || croak "can't read 4 bytes at offset $blk_off from MST file: $!";
323 my $value=unpack("V",$buff);
324 1 dpavlin
325 16 dpavlin print STDERR "## offset for rowid $value is $blk_off (blk $XRFMFB off $XRFMFP)\n" if ($self->{debug});
326
327 1 dpavlin if ($value!=$mfn) {
328 26 dpavlin if ($value == 0) {
329 print STDERR "## record $mfn is physically deleted\n" if ($self->{debug});
330 $self->{deleted} = $mfn;
331 return;
332 }
333
334 carp "Error: MFN ".$mfn." not found in MST file, found $value";
335 return;
336 1 dpavlin }
337
338 7 dpavlin read($self->{'fileMST'}, $buff, 14);
339 1 dpavlin
340 33 dpavlin my ($MFRL,$MFBWB,$MFBWP,$BASE,$NVF,$STATUS) = unpack("vVvvvv", $buff);
341 1 dpavlin
342 16 dpavlin print STDERR "## MFRL: $MFRL MFBWB: $MFBWB MFBWP: $MFBWP BASE: $BASE NVF: $NVF STATUS: $STATUS\n" if ($self->{debug});
343 1 dpavlin
344 25 dpavlin warn "MFRL $MFRL is not even number" unless ($MFRL % 2 == 0);
345 9 dpavlin
346 16 dpavlin warn "BASE is not 18+6*NVF" unless ($BASE == 18 + 6 * $NVF);
347
348 1 dpavlin # Get Directory Format
349
350 my @FieldPOS;
351 my @FieldLEN;
352 my @FieldTAG;
353
354 8 dpavlin read($self->{'fileMST'}, $buff, 6 * $NVF);
355
356 16 dpavlin my $rec_len = 0;
357 8 dpavlin
358 1 dpavlin for (my $i = 0 ; $i < $NVF ; $i++) {
359
360 33 dpavlin my ($TAG,$POS,$LEN) = unpack("vvv", substr($buff,$i * 6, 6));
361 1 dpavlin
362 16 dpavlin print STDERR "## TAG: $TAG POS: $POS LEN: $LEN\n" if ($self->{debug});
363 1 dpavlin
364 # The TAG does not exists in .FDT so we set it to 0.
365 #
366 # XXX This is removed from perl version; .FDT file is updated manually, so
367 # you will often have fields in .MST file which aren't in .FDT. On the other
368 # hand, IsisMarc doesn't use .FDT files at all!
369
370 #if (! $self->{TagName}->{$TAG}) {
371 # $TAG=0;
372 #}
373
374 push @FieldTAG,$TAG;
375 push @FieldPOS,$POS;
376 push @FieldLEN,$LEN;
377 8 dpavlin
378 16 dpavlin $rec_len += $LEN;
379 1 dpavlin }
380
381 # Get Variable Fields
382
383 16 dpavlin read($self->{'fileMST'},$buff,$rec_len);
384 8 dpavlin
385 16 dpavlin print STDERR "## rec_len: $rec_len poc: ",tell($self->{'fileMST'})."\n" if ($self->{debug});
386
387 1 dpavlin for (my $i = 0 ; $i < $NVF ; $i++) {
388 10 dpavlin # skip zero-sized fields
389 next if ($FieldLEN[$i] == 0);
390
391 70 dpavlin my $v = substr($buff,$FieldPOS[$i],$FieldLEN[$i]);
392
393 if ( $self->{ignore_empty_subfields} ) {
394 $v =~ s/(\^\w)+(\^\w)/$2/g;
395 $v =~ s/\^\w$//; # last on line?
396 next if ($v eq '');
397 }
398
399 push @{$self->{record}->{$FieldTAG[$i]}}, $v;
400 1 dpavlin }
401
402 16 dpavlin $self->{'current_mfn'} = $mfn;
403
404 25 dpavlin print STDERR Dumper($self),"\n" if ($self->{debug});
405 1 dpavlin
406 2 dpavlin return $self->{'record'};
407 1 dpavlin }
408
409 54 dpavlin =head2 mfn
410
411 Returns current MFN position
412
413 my $mfn = $isis->mfn;
414
415 =cut
416
417 # This function should be simple return $self->{current_mfn},
418 # but if new is called with _hack_mfn it becomes setter.
419 # It's useful in tests when setting $isis->{record} directly
420
421 sub mfn {
422 my $self = shift;
423 return $self->{current_mfn};
424 };
425
426
427 2 dpavlin =head2 to_ascii
428
429 27 dpavlin Returns ASCII output of record with specified MFN
430 2 dpavlin
431 15 dpavlin print $isis->to_ascii(42);
432 2 dpavlin
433 27 dpavlin This outputs something like this:
434 15 dpavlin
435 210 ^aNew York^cNew York University press^dcop. 1988
436 990 2140
437 990 88
438 990 HAY
439
440 If C<read_fdt> is specified when calling C<new> it will display field names
441 from C<.FDT> file instead of numeric tags.
442
443 2 dpavlin =cut
444
445 sub to_ascii {
446 my $self = shift;
447
448 my $mfn = shift || croak "need MFN";
449
450 41 dpavlin my $rec = $self->fetch($mfn) || return;
451 2 dpavlin
452 my $out = "0\t$mfn";
453
454 foreach my $f (sort keys %{$rec}) {
455 15 dpavlin my $fn = $self->tag_name($f);
456 $out .= "\n$fn\t".join("\n$fn\t",@{$self->{record}->{$f}});
457 2 dpavlin }
458
459 $out .= "\n";
460
461 return $out;
462 }
463
464 12 dpavlin =head2 to_hash
465
466 15 dpavlin Read record with specified MFN and convert it to hash
467 12 dpavlin
468 my $hash = $isis->to_hash($mfn);
469
470 27 dpavlin It has ability to convert characters (using C<hash_filter>) from ISIS
471 15 dpavlin database before creating structures enabling character re-mapping or quick
472 fix-up of data.
473 12 dpavlin
474 This function returns hash which is like this:
475
476 $hash = {
477 '210' => [
478 {
479 'c' => 'New York University press',
480 'a' => 'New York',
481 'd' => 'cop. 1988'
482 }
483 ],
484 '990' => [
485 '2140',
486 '88',
487 'HAY'
488 ],
489 };
490
491 15 dpavlin You can later use that hash to produce any output from ISIS data.
492 12 dpavlin
493 15 dpavlin If database is created using IsisMarc, it will also have to special fields
494 which will be used for identifiers, C<i1> and C<i2> like this:
495
496 '200' => [
497 {
498 'i1' => '1',
499 'i2' => ' '
500 'a' => 'Goa',
501 'f' => 'Valdo D\'Arienzo',
502 'e' => 'tipografie e tipografi nel XVI secolo',
503 }
504 ],
505
506 50 dpavlin In case there are repeatable subfields in record, this will create
507 following structure:
508
509 '900' => [ {
510 'a' => [ 'foo', 'bar', 'baz' ],
511 }]
512
513 57 dpavlin Or in more complex example of
514
515 902 ^aa1^aa2^aa3^bb1^aa4^bb2^cc1^aa5
516
517 it will create
518
519 902 => [
520 { a => ["a1", "a2", "a3", "a4", "a5"], b => ["b1", "b2"], c => "c1" },
521 ],
522
523 This behaviour can be changed using C<join_subfields_with> option to L</new>,
524 in which case C<to_hash> will always create single value for each subfield.
525 This will change result to:
526
527
528
529 15 dpavlin This method will also create additional field C<000> with MFN.
530
531 56 dpavlin There is also more elaborative way to call C<to_hash> like this:
532
533 my $hash = $isis->to_hash({
534 mfn => 42,
535 57 dpavlin include_subfields => 1,
536 56 dpavlin });
537
538 57 dpavlin Each option controll creation of hash:
539
540 =over 4
541
542 =item mfn
543
544 Specify MFN number of record
545
546 =item include_subfields
547
548 This option will create additional key in hash called C<subfields> which will
549 have original record subfield order and index to that subfield like this:
550
551