Line # Revision Author
1 11 dpavlin package MARC::Fast;
2 1 dpavlin
3 use strict;
4 use Carp;
5 26 dpavlin use Data::Dump qw/dump/;
6 1 dpavlin
7 BEGIN {
8 use Exporter ();
9 use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10 23 dpavlin $VERSION = 0.09;
11 1 dpavlin @ISA = qw (Exporter);
12 #Give a hoot don't pollute, do not export more than needed by default
13 @EXPORT = qw ();
14 @EXPORT_OK = qw ();
15 %EXPORT_TAGS = ();
16 }
17
18 =head1 NAME
19
20 MARC::Fast - Very fast implementation of MARC database reader
21
22 =head1 SYNOPSIS
23
24 use MARC::Fast;
25
26 18 dpavlin my $marc = new MARC::Fast(
27 marcdb => 'unimarc.iso',
28 );
29 1 dpavlin
30 18 dpavlin foreach my $mfn ( 1 .. $marc->count ) {
31 print $marc->to_ascii( $mfn );
32 }
33
34 For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35
36 1 dpavlin =head1 DESCRIPTION
37
38 This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39
40 18 dpavlin It's is also very subtable for random access to MARC records (as opposed to
41 1 dpavlin sequential one).
42
43 =head1 METHODS
44
45 =head2 new
46
47 Read MARC database
48
49 my $marc = new MARC::Fast(
50 marcdb => 'unimarc.iso',
51 quiet => 0,
52 debug => 0,
53 assert => 0,
54 8 dpavlin hash_filter => sub {
55 9 dpavlin my ($t, $record_number) = @_;
56 8 dpavlin $t =~ s/foo/bar/;
57 return $t;
58 },
59 1 dpavlin );
60
61 =cut
62
63 ################################################## subroutine header end ##
64
65
66 sub new {
67 my $class = shift;
68 my $self = {@_};
69 bless ($self, $class);
70
71 croak "need marcdb parametar" unless ($self->{marcdb});
72
73 print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74
75 open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
76 6 dpavlin binmode($self->{fh});
77 1 dpavlin
78 $self->{count} = 0;
79
80 while (! eof($self->{fh})) {
81 $self->{count}++;
82
83 # save record position
84 push @{$self->{fh_offset}}, tell($self->{fh});
85
86 my $leader;
87 6 dpavlin my $len = read($self->{fh}, $leader, 24);
88 1 dpavlin
89 6 dpavlin if ($len < 24) {
90 carp "short read of leader, aborting\n";
91 last;
92 }
93
94 1 dpavlin # Byte Name
95 # ---- ----
96 # 0-4 Record Length
97 # 5 Status (n=new, c=corrected and d=deleted)
98 # 6 Type of Record (a=printed material)
99 # 7 Bibliographic Level (m=monograph)
100 # 8-9 Blanks
101 # 10 Indictator count (2 for monographs)
102 # 11 Subfield code count (2 - 0x1F+subfield code itself)
103 # 12-16 Base address of data
104 # 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
105 # 3=sublevel 3)
106 # 18 Descriptive Cataloguing Form (blank=record is full ISBD,
107 # n=record is in non-ISBD format, i=record is in
108 # an incomplete ISBD format)
109 # 19 Blank
110 # 20 Length of length field in directory (always 4 in UNIMARC)
111 # 21 Length of Starting Character Position in directory (always
112 # 5 in UNIMARC)
113 # 22 Length of implementation defined portion in directory (always
114 # 0 in UNIMARC)
115 # 23 Blank
116 #
117 # |0 45 89 |12 16|1n 450 |
118 # |xxxxxnam 22(.....) 45 <---
119
120 print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
121
122 # store leader for later
123 18 dpavlin push @{$self->{leader}}, $leader;
124 1 dpavlin
125 # skip to next record
126 6 dpavlin my $o = substr($leader,0,5);
127 26 dpavlin warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
128 6 dpavlin if ($o > 24) {
129 seek($self->{fh},$o-24,1) if ($o);
130 } else {
131 last;
132 }
133 1 dpavlin
134 }
135
136 return $self;
137 }
138
139 =head2 count
140
141 Return number of records in database
142
143 print $marc->count;
144
145 =cut
146
147 sub count {
148 my $self = shift;
149 return $self->{count};
150 }
151
152 =head2 fetch
153
154 Fetch record from database
155
156 my $hash = $marc->fetch(42);
157
158 18 dpavlin First record number is C<1>
159
160 1 dpavlin =cut
161
162 sub fetch {
163 my $self = shift;
164
165 18 dpavlin my $rec_nr = shift;
166 1 dpavlin
167 18 dpavlin if ( ! $rec_nr ) {
168 $self->{last_leader} = undef;
169 return;
170 }
171
172 my $leader = $self->{leader}->[$rec_nr - 1];
173 $self->{last_leader} = $leader;
174 1 dpavlin unless ($leader) {
175 carp "can't find record $rec_nr";
176 return;
177 };
178 my $offset = $self->{fh_offset}->[$rec_nr - 1];
179 unless (defined($offset)) {
180 carp "can't find offset for record $rec_nr";
181 return;
182 };
183
184 my $reclen = substr($leader,0,5);
185 my $base_addr = substr($leader,12,5);
186
187 print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
188
189 my $skip = 0;
190
191 print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
192
193 if ( ! seek($self->{fh}, $offset+24, 0) ) {
194 carp "can't seek to $offset: $!";
195 return;
196 }
197
198 print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
199
200 my $directory;
201 if( ! read($self->{fh},$directory,$base_addr-24) ) {
202 carp "can't read directory: $!";
203 $skip = 1;
204 } else {
205 print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
206 }
207
208 print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
209
210 my $fields;
211 if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
212 carp "can't read fields: $!";
213 $skip = 1;
214 } else {
215 print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
216 }
217
218 my $row;
219
220 while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
221 my ($tag,$len,$addr) = ($1,$2,$3);
222
223 if (($addr+$len) > length($fields)) {
224 print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
225 $skip = 1;
226 next;
227 }
228
229 # take field
230 my $f = substr($fields,$addr,$len);
231 print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
232
233 6 dpavlin push @{ $row->{$tag} }, $f;
234 1 dpavlin
235 my $del = substr($fields,$addr+$len-1,1);
236
237 # check field delimiters...
238 if ($self->{assert} && $del ne chr(30)) {
239 print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
240 $skip = 1;
241 next;
242 }
243
244 if ($self->{assert} && length($f) < 2) {
245 print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
246 next;
247 }
248
249 }
250
251 return $row;
252 }
253
254 6 dpavlin
255 18 dpavlin =head2 last_leader
256
257 Returns leader of last record L<fetch>ed
258
259 print $marc->last_leader;
260
261 Added in version 0.08 of this module, so if you need it use:
262
263 use MARC::Fast 0.08;
264
265 to be sure that it's supported.
266
267 =cut
268
269 sub last_leader {
270 my $self = shift;
271 return $self->{last_leader};
272 }
273
274
275 6 dpavlin =head2 to_hash
276
277 Read record with specified MFN and convert it to hash
278
279 23 dpavlin my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
280 6 dpavlin
281 It has ability to convert characters (using C<hash_filter>) from MARC
282 database before creating structures enabling character re-mapping or quick
283 fix-up of data.
284
285 This function returns hash which is like this:
286
287 '200' => [
288 {
289 'i1' => '1',
290 'i2' => ' '
291 'a' => 'Goa',
292 'f' => 'Valdo D\'Arienzo',
293 'e' => 'tipografie e tipografi nel XVI secolo',
294 }
295 ],
296
297 This method will also create additional field C<000> with MFN.
298
299 =cut
300
301 sub to_hash {
302 my $self = shift;
303
304 my $mfn = shift || confess "need mfn!";
305
306 23 dpavlin my $args = {@_};
307
308 6 dpavlin # init record to include MFN as field 000
309 my $rec = { '000' => [ $mfn ] };
310
311 my $row = $self->fetch($mfn) || return;
312
313 9 dpavlin foreach my $rec_nr (keys %{$row}) {
314 foreach my $l (@{$row->{$rec_nr}}) {
315 6 dpavlin
316 # remove end marker
317 $l =~ s/\x1E$//;
318
319 # filter output
320 9 dpavlin $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
321 6 dpavlin
322 my $val;
323
324 # has identifiers?
325 ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
326
327 23 dpavlin my $sf_usage;
328 my @subfields;
329
330 6 dpavlin # has subfields?
331 if ($l =~ m/\x1F/) {
332 foreach my $t (split(/\x1F/,$l)) {
333 next if (! $t);
334 8 dpavlin my $f = substr($t,0,1);
335 23 dpavlin
336 push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
337
338 # repeatable subfiled -- convert it to array
339 8 dpavlin if ($val->{$f}) {
340 24 dpavlin if ( ref($val->{$f}) ne 'ARRAY' ) {
341 23 dpavlin $val->{$f} = [ $val->{$f}, $val ];
342 } else {
343 push @{$val->{$f}}, $val;
344 }
345 8 dpavlin }
346 6 dpavlin $val->{substr($t,0,1)} = substr($t,1);
347 }
348 23 dpavlin $val->{subfields} = [ @subfields ] if $args->{include_subfields};
349 6 dpavlin } else {
350 $val = $l;
351 }
352
353 9 dpavlin push @{$rec->{$rec_nr}}, $val;
354 6 dpavlin }
355 }
356
357 return $rec;
358 }
359
360 11 dpavlin =head2 to_ascii
361 6 dpavlin
362 11 dpavlin print $marc->to_ascii( 42 );
363 1 dpavlin
364 11 dpavlin =cut
365 1 dpavlin
366 11 dpavlin sub to_ascii {
367 my $self = shift;
368 1 dpavlin
369 11 dpavlin my $mfn = shift || confess "need mfn";
370 my $row = $self->fetch($mfn) || return;
371 1 dpavlin
372 11 dpavlin my $out;
373 1 dpavlin
374 11 dpavlin foreach my $f (sort keys %{$row}) {
375 my $dump = join('', @{ $row->{$f} });
376 $dump =~ s/\x1e$//;
377 $dump =~ s/\x1f/\$/g;
378 $out .= "$f\t$dump\n";
379 }
380 1 dpavlin
381 11 dpavlin return $out;
382 }
383 1 dpavlin
384 11 dpavlin 1;
385 __END__
386
387 1 dpavlin =head1 AUTHOR
388
389 Dobrica Pavlinusic
390 CPAN ID: DPAVLIN
391 dpavlin@rot13.org
392 http://www.rot13.org/~dpavlin/
393
394 =head1 COPYRIGHT
395
396 This program is free software; you can redistribute
397 it and/or modify it under the same terms as Perl itself.
398
399 The full text of the license can be found in the
400 LICENSE file included with this module.
401
402
403 =head1 SEE ALSO
404
405 11 dpavlin L<Biblio::Isis>, perl(1).
406 1 dpavlin
407 =cut