MARC-Fast/trunk/Fast.pm

package MARC::Fast;

use strict;
use Carp;
use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.09;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
  );

  foreach my $mfn ( 1 .. $marc->count ) {
        print $marc->to_ascii( $mfn );
  }

For longer example with command line options look at L<scripts/dump_fastmarc.pl>

=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very subtable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
        hash_filter => sub {
                my ($t, $record_number) = @_;
                $t =~ s/foo/bar/;
                return $t;
        },
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leader}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

First record number is C<1>

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift;

        if ( ! $rec_nr ) {
                $self->{last_leader} = undef;
                return;
        }

        my $leader = $self->{leader}->[$rec_nr - 1];
        $self->{last_leader} = $leader;
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 last_leader

Returns leader of last record L<fetch>ed

  print $marc->last_leader;

Added in version 0.08 of this module, so if you need it use:

  use MARC::Fast 0.08;

to be sure that it's supported.

=cut

sub last_leader {
        my $self = shift;
        return $self->{last_leader};
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash( $mfn, include_subfields => 1, );

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        my $args = {@_};

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $rec_nr (keys %{$row}) {
                foreach my $l (@{$row->{$rec_nr}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        my $sf_usage;
                        my @subfields;

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        my $f = substr($t,0,1);

                                        push @subfields, ( $f, $sf_usage->{$f}++ || 0 );

                                        # repeatable subfiled -- convert it to array
                                        if ($val->{$f}) {
                                                if ( ref($val->{$f}) ne 'ARRAY' ) {
                                                        $val->{$f} = [ $val->{$f}, $val ];
                                                } else {
                                                        push @{$val->{$f}}, $val;
                                                }
                                        }
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                                $val->{subfields} = [ @subfields ] if $args->{include_subfields};
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$rec_nr}}, $val;
                }
        }

        return $rec;
}

=head2 to_ascii

  print $marc->to_ascii( 42 );

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || confess "need mfn";
        my $row = $self->fetch($mfn) || return;

        my $out;

        foreach my $f (sort keys %{$row}) {
                my $dump = join('', @{ $row->{$f} });
                $dump =~ s/\x1e$//;
                $dump =~ s/\x1f/\$/g;
                $out .= "$f\t$dump\n";
        }

        return $out;
}

1;
__END__

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        https://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

L<Biblio::Isis>, perl(1).

=cut
1	package MARC::Fast;
2
3	use strict;
4	use Carp;
5	use Data::Dumper;
6
7	BEGIN {
8	use Exporter ();
9	use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	$VERSION = 0.09;
11	@ISA = qw (Exporter);
12	#Give a hoot don't pollute, do not export more than needed by default
13	@EXPORT = qw ();
14	@EXPORT_OK = qw ();
15	%EXPORT_TAGS = ();
16	}
17
18	=head1 NAME
19
20	MARC::Fast - Very fast implementation of MARC database reader
21
22	=head1 SYNOPSIS
23
24	use MARC::Fast;
25
26	my $marc = new MARC::Fast(
27	marcdb => 'unimarc.iso',
28	);
29
30	foreach my $mfn ( 1 .. $marc->count ) {
31	print $marc->to_ascii( $mfn );
32	}
33
34	For longer example with command line options look at L<scripts/dump_fastmarc.pl>
35
36	=head1 DESCRIPTION
37
38	This is very fast alternative to C<MARC> and C<MARC::Record> modules.
39
40	It's is also very subtable for random access to MARC records (as opposed to
41	sequential one).
42
43	=head1 METHODS
44
45	=head2 new
46
47	Read MARC database
48
49	my $marc = new MARC::Fast(
50	marcdb => 'unimarc.iso',
51	quiet => 0,
52	debug => 0,
53	assert => 0,
54	hash_filter => sub {
55	my ($t, $record_number) = @_;
56	$t =~ s/foo/bar/;
57	return $t;
58	},
59	);
60
61	=cut
62
63	################################################## subroutine header end ##
64
65
66	sub new {
67	my $class = shift;
68	my $self = {@_};
69	bless ($self, $class);
70
71	croak "need marcdb parametar" unless ($self->{marcdb});
72
73	print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
74
75	open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
76	binmode($self->{fh});
77
78	$self->{count} = 0;
79
80	while (! eof($self->{fh})) {
81	$self->{count}++;
82
83	# save record position
84	push @{$self->{fh_offset}}, tell($self->{fh});
85
86	my $leader;
87	my $len = read($self->{fh}, $leader, 24);
88
89	if ($len < 24) {
90	carp "short read of leader, aborting\n";
91	last;
92	}
93
94	# Byte Name
95	# ---- ----
96	# 0-4 Record Length
97	# 5 Status (n=new, c=corrected and d=deleted)
98	# 6 Type of Record (a=printed material)
99	# 7 Bibliographic Level (m=monograph)
100	# 8-9 Blanks
101	# 10 Indictator count (2 for monographs)
102	# 11 Subfield code count (2 - 0x1F+subfield code itself)
103	# 12-16 Base address of data
104	# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
105	# 3=sublevel 3)
106	# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
107	# n=record is in non-ISBD format, i=record is in
108	# an incomplete ISBD format)
109	# 19 Blank
110	# 20 Length of length field in directory (always 4 in UNIMARC)
111	# 21 Length of Starting Character Position in directory (always
112	# 5 in UNIMARC)
113	# 22 Length of implementation defined portion in directory (always
114	# 0 in UNIMARC)
115	# 23 Blank
116	#
117	# \|0 45 89 \|12 16\|1n 450 \|
118	# \|xxxxxnam 22(.....) 45 <---
119
120	print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
121
122	# store leader for later
123	push @{$self->{leader}}, $leader;
124
125	# skip to next record
126	my $o = substr($leader,0,5);
127	if ($o > 24) {
128	seek($self->{fh},$o-24,1) if ($o);
129	} else {
130	last;
131	}
132
133	}
134
135	return $self;
136	}
137
138	=head2 count
139
140	Return number of records in database
141
142	print $marc->count;
143
144	=cut
145
146	sub count {
147	my $self = shift;
148	return $self->{count};
149	}
150
151	=head2 fetch
152
153	Fetch record from database
154
155	my $hash = $marc->fetch(42);
156
157	First record number is C<1>
158
159	=cut
160
161	sub fetch {
162	my $self = shift;
163
164	my $rec_nr = shift;
165
166	if ( ! $rec_nr ) {
167	$self->{last_leader} = undef;
168	return;
169	}
170
171	my $leader = $self->{leader}->[$rec_nr - 1];
172	$self->{last_leader} = $leader;
173	unless ($leader) {
174	carp "can't find record $rec_nr";
175	return;
176	};
177	my $offset = $self->{fh_offset}->[$rec_nr - 1];
178	unless (defined($offset)) {
179	carp "can't find offset for record $rec_nr";
180	return;
181	};
182
183	my $reclen = substr($leader,0,5);
184	my $base_addr = substr($leader,12,5);
185
186	print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
187
188	my $skip = 0;
189
190	print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
191
192	if ( ! seek($self->{fh}, $offset+24, 0) ) {
193	carp "can't seek to $offset: $!";
194	return;
195	}
196
197	print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
198
199	my $directory;
200	if( ! read($self->{fh},$directory,$base_addr-24) ) {
201	carp "can't read directory: $!";
202	$skip = 1;
203	} else {
204	print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
205	}
206
207	print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
208
209	my $fields;
210	if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
211	carp "can't read fields: $!";
212	$skip = 1;
213	} else {
214	print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
215	}
216
217	my $row;
218
219	while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
220	my ($tag,$len,$addr) = ($1,$2,$3);
221
222	if (($addr+$len) > length($fields)) {
223	print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
224	$skip = 1;
225	next;
226	}
227
228	# take field
229	my $f = substr($fields,$addr,$len);
230	print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
231
232	push @{ $row->{$tag} }, $f;
233
234	my $del = substr($fields,$addr+$len-1,1);
235
236	# check field delimiters...
237	if ($self->{assert} && $del ne chr(30)) {
238	print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
239	$skip = 1;
240	next;
241	}
242
243	if ($self->{assert} && length($f) < 2) {
244	print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
245	next;
246	}
247
248	}
249
250	return $row;
251	}
252
253
254	=head2 last_leader
255
256	Returns leader of last record L<fetch>ed
257
258	print $marc->last_leader;
259
260	Added in version 0.08 of this module, so if you need it use:
261
262	use MARC::Fast 0.08;
263
264	to be sure that it's supported.
265
266	=cut
267
268	sub last_leader {
269	my $self = shift;
270	return $self->{last_leader};
271	}
272
273
274	=head2 to_hash
275
276	Read record with specified MFN and convert it to hash
277
278	my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
279
280	It has ability to convert characters (using C<hash_filter>) from MARC
281	database before creating structures enabling character re-mapping or quick
282	fix-up of data.
283
284	This function returns hash which is like this:
285
286	'200' => [
287	{
288	'i1' => '1',
289	'i2' => ' '
290	'a' => 'Goa',
291	'f' => 'Valdo D\'Arienzo',
292	'e' => 'tipografie e tipografi nel XVI secolo',
293	}
294	],
295
296	This method will also create additional field C<000> with MFN.
297
298	=cut
299
300	sub to_hash {
301	my $self = shift;
302
303	my $mfn = shift \|\| confess "need mfn!";
304
305	my $args = {@_};
306
307	# init record to include MFN as field 000
308	my $rec = { '000' => [ $mfn ] };
309
310	my $row = $self->fetch($mfn) \|\| return;
311
312	foreach my $rec_nr (keys %{$row}) {
313	foreach my $l (@{$row->{$rec_nr}}) {
314
315	# remove end marker
316	$l =~ s/\x1E$//;
317
318	# filter output
319	$l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
320
321	my $val;
322
323	# has identifiers?
324	($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
325
326	my $sf_usage;
327	my @subfields;
328
329	# has subfields?
330	if ($l =~ m/\x1F/) {
331	foreach my $t (split(/\x1F/,$l)) {
332	next if (! $t);
333	my $f = substr($t,0,1);
334
335	push @subfields, ( $f, $sf_usage->{$f}++ \|\| 0 );
336
337	# repeatable subfiled -- convert it to array
338	if ($val->{$f}) {
339	if ( ref($val->{$f}) ne 'ARRAY' ) {
340	$val->{$f} = [ $val->{$f}, $val ];
341	} else {
342	push @{$val->{$f}}, $val;
343	}
344	}
345	$val->{substr($t,0,1)} = substr($t,1);
346	}
347	$val->{subfields} = [ @subfields ] if $args->{include_subfields};
348	} else {
349	$val = $l;
350	}
351
352	push @{$rec->{$rec_nr}}, $val;
353	}
354	}
355
356	return $rec;
357	}
358
359	=head2 to_ascii
360
361	print $marc->to_ascii( 42 );
362
363	=cut
364
365	sub to_ascii {
366	my $self = shift;
367
368	my $mfn = shift \|\| confess "need mfn";
369	my $row = $self->fetch($mfn) \|\| return;
370
371	my $out;
372
373	foreach my $f (sort keys %{$row}) {
374	my $dump = join('', @{ $row->{$f} });
375	$dump =~ s/\x1e$//;
376	$dump =~ s/\x1f/\$/g;
377	$out .= "$f\t$dump\n";
378	}
379
380	return $out;
381	}
382
383	1;
384	__END__
385
386	=head1 AUTHOR
387
388	Dobrica Pavlinusic
389	CPAN ID: DPAVLIN
390	dpavlin@rot13.org
391	https://www.rot13.org/~dpavlin/
392
393	=head1 COPYRIGHT
394
395	This program is free software; you can redistribute
396	it and/or modify it under the same terms as Perl itself.
397
398	The full text of the license can be found in the
399	LICENSE file included with this module.
400
401
402	=head1 SEE ALSO
403
404	L<Biblio::Isis>, perl(1).
405
406	=cut