MARC-Fast/trunk/Fast.pm

package MARC::Fast;

use strict;
use Carp;
use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.07;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;


=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very sutable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
        hash_filter => sub {
                my ($t, $record_number) = @_;
                $t =~ s/foo/bar/;
                return $t;
        },
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leaders}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift || return;

        my $leader = $self->{leaders}->[$rec_nr - 1];
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash($mfn);

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $rec_nr (keys %{$row}) {
                foreach my $l (@{$row->{$rec_nr}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        my $f = substr($t,0,1);
                                        # repeatable subfileds. When we hit first one,
                                        # store CURRENT (up to that) in first repetition
                                        # of this record. Then, new record with same
                                        # identifiers will be created.
                                        if ($val->{$f}) {
                                                push @{$rec->{$rec_nr}}, $val;
                                                $val = {
                                                        i1 => $val->{i1},
                                                        i2 => $val->{i2},
                                                };
                                        }
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$rec_nr}}, $val;
                }
        }

        return $rec;
}

=head2 to_ascii

  print $marc->to_ascii( 42 );

=cut

sub to_ascii {
        my $self = shift;

        my $mfn = shift || confess "need mfn";
        my $row = $self->fetch($mfn) || return;

        my $out;

        foreach my $f (sort keys %{$row}) {
                my $dump = join('', @{ $row->{$f} });
                $dump =~ s/\x1e$//;
                $dump =~ s/\x1f/\$/g;
                $out .= "$f\t$dump\n";
        }

        return $out;
}

1;
__END__

=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        http://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

L<Biblio::Isis>, perl(1).

=cut
1	package MARC::Fast;
2
3	use strict;
4	use Carp;
5	use Data::Dumper;
6
7	BEGIN {
8	use Exporter ();
9	use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	$VERSION = 0.07;
11	@ISA = qw (Exporter);
12	#Give a hoot don't pollute, do not export more than needed by default
13	@EXPORT = qw ();
14	@EXPORT_OK = qw ();
15	%EXPORT_TAGS = ();
16	}
17
18	=head1 NAME
19
20	MARC::Fast - Very fast implementation of MARC database reader
21
22	=head1 SYNOPSIS
23
24	use MARC::Fast;
25
26
27	=head1 DESCRIPTION
28
29	This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30
31	It's is also very sutable for random access to MARC records (as opposed to
32	sequential one).
33
34	=head1 METHODS
35
36	=head2 new
37
38	Read MARC database
39
40	my $marc = new MARC::Fast(
41	marcdb => 'unimarc.iso',
42	quiet => 0,
43	debug => 0,
44	assert => 0,
45	hash_filter => sub {
46	my ($t, $record_number) = @_;
47	$t =~ s/foo/bar/;
48	return $t;
49	},
50	);
51
52	=cut
53
54	################################################## subroutine header end ##
55
56
57	sub new {
58	my $class = shift;
59	my $self = {@_};
60	bless ($self, $class);
61
62	croak "need marcdb parametar" unless ($self->{marcdb});
63
64	print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
65
66	open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
67	binmode($self->{fh});
68
69	$self->{count} = 0;
70
71	while (! eof($self->{fh})) {
72	$self->{count}++;
73
74	# save record position
75	push @{$self->{fh_offset}}, tell($self->{fh});
76
77	my $leader;
78	my $len = read($self->{fh}, $leader, 24);
79
80	if ($len < 24) {
81	carp "short read of leader, aborting\n";
82	last;
83	}
84
85	# Byte Name
86	# ---- ----
87	# 0-4 Record Length
88	# 5 Status (n=new, c=corrected and d=deleted)
89	# 6 Type of Record (a=printed material)
90	# 7 Bibliographic Level (m=monograph)
91	# 8-9 Blanks
92	# 10 Indictator count (2 for monographs)
93	# 11 Subfield code count (2 - 0x1F+subfield code itself)
94	# 12-16 Base address of data
95	# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
96	# 3=sublevel 3)
97	# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
98	# n=record is in non-ISBD format, i=record is in
99	# an incomplete ISBD format)
100	# 19 Blank
101	# 20 Length of length field in directory (always 4 in UNIMARC)
102	# 21 Length of Starting Character Position in directory (always
103	# 5 in UNIMARC)
104	# 22 Length of implementation defined portion in directory (always
105	# 0 in UNIMARC)
106	# 23 Blank
107	#
108	# \|0 45 89 \|12 16\|1n 450 \|
109	# \|xxxxxnam 22(.....) 45 <---
110
111	print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
112
113	# store leader for later
114	push @{$self->{leaders}}, $leader;
115
116	# skip to next record
117	my $o = substr($leader,0,5);
118	if ($o > 24) {
119	seek($self->{fh},$o-24,1) if ($o);
120	} else {
121	last;
122	}
123
124	}
125
126	return $self;
127	}
128
129	=head2 count
130
131	Return number of records in database
132
133	print $marc->count;
134
135	=cut
136
137	sub count {
138	my $self = shift;
139	return $self->{count};
140	}
141
142	=head2 fetch
143
144	Fetch record from database
145
146	my $hash = $marc->fetch(42);
147
148	=cut
149
150	sub fetch {
151	my $self = shift;
152
153	my $rec_nr = shift \|\| return;
154
155	my $leader = $self->{leaders}->[$rec_nr - 1];
156	unless ($leader) {
157	carp "can't find record $rec_nr";
158	return;
159	};
160	my $offset = $self->{fh_offset}->[$rec_nr - 1];
161	unless (defined($offset)) {
162	carp "can't find offset for record $rec_nr";
163	return;
164	};
165
166	my $reclen = substr($leader,0,5);
167	my $base_addr = substr($leader,12,5);
168
169	print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
170
171	my $skip = 0;
172
173	print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
174
175	if ( ! seek($self->{fh}, $offset+24, 0) ) {
176	carp "can't seek to $offset: $!";
177	return;
178	}
179
180	print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
181
182	my $directory;
183	if( ! read($self->{fh},$directory,$base_addr-24) ) {
184	carp "can't read directory: $!";
185	$skip = 1;
186	} else {
187	print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
188	}
189
190	print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
191
192	my $fields;
193	if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
194	carp "can't read fields: $!";
195	$skip = 1;
196	} else {
197	print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
198	}
199
200	my $row;
201
202	while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
203	my ($tag,$len,$addr) = ($1,$2,$3);
204
205	if (($addr+$len) > length($fields)) {
206	print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
207	$skip = 1;
208	next;
209	}
210
211	# take field
212	my $f = substr($fields,$addr,$len);
213	print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
214
215	push @{ $row->{$tag} }, $f;
216
217	my $del = substr($fields,$addr+$len-1,1);
218
219	# check field delimiters...
220	if ($self->{assert} && $del ne chr(30)) {
221	print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
222	$skip = 1;
223	next;
224	}
225
226	if ($self->{assert} && length($f) < 2) {
227	print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
228	next;
229	}
230
231	}
232
233	return $row;
234	}
235
236
237	=head2 to_hash
238
239	Read record with specified MFN and convert it to hash
240
241	my $hash = $marc->to_hash($mfn);
242
243	It has ability to convert characters (using C<hash_filter>) from MARC
244	database before creating structures enabling character re-mapping or quick
245	fix-up of data.
246
247	This function returns hash which is like this:
248
249	'200' => [
250	{
251	'i1' => '1',
252	'i2' => ' '
253	'a' => 'Goa',
254	'f' => 'Valdo D\'Arienzo',
255	'e' => 'tipografie e tipografi nel XVI secolo',
256	}
257	],
258
259	This method will also create additional field C<000> with MFN.
260
261	=cut
262
263	sub to_hash {
264	my $self = shift;
265
266	my $mfn = shift \|\| confess "need mfn!";
267
268	# init record to include MFN as field 000
269	my $rec = { '000' => [ $mfn ] };
270
271	my $row = $self->fetch($mfn) \|\| return;
272
273	foreach my $rec_nr (keys %{$row}) {
274	foreach my $l (@{$row->{$rec_nr}}) {
275
276	# remove end marker
277	$l =~ s/\x1E$//;
278
279	# filter output
280	$l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
281
282	my $val;
283
284	# has identifiers?
285	($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
286
287	# has subfields?
288	if ($l =~ m/\x1F/) {
289	foreach my $t (split(/\x1F/,$l)) {
290	next if (! $t);
291	my $f = substr($t,0,1);
292	# repeatable subfileds. When we hit first one,
293	# store CURRENT (up to that) in first repetition
294	# of this record. Then, new record with same
295	# identifiers will be created.
296	if ($val->{$f}) {
297	push @{$rec->{$rec_nr}}, $val;
298	$val = {
299	i1 => $val->{i1},
300	i2 => $val->{i2},
301	};
302	}
303	$val->{substr($t,0,1)} = substr($t,1);
304	}
305	} else {
306	$val = $l;
307	}
308
309	push @{$rec->{$rec_nr}}, $val;
310	}
311	}
312
313	return $rec;
314	}
315
316	=head2 to_ascii
317
318	print $marc->to_ascii( 42 );
319
320	=cut
321
322	sub to_ascii {
323	my $self = shift;
324
325	my $mfn = shift \|\| confess "need mfn";
326	my $row = $self->fetch($mfn) \|\| return;
327
328	my $out;
329
330	foreach my $f (sort keys %{$row}) {
331	my $dump = join('', @{ $row->{$f} });
332	$dump =~ s/\x1e$//;
333	$dump =~ s/\x1f/\$/g;
334	$out .= "$f\t$dump\n";
335	}
336
337	return $out;
338	}
339
340	1;
341	__END__
342
343	=head1 AUTHOR
344
345	Dobrica Pavlinusic
346	CPAN ID: DPAVLIN
347	dpavlin@rot13.org
348	http://www.rot13.org/~dpavlin/
349
350	=head1 COPYRIGHT
351
352	This program is free software; you can redistribute
353	it and/or modify it under the same terms as Perl itself.
354
355	The full text of the license can be found in the
356	LICENSE file included with this module.
357
358
359	=head1 SEE ALSO
360
361	L<Biblio::Isis>, perl(1).
362
363	=cut