MARC-Fast/trunk/Fast.pm


package MARC::Fast;
use strict;
use Carp;
use Data::Dumper;

BEGIN {
        use Exporter ();
        use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $VERSION     = 0.04;
        @ISA         = qw (Exporter);
        #Give a hoot don't pollute, do not export more than needed by default
        @EXPORT      = qw ();
        @EXPORT_OK   = qw ();
        %EXPORT_TAGS = ();
}

=head1 NAME

MARC::Fast - Very fast implementation of MARC database reader

=head1 SYNOPSIS

  use MARC::Fast;


=head1 DESCRIPTION

This is very fast alternative to C<MARC> and C<MARC::Record> modules.

It's is also very sutable for random access to MARC records (as opposed to
sequential one).

=head1 METHODS

=head2 new

Read MARC database

  my $marc = new MARC::Fast(
        marcdb => 'unimarc.iso',
        quiet => 0,
        debug => 0,
        assert => 0,
        hash_filter => sub {
                my ($t, $record_number) = @_;
                $t =~ s/foo/bar/;
                return $t;
        },
  );

=cut

################################################## subroutine header end ##


sub new {
        my $class = shift;
        my $self = {@_};
        bless ($self, $class);

        croak "need marcdb parametar" unless ($self->{marcdb});

        print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});

        open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
        binmode($self->{fh});

        $self->{count} = 0;

        while (! eof($self->{fh})) {
                $self->{count}++;

                # save record position
                push @{$self->{fh_offset}}, tell($self->{fh});

                my $leader;
                my $len = read($self->{fh}, $leader, 24);

                if ($len < 24) {
                        carp "short read of leader, aborting\n";
                        last;
                }

                # Byte        Name
                # ----        ----
                # 0-4         Record Length
                # 5           Status (n=new, c=corrected and d=deleted)
                # 6           Type of Record (a=printed material)
                # 7           Bibliographic Level (m=monograph)
                # 8-9         Blanks
                # 10          Indictator count (2 for monographs)
                # 11          Subfield code count (2 - 0x1F+subfield code itself)
                # 12-16       Base address of data
                # 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
                #               3=sublevel 3)
                # 18          Descriptive Cataloguing Form (blank=record is full ISBD,
                #               n=record is in non-ISBD format, i=record is in
                #               an incomplete ISBD format)
                # 19          Blank
                # 20          Length of length field in directory (always 4 in UNIMARC)
                # 21          Length of Starting Character Position in directory (always
                #               5 in UNIMARC)
                # 22          Length of implementation defined portion in directory (always
                #               0 in UNIMARC)
                # 23          Blank
                #
                #           |0   45  89  |12 16|1n 450 |
                #           |xxxxxnam  22(.....)   45 <---

                print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});

                # store leader for later
                push @{$self->{leaders}}, $leader;

                # skip to next record
                my $o = substr($leader,0,5);
                if ($o > 24) {
                        seek($self->{fh},$o-24,1) if ($o);
                } else {
                        last;
                }

        }

        return $self;
}

=head2 count

Return number of records in database

  print $marc->count;

=cut

sub count {
        my $self = shift;
        return $self->{count};
}

=head2 fetch

Fetch record from database

  my $hash = $marc->fetch(42);

=cut

sub fetch {
        my $self = shift;

        my $rec_nr = shift || return;

        my $leader = $self->{leaders}->[$rec_nr - 1];
        unless ($leader) {
                carp "can't find record $rec_nr";
                return;
        };
        my $offset = $self->{fh_offset}->[$rec_nr - 1];
        unless (defined($offset)) {
                carp "can't find offset for record $rec_nr";
                return;
        };

        my $reclen = substr($leader,0,5);
        my $base_addr = substr($leader,12,5);

        print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});

        my $skip = 0;

        print STDERR "# seeking to $offset + 24\n" if ($self->{debug});

        if ( ! seek($self->{fh}, $offset+24, 0) ) {
                carp "can't seek to $offset: $!";
                return;
        }

        print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});

        my $directory;
        if( ! read($self->{fh},$directory,$base_addr-24) ) {
                carp "can't read directory: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
        }

        print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});

        my $fields;
        if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
                carp "can't read fields: $!";
                $skip = 1;
        } else {
                print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
        }

        my $row;

        while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
                my ($tag,$len,$addr) = ($1,$2,$3);

                if (($addr+$len) > length($fields)) {
                        print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                # take field
                my $f = substr($fields,$addr,$len);
                print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});

                push @{ $row->{$tag} }, $f;

                my $del = substr($fields,$addr+$len-1,1);

                # check field delimiters...
                if ($self->{assert} && $del ne chr(30)) {
                        print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
                        $skip = 1;
                        next;
                }

                if ($self->{assert} && length($f) < 2) {
                        print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
                        next;
                }

        }

        return $row;
}


=head2 to_hash

Read record with specified MFN and convert it to hash

  my $hash = $marc->to_hash($mfn);

It has ability to convert characters (using C<hash_filter>) from MARC
database before creating structures enabling character re-mapping or quick
fix-up of data.

This function returns hash which is like this:

  '200' => [
             {
               'i1' => '1',
               'i2' => ' '
               'a' => 'Goa',
               'f' => 'Valdo D\'Arienzo',
               'e' => 'tipografie e tipografi nel XVI secolo',
             }
           ],

This method will also create additional field C<000> with MFN.

=cut

sub to_hash {
        my $self = shift;

        my $mfn = shift || confess "need mfn!";

        # init record to include MFN as field 000
        my $rec = { '000' => [ $mfn ] };

        my $row = $self->fetch($mfn) || return;

        foreach my $rec_nr (keys %{$row}) {
                foreach my $l (@{$row->{$rec_nr}}) {

                        # remove end marker
                        $l =~ s/\x1E$//;

                        # filter output
                        $l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});

                        my $val;

                        # has identifiers?
                        ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);

                        # has subfields?
                        if ($l =~ m/\x1F/) {
                                foreach my $t (split(/\x1F/,$l)) {
                                        next if (! $t);
                                        my $f = substr($t,0,1);
                                        # repeatable subfileds. When we hit first one,
                                        # store CURRENT (up to that) in first repetition
                                        # of this record. Then, new record with same
                                        # identifiers will be created.
                                        if ($val->{$f}) {
                                                push @{$rec->{$rec_nr}}, $val;
                                                $val = {
                                                        i1 => $val->{i1},
                                                        i2 => $val->{i2},
                                                };
                                        }
                                        $val->{substr($t,0,1)} = substr($t,1);
                                }
                        } else {
                                $val = $l;
                        }

                        push @{$rec->{$rec_nr}}, $val;
                }
        }

        return $rec;
}


1;
__END__

=head1 BUGS


=head1 SUPPORT


=head1 AUTHOR

        Dobrica Pavlinusic
        CPAN ID: DPAVLIN
        dpavlin@rot13.org
        https://www.rot13.org/~dpavlin/

=head1 COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.


=head1 SEE ALSO

perl(1).

=cut
1	dpavlin	1
2			package MARC::Fast;
3			use strict;
4			use Carp;
5			use Data::Dumper;
6
7			BEGIN {
8			use Exporter ();
9			use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
10	dpavlin	9	$VERSION = 0.04;
11	dpavlin	1	@ISA = qw (Exporter);
12			#Give a hoot don't pollute, do not export more than needed by default
13			@EXPORT = qw ();
14			@EXPORT_OK = qw ();
15			%EXPORT_TAGS = ();
16			}
17
18			=head1 NAME
19
20			MARC::Fast - Very fast implementation of MARC database reader
21
22			=head1 SYNOPSIS
23
24			use MARC::Fast;
25
26
27			=head1 DESCRIPTION
28
29			This is very fast alternative to C<MARC> and C<MARC::Record> modules.
30
31			It's is also very sutable for random access to MARC records (as opposed to
32			sequential one).
33
34			=head1 METHODS
35
36			=head2 new
37
38			Read MARC database
39
40			my $marc = new MARC::Fast(
41			marcdb => 'unimarc.iso',
42			quiet => 0,
43			debug => 0,
44			assert => 0,
45	dpavlin	8	hash_filter => sub {
46	dpavlin	9	my ($t, $record_number) = @_;
47	dpavlin	8	$t =~ s/foo/bar/;
48			return $t;
49			},
50	dpavlin	1	);
51
52			=cut
53
54			################################################## subroutine header end ##
55
56
57			sub new {
58			my $class = shift;
59			my $self = {@_};
60			bless ($self, $class);
61
62			croak "need marcdb parametar" unless ($self->{marcdb});
63
64			print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
65
66			open($self->{fh}, $self->{marcdb}) \|\| croak "can't open ",$self->{marcdb},": $!";
67	dpavlin	6	binmode($self->{fh});
68	dpavlin	1
69			$self->{count} = 0;
70
71			while (! eof($self->{fh})) {
72			$self->{count}++;
73
74			# save record position
75			push @{$self->{fh_offset}}, tell($self->{fh});
76
77			my $leader;
78	dpavlin	6	my $len = read($self->{fh}, $leader, 24);
79	dpavlin	1
80	dpavlin	6	if ($len < 24) {
81			carp "short read of leader, aborting\n";
82			last;
83			}
84
85	dpavlin	1	# Byte Name
86			# ---- ----
87			# 0-4 Record Length
88			# 5 Status (n=new, c=corrected and d=deleted)
89			# 6 Type of Record (a=printed material)
90			# 7 Bibliographic Level (m=monograph)
91			# 8-9 Blanks
92			# 10 Indictator count (2 for monographs)
93			# 11 Subfield code count (2 - 0x1F+subfield code itself)
94			# 12-16 Base address of data
95			# 17 Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
96			# 3=sublevel 3)
97			# 18 Descriptive Cataloguing Form (blank=record is full ISBD,
98			# n=record is in non-ISBD format, i=record is in
99			# an incomplete ISBD format)
100			# 19 Blank
101			# 20 Length of length field in directory (always 4 in UNIMARC)
102			# 21 Length of Starting Character Position in directory (always
103			# 5 in UNIMARC)
104			# 22 Length of implementation defined portion in directory (always
105			# 0 in UNIMARC)
106			# 23 Blank
107			#
108			# \|0 45 89 \|12 16\|1n 450 \|
109			# \|xxxxxnam 22(.....) 45 <---
110
111			print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
112
113			# store leader for later
114			push @{$self->{leaders}}, $leader;
115
116			# skip to next record
117	dpavlin	6	my $o = substr($leader,0,5);
118			if ($o > 24) {
119			seek($self->{fh},$o-24,1) if ($o);
120			} else {
121			last;
122			}
123	dpavlin	1
124			}
125
126			return $self;
127			}
128
129			=head2 count
130
131			Return number of records in database
132
133			print $marc->count;
134
135			=cut
136
137			sub count {
138			my $self = shift;
139			return $self->{count};
140			}
141
142			=head2 fetch
143
144			Fetch record from database
145
146			my $hash = $marc->fetch(42);
147
148			=cut
149
150			sub fetch {
151			my $self = shift;
152
153			my $rec_nr = shift \|\| return;
154
155			my $leader = $self->{leaders}->[$rec_nr - 1];
156			unless ($leader) {
157			carp "can't find record $rec_nr";
158			return;
159			};
160			my $offset = $self->{fh_offset}->[$rec_nr - 1];
161			unless (defined($offset)) {
162			carp "can't find offset for record $rec_nr";
163			return;
164			};
165
166			my $reclen = substr($leader,0,5);
167			my $base_addr = substr($leader,12,5);
168
169			print STDERR "# $rec_nr leader: '$leader' reclen: $reclen base addr: $base_addr [dir: ",$base_addr - 24,"]\n" if ($self->{debug});
170
171			my $skip = 0;
172
173			print STDERR "# seeking to $offset + 24\n" if ($self->{debug});
174
175			if ( ! seek($self->{fh}, $offset+24, 0) ) {
176			carp "can't seek to $offset: $!";
177			return;
178			}
179
180			print STDERR "# reading ",$base_addr-24," bytes of dictionary\n" if ($self->{debug});
181
182			my $directory;
183			if( ! read($self->{fh},$directory,$base_addr-24) ) {
184			carp "can't read directory: $!";
185			$skip = 1;
186			} else {
187			print STDERR "# $rec_nr directory: [",length($directory),"] '$directory'\n" if ($self->{debug});
188			}
189
190			print STDERR "# reading ",$reclen-$base_addr," bytes of fields\n" if ($self->{debug});
191
192			my $fields;
193			if( ! read($self->{fh},$fields,$reclen-$base_addr) ) {
194			carp "can't read fields: $!";
195			$skip = 1;
196			} else {
197			print STDERR "# $rec_nr fields: '$fields'\n" if ($self->{debug});
198			}
199
200			my $row;
201
202			while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
203			my ($tag,$len,$addr) = ($1,$2,$3);
204
205			if (($addr+$len) > length($fields)) {
206			print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $self->{quiet});
207			$skip = 1;
208			next;
209			}
210
211			# take field
212			my $f = substr($fields,$addr,$len);
213			print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
214
215	dpavlin	6	push @{ $row->{$tag} }, $f;
216	dpavlin	1
217			my $del = substr($fields,$addr+$len-1,1);
218
219			# check field delimiters...
220			if ($self->{assert} && $del ne chr(30)) {
221			print STDERR "WARNING: skipping record $rec_nr, can't find delimiter 30 got: '$del'\n" if (! $self->{quiet});
222			$skip = 1;
223			next;
224			}
225
226			if ($self->{assert} && length($f) < 2) {
227			print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $self->{quiet});
228			next;
229			}
230
231			}
232
233			return $row;
234			}
235
236	dpavlin	6
237			=head2 to_hash
238
239			Read record with specified MFN and convert it to hash
240
241			my $hash = $marc->to_hash($mfn);
242
243			It has ability to convert characters (using C<hash_filter>) from MARC
244			database before creating structures enabling character re-mapping or quick
245			fix-up of data.
246
247			This function returns hash which is like this:
248
249			'200' => [
250			{
251			'i1' => '1',
252			'i2' => ' '
253			'a' => 'Goa',
254			'f' => 'Valdo D\'Arienzo',
255			'e' => 'tipografie e tipografi nel XVI secolo',
256			}
257			],
258
259			This method will also create additional field C<000> with MFN.
260
261			=cut
262
263			sub to_hash {
264			my $self = shift;
265
266			my $mfn = shift \|\| confess "need mfn!";
267
268			# init record to include MFN as field 000
269			my $rec = { '000' => [ $mfn ] };
270
271			my $row = $self->fetch($mfn) \|\| return;
272
273	dpavlin	9	foreach my $rec_nr (keys %{$row}) {
274			foreach my $l (@{$row->{$rec_nr}}) {
275	dpavlin	6
276			# remove end marker
277			$l =~ s/\x1E$//;
278
279			# filter output
280	dpavlin	9	$l = $self->{'hash_filter'}->($l, $rec_nr) if ($self->{'hash_filter'});
281	dpavlin	6
282			my $val;
283
284			# has identifiers?
285			($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
286
287			# has subfields?
288			if ($l =~ m/\x1F/) {
289			foreach my $t (split(/\x1F/,$l)) {
290			next if (! $t);
291	dpavlin	8	my $f = substr($t,0,1);
292			# repeatable subfileds. When we hit first one,
293			# store CURRENT (up to that) in first repetition
294			# of this record. Then, new record with same
295			# identifiers will be created.
296			if ($val->{$f}) {
297	dpavlin	9	push @{$rec->{$rec_nr}}, $val;
298	dpavlin	8	$val = {
299			i1 => $val->{i1},
300			i2 => $val->{i2},
301			};
302			}
303	dpavlin	6	$val->{substr($t,0,1)} = substr($t,1);
304			}
305			} else {
306			$val = $l;
307			}
308
309	dpavlin	9	push @{$rec->{$rec_nr}}, $val;
310	dpavlin	6	}
311			}
312
313			return $rec;
314			}
315
316
317	dpavlin	1	1;
318			__END__
319
320			=head1 BUGS
321
322
323
324			=head1 SUPPORT
325
326
327
328			=head1 AUTHOR
329
330			Dobrica Pavlinusic
331			CPAN ID: DPAVLIN
332			dpavlin@rot13.org
333			https://www.rot13.org/~dpavlin/
334
335			=head1 COPYRIGHT
336
337			This program is free software; you can redistribute
338			it and/or modify it under the same terms as Perl itself.
339
340			The full text of the license can be found in the
341			LICENSE file included with this module.
342
343
344			=head1 SEE ALSO
345
346			perl(1).
347
348			=cut