Revision 217 (by dpavlin, 2004/02/01 23:28:27) bugfix and improvements:
- debug and quiet switches
- skip fields which are too short (just subfield without data)
- calculate directory address
- calculate base address in leader
#!/usr/bin/perl -w
#
# This script will try (hard) to convert database from
# PhpMyLibrary (http://phpmylibrary.sourceforge.net/) back
# to MARC format (ISO 2709)
#
# 2003-01-31 Dobrica Pavlinusic <dpavlin@rot13.org>
#
# This script is written for WebPac project available at
# http://webpac.sourceforge.net/
#
# MARC file format documentation is taken from
# http://www.ariadne.ac.uk/issue7/marc/
#
# 

use DBI;
use strict;
use Getopt::Long;

my $database = "postnuke";
my $host = "localhost";
my $user = "root";
my $passwd = "";

my $usage = 0;
my $debug = 0;
my $quiet = 0;

my $result = GetOptions(
	"database=s" => \$database,
	"host=s" => \$host,
	"user=s" => \$user,
	"password=s" => \$passwd,
	"debug!" => \$debug,
	"verbose!" => \$debug,
	"quiet!" => \$quiet,
	"help!" => \$usage,
	);

if ($usage) {
	print qq{usage: $0 [--database="$database" --host="$host" --user="$user" --password="$passwd"] > file.marc\n
This script will convert PhpMyLibrary database to standard UNIMARC format\n};
	exit 1;
}

my $dsn = "DBI:mysql:database=$database;host=$host";
my $dbh = DBI->connect($dsn, $user, $passwd, {'RaiseError' => 1});

# UNIMARC leader format
#my $leader_fmt = qq{%05diam0 22%05d   45  };
# MARC leader format
my $leader_fmt = qq{%05dcas  22%05d a 4500};


my $sth = $dbh->prepare("SELECT marc FROM tblbib");
$sth->execute();

my $count = 0;
my $rec_nr = 0;

while (my $row = $sth->fetchrow_hashref()) {
	my $marc = $row->{'marc'};
	$rec_nr++;

	my $real_len = length($marc);

	my $skip = 0;	# skip this record?

	# fix PhpMyLibrary MARC (why do I have to do this? It's MARC,
	# for gaddem sake!!!
	
	# Byte        Name
	# ----        ----
	# 0-4         Record Length
	# 5           Status (n=new, c=corrected and d=deleted)
	# 6           Type of Record (a=printed material)
	# 7           Bibliographic Level (m=monograph)
	# 8-9         Blanks
	# 10          Indictator count (2 for monographs)
	# 11          Subfield code count (2 - 0x1F+subfield code itself)
	# 12-16       Base address of data
	# 17          Encoding level (blank=full level, 1=sublevel 1, 2=sublevel 2,
	# 		3=sublevel 3)
	# 18          Descriptive Cataloguing Form (blank=record is full ISBD,
	#		n=record is in non-ISBD format, i=record is in
	#		an incomplete ISBD format)
	# 19          Blank
	# 20          Length of length field in directory (always 4 in UNIMARC)
	# 21          Length of Starting Character Position in directory (always
	# 		5 in UNIMARC)
	# 22          Length of implementation defined portion in directory (always
	# 		0 in UNIMARC)
	# 23          Blank
	#
	#           |0   4|5  89  |12 16|1n 450 |
	#           (xxxxx)nam  22(.....)   450 <---
	$marc =~ m/^(.....)......(.....)polerioj/ || die "record: '$marc' unparsable!";
	my ($reclen,$base_addr) = ($1,$2);

	my $directory = substr($marc,24,$base_addr-24);
	my $fields = substr($marc,$base_addr-1);

	print STDERR "# $rec_nr fields: '$fields'\n" if ($debug);
	print STDERR "# $rec_nr directory: [",length($directory),"]\n" if ($debug);

	# PhpMyLibrary MARC records don't have indicators, so we'll add them

	my $o = 0;	# offset
	my $new_dictionary;
	my $new_fields;

	while (!$skip && $directory =~ s/(\d{3})(\d{4})(\d{5})//) {
		my ($tag,$len,$addr) = ($1,$2,$3);

		sub check_field($) {
			my $f = shift;
			my $del = substr($f,0,1);
		
			die "expected 0x1e, got '$del' (".ord($del)."): '$f'" if (ord($del) != 30);
		}

		if (($addr+$len) > length($fields)) {
			print STDERR "WARNING: error in dictionary on record $rec_nr skipping...\n" if (! $quiet);
			$skip = 1;
			next;
		}

		# take field
		my $f = substr($fields,$addr,$len);
		print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($debug);

		my $del = substr($fields,$addr+$len,1);

		# check field delimiters...
		if ($del ne chr(30)) {
			print STDERR "WARNING: skipping record $rec_nr, can't find delimiters got: '$del'\n" if (! $quiet);
			$skip = 1;
			next;
		}

		check_field($f);

		if (length($f) < 2) {
			print STDERR "WARNING: skipping field $tag from record $rec_nr because it's too short!\n" if (! $quiet);
			next;
		}


		if ($tag =~ m/^00/) {
			# fields 001-008 doesn't have indicators
			$new_dictionary .= sprintf("%03d%04d%05d",$tag,$len,length($new_fields || ''));
			$new_fields.=$f;
		} else {
			$new_dictionary .= sprintf("%03d%04d%05d",$tag,($len+2),length($new_fields || ''));
			$new_fields.=chr(30)."  ".substr($f,1);
			$o += 2;
		}
	}

	if (! $skip) {
		my $new_leader = sprintf($leader_fmt,24+length($new_dictionary.$new_fields)+2,length($new_dictionary)+25);
		my $new_marc = $new_leader . $new_dictionary . $new_fields . chr(30);
		$new_marc .= chr(29);	# end of record

		print STDERR "original and new marc: [$rec_nr/$count]\n$marc\n$new_marc\n\n" if ($debug);
		print "$new_marc";
		$count++;
	}

#	last if ($count > 100);

}
$sth->finish();
$dbh->disconnect();

print STDERR "$count records from database $database converted...\n";