Revision 338 (by dpavlin, 2004/06/10 22:05:38) fixed parsing for science direct html with more than one <a href=> per one <tr>
support to force input file (if you uncomment filename in script)
#!/usr/bin/perl -w

# This script will fatch list of articles on which you have access
# (using IP authorisation) from ScienceDirect
#
# This version requires CSV dumps from ScienceDirect for Holdings data
# and categories, but can output much more data about each record

use LWP::UserAgent;
use HTML::TreeBuilder;
require Text::CSV;
use Text::Unaccent;
use strict;

my $debug=1;

my $file;

# uncomment following line if you want to use file instead of http connection
#$file="list.html";

# configure ScienceDirect CVS files location
my $csv_dir="/data/isis_data/sciencedirect";
my $j_holdings="sd_JournalHoldingsRpt.txt";
my $j_category="sd_Journal_Category.txt";

# URL to list of subscribed journals
my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920';
my $html_codepage="iso-8859-1";

my $csv = Text::CSV->new(); 
my $journal;
my $c_wo_h = 0;		# category without holding record
my $c_nr = 0;		# number of categories assigned

my $j_basic = 0;
my $j_detailed = 0;

print STDERR "unrolling $j_holdings\n";

sub nuc {
	# normalizing UC
	my $s=shift @_ || return "";
	$s=unac_string($html_codepage,$s);
	$s=~s/[^\w]/ /g;
	$s=~s/  +/ /g;
	return uc($s);
}

open(H,"$csv_dir/$j_holdings") || die "can't open $csv_dir/$j_holdings: $!";
my $line = <H>;		# skip header line
while(<H>) {
	chomp;
	$csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
	my @data = $csv->fields;
	my $key = nuc($data[0]);
	push @data,"";			# for categories later...
	$journal->{$key} = \@data;
}
close(H);

print STDERR "unrolling $j_category\n";

open(C,"$csv_dir/$j_category") || die "can't open $csv_dir/$j_category: $!";
$line = <C>;		# skip header line
while(<C>) {
	chomp;
	$csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
	my @data = $csv->fields;
	my $key = nuc($data[1]);
	if (! $journal->{$key}) {
		$c_wo_h++;
		next;
	}

	foreach my $i (4, 6, 8, 10) {
		push @{$journal->{$key}},$data[$i] || "";
		if ($data[$i]) {
			$c_nr++;
		}
	}
}
close(C);

print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n";

$debug++ if (lc($ARGV[0]) eq "-d");


my $res;
if (! $file) {
	my $ua = new LWP::UserAgent;
	$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
	$ua->timeout(60);
	#$ua->env_proxy();
	#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');

	print STDERR "getting '$url'...\n" if ($debug);
	my $req = HTTP::Request->new(GET => $url);

	$res = $ua->request($req);
} elsif (! -e $file) {
	die "can't find feed file '$file'";
}

if ($file || $res->is_success) {
	print STDERR "parsing html...\n" if ($debug);
	my $tree = HTML::TreeBuilder->new;
	if ($file) {
		$tree->parse_file("list.html");
	} else {
		$tree->parse($res->content);
	}

	foreach my $tr ($tree->look_down('_tag', 'tr')) {
		my $link;
		foreach my $link ($tr->look_down('_tag','a')) {
			if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
				my $j=nuc($link->as_text);
				if ($journal->{$j}) {
					my $i=0;
					foreach my $line (@{$journal->{$j}}) {
						print $i++,": $line\n";
					}
					$j_detailed++;
				} else {
					print "0: ",$link->as_text."\n";
					print "7: http://www.sciencedirect.com",$link->attr('href')."\n";
					$j_basic++;
					print STDERR "can't find details for $j\n" if ($debug);
				}

				print "\n";
			}
		}
	}

	$tree->delete; # clear memory!

} else {
    warn "can't fetch web page from '$url'";
}

print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n";