Revision 74 (by dpavlin, 2003/07/05 22:37:30) support for new feed format which have decimal number of field, semicolumn
and space at beginning of each line (like: 0: data)
#!/usr/bin/perl -w

# This script will fatch list of articles on which you have access
# (using IP authorisation) from ScienceDirect

use LWP::UserAgent;
use HTML::TreeBuilder;
use strict;

my $debug=1;

my $base_url = 'http://www.sciencedirect.com';
my $url = $base_url . '/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920';

$debug++ if (lc($ARGV[0]) eq "-d");

my $ua = new LWP::UserAgent;
$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
$ua->timeout(60);
#$ua->env_proxy();
#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');

print STDERR "getting '$url'...\n" if ($debug);
my $req = HTTP::Request->new(GET => $url);

my @out;

my $res = $ua->request($req);
if ($res->is_success) {
	print STDERR "parsing html...\n" if ($debug);
	my $tree = HTML::TreeBuilder->new;
#	$tree->parse_file("list.html");   # !
	$tree->parse($res->content);

	foreach my $tr ($tree->look_down('_tag', 'tr')) {
		my $link;
		if ($link = $tr->look_down('_tag','a')) {
			if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
				print "0: ",$link->as_text."\n";
				print "7: ",$base_url.$link->attr('href')."\n";
				print "\n";
			}
		}
	}

	$tree->delete; # clear memory!

} else {
    warn "can't fetch web page from '$url'";
}