1 |
dpavlin |
67 |
#!/usr/bin/perl -w |
2 |
|
|
|
3 |
|
|
# This script will fatch list of articles on which you have access |
4 |
|
|
# (using IP authorisation) from ScienceDirect |
5 |
|
|
|
6 |
|
|
use LWP::UserAgent; |
7 |
|
|
use HTML::TreeBuilder; |
8 |
|
|
use strict; |
9 |
|
|
|
10 |
|
|
my $debug=1; |
11 |
|
|
|
12 |
|
|
my $base_url = 'http://www.sciencedirect.com'; |
13 |
|
|
my $url = $base_url . '/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920'; |
14 |
|
|
|
15 |
|
|
$debug++ if (lc($ARGV[0]) eq "-d"); |
16 |
|
|
|
17 |
|
|
my $ua = new LWP::UserAgent; |
18 |
|
|
$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0"); |
19 |
|
|
$ua->timeout(60); |
20 |
|
|
#$ua->env_proxy(); |
21 |
|
|
#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/'); |
22 |
|
|
|
23 |
|
|
print STDERR "getting '$url'...\n" if ($debug); |
24 |
|
|
my $req = HTTP::Request->new(GET => $url); |
25 |
|
|
|
26 |
|
|
my @out; |
27 |
|
|
|
28 |
|
|
my $res = $ua->request($req); |
29 |
|
|
if ($res->is_success) { |
30 |
|
|
print STDERR "parsing html...\n" if ($debug); |
31 |
|
|
my $tree = HTML::TreeBuilder->new; |
32 |
|
|
# $tree->parse_file("list.html"); # ! |
33 |
|
|
$tree->parse($res->content); |
34 |
|
|
|
35 |
|
|
foreach my $tr ($tree->look_down('_tag', 'tr')) { |
36 |
|
|
my $link; |
37 |
|
|
if ($link = $tr->look_down('_tag','a')) { |
38 |
|
|
if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) { |
39 |
|
|
print $base_url.$link->attr('href')."\n"; |
40 |
|
|
print $link->as_text."\n"; |
41 |
|
|
print "\n"; |
42 |
|
|
} |
43 |
|
|
} |
44 |
|
|
} |
45 |
|
|
|
46 |
|
|
$tree->delete; # clear memory! |
47 |
|
|
|
48 |
|
|
} else { |
49 |
|
|
warn "can't fetch web page from '$url'"; |
50 |
|
|
} |