14 |
|
|
15 |
my $debug=1; |
my $debug=1; |
16 |
|
|
17 |
|
my $file; |
18 |
|
|
19 |
|
# uncomment following line if you want to use file instead of http connection |
20 |
|
#$file="list.html"; |
21 |
|
|
22 |
# configure ScienceDirect CVS files location |
# configure ScienceDirect CVS files location |
23 |
my $csv_dir="/data/isis_data/sciencedirect"; |
my $csv_dir="/data/isis_data/sciencedirect"; |
24 |
my $j_holdings="sd_JournalHoldingsRpt.txt"; |
my $j_holdings="sd_JournalHoldingsRpt.txt"; |
86 |
|
|
87 |
$debug++ if (lc($ARGV[0]) eq "-d"); |
$debug++ if (lc($ARGV[0]) eq "-d"); |
88 |
|
|
|
my $ua = new LWP::UserAgent; |
|
|
$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0"); |
|
|
$ua->timeout(60); |
|
|
#$ua->env_proxy(); |
|
|
#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/'); |
|
|
|
|
|
print STDERR "getting '$url'...\n" if ($debug); |
|
|
my $req = HTTP::Request->new(GET => $url); |
|
89 |
|
|
90 |
my @out; |
my $res; |
91 |
|
if (! $file) { |
92 |
|
my $ua = new LWP::UserAgent; |
93 |
|
$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0"); |
94 |
|
$ua->timeout(60); |
95 |
|
#$ua->env_proxy(); |
96 |
|
#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/'); |
97 |
|
|
98 |
|
print STDERR "getting '$url'...\n" if ($debug); |
99 |
|
my $req = HTTP::Request->new(GET => $url); |
100 |
|
|
101 |
|
$res = $ua->request($req); |
102 |
|
} elsif (! -e $file) { |
103 |
|
die "can't find feed file '$file'"; |
104 |
|
} |
105 |
|
|
106 |
my $res = $ua->request($req); |
if ($file || $res->is_success) { |
|
if ($res->is_success) { |
|
107 |
print STDERR "parsing html...\n" if ($debug); |
print STDERR "parsing html...\n" if ($debug); |
108 |
my $tree = HTML::TreeBuilder->new; |
my $tree = HTML::TreeBuilder->new; |
109 |
# $tree->parse_file("list.html"); # ! |
if ($file) { |
110 |
$tree->parse($res->content); |
$tree->parse_file("list.html"); |
111 |
|
} else { |
112 |
|
$tree->parse($res->content); |
113 |
|
} |
114 |
|
|
115 |
foreach my $tr ($tree->look_down('_tag', 'tr')) { |
foreach my $tr ($tree->look_down('_tag', 'tr')) { |
116 |
my $link; |
my $link; |
117 |
if ($link = $tr->look_down('_tag','a')) { |
foreach my $link ($tr->look_down('_tag','a')) { |
118 |
if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) { |
if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) { |
119 |
my $j=nuc($link->as_text); |
my $j=nuc($link->as_text); |
120 |
if ($journal->{$j}) { |
if ($journal->{$j}) { |
124 |
} |
} |
125 |
$j_detailed++; |
$j_detailed++; |
126 |
} else { |
} else { |
127 |
print $link->attr('href')."\n"; |
print "0: ",$link->as_text."\n"; |
128 |
print $link->as_text."\n"; |
print "7: http://www.sciencedirect.com",$link->attr('href')."\n"; |
129 |
$j_basic++; |
$j_basic++; |
130 |
print STDERR "can't find details for $j\n" if ($debug); |
print STDERR "can't find details for $j\n" if ($debug); |
131 |
} |
} |