1 |
dpavlin |
70 |
#!/usr/bin/perl -w |
2 |
|
|
|
3 |
|
|
# This script will fatch list of articles on which you have access |
4 |
|
|
# (using IP authorisation) from ScienceDirect |
5 |
|
|
# |
6 |
|
|
# This version requires CSV dumps from ScienceDirect for Holdings data |
7 |
|
|
# and categories, but can output much more data about each record |
8 |
|
|
|
9 |
|
|
use LWP::UserAgent; |
10 |
|
|
use HTML::TreeBuilder; |
11 |
|
|
require Text::CSV; |
12 |
|
|
use Text::Unaccent; |
13 |
|
|
use strict; |
14 |
|
|
|
15 |
|
|
my $debug=1; |
16 |
|
|
|
17 |
dpavlin |
339 |
my $file; |
18 |
|
|
|
19 |
|
|
# uncomment following line if you want to use file instead of http connection |
20 |
|
|
#$file="list.html"; |
21 |
|
|
|
22 |
dpavlin |
70 |
# configure ScienceDirect CVS files location |
23 |
|
|
my $csv_dir="/data/isis_data/sciencedirect"; |
24 |
|
|
my $j_holdings="sd_JournalHoldingsRpt.txt"; |
25 |
|
|
my $j_category="sd_Journal_Category.txt"; |
26 |
|
|
|
27 |
|
|
# URL to list of subscribed journals |
28 |
|
|
my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920'; |
29 |
|
|
my $html_codepage="iso-8859-1"; |
30 |
|
|
|
31 |
|
|
my $csv = Text::CSV->new(); |
32 |
|
|
my $journal; |
33 |
|
|
my $c_wo_h = 0; # category without holding record |
34 |
|
|
my $c_nr = 0; # number of categories assigned |
35 |
|
|
|
36 |
|
|
my $j_basic = 0; |
37 |
|
|
my $j_detailed = 0; |
38 |
|
|
|
39 |
|
|
print STDERR "unrolling $j_holdings\n"; |
40 |
|
|
|
41 |
|
|
sub nuc { |
42 |
|
|
# normalizing UC |
43 |
|
|
my $s=shift @_ || return ""; |
44 |
|
|
$s=unac_string($html_codepage,$s); |
45 |
|
|
$s=~s/[^\w]/ /g; |
46 |
|
|
$s=~s/ +/ /g; |
47 |
|
|
return uc($s); |
48 |
|
|
} |
49 |
|
|
|
50 |
|
|
open(H,"$csv_dir/$j_holdings") || die "can't open $csv_dir/$j_holdings: $!"; |
51 |
|
|
my $line = <H>; # skip header line |
52 |
|
|
while(<H>) { |
53 |
|
|
chomp; |
54 |
|
|
$csv->parse($_) || warn "can't parse '$_': ".$csv->error_input; |
55 |
|
|
my @data = $csv->fields; |
56 |
|
|
my $key = nuc($data[0]); |
57 |
|
|
push @data,""; # for categories later... |
58 |
|
|
$journal->{$key} = \@data; |
59 |
|
|
} |
60 |
|
|
close(H); |
61 |
|
|
|
62 |
|
|
print STDERR "unrolling $j_category\n"; |
63 |
|
|
|
64 |
|
|
open(C,"$csv_dir/$j_category") || die "can't open $csv_dir/$j_category: $!"; |
65 |
|
|
$line = <C>; # skip header line |
66 |
|
|
while(<C>) { |
67 |
|
|
chomp; |
68 |
|
|
$csv->parse($_) || warn "can't parse '$_': ".$csv->error_input; |
69 |
|
|
my @data = $csv->fields; |
70 |
|
|
my $key = nuc($data[1]); |
71 |
|
|
if (! $journal->{$key}) { |
72 |
|
|
$c_wo_h++; |
73 |
|
|
next; |
74 |
|
|
} |
75 |
|
|
|
76 |
|
|
foreach my $i (4, 6, 8, 10) { |
77 |
dpavlin |
74 |
push @{$journal->{$key}},$data[$i] || ""; |
78 |
dpavlin |
70 |
if ($data[$i]) { |
79 |
|
|
$c_nr++; |
80 |
|
|
} |
81 |
|
|
} |
82 |
|
|
} |
83 |
|
|
close(C); |
84 |
|
|
|
85 |
|
|
print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n"; |
86 |
|
|
|
87 |
|
|
$debug++ if (lc($ARGV[0]) eq "-d"); |
88 |
|
|
|
89 |
|
|
|
90 |
dpavlin |
339 |
my $res; |
91 |
|
|
if (! $file) { |
92 |
|
|
my $ua = new LWP::UserAgent; |
93 |
|
|
$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0"); |
94 |
|
|
$ua->timeout(60); |
95 |
|
|
#$ua->env_proxy(); |
96 |
|
|
#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/'); |
97 |
dpavlin |
70 |
|
98 |
dpavlin |
339 |
print STDERR "getting '$url'...\n" if ($debug); |
99 |
|
|
my $req = HTTP::Request->new(GET => $url); |
100 |
dpavlin |
70 |
|
101 |
dpavlin |
339 |
$res = $ua->request($req); |
102 |
|
|
} elsif (! -e $file) { |
103 |
|
|
die "can't find feed file '$file'"; |
104 |
|
|
} |
105 |
|
|
|
106 |
|
|
if ($file || $res->is_success) { |
107 |
dpavlin |
70 |
print STDERR "parsing html...\n" if ($debug); |
108 |
|
|
my $tree = HTML::TreeBuilder->new; |
109 |
dpavlin |
339 |
if ($file) { |
110 |
|
|
$tree->parse_file("list.html"); |
111 |
|
|
} else { |
112 |
|
|
$tree->parse($res->content); |
113 |
|
|
} |
114 |
dpavlin |
70 |
|
115 |
|
|
foreach my $tr ($tree->look_down('_tag', 'tr')) { |
116 |
|
|
my $link; |
117 |
dpavlin |
339 |
foreach my $link ($tr->look_down('_tag','a')) { |
118 |
dpavlin |
70 |
if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) { |
119 |
|
|
my $j=nuc($link->as_text); |
120 |
|
|
if ($journal->{$j}) { |
121 |
dpavlin |
74 |
my $i=0; |
122 |
|
|
foreach my $line (@{$journal->{$j}}) { |
123 |
|
|
print $i++,": $line\n"; |
124 |
|
|
} |
125 |
dpavlin |
70 |
$j_detailed++; |
126 |
|
|
} else { |
127 |
dpavlin |
77 |
print "0: ",$link->as_text."\n"; |
128 |
|
|
print "7: http://www.sciencedirect.com",$link->attr('href')."\n"; |
129 |
dpavlin |
70 |
$j_basic++; |
130 |
|
|
print STDERR "can't find details for $j\n" if ($debug); |
131 |
|
|
} |
132 |
|
|
|
133 |
|
|
print "\n"; |
134 |
|
|
} |
135 |
|
|
} |
136 |
|
|
} |
137 |
|
|
|
138 |
|
|
$tree->delete; # clear memory! |
139 |
|
|
|
140 |
|
|
} else { |
141 |
|
|
warn "can't fetch web page from '$url'"; |
142 |
|
|
} |
143 |
|
|
|
144 |
|
|
print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n"; |
145 |
|
|
|