1 |
#!/usr/bin/perl -w |
2 |
|
3 |
# This script will fatch list of articles on which you have access |
4 |
# (using IP authorisation) from ScienceDirect |
5 |
# |
6 |
# This version requires CSV dumps from ScienceDirect for Holdings data |
7 |
# and categories, but can output much more data about each record |
8 |
|
9 |
use LWP::UserAgent; |
10 |
use HTML::TreeBuilder; |
11 |
require Text::CSV; |
12 |
use Text::Unaccent; |
13 |
use strict; |
14 |
|
15 |
my $debug=1; |
16 |
|
17 |
# configure ScienceDirect CVS files location |
18 |
my $csv_dir="/data/isis_data/sciencedirect"; |
19 |
my $j_holdings="sd_JournalHoldingsRpt.txt"; |
20 |
my $j_category="sd_Journal_Category.txt"; |
21 |
|
22 |
# URL to list of subscribed journals |
23 |
my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920'; |
24 |
my $html_codepage="iso-8859-1"; |
25 |
|
26 |
my $csv = Text::CSV->new(); |
27 |
my $journal; |
28 |
my $c_wo_h = 0; # category without holding record |
29 |
my $c_nr = 0; # number of categories assigned |
30 |
|
31 |
my $j_basic = 0; |
32 |
my $j_detailed = 0; |
33 |
|
34 |
print STDERR "unrolling $j_holdings\n"; |
35 |
|
36 |
sub nuc { |
37 |
# normalizing UC |
38 |
my $s=shift @_ || return ""; |
39 |
$s=unac_string($html_codepage,$s); |
40 |
$s=~s/[^\w]/ /g; |
41 |
$s=~s/ +/ /g; |
42 |
return uc($s); |
43 |
} |
44 |
|
45 |
open(H,"$csv_dir/$j_holdings") || die "can't open $csv_dir/$j_holdings: $!"; |
46 |
my $line = <H>; # skip header line |
47 |
while(<H>) { |
48 |
chomp; |
49 |
$csv->parse($_) || warn "can't parse '$_': ".$csv->error_input; |
50 |
my @data = $csv->fields; |
51 |
my $key = nuc($data[0]); |
52 |
push @data,""; # for categories later... |
53 |
$journal->{$key} = \@data; |
54 |
} |
55 |
close(H); |
56 |
|
57 |
print STDERR "unrolling $j_category\n"; |
58 |
|
59 |
open(C,"$csv_dir/$j_category") || die "can't open $csv_dir/$j_category: $!"; |
60 |
$line = <C>; # skip header line |
61 |
while(<C>) { |
62 |
chomp; |
63 |
$csv->parse($_) || warn "can't parse '$_': ".$csv->error_input; |
64 |
my @data = $csv->fields; |
65 |
my $key = nuc($data[1]); |
66 |
if (! $journal->{$key}) { |
67 |
$c_wo_h++; |
68 |
next; |
69 |
} |
70 |
|
71 |
foreach my $i (4, 6, 8, 10) { |
72 |
push @{$journal->{$key}},$data[$i] || ""; |
73 |
if ($data[$i]) { |
74 |
$c_nr++; |
75 |
} |
76 |
} |
77 |
} |
78 |
close(C); |
79 |
|
80 |
print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n"; |
81 |
|
82 |
$debug++ if (lc($ARGV[0]) eq "-d"); |
83 |
|
84 |
my $ua = new LWP::UserAgent; |
85 |
$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0"); |
86 |
$ua->timeout(60); |
87 |
#$ua->env_proxy(); |
88 |
#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/'); |
89 |
|
90 |
print STDERR "getting '$url'...\n" if ($debug); |
91 |
my $req = HTTP::Request->new(GET => $url); |
92 |
|
93 |
my @out; |
94 |
|
95 |
my $res = $ua->request($req); |
96 |
if ($res->is_success) { |
97 |
print STDERR "parsing html...\n" if ($debug); |
98 |
my $tree = HTML::TreeBuilder->new; |
99 |
# $tree->parse_file("list.html"); # ! |
100 |
$tree->parse($res->content); |
101 |
|
102 |
foreach my $tr ($tree->look_down('_tag', 'tr')) { |
103 |
my $link; |
104 |
if ($link = $tr->look_down('_tag','a')) { |
105 |
if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) { |
106 |
my $j=nuc($link->as_text); |
107 |
if ($journal->{$j}) { |
108 |
my $i=0; |
109 |
foreach my $line (@{$journal->{$j}}) { |
110 |
print $i++,": $line\n"; |
111 |
} |
112 |
$j_detailed++; |
113 |
} else { |
114 |
print $link->attr('href')."\n"; |
115 |
print $link->as_text."\n"; |
116 |
$j_basic++; |
117 |
print STDERR "can't find details for $j\n" if ($debug); |
118 |
} |
119 |
|
120 |
print "\n"; |
121 |
} |
122 |
} |
123 |
} |
124 |
|
125 |
$tree->delete; # clear memory! |
126 |
|
127 |
} else { |
128 |
warn "can't fetch web page from '$url'"; |
129 |
} |
130 |
|
131 |
print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n"; |
132 |
|