1 |
dpavlin |
70 |
#!/usr/bin/perl -w |
2 |
|
|
|
3 |
|
|
# This script will fatch list of articles on which you have access |
4 |
|
|
# (using IP authorisation) from ScienceDirect |
5 |
|
|
# |
6 |
|
|
# This version requires CSV dumps from ScienceDirect for Holdings data |
7 |
|
|
# and categories, but can output much more data about each record |
8 |
|
|
|
9 |
|
|
use LWP::UserAgent; |
10 |
|
|
use HTML::TreeBuilder; |
11 |
|
|
require Text::CSV; |
12 |
|
|
use Text::Unaccent; |
13 |
|
|
use strict; |
14 |
|
|
|
15 |
|
|
my $debug=1; |
16 |
|
|
|
17 |
|
|
# configure ScienceDirect CVS files location |
18 |
|
|
my $csv_dir="/data/isis_data/sciencedirect"; |
19 |
|
|
my $j_holdings="sd_JournalHoldingsRpt.txt"; |
20 |
|
|
my $j_category="sd_Journal_Category.txt"; |
21 |
|
|
|
22 |
|
|
# URL to list of subscribed journals |
23 |
|
|
my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920'; |
24 |
|
|
my $html_codepage="iso-8859-1"; |
25 |
|
|
|
26 |
|
|
my $csv = Text::CSV->new(); |
27 |
|
|
my $journal; |
28 |
|
|
my $c_wo_h = 0; # category without holding record |
29 |
|
|
my $c_nr = 0; # number of categories assigned |
30 |
|
|
|
31 |
|
|
my $j_basic = 0; |
32 |
|
|
my $j_detailed = 0; |
33 |
|
|
|
34 |
|
|
print STDERR "unrolling $j_holdings\n"; |
35 |
|
|
|
36 |
|
|
sub nuc { |
37 |
|
|
# normalizing UC |
38 |
|
|
my $s=shift @_ || return ""; |
39 |
|
|
$s=unac_string($html_codepage,$s); |
40 |
|
|
$s=~s/[^\w]/ /g; |
41 |
|
|
$s=~s/ +/ /g; |
42 |
|
|
return uc($s); |
43 |
|
|
} |
44 |
|
|
|
45 |
|
|
open(H,"$csv_dir/$j_holdings") || die "can't open $csv_dir/$j_holdings: $!"; |
46 |
|
|
my $line = <H>; # skip header line |
47 |
|
|
while(<H>) { |
48 |
|
|
chomp; |
49 |
|
|
$csv->parse($_) || warn "can't parse '$_': ".$csv->error_input; |
50 |
|
|
my @data = $csv->fields; |
51 |
|
|
my $key = nuc($data[0]); |
52 |
|
|
push @data,""; # for categories later... |
53 |
|
|
$journal->{$key} = \@data; |
54 |
|
|
} |
55 |
|
|
close(H); |
56 |
|
|
|
57 |
|
|
print STDERR "unrolling $j_category\n"; |
58 |
|
|
|
59 |
|
|
open(C,"$csv_dir/$j_category") || die "can't open $csv_dir/$j_category: $!"; |
60 |
|
|
$line = <C>; # skip header line |
61 |
|
|
while(<C>) { |
62 |
|
|
chomp; |
63 |
|
|
$csv->parse($_) || warn "can't parse '$_': ".$csv->error_input; |
64 |
|
|
my @data = $csv->fields; |
65 |
|
|
my $key = nuc($data[1]); |
66 |
|
|
if (! $journal->{$key}) { |
67 |
|
|
$c_wo_h++; |
68 |
|
|
next; |
69 |
|
|
} |
70 |
|
|
|
71 |
|
|
foreach my $i (4, 6, 8, 10) { |
72 |
dpavlin |
74 |
push @{$journal->{$key}},$data[$i] || ""; |
73 |
dpavlin |
70 |
if ($data[$i]) { |
74 |
|
|
$c_nr++; |
75 |
|
|
} |
76 |
|
|
} |
77 |
|
|
} |
78 |
|
|
close(C); |
79 |
|
|
|
80 |
|
|
print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n"; |
81 |
|
|
|
82 |
|
|
$debug++ if (lc($ARGV[0]) eq "-d"); |
83 |
|
|
|
84 |
|
|
my $ua = new LWP::UserAgent; |
85 |
|
|
$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0"); |
86 |
|
|
$ua->timeout(60); |
87 |
|
|
#$ua->env_proxy(); |
88 |
|
|
#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/'); |
89 |
|
|
|
90 |
|
|
print STDERR "getting '$url'...\n" if ($debug); |
91 |
|
|
my $req = HTTP::Request->new(GET => $url); |
92 |
|
|
|
93 |
|
|
my @out; |
94 |
|
|
|
95 |
|
|
my $res = $ua->request($req); |
96 |
|
|
if ($res->is_success) { |
97 |
|
|
print STDERR "parsing html...\n" if ($debug); |
98 |
|
|
my $tree = HTML::TreeBuilder->new; |
99 |
|
|
# $tree->parse_file("list.html"); # ! |
100 |
|
|
$tree->parse($res->content); |
101 |
|
|
|
102 |
|
|
foreach my $tr ($tree->look_down('_tag', 'tr')) { |
103 |
|
|
my $link; |
104 |
|
|
if ($link = $tr->look_down('_tag','a')) { |
105 |
|
|
if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) { |
106 |
|
|
my $j=nuc($link->as_text); |
107 |
|
|
if ($journal->{$j}) { |
108 |
dpavlin |
74 |
my $i=0; |
109 |
|
|
foreach my $line (@{$journal->{$j}}) { |
110 |
|
|
print $i++,": $line\n"; |
111 |
|
|
} |
112 |
dpavlin |
70 |
$j_detailed++; |
113 |
|
|
} else { |
114 |
dpavlin |
77 |
print "0: ",$link->as_text."\n"; |
115 |
|
|
print "7: http://www.sciencedirect.com",$link->attr('href')."\n"; |
116 |
dpavlin |
70 |
$j_basic++; |
117 |
|
|
print STDERR "can't find details for $j\n" if ($debug); |
118 |
|
|
} |
119 |
|
|
|
120 |
|
|
print "\n"; |
121 |
|
|
} |
122 |
|
|
} |
123 |
|
|
} |
124 |
|
|
|
125 |
|
|
$tree->delete; # clear memory! |
126 |
|
|
|
127 |
|
|
} else { |
128 |
|
|
warn "can't fetch web page from '$url'"; |
129 |
|
|
} |
130 |
|
|
|
131 |
|
|
print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n"; |
132 |
|
|
|