/[webpac]/branches/ecas/feeds/sciencedirect2.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/ecas/feeds/sciencedirect2.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 763 - (show annotations)
Sun Nov 5 11:26:37 2006 UTC (17 years, 5 months ago) by dpavlin
File MIME type: text/plain
File size: 3693 byte(s)
local changes
1 #!/usr/bin/perl -w
2
3 # This script will fatch list of articles on which you have access
4 # (using IP authorisation) from ScienceDirect
5 #
6 # This version requires CSV dumps from ScienceDirect for Holdings data
7 # and categories, but can output much more data about each record
8
9 use LWP::UserAgent;
10 use HTML::TreeBuilder;
11 require Text::CSV;
12 use Text::Unaccent;
13 use strict;
14
15 my $debug=1;
16
17 my $file;
18
19 # uncomment following line if you want to use file instead of http connection
20 #$file="list.html";
21
22 # configure ScienceDirect CVS files location
23 my $csv_dir="/data/isis_data/sciencedirect";
24 my $j_holdings="sd_JournalHoldingsRpt.txt";
25 my $j_category="sd_Journal_Category.txt";
26
27 # URL to list of subscribed journals
28
29 my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920';
30
31 my $html_codepage="iso-8859-1";
32
33 my $csv = Text::CSV->new();
34 my $journal;
35 my $c_wo_h = 0; # category without holding record
36 my $c_nr = 0; # number of categories assigned
37
38 my $j_basic = 0;
39 my $j_detailed = 0;
40
41 print STDERR "unrolling $j_holdings\n";
42
43 sub nuc {
44 # normalizing UC
45 my $s=shift @_ || return "";
46 $s=unac_string($html_codepage,$s);
47 $s=~s/[^\w]/ /g;
48 $s=~s/ +/ /g;
49 return uc($s);
50 }
51
52 open(H,"$csv_dir/$j_holdings") || die "can't open $csv_dir/$j_holdings: $!";
53 my $line = <H>; # skip header line
54 while(<H>) {
55 chomp;
56 $csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
57 my @data = $csv->fields;
58 my $key = nuc($data[0]);
59 push @data,""; # for categories later...
60 $journal->{$key} = \@data;
61 }
62 close(H);
63
64 print STDERR "unrolling $j_category\n";
65
66 open(C,"$csv_dir/$j_category") || die "can't open $csv_dir/$j_category: $!";
67 $line = <C>; # skip header line
68 while(<C>) {
69 chomp;
70 $csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
71 my @data = $csv->fields;
72 my $key = nuc($data[1]);
73 if (! $journal->{$key}) {
74 $c_wo_h++;
75 next;
76 }
77
78 foreach my $i (4, 6, 8, 10) {
79 push @{$journal->{$key}},$data[$i] || "";
80 if ($data[$i]) {
81 $c_nr++;
82 }
83 }
84 }
85 close(C);
86
87 print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n";
88
89 $debug++ if (lc($ARGV[0]) eq "-d");
90
91
92 my $res;
93 if (! $file) {
94 my $ua = new LWP::UserAgent;
95 $ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
96 $ua->timeout(60);
97 #$ua->env_proxy();
98 #$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');
99
100 print STDERR "getting '$url'...\n" if ($debug);
101 my $req = HTTP::Request->new(GET => $url);
102
103 $res = $ua->request($req);
104 } elsif (! -e $file) {
105 die "can't find feed file '$file'";
106 }
107
108 if ($file || $res->is_success) {
109 print STDERR "parsing html...\n" if ($debug);
110 my $tree = HTML::TreeBuilder->new;
111 if ($file) {
112 $tree->parse_file("list.html");
113 } else {
114 $tree->parse($res->content);
115 }
116
117 foreach my $tr ($tree->look_down('_tag', 'tr')) {
118 my $link;
119 foreach my $link ($tr->look_down('_tag','a')) {
120 if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
121 my $j=nuc($link->as_text);
122 if ($journal->{$j}) {
123 my $i=0;
124 foreach my $line (@{$journal->{$j}}) {
125 print $i++,": $line\n";
126 }
127 $j_detailed++;
128 } else {
129 print "0: ",$link->as_text."\n";
130 print "7: http://www.sciencedirect.com",$link->attr('href')."\n";
131 $j_basic++;
132 print STDERR "can't find details for $j\n" if ($debug);
133 }
134
135 print "\n";
136 }
137 }
138 }
139
140 $tree->delete; # clear memory!
141
142 } else {
143 warn "can't fetch web page from '$url'";
144 }
145
146 print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n";
147

Properties

Name Value
cvs2svn:cvs-rev 1.3
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26