/[webpac]/trunk/feeds/sciencedirect2.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/feeds/sciencedirect2.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 74 - (hide annotations)
Sat Jul 5 22:37:30 2003 UTC (16 years, 11 months ago) by dpavlin
File MIME type: text/plain
File size: 3421 byte(s)
support for new feed format which have decimal number of field, semicolumn
and space at beginning of each line (like: 0: data)

1 dpavlin 70 #!/usr/bin/perl -w
2    
3     # This script will fatch list of articles on which you have access
4     # (using IP authorisation) from ScienceDirect
5     #
6     # This version requires CSV dumps from ScienceDirect for Holdings data
7     # and categories, but can output much more data about each record
8    
9     use LWP::UserAgent;
10     use HTML::TreeBuilder;
11     require Text::CSV;
12     use Text::Unaccent;
13     use strict;
14    
15     my $debug=1;
16    
17     # configure ScienceDirect CVS files location
18     my $csv_dir="/data/isis_data/sciencedirect";
19     my $j_holdings="sd_JournalHoldingsRpt.txt";
20     my $j_category="sd_Journal_Category.txt";
21    
22     # URL to list of subscribed journals
23     my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920';
24     my $html_codepage="iso-8859-1";
25    
26     my $csv = Text::CSV->new();
27     my $journal;
28     my $c_wo_h = 0; # category without holding record
29     my $c_nr = 0; # number of categories assigned
30    
31     my $j_basic = 0;
32     my $j_detailed = 0;
33    
34     print STDERR "unrolling $j_holdings\n";
35    
36     sub nuc {
37     # normalizing UC
38     my $s=shift @_ || return "";
39     $s=unac_string($html_codepage,$s);
40     $s=~s/[^\w]/ /g;
41     $s=~s/ +/ /g;
42     return uc($s);
43     }
44    
45     open(H,"$csv_dir/$j_holdings") || die "can't open $csv_dir/$j_holdings: $!";
46     my $line = <H>; # skip header line
47     while(<H>) {
48     chomp;
49     $csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
50     my @data = $csv->fields;
51     my $key = nuc($data[0]);
52     push @data,""; # for categories later...
53     $journal->{$key} = \@data;
54     }
55     close(H);
56    
57     print STDERR "unrolling $j_category\n";
58    
59     open(C,"$csv_dir/$j_category") || die "can't open $csv_dir/$j_category: $!";
60     $line = <C>; # skip header line
61     while(<C>) {
62     chomp;
63     $csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
64     my @data = $csv->fields;
65     my $key = nuc($data[1]);
66     if (! $journal->{$key}) {
67     $c_wo_h++;
68     next;
69     }
70    
71     foreach my $i (4, 6, 8, 10) {
72 dpavlin 74 push @{$journal->{$key}},$data[$i] || "";
73 dpavlin 70 if ($data[$i]) {
74     $c_nr++;
75     }
76     }
77     }
78     close(C);
79    
80     print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n";
81    
82     $debug++ if (lc($ARGV[0]) eq "-d");
83    
84     my $ua = new LWP::UserAgent;
85     $ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
86     $ua->timeout(60);
87     #$ua->env_proxy();
88     #$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');
89    
90     print STDERR "getting '$url'...\n" if ($debug);
91     my $req = HTTP::Request->new(GET => $url);
92    
93     my @out;
94    
95     my $res = $ua->request($req);
96     if ($res->is_success) {
97     print STDERR "parsing html...\n" if ($debug);
98     my $tree = HTML::TreeBuilder->new;
99     # $tree->parse_file("list.html"); # !
100     $tree->parse($res->content);
101    
102     foreach my $tr ($tree->look_down('_tag', 'tr')) {
103     my $link;
104     if ($link = $tr->look_down('_tag','a')) {
105     if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
106     my $j=nuc($link->as_text);
107     if ($journal->{$j}) {
108 dpavlin 74 my $i=0;
109     foreach my $line (@{$journal->{$j}}) {
110     print $i++,": $line\n";
111     }
112 dpavlin 70 $j_detailed++;
113     } else {
114     print $link->attr('href')."\n";
115     print $link->as_text."\n";
116     $j_basic++;
117     print STDERR "can't find details for $j\n" if ($debug);
118     }
119    
120     print "\n";
121     }
122     }
123     }
124    
125     $tree->delete; # clear memory!
126    
127     } else {
128     warn "can't fetch web page from '$url'";
129     }
130    
131     print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n";
132    

Properties

Name Value
cvs2svn:cvs-rev 1.2
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26