ffzg/feeds/sciencedirect2.pl

#!/usr/bin/perl -w

# This script will fatch list of articles on which you have access
# (using IP authorisation) from ScienceDirect
#
# This version requires CSV dumps from ScienceDirect for Holdings data
# and categories, but can output much more data about each record

use LWP::UserAgent;
use HTML::TreeBuilder;
require Text::CSV;
use Text::Unaccent;
use strict;

my $debug=1;

my $file;

# uncomment following line if you want to use file instead of http connection
#$file="list.html";

# configure ScienceDirect CVS files location
my $csv_dir="/data/isis_data/sciencedirect";
my $j_holdings="sd_JournalHoldingsRpt.txt";
my $j_category="sd_Journal_Category.txt";

# URL to list of subscribed journals
my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920';
my $html_codepage="iso-8859-1";

my $csv = Text::CSV->new(); 
my $journal;
my $c_wo_h = 0;         # category without holding record
my $c_nr = 0;           # number of categories assigned

my $j_basic = 0;
my $j_detailed = 0;

print STDERR "unrolling $j_holdings\n";

sub nuc {
        # normalizing UC
        my $s=shift @_ || return "";
        $s=unac_string($html_codepage,$s);
        $s=~s/[^\w]/ /g;
        $s=~s/  +/ /g;
        return uc($s);
}

open(H,"$csv_dir/$j_holdings") || die "can't open $csv_dir/$j_holdings: $!";
my $line = <H>;         # skip header line
while(<H>) {
        chomp;
        $csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
        my @data = $csv->fields;
        my $key = nuc($data[0]);
        push @data,"";                  # for categories later...
        $journal->{$key} = \@data;
}
close(H);

print STDERR "unrolling $j_category\n";

open(C,"$csv_dir/$j_category") || die "can't open $csv_dir/$j_category: $!";
$line = <C>;            # skip header line
while(<C>) {
        chomp;
        $csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
        my @data = $csv->fields;
        my $key = nuc($data[1]);
        if (! $journal->{$key}) {
                $c_wo_h++;
                next;
        }

        foreach my $i (4, 6, 8, 10) {
                push @{$journal->{$key}},$data[$i] || "";
                if ($data[$i]) {
                        $c_nr++;
                }
        }
}
close(C);

print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n";

$debug++ if (lc($ARGV[0]) eq "-d");


my $res;
if (! $file) {
        my $ua = new LWP::UserAgent;
        $ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
        $ua->timeout(60);
        #$ua->env_proxy();
        #$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');

        print STDERR "getting '$url'...\n" if ($debug);
        my $req = HTTP::Request->new(GET => $url);

        $res = $ua->request($req);
} elsif (! -e $file) {
        die "can't find feed file '$file'";
}

if ($file || $res->is_success) {
        print STDERR "parsing html...\n" if ($debug);
        my $tree = HTML::TreeBuilder->new;
        if ($file) {
                $tree->parse_file("list.html");
        } else {
                $tree->parse($res->content);
        }

        foreach my $tr ($tree->look_down('_tag', 'tr')) {
                my $link;
                foreach my $link ($tr->look_down('_tag','a')) {
                        if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
                                my $j=nuc($link->as_text);
                                if ($journal->{$j}) {
                                        my $i=0;
                                        foreach my $line (@{$journal->{$j}}) {
                                                print $i++,": $line\n";
                                        }
                                        $j_detailed++;
                                } else {
                                        print "0: ",$link->as_text."\n";
                                        print "7: http://www.sciencedirect.com",$link->attr('href')."\n";
                                        $j_basic++;
                                        print STDERR "can't find details for $j\n" if ($debug);
                                }

                                print "\n";
                        }
                }
        }

        $tree->delete; # clear memory!

} else {
    warn "can't fetch web page from '$url'";
}

print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n";

1	dpavlin	70	#!/usr/bin/perl -w
2
3			# This script will fatch list of articles on which you have access
4			# (using IP authorisation) from ScienceDirect
5			#
6			# This version requires CSV dumps from ScienceDirect for Holdings data
7			# and categories, but can output much more data about each record
8
9			use LWP::UserAgent;
10			use HTML::TreeBuilder;
11			require Text::CSV;
12			use Text::Unaccent;
13			use strict;
14
15			my $debug=1;
16
17	dpavlin	339	my $file;
18
19			# uncomment following line if you want to use file instead of http connection
20			#$file="list.html";
21
22	dpavlin	70	# configure ScienceDirect CVS files location
23			my $csv_dir="/data/isis_data/sciencedirect";
24			my $j_holdings="sd_JournalHoldingsRpt.txt";
25			my $j_category="sd_Journal_Category.txt";
26
27			# URL to list of subscribed journals
28			my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920';
29			my $html_codepage="iso-8859-1";
30
31			my $csv = Text::CSV->new();
32			my $journal;
33			my $c_wo_h = 0; # category without holding record
34			my $c_nr = 0; # number of categories assigned
35
36			my $j_basic = 0;
37			my $j_detailed = 0;
38
39			print STDERR "unrolling $j_holdings\n";
40
41			sub nuc {
42			# normalizing UC
43			my $s=shift @_ \|\| return "";
44			$s=unac_string($html_codepage,$s);
45			$s=~s/[^\w]/ /g;
46			$s=~s/ +/ /g;
47			return uc($s);
48			}
49
50			open(H,"$csv_dir/$j_holdings") \|\| die "can't open $csv_dir/$j_holdings: $!";
51			my $line = <H>; # skip header line
52			while(<H>) {
53			chomp;
54			$csv->parse($_) \|\| warn "can't parse '$_': ".$csv->error_input;
55			my @data = $csv->fields;
56			my $key = nuc($data[0]);
57			push @data,""; # for categories later...
58			$journal->{$key} = \@data;
59			}
60			close(H);
61
62			print STDERR "unrolling $j_category\n";
63
64			open(C,"$csv_dir/$j_category") \|\| die "can't open $csv_dir/$j_category: $!";
65			$line = <C>; # skip header line
66			while(<C>) {
67			chomp;
68			$csv->parse($_) \|\| warn "can't parse '$_': ".$csv->error_input;
69			my @data = $csv->fields;
70			my $key = nuc($data[1]);
71			if (! $journal->{$key}) {
72			$c_wo_h++;
73			next;
74			}
75
76			foreach my $i (4, 6, 8, 10) {
77	dpavlin	74	push @{$journal->{$key}},$data[$i] \|\| "";
78	dpavlin	70	if ($data[$i]) {
79			$c_nr++;
80			}
81			}
82			}
83			close(C);
84
85			print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n";
86
87			$debug++ if (lc($ARGV[0]) eq "-d");
88
89
90	dpavlin	339	my $res;
91			if (! $file) {
92			my $ua = new LWP::UserAgent;
93			$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
94			$ua->timeout(60);
95			#$ua->env_proxy();
96			#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');
97	dpavlin	70
98	dpavlin	339	print STDERR "getting '$url'...\n" if ($debug);
99			my $req = HTTP::Request->new(GET => $url);
100	dpavlin	70
101	dpavlin	339	$res = $ua->request($req);
102			} elsif (! -e $file) {
103			die "can't find feed file '$file'";
104			}
105
106			if ($file \|\| $res->is_success) {
107	dpavlin	70	print STDERR "parsing html...\n" if ($debug);
108			my $tree = HTML::TreeBuilder->new;
109	dpavlin	339	if ($file) {
110			$tree->parse_file("list.html");
111			} else {
112			$tree->parse($res->content);
113			}
114	dpavlin	70
115			foreach my $tr ($tree->look_down('_tag', 'tr')) {
116			my $link;
117	dpavlin	339	foreach my $link ($tr->look_down('_tag','a')) {
118	dpavlin	70	if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
119			my $j=nuc($link->as_text);
120			if ($journal->{$j}) {
121	dpavlin	74	my $i=0;
122			foreach my $line (@{$journal->{$j}}) {
123			print $i++,": $line\n";
124			}
125	dpavlin	70	$j_detailed++;
126			} else {
127	dpavlin	77	print "0: ",$link->as_text."\n";
128			print "7: http://www.sciencedirect.com",$link->attr('href')."\n";
129	dpavlin	70	$j_basic++;
130			print STDERR "can't find details for $j\n" if ($debug);
131			}
132
133			print "\n";
134			}
135			}
136			}
137
138			$tree->delete; # clear memory!
139
140			} else {
141			warn "can't fetch web page from '$url'";
142			}
143
144			print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n";
145