--- trunk/feeds/sciencedirect2.pl 2003/07/05 18:26:40 70 +++ trunk/feeds/sciencedirect2.pl 2004/06/10 22:05:38 338 @@ -14,6 +14,11 @@ my $debug=1; +my $file; + +# uncomment following line if you want to use file instead of http connection +#$file="list.html"; + # configure ScienceDirect CVS files location my $csv_dir="/data/isis_data/sciencedirect"; my $j_holdings="sd_JournalHoldingsRpt.txt"; @@ -69,8 +74,8 @@ } foreach my $i (4, 6, 8, 10) { + push @{$journal->{$key}},$data[$i] || ""; if ($data[$i]) { - push @{$journal->{$key}},$data[$i]; $c_nr++; } } @@ -81,35 +86,46 @@ $debug++ if (lc($ARGV[0]) eq "-d"); -my $ua = new LWP::UserAgent; -$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0"); -$ua->timeout(60); -#$ua->env_proxy(); -#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/'); - -print STDERR "getting '$url'...\n" if ($debug); -my $req = HTTP::Request->new(GET => $url); -my @out; +my $res; +if (! $file) { + my $ua = new LWP::UserAgent; + $ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0"); + $ua->timeout(60); + #$ua->env_proxy(); + #$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/'); + + print STDERR "getting '$url'...\n" if ($debug); + my $req = HTTP::Request->new(GET => $url); + + $res = $ua->request($req); +} elsif (! -e $file) { + die "can't find feed file '$file'"; +} -my $res = $ua->request($req); -if ($res->is_success) { +if ($file || $res->is_success) { print STDERR "parsing html...\n" if ($debug); my $tree = HTML::TreeBuilder->new; -# $tree->parse_file("list.html"); # ! - $tree->parse($res->content); + if ($file) { + $tree->parse_file("list.html"); + } else { + $tree->parse($res->content); + } foreach my $tr ($tree->look_down('_tag', 'tr')) { my $link; - if ($link = $tr->look_down('_tag','a')) { + foreach my $link ($tr->look_down('_tag','a')) { if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) { my $j=nuc($link->as_text); if ($journal->{$j}) { - print join("\n",@{$journal->{$j}}); + my $i=0; + foreach my $line (@{$journal->{$j}}) { + print $i++,": $line\n"; + } $j_detailed++; } else { - print $link->attr('href')."\n"; - print $link->as_text."\n"; + print "0: ",$link->as_text."\n"; + print "7: http://www.sciencedirect.com",$link->attr('href')."\n"; $j_basic++; print STDERR "can't find details for $j\n" if ($debug); }