/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (hide annotations)
Tue Jun 4 06:39:53 2002 UTC (18 years, 4 months ago) by dpavlin
File size: 2899 byte(s)
Initial revision

1 dpavlin 1 #!/usr/local/bin/perl -w
2     use strict;
3    
4     use LWP::UserAgent;
5     use HTTP::Status;
6     use HTML::LinkExtor;
7    
8     if (scalar(@ARGV) != 2) {
9     print STDERR "Usage: SwishSpider localpath url\n";
10     exit(1);
11     }
12    
13     my $ua = new LWP::UserAgent;
14     $ua->agent( "SwishSpider http://swish-e.org" );
15    
16     my $localpath = shift;
17     my $url = shift;
18    
19     my $no_parent_url;
20     if ($url =~ m/\s/) {
21     ($url,$no_parent_url) = split(/\s/,$url,2);
22     }
23    
24     my $request = new HTTP::Request( "GET", $url );
25     my $response = $ua->simple_request( $request );
26    
27     #
28     # Write out important meta-data. This includes the HTTP code. Depending on the
29     # code, we write out other data. Redirects have the location printed, everything
30     # else gets the content-type.
31     #
32     open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );
33    
34     print RESP $response->code() . "\n";
35     if( $response->code() == RC_OK ) {
36     print RESP $response->header( "content-type" ) . "\n";
37     } elsif( $response->is_redirect() ) {
38     my $link = $response->header( "location" );
39     if ($no_parent_url) {
40     if ($link =~ m/$no_parent_url/) {
41     # if this URL is below parent URL o.k....
42     print RESP "$link $no_parent_url\n";
43     } else {
44     # if not, crawl just this page!
45     print RESP "$link $link\n";
46     }
47     } else {
48     print RESP "$link\n";
49     }
50     }
51     close( RESP );
52    
53     #
54     # Write out the actual data assuming the retrieval was succesful. Also, if
55     # we have actual data and it's of type text/html, write out all the links it
56     # refers to
57     #
58     if( $response->code() == RC_OK ) {
59     my $contents = $response->content();
60    
61     open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
62     $contents =~ s,<noindex>.+?</noindex>,,isg;
63     $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
64     $contents =~ s,<script>.+?</script>,,isg;
65     $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
66     print CONTENTS $contents;
67     close( CONTENTS );
68    
69     if( $response->header("content-type") =~ "text/html" ) {
70     open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
71     my $p = HTML::LinkExtor->new( \&linkcb, $url );
72     $p->parse( $contents );
73    
74     close( LINKS );
75     }
76     }
77    
78    
79     sub linkcb {
80     my($tag, %links) = @_;
81     if (($tag eq "a") && ($links{"href"})) {
82     my $link = $links{"href"};
83    
84     #
85     # Remove fragments
86     #
87     $link =~ s/(.*)#.*/$1/;
88    
89     #
90     # Remove ../ This is important because the abs() function
91     # can leave these in and cause never ending loops.
92     #
93     $link =~ s/\.\.\///g;
94    
95     # hack for apostrophe -- changes URL, but should work for most clients.
96     $link =~ s/'/%27/g;
97    
98     # hack for Apache directory listings
99     $link =~ s,/\?[NMSD]=[AD]$,/,g;
100    
101     if ($no_parent_url) {
102     if ($link =~ m/$no_parent_url/) {
103     print LINKS "$link $no_parent_url\n";
104     # print STDERR "using $link\n";
105     # } else {
106     # print STDERR "skipping $link\n";
107     }
108     } else {
109     print LINKS "$link\n";
110     }
111     }
112     }
113    

Properties

Name Value
cvs2svn:cvs-rev 1.1
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26