/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 46 - (hide annotations)
Sat Jan 17 23:57:55 2004 UTC (16 years, 9 months ago) by dpavlin
File size: 3462 byte(s)
- moved text/html content filtering to filter.pm to faciliate code re-use
- added progspider which can be used with -S prog to crawl files and
  use filtering subroutines

1 dpavlin 1 #!/usr/local/bin/perl -w
2     use strict;
3    
4     use LWP::UserAgent;
5     use HTTP::Status;
6     use HTML::LinkExtor;
7    
8 dpavlin 46 my $basedir = $0;
9     $basedir =~ s,/[^/]+$,/,;
10     require "$basedir/filter.pm";
11    
12 dpavlin 1 if (scalar(@ARGV) != 2) {
13     print STDERR "Usage: SwishSpider localpath url\n";
14     exit(1);
15     }
16    
17     my $ua = new LWP::UserAgent;
18     $ua->agent( "SwishSpider http://swish-e.org" );
19    
20     my $localpath = shift;
21     my $url = shift;
22    
23     my $no_parent_url;
24     if ($url =~ m/\s/) {
25 dpavlin 40 ($no_parent_url,$url) = split(/\s/,$url,2);
26     # old scheme had URL, no parent and new is reverse
27 dpavlin 42 ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
28 dpavlin 1 }
29    
30     my $request = new HTTP::Request( "GET", $url );
31     my $response = $ua->simple_request( $request );
32 dpavlin 45 my $urlbase = $response->base;
33     $urlbase =~ s,/[^/]*$,/,; # remove filename
34 dpavlin 1
35     #
36     # Write out important meta-data. This includes the HTTP code. Depending on the
37     # code, we write out other data. Redirects have the location printed, everything
38     # else gets the content-type.
39     #
40     open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );
41    
42     print RESP $response->code() . "\n";
43     if( $response->code() == RC_OK ) {
44     print RESP $response->header( "content-type" ) . "\n";
45     } elsif( $response->is_redirect() ) {
46     my $link = $response->header( "location" );
47     if ($no_parent_url) {
48     if ($link =~ m/$no_parent_url/) {
49     # if this URL is below parent URL o.k....
50 dpavlin 40 print RESP "$no_parent_url $link\n";
51 dpavlin 1 } else {
52     # if not, crawl just this page!
53     print RESP "$link $link\n";
54     }
55     } else {
56     print RESP "$link\n";
57     }
58     }
59     close( RESP );
60    
61     #
62     # Write out the actual data assuming the retrieval was succesful. Also, if
63     # we have actual data and it's of type text/html, write out all the links it
64     # refers to
65     #
66     if( $response->code() == RC_OK ) {
67     my $contents = $response->content();
68    
69     open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
70 dpavlin 40 # fixup just HTML files
71     if ($response->header("content-type") =~ "text/html") {
72 dpavlin 46 $contents = filter($contents);
73 dpavlin 40 }
74 dpavlin 1 print CONTENTS $contents;
75     close( CONTENTS );
76    
77     if( $response->header("content-type") =~ "text/html" ) {
78     open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
79     my $p = HTML::LinkExtor->new( \&linkcb, $url );
80     $p->parse( $contents );
81    
82 dpavlin 45
83    
84 dpavlin 1 close( LINKS );
85     }
86     }
87    
88    
89     sub linkcb {
90     my($tag, %links) = @_;
91 dpavlin 40 if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) {
92     my $link = $links{"href"} || $links{"src"};
93 dpavlin 1
94     #
95     # Remove fragments
96     #
97     $link =~ s/(.*)#.*/$1/;
98    
99     #
100     # Remove ../ This is important because the abs() function
101     # can leave these in and cause never ending loops.
102     #
103     $link =~ s/\.\.\///g;
104    
105 dpavlin 45 if ($link =~ m,javascript:displayWindow\((.+)\),i) {
106     my $arg = $1;
107     $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
108     ($link,undef) = split(',',$arg,2);
109     $link =~ s/^['"]//;
110     $link =~ s/['"]$//;
111     $link = $urlbase.$link;
112     }
113    
114 dpavlin 1 # hack for apostrophe -- changes URL, but should work for most clients.
115     $link =~ s/'/%27/g;
116    
117     # hack for Apache directory listings
118     $link =~ s,/\?[NMSD]=[AD]$,/,g;
119    
120 dpavlin 15 # speedup, skip pictures
121     return if ($link =~ m/\.(gif|jpg|png)/);
122    
123 dpavlin 1 if ($no_parent_url) {
124     if ($link =~ m/$no_parent_url/) {
125 dpavlin 40 print LINKS "$no_parent_url $link\n";
126 dpavlin 1 # print STDERR "using $link\n";
127     # } else {
128     # print STDERR "skipping $link\n";
129     }
130     } else {
131     print LINKS "$link\n";
132     }
133     }
134     }
135    

Properties

Name Value
cvs2svn:cvs-rev 1.8
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26