/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 15 - (show annotations)
Sun Mar 16 21:31:55 2003 UTC (21 years ago) by dpavlin
File size: 2986 byte(s)
support for image map and skip pictures (speedup)

1 #!/usr/local/bin/perl -w
2 use strict;
3
4 use LWP::UserAgent;
5 use HTTP::Status;
6 use HTML::LinkExtor;
7
8 if (scalar(@ARGV) != 2) {
9 print STDERR "Usage: SwishSpider localpath url\n";
10 exit(1);
11 }
12
13 my $ua = new LWP::UserAgent;
14 $ua->agent( "SwishSpider http://swish-e.org" );
15
16 my $localpath = shift;
17 my $url = shift;
18
19 my $no_parent_url;
20 if ($url =~ m/\s/) {
21 ($url,$no_parent_url) = split(/\s/,$url,2);
22 }
23
24 my $request = new HTTP::Request( "GET", $url );
25 my $response = $ua->simple_request( $request );
26
27 #
28 # Write out important meta-data. This includes the HTTP code. Depending on the
29 # code, we write out other data. Redirects have the location printed, everything
30 # else gets the content-type.
31 #
32 open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );
33
34 print RESP $response->code() . "\n";
35 if( $response->code() == RC_OK ) {
36 print RESP $response->header( "content-type" ) . "\n";
37 } elsif( $response->is_redirect() ) {
38 my $link = $response->header( "location" );
39 if ($no_parent_url) {
40 if ($link =~ m/$no_parent_url/) {
41 # if this URL is below parent URL o.k....
42 print RESP "$link $no_parent_url\n";
43 } else {
44 # if not, crawl just this page!
45 print RESP "$link $link\n";
46 }
47 } else {
48 print RESP "$link\n";
49 }
50 }
51 close( RESP );
52
53 #
54 # Write out the actual data assuming the retrieval was succesful. Also, if
55 # we have actual data and it's of type text/html, write out all the links it
56 # refers to
57 #
58 if( $response->code() == RC_OK ) {
59 my $contents = $response->content();
60
61 open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
62 $contents =~ s,<noindex>.+?</noindex>,,isg;
63 $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
64 $contents =~ s,<script>.+?</script>,,isg;
65 $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
66 print CONTENTS $contents;
67 close( CONTENTS );
68
69 if( $response->header("content-type") =~ "text/html" ) {
70 open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
71 my $p = HTML::LinkExtor->new( \&linkcb, $url );
72 $p->parse( $contents );
73
74 close( LINKS );
75 }
76 }
77
78
79 sub linkcb {
80 my($tag, %links) = @_;
81 if (($tag eq "a" || $tag eq "area") && ($links{"href"})) {
82 my $link = $links{"href"};
83
84 #
85 # Remove fragments
86 #
87 $link =~ s/(.*)#.*/$1/;
88
89 #
90 # Remove ../ This is important because the abs() function
91 # can leave these in and cause never ending loops.
92 #
93 $link =~ s/\.\.\///g;
94
95 # hack for apostrophe -- changes URL, but should work for most clients.
96 $link =~ s/'/%27/g;
97
98 # hack for Apache directory listings
99 $link =~ s,/\?[NMSD]=[AD]$,/,g;
100
101 # speedup, skip pictures
102 return if ($link =~ m/\.(gif|jpg|png)/);
103
104 if ($no_parent_url) {
105 if ($link =~ m/$no_parent_url/) {
106 print LINKS "$link $no_parent_url\n";
107 # print STDERR "using $link\n";
108 # } else {
109 # print STDERR "skipping $link\n";
110 }
111 } else {
112 print LINKS "$link\n";
113 }
114 }
115 }
116

Properties

Name Value
cvs2svn:cvs-rev 1.2
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26