/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 42 - (show annotations)
Tue Jul 29 10:40:58 2003 UTC (17 years, 2 months ago) by dpavlin
File size: 4033 byte(s)
better handling of chars in URL, support for
<!-- noindex -->, <!-- index --> which is supported natively in swish 2.4

1 #!/usr/local/bin/perl -w
2 use strict;
3
4 use LWP::UserAgent;
5 use HTTP::Status;
6 use HTML::LinkExtor;
7
8 if (scalar(@ARGV) != 2) {
9 print STDERR "Usage: SwishSpider localpath url\n";
10 exit(1);
11 }
12
13 my $ua = new LWP::UserAgent;
14 $ua->agent( "SwishSpider http://swish-e.org" );
15
16 my $localpath = shift;
17 my $url = shift;
18
19 my $no_parent_url;
20 if ($url =~ m/\s/) {
21 ($no_parent_url,$url) = split(/\s/,$url,2);
22 # old scheme had URL, no parent and new is reverse
23 ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
24 }
25
26 my $request = new HTTP::Request( "GET", $url );
27 my $response = $ua->simple_request( $request );
28
29 #
30 # Write out important meta-data. This includes the HTTP code. Depending on the
31 # code, we write out other data. Redirects have the location printed, everything
32 # else gets the content-type.
33 #
34 open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );
35
36 print RESP $response->code() . "\n";
37 if( $response->code() == RC_OK ) {
38 print RESP $response->header( "content-type" ) . "\n";
39 } elsif( $response->is_redirect() ) {
40 my $link = $response->header( "location" );
41 if ($no_parent_url) {
42 if ($link =~ m/$no_parent_url/) {
43 # if this URL is below parent URL o.k....
44 print RESP "$no_parent_url $link\n";
45 } else {
46 # if not, crawl just this page!
47 print RESP "$link $link\n";
48 }
49 } else {
50 print RESP "$link\n";
51 }
52 }
53 close( RESP );
54
55 #
56 # Write out the actual data assuming the retrieval was succesful. Also, if
57 # we have actual data and it's of type text/html, write out all the links it
58 # refers to
59 #
60 if( $response->code() == RC_OK ) {
61 my $contents = $response->content();
62
63 open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
64 # fixup just HTML files
65 if ($response->header("content-type") =~ "text/html") {
66 # if you don't want content to be indexed, include it in
67 # <noindex> foobar </noindex> tags or surround it with comments
68 # <!-- noindex --> foobar <!-- /noindex -->
69 # <!-- noindex --> foobar <!-- index --> (also supported by swish)
70 $contents =~ s,<noindex>.+?</noindex>,,isg;
71 $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
72 $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*index\s*-->,,isg;
73 # this will remove all script from indexing content
74 $contents =~ s,<script>.+?</script>,,isg;
75 # remap Windows charset to ISO-8859-2
76 $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
77 # this will fix badly formatted html in form:
78 # <head><title>some text</title
79 # ></head>
80 # which will confuse indexer (or libxml2?)
81 $contents =~ s/[\n\r]^(>)/$1\n/msg;
82 # remove comments between <html> and <head> texi2html inserts them
83 # there and swish can't find document title then (libxml or swish bug?)
84 while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { };
85 }
86 print CONTENTS $contents;
87 close( CONTENTS );
88
89 if( $response->header("content-type") =~ "text/html" ) {
90 open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
91 my $p = HTML::LinkExtor->new( \&linkcb, $url );
92 $p->parse( $contents );
93
94 close( LINKS );
95 }
96 }
97
98
99 sub linkcb {
100 my($tag, %links) = @_;
101 if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) {
102 my $link = $links{"href"} || $links{"src"};
103
104 #
105 # Remove fragments
106 #
107 $link =~ s/(.*)#.*/$1/;
108
109 #
110 # Remove ../ This is important because the abs() function
111 # can leave these in and cause never ending loops.
112 #
113 $link =~ s/\.\.\///g;
114
115 # hack for apostrophe -- changes URL, but should work for most clients.
116 $link =~ s/'/%27/g;
117
118 # hack for Apache directory listings
119 $link =~ s,/\?[NMSD]=[AD]$,/,g;
120
121 # speedup, skip pictures
122 return if ($link =~ m/\.(gif|jpg|png)/);
123
124 if ($no_parent_url) {
125 if ($link =~ m/$no_parent_url/) {
126 print LINKS "$no_parent_url $link\n";
127 # print STDERR "using $link\n";
128 # } else {
129 # print STDERR "skipping $link\n";
130 }
131 } else {
132 print LINKS "$link\n";
133 }
134 }
135 }
136

Properties

Name Value
cvs2svn:cvs-rev 1.6
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26