/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 45 - (hide annotations)
Wed Nov 19 12:07:07 2003 UTC (16 years, 11 months ago) by dpavlin
File size: 4871 byte(s)
fixes and improvements

1 dpavlin 1 #!/usr/local/bin/perl -w
2     use strict;
3    
4     use LWP::UserAgent;
5     use HTTP::Status;
6     use HTML::LinkExtor;
7    
8     if (scalar(@ARGV) != 2) {
9     print STDERR "Usage: SwishSpider localpath url\n";
10     exit(1);
11     }
12    
13     my $ua = new LWP::UserAgent;
14     $ua->agent( "SwishSpider http://swish-e.org" );
15    
16     my $localpath = shift;
17     my $url = shift;
18    
19     my $no_parent_url;
20     if ($url =~ m/\s/) {
21 dpavlin 40 ($no_parent_url,$url) = split(/\s/,$url,2);
22     # old scheme had URL, no parent and new is reverse
23 dpavlin 42 ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
24 dpavlin 1 }
25    
26     my $request = new HTTP::Request( "GET", $url );
27     my $response = $ua->simple_request( $request );
28 dpavlin 45 my $urlbase = $response->base;
29     $urlbase =~ s,/[^/]*$,/,; # remove filename
30 dpavlin 1
31     #
32     # Write out important meta-data. This includes the HTTP code. Depending on the
33     # code, we write out other data. Redirects have the location printed, everything
34     # else gets the content-type.
35     #
36     open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );
37    
38     print RESP $response->code() . "\n";
39     if( $response->code() == RC_OK ) {
40     print RESP $response->header( "content-type" ) . "\n";
41     } elsif( $response->is_redirect() ) {
42     my $link = $response->header( "location" );
43     if ($no_parent_url) {
44     if ($link =~ m/$no_parent_url/) {
45     # if this URL is below parent URL o.k....
46 dpavlin 40 print RESP "$no_parent_url $link\n";
47 dpavlin 1 } else {
48     # if not, crawl just this page!
49     print RESP "$link $link\n";
50     }
51     } else {
52     print RESP "$link\n";
53     }
54     }
55     close( RESP );
56    
57     #
58     # Write out the actual data assuming the retrieval was succesful. Also, if
59     # we have actual data and it's of type text/html, write out all the links it
60     # refers to
61     #
62     if( $response->code() == RC_OK ) {
63     my $contents = $response->content();
64    
65     open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
66 dpavlin 40 # fixup just HTML files
67     if ($response->header("content-type") =~ "text/html") {
68     # if you don't want content to be indexed, include it in
69     # <noindex> foobar </noindex> tags or surround it with comments
70     # <!-- noindex --> foobar <!-- /noindex -->
71 dpavlin 42 # <!-- noindex --> foobar <!-- index --> (also supported by swish)
72 dpavlin 40 $contents =~ s,<noindex>.+?</noindex>,,isg;
73     $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
74 dpavlin 42 $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*index\s*-->,,isg;
75 dpavlin 40 # this will remove all script from indexing content
76     $contents =~ s,<script>.+?</script>,,isg;
77     # remap Windows charset to ISO-8859-2
78     $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
79     # this will fix badly formatted html in form:
80     # <head><title>some text</title
81     # ></head>
82     # which will confuse indexer (or libxml2?)
83     $contents =~ s/[\n\r]^(>)/$1\n/msg;
84     # remove comments between <html> and <head> texi2html inserts them
85     # there and swish can't find document title then (libxml or swish bug?)
86     while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { };
87 dpavlin 45
88     # remote TPJ left column
89     if ($contents =~ s,<!-- BEGIN LEFT SIDE BAR CELL -->.+?<!-- END LEFT SIDE BAR CELL -->,,isg) {
90     my $title;
91     # extract title and add to title
92     if ($contents =~ m,<!-- the article goes here -->\s*<h2[^>]*>(.+?)</h2>,si) {
93     $title = $1;
94     } elsif ($contents =~ m,<h1[^>]*>(.+?)</h1>,is) {
95     $title = $1;
96     } elsif ($contents =~ m,<h2[^>]*>(.+?)</h2>,is) {
97     $title = $1;
98     } else {
99     $title = "no detail title";
100     }
101     $contents =~ s,(<title>)([^<]+)(</title>),$1$2: $title$3,gsi if ($title);
102    
103     }
104 dpavlin 40 }
105 dpavlin 1 print CONTENTS $contents;
106     close( CONTENTS );
107    
108     if( $response->header("content-type") =~ "text/html" ) {
109     open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
110     my $p = HTML::LinkExtor->new( \&linkcb, $url );
111     $p->parse( $contents );
112    
113 dpavlin 45
114    
115 dpavlin 1 close( LINKS );
116     }
117     }
118    
119    
120     sub linkcb {
121     my($tag, %links) = @_;
122 dpavlin 40 if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) {
123     my $link = $links{"href"} || $links{"src"};
124 dpavlin 1
125     #
126     # Remove fragments
127     #
128     $link =~ s/(.*)#.*/$1/;
129    
130     #
131     # Remove ../ This is important because the abs() function
132     # can leave these in and cause never ending loops.
133     #
134     $link =~ s/\.\.\///g;
135    
136 dpavlin 45 if ($link =~ m,javascript:displayWindow\((.+)\),i) {
137     my $arg = $1;
138     $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
139     ($link,undef) = split(',',$arg,2);
140     $link =~ s/^['"]//;
141     $link =~ s/['"]$//;
142     $link = $urlbase.$link;
143     }
144    
145 dpavlin 1 # hack for apostrophe -- changes URL, but should work for most clients.
146     $link =~ s/'/%27/g;
147    
148     # hack for Apache directory listings
149     $link =~ s,/\?[NMSD]=[AD]$,/,g;
150    
151 dpavlin 15 # speedup, skip pictures
152     return if ($link =~ m/\.(gif|jpg|png)/);
153    
154 dpavlin 1 if ($no_parent_url) {
155     if ($link =~ m/$no_parent_url/) {
156 dpavlin 40 print LINKS "$no_parent_url $link\n";
157 dpavlin 1 # print STDERR "using $link\n";
158     # } else {
159     # print STDERR "skipping $link\n";
160     }
161     } else {
162     print LINKS "$link\n";
163     }
164     }
165     }
166    

Properties

Name Value
cvs2svn:cvs-rev 1.7
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26