Contents of /trunk/spider/swishspider

#!/usr/local/bin/perl -w
use strict;

use LWP::UserAgent;
use HTTP::Status;
use HTML::LinkExtor;

if (scalar(@ARGV) != 2) {
    print STDERR "Usage: SwishSpider localpath url\n";
    exit(1);
}

my $ua = new LWP::UserAgent;
$ua->agent( "SwishSpider http://swish-e.org" );

my $localpath = shift;
my $url = shift;

my $no_parent_url;
if ($url =~ m/\s/) {
        ($no_parent_url,$url) = split(/\s/,$url,2);
        # old scheme had URL, no parent and new is reverse
        ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
}

my $request = new HTTP::Request( "GET", $url );
my $response = $ua->simple_request( $request );
my $urlbase = $response->base;
$urlbase =~ s,/[^/]*$,/,;       # remove filename

#
# Write out important meta-data.  This includes the HTTP code.  Depending on the
# code, we write out other data.  Redirects have the location printed, everything
# else gets the content-type.
#
open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );

print RESP $response->code() . "\n";
if( $response->code() == RC_OK ) {
    print RESP $response->header( "content-type" ) . "\n";
} elsif( $response->is_redirect() ) {
        my $link = $response->header( "location" );
        if ($no_parent_url) {
                if ($link =~ m/$no_parent_url/) {
                        # if this URL is below parent URL o.k....
                        print RESP "$no_parent_url $link\n";
                } else {
                        # if not, crawl just this page!
                        print RESP "$link $link\n";
                }
        } else {
                print RESP "$link\n";
        }
}
close( RESP );

#
# Write out the actual data assuming the retrieval was succesful.  Also, if
# we have actual data and it's of type text/html, write out all the links it
# refers to
#
if( $response->code() == RC_OK ) {
    my $contents = $response->content();

    open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
    # fixup just HTML files
    if ($response->header("content-type") =~ "text/html") {
        # if you don't want content to be indexed, include it in
        # <noindex> foobar </noindex> tags or surround it with comments
        # <!-- noindex --> foobar <!-- /noindex -->
        # <!-- noindex --> foobar <!-- index --> (also supported by swish)
        $contents =~ s,<noindex>.+?</noindex>,,isg;
        $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
        $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*index\s*-->,,isg;
        # this will remove all script from indexing content
        $contents =~ s,<script>.+?</script>,,isg;
        # remap Windows charset to ISO-8859-2
        $contents =~ tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
        # this will fix badly formatted html in form:
        # <head><title>some text</title
        # ></head>
        # which will confuse indexer (or libxml2?)
        $contents =~ s/[\n\r]^(>)/$1\n/msg;
        # remove comments between <html> and <head> texi2html inserts them
        # there and swish can't find document title then (libxml or swish bug?)
        while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { };

        # remote TPJ left column
        if ($contents =~ s,<!-- BEGIN LEFT SIDE BAR CELL -->.+?<!-- END LEFT SIDE BAR CELL -->,,isg) {
                my $title;
                # extract title and add to title
                if ($contents =~ m,<!-- the article goes here -->\s*<h2[^>]*>(.+?)</h2>,si) {
                        $title = $1;
                } elsif ($contents =~ m,<h1[^>]*>(.+?)</h1>,is) {
                        $title = $1;
                } elsif ($contents =~ m,<h2[^>]*>(.+?)</h2>,is) {
                        $title = $1;
                } else {
                        $title = "no detail title";
                }
                $contents =~ s,(<title>)([^<]+)(</title>),$1$2: $title$3,gsi if ($title);

        }
    }
    print CONTENTS $contents;
    close( CONTENTS );

    if( $response->header("content-type") =~ "text/html" ) {
        open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
        my $p = HTML::LinkExtor->new( \&linkcb, $url );
        $p->parse( $contents );


        close( LINKS );
    }
}


sub linkcb {
    my($tag, %links) = @_;
    if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) {
        my $link = $links{"href"} || $links{"src"};

        #
        # Remove fragments
        #
        $link =~ s/(.*)#.*/$1/;

        #
        # Remove ../  This is important because the abs() function
        # can leave these in and cause never ending loops.
        #
        $link =~ s/\.\.\///g;

        if ($link =~ m,javascript:displayWindow\((.+)\),i) {
                my $arg = $1;
                $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
                ($link,undef) = split(',',$arg,2);
                $link =~ s/^['"]//;
                $link =~ s/['"]$//;
                $link = $urlbase.$link;
        }

        # hack for apostrophe -- changes URL, but should work for most clients.
        $link =~ s/'/%27/g;

        # hack for Apache directory listings
        $link =~ s,/\?[NMSD]=[AD]$,/,g;

        # speedup, skip pictures
        return if ($link =~ m/\.(gif|jpg|png)/);

        if ($no_parent_url) {
                        if ($link =~ m/$no_parent_url/) {
                                print LINKS "$no_parent_url $link\n";
#                               print STDERR "using $link\n";
#                       } else {
#                               print STDERR "skipping $link\n";
                        }
        } else {
                print LINKS "$link\n";
        }
    }
}

1	#!/usr/local/bin/perl -w
2	use strict;
3
4	use LWP::UserAgent;
5	use HTTP::Status;
6	use HTML::LinkExtor;
7
8	if (scalar(@ARGV) != 2) {
9	print STDERR "Usage: SwishSpider localpath url\n";
10	exit(1);
11	}
12
13	my $ua = new LWP::UserAgent;
14	$ua->agent( "SwishSpider http://swish-e.org" );
15
16	my $localpath = shift;
17	my $url = shift;
18
19	my $no_parent_url;
20	if ($url =~ m/\s/) {
21	($no_parent_url,$url) = split(/\s/,$url,2);
22	# old scheme had URL, no parent and new is reverse
23	($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
24	}
25
26	my $request = new HTTP::Request( "GET", $url );
27	my $response = $ua->simple_request( $request );
28	my $urlbase = $response->base;
29	$urlbase =~ s,/[^/]*$,/,; # remove filename
30
31	#
32	# Write out important meta-data. This includes the HTTP code. Depending on the
33	# code, we write out other data. Redirects have the location printed, everything
34	# else gets the content-type.
35	#
36	open( RESP, ">$localpath.response" ) \|\| die( "Could not open response file $localpath.response" );
37
38	print RESP $response->code() . "\n";
39	if( $response->code() == RC_OK ) {
40	print RESP $response->header( "content-type" ) . "\n";
41	} elsif( $response->is_redirect() ) {
42	my $link = $response->header( "location" );
43	if ($no_parent_url) {
44	if ($link =~ m/$no_parent_url/) {
45	# if this URL is below parent URL o.k....
46	print RESP "$no_parent_url $link\n";
47	} else {
48	# if not, crawl just this page!
49	print RESP "$link $link\n";
50	}
51	} else {
52	print RESP "$link\n";
53	}
54	}
55	close( RESP );
56
57	#
58	# Write out the actual data assuming the retrieval was succesful. Also, if
59	# we have actual data and it's of type text/html, write out all the links it
60	# refers to
61	#
62	if( $response->code() == RC_OK ) {
63	my $contents = $response->content();
64
65	open( CONTENTS, ">$localpath.contents" ) \|\| die( "Could not open contents file $localpath.contents\n" );
66	# fixup just HTML files
67	if ($response->header("content-type") =~ "text/html") {
68	# if you don't want content to be indexed, include it in
69	# <noindex> foobar </noindex> tags or surround it with comments
70	# <!-- noindex --> foobar <!-- /noindex -->
71	# <!-- noindex --> foobar <!-- index --> (also supported by swish)
72	$contents =~ s,<noindex>.+?</noindex>,,isg;
73	$contents =~ s,<!--\snoindex\s-->.+?<!--\s/noindex\s-->,,isg;
74	$contents =~ s,<!--\snoindex\s-->.+?<!--\sindex\s-->,,isg;
75	# this will remove all script from indexing content
76	$contents =~ s,<script>.+?</script>,,isg;
77	# remap Windows charset to ISO-8859-2
78	$contents =~ tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
79	# this will fix badly formatted html in form:
80	# <head><title>some text</title
81	# ></head>
82	# which will confuse indexer (or libxml2?)
83	$contents =~ s/[\n\r]^(>)/$1\n/msg;
84	# remove comments between <html> and <head> texi2html inserts them
85	# there and swish can't find document title then (libxml or swish bug?)
86	while ($contents =~ s/(<html>.)<!--.?-->(.*<head>)/$1$2/msi) { };
87
88	# remote TPJ left column
89	if ($contents =~ s,<!-- BEGIN LEFT SIDE BAR CELL -->.+?<!-- END LEFT SIDE BAR CELL -->,,isg) {
90	my $title;
91	# extract title and add to title
92	if ($contents =~ m,<!-- the article goes here -->\s<h2[^>]>(.+?)</h2>,si) {
93	$title = $1;
94	} elsif ($contents =~ m,<h1[^>]*>(.+?)</h1>,is) {
95	$title = $1;
96	} elsif ($contents =~ m,<h2[^>]*>(.+?)</h2>,is) {
97	$title = $1;
98	} else {
99	$title = "no detail title";
100	}
101	$contents =~ s,(<title>)([^<]+)(</title>),$1$2: $title$3,gsi if ($title);
102
103	}
104	}
105	print CONTENTS $contents;
106	close( CONTENTS );
107
108	if( $response->header("content-type") =~ "text/html" ) {
109	open( LINKS, ">$localpath.links" ) \|\| die( "Could not open links file $localpath.links\n" );
110	my $p = HTML::LinkExtor->new( \&linkcb, $url );
111	$p->parse( $contents );
112
113
114
115	close( LINKS );
116	}
117	}
118
119
120	sub linkcb {
121	my($tag, %links) = @_;
122	if (($tag eq "a" \|\| $tag eq "area") && ($links{"href"}) \|\| ($tag eq "frame" && $links{"src"})) {
123	my $link = $links{"href"} \|\| $links{"src"};
124
125	#
126	# Remove fragments
127	#
128	$link =~ s/(.)#./$1/;
129
130	#
131	# Remove ../ This is important because the abs() function
132	# can leave these in and cause never ending loops.
133	#
134	$link =~ s/\.\.\///g;
135
136	if ($link =~ m,javascript:displayWindow\((.+)\),i) {
137	my $arg = $1;
138	$arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
139	($link,undef) = split(',',$arg,2);
140	$link =~ s/^['"]//;
141	$link =~ s/['"]$//;
142	$link = $urlbase.$link;
143	}
144
145	# hack for apostrophe -- changes URL, but should work for most clients.
146	$link =~ s/'/%27/g;
147
148	# hack for Apache directory listings
149	$link =~ s,/\?[NMSD]=[AD]$,/,g;
150
151	# speedup, skip pictures
152	return if ($link =~ m/\.(gif\|jpg\|png)/);
153
154	if ($no_parent_url) {
155	if ($link =~ m/$no_parent_url/) {
156	print LINKS "$no_parent_url $link\n";
157	# print STDERR "using $link\n";
158	# } else {
159	# print STDERR "skipping $link\n";
160	}
161	} else {
162	print LINKS "$link\n";
163	}
164	}
165	}
166