Annotation of /trunk/spider/swishspider

#!/usr/local/bin/perl -w
use strict;

use LWP::UserAgent;
use HTTP::Status;
use HTML::LinkExtor;

if (scalar(@ARGV) != 2) {
    print STDERR "Usage: SwishSpider localpath url\n";
    exit(1);
}

my $ua = new LWP::UserAgent;
$ua->agent( "SwishSpider http://swish-e.org" );

my $localpath = shift;
my $url = shift;

my $no_parent_url;
if ($url =~ m/\s/) {
        ($url,$no_parent_url) = split(/\s/,$url,2);
}

my $request = new HTTP::Request( "GET", $url );
my $response = $ua->simple_request( $request );

#
# Write out important meta-data.  This includes the HTTP code.  Depending on the
# code, we write out other data.  Redirects have the location printed, everything
# else gets the content-type.
#
open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );

print RESP $response->code() . "\n";
if( $response->code() == RC_OK ) {
    print RESP $response->header( "content-type" ) . "\n";
} elsif( $response->is_redirect() ) {
        my $link = $response->header( "location" );
        if ($no_parent_url) {
                if ($link =~ m/$no_parent_url/) {
                        # if this URL is below parent URL o.k....
                        print RESP "$link $no_parent_url\n";
                } else {
                        # if not, crawl just this page!
                        print RESP "$link $link\n";
                }
        } else {
                print RESP "$link\n";
        }
}
close( RESP );

#
# Write out the actual data assuming the retrieval was succesful.  Also, if
# we have actual data and it's of type text/html, write out all the links it
# refers to
#
if( $response->code() == RC_OK ) {
    my $contents = $response->content();

    open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
    $contents =~ s,<noindex>.+?</noindex>,,isg;
    $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
    $contents =~ s,<script>.+?</script>,,isg;
    $contents =~ tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/;     # 1250 -> iso8859-2
    print CONTENTS $contents;
    close( CONTENTS );

    if( $response->header("content-type") =~ "text/html" ) {
        open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
        my $p = HTML::LinkExtor->new( \&linkcb, $url );
        $p->parse( $contents );

        close( LINKS );
    }
}


sub linkcb {
    my($tag, %links) = @_;
    if (($tag eq "a") && ($links{"href"})) {
        my $link = $links{"href"};

        #
        # Remove fragments
        #
        $link =~ s/(.*)#.*/$1/;

        #
        # Remove ../  This is important because the abs() function
        # can leave these in and cause never ending loops.
        #
        $link =~ s/\.\.\///g;

        # hack for apostrophe -- changes URL, but should work for most clients.
        $link =~ s/'/%27/g;

        # hack for Apache directory listings
        $link =~ s,/\?[NMSD]=[AD]$,/,g;

        if ($no_parent_url) {
                        if ($link =~ m/$no_parent_url/) {
                                print LINKS "$link $no_parent_url\n";
#                               print STDERR "using $link\n";
#                       } else {
#                               print STDERR "skipping $link\n";
                        }
        } else {
                print LINKS "$link\n";
        }
    }
}

1	dpavlin	1	#!/usr/local/bin/perl -w
2			use strict;
3
4			use LWP::UserAgent;
5			use HTTP::Status;
6			use HTML::LinkExtor;
7
8			if (scalar(@ARGV) != 2) {
9			print STDERR "Usage: SwishSpider localpath url\n";
10			exit(1);
11			}
12
13			my $ua = new LWP::UserAgent;
14			$ua->agent( "SwishSpider http://swish-e.org" );
15
16			my $localpath = shift;
17			my $url = shift;
18
19			my $no_parent_url;
20			if ($url =~ m/\s/) {
21			($url,$no_parent_url) = split(/\s/,$url,2);
22			}
23
24			my $request = new HTTP::Request( "GET", $url );
25			my $response = $ua->simple_request( $request );
26
27			#
28			# Write out important meta-data. This includes the HTTP code. Depending on the
29			# code, we write out other data. Redirects have the location printed, everything
30			# else gets the content-type.
31			#
32			open( RESP, ">$localpath.response" ) \|\| die( "Could not open response file $localpath.response" );
33
34			print RESP $response->code() . "\n";
35			if( $response->code() == RC_OK ) {
36			print RESP $response->header( "content-type" ) . "\n";
37			} elsif( $response->is_redirect() ) {
38			my $link = $response->header( "location" );
39			if ($no_parent_url) {
40			if ($link =~ m/$no_parent_url/) {
41			# if this URL is below parent URL o.k....
42			print RESP "$link $no_parent_url\n";
43			} else {
44			# if not, crawl just this page!
45			print RESP "$link $link\n";
46			}
47			} else {
48			print RESP "$link\n";
49			}
50			}
51			close( RESP );
52
53			#
54			# Write out the actual data assuming the retrieval was succesful. Also, if
55			# we have actual data and it's of type text/html, write out all the links it
56			# refers to
57			#
58			if( $response->code() == RC_OK ) {
59			my $contents = $response->content();
60
61			open( CONTENTS, ">$localpath.contents" ) \|\| die( "Could not open contents file $localpath.contents\n" );
62			$contents =~ s,<noindex>.+?</noindex>,,isg;
63			$contents =~ s,<!--\snoindex\s-->.+?<!--\s/noindex\s-->,,isg;
64			$contents =~ s,<script>.+?</script>,,isg;
65			$contents =~ tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
66			print CONTENTS $contents;
67			close( CONTENTS );
68
69			if( $response->header("content-type") =~ "text/html" ) {
70			open( LINKS, ">$localpath.links" ) \|\| die( "Could not open links file $localpath.links\n" );
71			my $p = HTML::LinkExtor->new( \&linkcb, $url );
72			$p->parse( $contents );
73
74			close( LINKS );
75			}
76			}
77
78
79			sub linkcb {
80			my($tag, %links) = @_;
81			if (($tag eq "a") && ($links{"href"})) {
82			my $link = $links{"href"};
83
84			#
85			# Remove fragments
86			#
87			$link =~ s/(.)#./$1/;
88
89			#
90			# Remove ../ This is important because the abs() function
91			# can leave these in and cause never ending loops.
92			#
93			$link =~ s/\.\.\///g;
94
95			# hack for apostrophe -- changes URL, but should work for most clients.
96			$link =~ s/'/%27/g;
97
98			# hack for Apache directory listings
99			$link =~ s,/\?[NMSD]=[AD]$,/,g;
100
101			if ($no_parent_url) {
102			if ($link =~ m/$no_parent_url/) {
103			print LINKS "$link $no_parent_url\n";
104			# print STDERR "using $link\n";
105			# } else {
106			# print STDERR "skipping $link\n";
107			}
108			} else {
109			print LINKS "$link\n";
110			}
111			}
112			}
113