Annotation of /trunk/spider/swishspider

#!/usr/local/bin/perl -w
use strict;

use LWP::UserAgent;
use HTTP::Status;
use HTML::LinkExtor;

my $basedir = $0;
$basedir =~ s,/[^/]+$,/,;
require "$basedir/filter.pm";

if (scalar(@ARGV) != 2) {
    print STDERR "Usage: SwishSpider localpath url\n";
    exit(1);
}

my $ua = new LWP::UserAgent;
$ua->agent( "SwishSpider http://swish-e.org" );

my $localpath = shift;
my $url = shift;

my $no_parent_url;
if ($url =~ m/\s/) {
        ($no_parent_url,$url) = split(/\s/,$url,2);
        # old scheme had URL, no parent and new is reverse
        ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
}

my $request = new HTTP::Request( "GET", $url );
my $response = $ua->simple_request( $request );
my $urlbase = $response->base;
$urlbase =~ s,/[^/]*$,/,;       # remove filename

#
# Write out important meta-data.  This includes the HTTP code.  Depending on the
# code, we write out other data.  Redirects have the location printed, everything
# else gets the content-type.
#
open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );

print RESP $response->code() . "\n";
if( $response->code() == RC_OK ) {
    print RESP $response->header( "content-type" ) . "\n";
} elsif( $response->is_redirect() ) {
        my $link = $response->header( "location" );
        if ($no_parent_url) {
                if ($link =~ m/$no_parent_url/) {
                        # if this URL is below parent URL o.k....
                        print RESP "$no_parent_url $link\n";
                } else {
                        # if not, crawl just this page!
                        print RESP "$link $link\n";
                }
        } else {
                print RESP "$link\n";
        }
}
close( RESP );

#
# Write out the actual data assuming the retrieval was succesful.  Also, if
# we have actual data and it's of type text/html, write out all the links it
# refers to
#
if( $response->code() == RC_OK ) {
    my $contents = $response->content();

    open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
    # fixup just HTML files
    if ($response->header("content-type") =~ "text/html") {
        $contents = filter($contents);
    }
    print CONTENTS $contents;
    close( CONTENTS );

    if( $response->header("content-type") =~ "text/html" ) {
        open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
        my $p = HTML::LinkExtor->new( \&linkcb, $url );
        $p->parse( $contents );


        close( LINKS );
    }
}


sub linkcb {
    my($tag, %links) = @_;
    if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) {
        my $link = $links{"href"} || $links{"src"};

        #
        # Remove fragments
        #
        $link =~ s/(.*)#.*/$1/;

        #
        # Remove ../  This is important because the abs() function
        # can leave these in and cause never ending loops.
        #
        $link =~ s/\.\.\///g;

        if ($link =~ m,javascript:displayWindow\((.+)\),i) {
                my $arg = $1;
                $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
                ($link,undef) = split(',',$arg,2);
                $link =~ s/^['"]//;
                $link =~ s/['"]$//;
                $link = $urlbase.$link;
        }

        # hack for apostrophe -- changes URL, but should work for most clients.
        $link =~ s/'/%27/g;

        # hack for Apache directory listings
        $link =~ s,/\?[NMSD]=[AD]$,/,g;

        # speedup, skip pictures
        return if ($link =~ m/\.(gif|jpg|png)/);

        if ($no_parent_url) {
                        if ($link =~ m/$no_parent_url/) {
                                print LINKS "$no_parent_url $link\n";
#                               print STDERR "using $link\n";
#                       } else {
#                               print STDERR "skipping $link\n";
                        }
        } else {
                print LINKS "$link\n";
        }
    }
}

1	dpavlin	1	#!/usr/local/bin/perl -w
2			use strict;
3
4			use LWP::UserAgent;
5			use HTTP::Status;
6			use HTML::LinkExtor;
7
8	dpavlin	46	my $basedir = $0;
9			$basedir =~ s,/[^/]+$,/,;
10			require "$basedir/filter.pm";
11
12	dpavlin	1	if (scalar(@ARGV) != 2) {
13			print STDERR "Usage: SwishSpider localpath url\n";
14			exit(1);
15			}
16
17			my $ua = new LWP::UserAgent;
18			$ua->agent( "SwishSpider http://swish-e.org" );
19
20			my $localpath = shift;
21			my $url = shift;
22
23			my $no_parent_url;
24			if ($url =~ m/\s/) {
25	dpavlin	40	($no_parent_url,$url) = split(/\s/,$url,2);
26			# old scheme had URL, no parent and new is reverse
27	dpavlin	42	($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
28	dpavlin	1	}
29
30			my $request = new HTTP::Request( "GET", $url );
31			my $response = $ua->simple_request( $request );
32	dpavlin	45	my $urlbase = $response->base;
33			$urlbase =~ s,/[^/]*$,/,; # remove filename
34	dpavlin	1
35			#
36			# Write out important meta-data. This includes the HTTP code. Depending on the
37			# code, we write out other data. Redirects have the location printed, everything
38			# else gets the content-type.
39			#
40			open( RESP, ">$localpath.response" ) \|\| die( "Could not open response file $localpath.response" );
41
42			print RESP $response->code() . "\n";
43			if( $response->code() == RC_OK ) {
44			print RESP $response->header( "content-type" ) . "\n";
45			} elsif( $response->is_redirect() ) {
46			my $link = $response->header( "location" );
47			if ($no_parent_url) {
48			if ($link =~ m/$no_parent_url/) {
49			# if this URL is below parent URL o.k....
50	dpavlin	40	print RESP "$no_parent_url $link\n";
51	dpavlin	1	} else {
52			# if not, crawl just this page!
53			print RESP "$link $link\n";
54			}
55			} else {
56			print RESP "$link\n";
57			}
58			}
59			close( RESP );
60
61			#
62			# Write out the actual data assuming the retrieval was succesful. Also, if
63			# we have actual data and it's of type text/html, write out all the links it
64			# refers to
65			#
66			if( $response->code() == RC_OK ) {
67			my $contents = $response->content();
68
69			open( CONTENTS, ">$localpath.contents" ) \|\| die( "Could not open contents file $localpath.contents\n" );
70	dpavlin	40	# fixup just HTML files
71			if ($response->header("content-type") =~ "text/html") {
72	dpavlin	46	$contents = filter($contents);
73	dpavlin	40	}
74	dpavlin	1	print CONTENTS $contents;
75			close( CONTENTS );
76
77			if( $response->header("content-type") =~ "text/html" ) {
78			open( LINKS, ">$localpath.links" ) \|\| die( "Could not open links file $localpath.links\n" );
79			my $p = HTML::LinkExtor->new( \&linkcb, $url );
80			$p->parse( $contents );
81
82	dpavlin	45
83
84	dpavlin	1	close( LINKS );
85			}
86			}
87
88
89			sub linkcb {
90			my($tag, %links) = @_;
91	dpavlin	40	if (($tag eq "a" \|\| $tag eq "area") && ($links{"href"}) \|\| ($tag eq "frame" && $links{"src"})) {
92			my $link = $links{"href"} \|\| $links{"src"};
93	dpavlin	1
94			#
95			# Remove fragments
96			#
97			$link =~ s/(.)#./$1/;
98
99			#
100			# Remove ../ This is important because the abs() function
101			# can leave these in and cause never ending loops.
102			#
103			$link =~ s/\.\.\///g;
104
105	dpavlin	45	if ($link =~ m,javascript:displayWindow\((.+)\),i) {
106			my $arg = $1;
107			$arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
108			($link,undef) = split(',',$arg,2);
109			$link =~ s/^['"]//;
110			$link =~ s/['"]$//;
111			$link = $urlbase.$link;
112			}
113
114	dpavlin	1	# hack for apostrophe -- changes URL, but should work for most clients.
115			$link =~ s/'/%27/g;
116
117			# hack for Apache directory listings
118			$link =~ s,/\?[NMSD]=[AD]$,/,g;
119
120	dpavlin	15	# speedup, skip pictures
121			return if ($link =~ m/\.(gif\|jpg\|png)/);
122
123	dpavlin	1	if ($no_parent_url) {
124			if ($link =~ m/$no_parent_url/) {
125	dpavlin	40	print LINKS "$no_parent_url $link\n";
126	dpavlin	1	# print STDERR "using $link\n";
127			# } else {
128			# print STDERR "skipping $link\n";
129			}
130			} else {
131			print LINKS "$link\n";
132			}
133			}
134			}
135