--- trunk/spider/swishspider 2003/04/30 12:40:09 32
+++ trunk/spider/swishspider 2004/01/17 23:57:55 46
@@ -5,6 +5,10 @@
use HTTP::Status;
use HTML::LinkExtor;
+my $basedir = $0;
+$basedir =~ s,/[^/]+$,/,;
+require "$basedir/filter.pm";
+
if (scalar(@ARGV) != 2) {
print STDERR "Usage: SwishSpider localpath url\n";
exit(1);
@@ -18,11 +22,15 @@
my $no_parent_url;
if ($url =~ m/\s/) {
- ($url,$no_parent_url) = split(/\s/,$url,2);
+ ($no_parent_url,$url) = split(/\s/,$url,2);
+ # old scheme had URL, no parent and new is reverse
+ ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
}
my $request = new HTTP::Request( "GET", $url );
my $response = $ua->simple_request( $request );
+my $urlbase = $response->base;
+$urlbase =~ s,/[^/]*$,/,; # remove filename
#
# Write out important meta-data. This includes the HTTP code. Depending on the
@@ -39,7 +47,7 @@
if ($no_parent_url) {
if ($link =~ m/$no_parent_url/) {
# if this URL is below parent URL o.k....
- print RESP "$link $no_parent_url\n";
+ print RESP "$no_parent_url $link\n";
} else {
# if not, crawl just this page!
print RESP "$link $link\n";
@@ -59,23 +67,10 @@
my $contents = $response->content();
open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
- # if you don't want content to be indexed, include it in
- # foobar tags or surround it with comments
- # foobar
- $contents =~ s,.+?,,isg;
- $contents =~ s,.+?,,isg;
- # this will remove all script from indexing content
- $contents =~ s,,,isg;
- # remap Windows charset to ISO-8859-2
- $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
- # this will fix badly formatted html in form:
- #
some text
- # which will confuse indexer (or libxml2?)
- $contents =~ s/[\n\r]^(>)/$1\n/msg;
- # remove comments between and texi2html inserts them
- # there and swish can't find document title then (libxml or swish bug?)
- while ($contents =~ s/(.*)(.*)/$1$2/msi) { };
+ # fixup just HTML files
+ if ($response->header("content-type") =~ "text/html") {
+ $contents = filter($contents);
+ }
print CONTENTS $contents;
close( CONTENTS );
@@ -84,6 +79,8 @@
my $p = HTML::LinkExtor->new( \&linkcb, $url );
$p->parse( $contents );
+
+
close( LINKS );
}
}
@@ -91,8 +88,8 @@
sub linkcb {
my($tag, %links) = @_;
- if (($tag eq "a" || $tag eq "area") && ($links{"href"})) {
- my $link = $links{"href"};
+ if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) {
+ my $link = $links{"href"} || $links{"src"};
#
# Remove fragments
@@ -105,6 +102,15 @@
#
$link =~ s/\.\.\///g;
+ if ($link =~ m,javascript:displayWindow\((.+)\),i) {
+ my $arg = $1;
+ $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
+ ($link,undef) = split(',',$arg,2);
+ $link =~ s/^['"]//;
+ $link =~ s/['"]$//;
+ $link = $urlbase.$link;
+ }
+
# hack for apostrophe -- changes URL, but should work for most clients.
$link =~ s/'/%27/g;
@@ -116,7 +122,7 @@
if ($no_parent_url) {
if ($link =~ m/$no_parent_url/) {
- print LINKS "$link $no_parent_url\n";
+ print LINKS "$no_parent_url $link\n";
# print STDERR "using $link\n";
# } else {
# print STDERR "skipping $link\n";