--- trunk/spider/swishspider 2002/06/04 06:39:53 1
+++ trunk/spider/swishspider 2003/03/24 09:57:44 30
@@ -59,10 +59,20 @@
my $contents = $response->content();
open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
+ # if you don't want content to be indexed, include it in
+ # foobar tags or surround it with comments
+ # foobar
$contents =~ s,.+?,,isg;
$contents =~ s,.+?,,isg;
+ # this will remove all script from indexing content
$contents =~ s,,,isg;
+ # remap Windows charset to ISO-8859-2
$contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
+ # this will fix badly formatted html in form:
+ #
some text
+ # which will confuse indexer (or libxml2?)
+ $contents =~ s/[\n\r]^(>)/$1\n/msg;
print CONTENTS $contents;
close( CONTENTS );
@@ -78,7 +88,7 @@
sub linkcb {
my($tag, %links) = @_;
- if (($tag eq "a") && ($links{"href"})) {
+ if (($tag eq "a" || $tag eq "area") && ($links{"href"})) {
my $link = $links{"href"};
#
@@ -98,6 +108,9 @@
# hack for Apache directory listings
$link =~ s,/\?[NMSD]=[AD]$,/,g;
+ # speedup, skip pictures
+ return if ($link =~ m/\.(gif|jpg|png)/);
+
if ($no_parent_url) {
if ($link =~ m/$no_parent_url/) {
print LINKS "$link $no_parent_url\n";