--- trunk/spider/swishspider 2003/03/16 21:31:55 15
+++ trunk/spider/swishspider 2003/03/24 09:57:44 30
@@ -59,10 +59,20 @@
my $contents = $response->content();
open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
+ # if you don't want content to be indexed, include it in
+ # foobar tags or surround it with comments
+ # foobar
$contents =~ s,.+?,,isg;
$contents =~ s,.+?,,isg;
+ # this will remove all script from indexing content
$contents =~ s,,,isg;
+ # remap Windows charset to ISO-8859-2
$contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
+ # this will fix badly formatted html in form:
+ #
some text
+ # which will confuse indexer (or libxml2?)
+ $contents =~ s/[\n\r]^(>)/$1\n/msg;
print CONTENTS $contents;
close( CONTENTS );