--- trunk/spider/swishspider 2003/11/19 12:07:07 45
+++ trunk/spider/swishspider 2004/01/17 23:57:55 46
@@ -5,6 +5,10 @@
use HTTP::Status;
use HTML::LinkExtor;
+my $basedir = $0;
+$basedir =~ s,/[^/]+$,/,;
+require "$basedir/filter.pm";
+
if (scalar(@ARGV) != 2) {
print STDERR "Usage: SwishSpider localpath url\n";
exit(1);
@@ -65,42 +69,7 @@
open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
# fixup just HTML files
if ($response->header("content-type") =~ "text/html") {
- # if you don't want content to be indexed, include it in
- # foobar tags or surround it with comments
- # foobar
- # foobar (also supported by swish)
- $contents =~ s,.+? ,,isg;
- $contents =~ s,.+?,,isg;
- $contents =~ s,.+?,,isg;
- # this will remove all script from indexing content
- $contents =~ s,,,isg;
- # remap Windows charset to ISO-8859-2
- $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
- # this will fix badly formatted html in form:
- #
some text
- # which will confuse indexer (or libxml2?)
- $contents =~ s/[\n\r]^(>)/$1\n/msg;
- # remove comments between and texi2html inserts them
- # there and swish can't find document title then (libxml or swish bug?)
- while ($contents =~ s/(.*)(.*)/$1$2/msi) { };
-
- # remote TPJ left column
- if ($contents =~ s,.+?,,isg) {
- my $title;
- # extract title and add to title
- if ($contents =~ m,\s*]*>(.+?) ,si) {
- $title = $1;
- } elsif ($contents =~ m,]*>(.+?) ,is) {
- $title = $1;
- } elsif ($contents =~ m,]*>(.+?) ,is) {
- $title = $1;
- } else {
- $title = "no detail title";
- }
- $contents =~ s,()([^<]+)( ),$1$2: $title$3,gsi if ($title);
-
- }
+ $contents = filter($contents);
}
print CONTENTS $contents;
close( CONTENTS );