--- trunk/spider/swishspider 2003/11/19 12:07:07 45 +++ trunk/spider/swishspider 2004/01/17 23:57:55 46 @@ -5,6 +5,10 @@ use HTTP::Status; use HTML::LinkExtor; +my $basedir = $0; +$basedir =~ s,/[^/]+$,/,; +require "$basedir/filter.pm"; + if (scalar(@ARGV) != 2) { print STDERR "Usage: SwishSpider localpath url\n"; exit(1); @@ -65,42 +69,7 @@ open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" ); # fixup just HTML files if ($response->header("content-type") =~ "text/html") { - # if you don't want content to be indexed, include it in - # foobar tags or surround it with comments - # foobar - # foobar (also supported by swish) - $contents =~ s,.+?,,isg; - $contents =~ s,.+?,,isg; - $contents =~ s,.+?,,isg; - # this will remove all script from indexing content - $contents =~ s,,,isg; - # remap Windows charset to ISO-8859-2 - $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2 - # this will fix badly formatted html in form: - # some text - # which will confuse indexer (or libxml2?) - $contents =~ s/[\n\r]^(>)/$1\n/msg; - # remove comments between and texi2html inserts them - # there and swish can't find document title then (libxml or swish bug?) - while ($contents =~ s/(.*)(.*)/$1$2/msi) { }; - - # remote TPJ left column - if ($contents =~ s,.+?,,isg) { - my $title; - # extract title and add to title - if ($contents =~ m,\s*]*>(.+?),si) { - $title = $1; - } elsif ($contents =~ m,]*>(.+?),is) { - $title = $1; - } elsif ($contents =~ m,]*>(.+?),is) { - $title = $1; - } else { - $title = "no detail title"; - } - $contents =~ s,()([^<]+)(),$1$2: $title$3,gsi if ($title); - - } + $contents = filter($contents); } print CONTENTS $contents; close( CONTENTS );