--- trunk/spider/filter.pm 2004/01/29 18:25:55 60
+++ trunk/spider/filter.pm 2004/01/29 18:26:19 61
@@ -1,7 +1,7 @@
sub filter {
my $contents = shift || return;
-# my $verbose = 1;
+ my $verbose = 0;
# if you don't want content to be indexed, include it in
# foobar tags or surround it with comments
@@ -48,11 +48,15 @@
if ($contents =~ m,,is) {
$new_title = $1;
- } elsif ($contents =~ m,]*>([^<]+),is) {
+ } elsif ($contents =~ m,]*>(.+?),is) {
$new_title = $1;
- } elsif ($contents =~ m,]*>([^<]+),is) {
+ } elsif ($contents =~ m,]*>(.+?),is) {
$new_title = $1;
- } elsif ($contents =~ m,]*>([^<]+),is) {
+ } elsif ($contents =~ m,]*>(.+?),is) {
+ $new_title = $1;
+ } elsif ($contents =~ m,]*>(.+?),is) {
+ $new_title = $1;
+ } elsif ($contents =~ m,]*>(.+?),is) {
$new_title = $1;
} else {
if ($contents =~ m,
([^<]+),is) {
@@ -63,6 +67,10 @@
}
if ($new_title) {
+ # nuke html in title
+ $new_title =~ s/ \s+/: /gs;
+ $new_title =~ s/<\/*[^>]+>//gs;
+
# check if new title is same as collection name
my ($a,$b) = ($new_title,$collection);
$a =~ s/([^a-zA-Z])+/ /gs;