--- trunk/spider/filter.pm 2004/01/29 18:25:55 60 +++ trunk/spider/filter.pm 2004/01/29 18:26:19 61 @@ -1,7 +1,7 @@ sub filter { my $contents = shift || return; -# my $verbose = 1; + my $verbose = 0; # if you don't want content to be indexed, include it in # foobar tags or surround it with comments @@ -48,11 +48,15 @@ if ($contents =~ m,,is) { $new_title = $1; - } elsif ($contents =~ m,]*>([^<]+),is) { + } elsif ($contents =~ m,]*>(.+?),is) { $new_title = $1; - } elsif ($contents =~ m,]*>([^<]+),is) { + } elsif ($contents =~ m,]*>(.+?),is) { $new_title = $1; - } elsif ($contents =~ m,]*>([^<]+),is) { + } elsif ($contents =~ m,]*>(.+?),is) { + $new_title = $1; + } elsif ($contents =~ m,]*>(.+?),is) { + $new_title = $1; + } elsif ($contents =~ m,]*>(.+?),is) { $new_title = $1; } else { if ($contents =~ m,([^<]+),is) { @@ -63,6 +67,10 @@ } if ($new_title) { + # nuke html in title + $new_title =~ s/
\s+/: /gs; + $new_title =~ s/<\/*[^>]+>//gs; + # check if new title is same as collection name my ($a,$b) = ($new_title,$collection); $a =~ s/([^a-zA-Z])+/ /gs;