--- trunk/spider/filter.pm 2004/01/20 18:40:06 51 +++ trunk/spider/filter.pm 2004/03/18 23:07:21 69 @@ -1,5 +1,8 @@ sub filter { my $contents = shift || return; + + my $verbose = 0; + # if you don't want content to be indexed, include it in # foobar tags or surround it with comments # foobar @@ -37,6 +40,67 @@ } + # is second argument collection? + my $collection = shift || return $contents; + + # construct new title (from various parts of DocBook if available) + my $new_title; + + if ($contents =~ m,,is) { + $new_title = $1; + } elsif ($contents =~ m,<(h\d)\sclass="docPartTitle"[^>]*>(.+?)<\1>,is) { + $new_title = $2; + } elsif ($contents =~ m,<(h\d)\sclass="docChapterTitle"[^>]*>(.+?)<\1>,is) { + $new_title = $2; + } elsif ($contents =~ m,<(h\d)\sclass="docSection1Title"[^>]*>(.+?)<\1>,is) { + $new_title = $2; + } elsif ($contents =~ m,<(h\d)\sclass="chapter"[^>]*>(.+?)<\1>,is) { + $new_title = $2; + } elsif ($contents =~ m,<(h\d)\sclass="sect1"[^>]*>(.+?)<\1>,is) { + $new_title = $2; + } else { + if ($contents =~ m,([^<]+),is) { + $new_title = $1; + } elsif ($contents =~ m,]*>([^<]+),is) { + $new_title = $1; + } + } + + if ($new_title) { + # nuke html in title + $new_title =~ s/
\s+/: /gs; + $new_title =~ s/<\/*[^>]+>//gs; + + # check if new title is same as collection name + my ($a,$b) = ($new_title,$collection); + $a =~ s/([^a-zA-Z])+/ /gs; + $b =~ s/([^a-zA-Z])+/ /gs; + if ($a =~ m/$b/i) { + $new_title = $collection; + } else { + $new_title = "$collection :: $new_title"; + } + } else { + # fall-back to collection title + $new_title = $collection; + } + + $new_title =~ s/\s\s+/ /g; + + if ($contents =~ s,(.*),$new_title,is) { + print STDERR "replace title '$1' with '$new_title'\n" if ($verbose); + } elsif ($contents =~ s,(),$1$new_title,is) { + print STDERR "adding title '$new_title' after \n" if ($verbose); + + } elsif ($contents =~ s,(),$1$new_title,is) { + print STDERR "adding title '$new_title' after \n" if ($verbose); + + } elsif ($contents =~ s,^,$new_title,) { + print STDERR "adding new title '$new_title'\n" if ($verbose); + } else { + print STDERR "WARNING: filter couldn't add new title '$new_title' anywhere!"; + } + return $contents; }