--- trunk/spider/filter.pm 2004/01/20 18:40:06 51
+++ trunk/spider/filter.pm 2004/03/18 23:07:21 69
@@ -1,5 +1,8 @@
sub filter {
my $contents = shift || return;
+
+ my $verbose = 0;
+
# if you don't want content to be indexed, include it in
# foobar tags or surround it with comments
# foobar
@@ -37,6 +40,67 @@
}
+ # is second argument collection?
+ my $collection = shift || return $contents;
+
+ # construct new title (from various parts of DocBook if available)
+ my $new_title;
+
+ if ($contents =~ m,,is) {
+ $new_title = $1;
+ } elsif ($contents =~ m,<(h\d)\sclass="docPartTitle"[^>]*>(.+?)<\1>,is) {
+ $new_title = $2;
+ } elsif ($contents =~ m,<(h\d)\sclass="docChapterTitle"[^>]*>(.+?)<\1>,is) {
+ $new_title = $2;
+ } elsif ($contents =~ m,<(h\d)\sclass="docSection1Title"[^>]*>(.+?)<\1>,is) {
+ $new_title = $2;
+ } elsif ($contents =~ m,<(h\d)\sclass="chapter"[^>]*>(.+?)<\1>,is) {
+ $new_title = $2;
+ } elsif ($contents =~ m,<(h\d)\sclass="sect1"[^>]*>(.+?)<\1>,is) {
+ $new_title = $2;
+ } else {
+ if ($contents =~ m,
([^<]+),is) {
+ $new_title = $1;
+ } elsif ($contents =~ m,]*>([^<]+),is) {
+ $new_title = $1;
+ }
+ }
+
+ if ($new_title) {
+ # nuke html in title
+ $new_title =~ s/
\s+/: /gs;
+ $new_title =~ s/<\/*[^>]+>//gs;
+
+ # check if new title is same as collection name
+ my ($a,$b) = ($new_title,$collection);
+ $a =~ s/([^a-zA-Z])+/ /gs;
+ $b =~ s/([^a-zA-Z])+/ /gs;
+ if ($a =~ m/$b/i) {
+ $new_title = $collection;
+ } else {
+ $new_title = "$collection :: $new_title";
+ }
+ } else {
+ # fall-back to collection title
+ $new_title = $collection;
+ }
+
+ $new_title =~ s/\s\s+/ /g;
+
+ if ($contents =~ s,(.*),$new_title,is) {
+ print STDERR "replace title '$1' with '$new_title'\n" if ($verbose);
+ } elsif ($contents =~ s,(),$1$new_title,is) {
+ print STDERR "adding title '$new_title' after \n" if ($verbose);
+
+ } elsif ($contents =~ s,(),$1$new_title,is) {
+ print STDERR "adding title '$new_title' after \n" if ($verbose);
+
+ } elsif ($contents =~ s,^,$new_title,) {
+ print STDERR "adding new title '$new_title'\n" if ($verbose);
+ } else {
+ print STDERR "WARNING: filter couldn't add new title '$new_title' anywhere!";
+ }
+
return $contents;
}