--- trunk/spider/filter.pm 2004/03/17 12:19:14 65 +++ trunk/spider/filter.pm 2004/04/07 12:54:21 74 @@ -22,6 +22,16 @@ # remove comments between and texi2html inserts them # there and swish can't find document title then (libxml or swish bug?) while ($contents =~ s/(.*)(.*)/$1$2/msi) { }; + + # remove empty lines before/after + $contents =~ s/^\s+()/$1/is; + $contents =~ s/(<\/html>)\s+$/$1/is; + # remove cr + $contents =~ s/\r//gs; + + # remove SQL Magazine header and footer + $contents =~ s/.+?/<\/table>/is; + $contents =~ s/.+?<\/table>/<\/table>/is; # remote TPJ left column if ($contents =~ s,.+?,,isg) { @@ -48,21 +58,32 @@ if ($contents =~ m,,is) { $new_title = $1; - } elsif ($contents =~ m,<(h\d)\sclass="docPartTitle"[^>]*>(.+?)<\1>,is) { + print STDERR "using title '$new_title' from \n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="docPartTitle"[^>]*>(.+?),is) { + $new_title = $2; + print STDERR "using title '$new_title' from docPartTitle\n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="docChapterTitle"[^>]*>(.+?),is) { $new_title = $2; - } elsif ($contents =~ m,<(h\d)\sclass="docChapterTitle"[^>]*>(.+?)<\1>,is) { + print STDERR "using title '$new_title' from docChapterTitle\n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="docSection1Title"[^>]*>(.+?),is) { $new_title = $2; - } elsif ($contents =~ m,<(h\d)\sclass="docSection1Title"[^>]*>(.+?)<\1>,is) { + print STDERR "using title '$new_title' from docSection1Title\n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="doc[^"]*Title"[^>]*>(.+?),is) { $new_title = $2; - } elsif ($contents =~ m,<(h\d)\sclass="chapter"[^>]*>(.+?)<\1>,is) { + print STDERR "using title '$new_title' from doc.+Title\n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="chapter"[^>]*>(.+?),is) { $new_title = $2; - } elsif ($contents =~ m,<(h\d)\sclass="sect1"[^>]*>(.+?)<\1>,is) { + print STDERR "using title '$new_title' from chapter\n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="sect1"[^>]*>(.+?),is) { $new_title = $2; + print STDERR "using title '$new_title' from sect1\n" if ($verbose); } else { if ($contents =~ m,([^<]+),is) { $new_title = $1; + print STDERR "using title '$new_title' from \n" if ($verbose); } elsif ($contents =~ m,<h\d[^>]*>([^<]+)</h\d>,is) { $new_title = $1; + print STDERR "using title '$new_title' from <h_>\n" if ($verbose); } } @@ -77,6 +98,7 @@ $b =~ s/([^a-zA-Z])+/ /gs; if ($a =~ m/$b/i) { $new_title = $collection; + print STDERR "new_title and collection are same! [$new_title]\n" if ($verbose); } else { $new_title = "$collection :: $new_title"; } @@ -89,12 +111,16 @@ if ($contents =~ s,<title>(.*),$new_title,is) { print STDERR "replace title '$1' with '$new_title'\n" if ($verbose); - } elsif ( - # try to insert after , or at top - $contents =~ s,(),$1$new_title,is || - $contents =~ s,(),$1$new_title,is || - $contents =~ s,^,$new_title, ) { + } elsif ($contents =~ s,(),$1$new_title,is) { + print STDERR "adding title '$new_title' after \n" if ($verbose); + + } elsif ($contents =~ s,(),$1$new_title,is) { + print STDERR "adding title '$new_title' after \n" if ($verbose); + + } elsif ($contents =~ s,^,$new_title,) { print STDERR "adding new title '$new_title'\n" if ($verbose); + } else { + print STDERR "WARNING: filter couldn't add new title '$new_title' anywhere!"; } return $contents;