--- trunk/spider/filter.pm 2004/03/17 12:19:14 65 +++ trunk/spider/filter.pm 2004/04/07 12:54:21 74 @@ -22,6 +22,16 @@ # remove comments between and
texi2html inserts them # there and swish can't find document title then (libxml or swish bug?) while ($contents =~ s/(.*)(.*)/$1$2/msi) { }; + + # remove empty lines before/after + $contents =~ s/^\s+()/$1/is; + $contents =~ s/(<\/html>)\s+$/$1/is; + # remove cr + $contents =~ s/\r//gs; + + # remove SQL Magazine header and footer + $contents =~ s/.+?/<\/table>/is; + $contents =~ s/.+?<\/table>/<\/table>/is; # remote TPJ left column if ($contents =~ s,.+?,,isg) { @@ -48,21 +58,32 @@ if ($contents =~ m,,is) { $new_title = $1; - } elsif ($contents =~ m,<(h\d)\sclass="docPartTitle"[^>]*>(.+?)<\1>,is) { + print STDERR "using title '$new_title' from \n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="docPartTitle"[^>]*>(.+?)\1>,is) { + $new_title = $2; + print STDERR "using title '$new_title' from docPartTitle\n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="docChapterTitle"[^>]*>(.+?)\1>,is) { $new_title = $2; - } elsif ($contents =~ m,<(h\d)\sclass="docChapterTitle"[^>]*>(.+?)<\1>,is) { + print STDERR "using title '$new_title' from docChapterTitle\n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="docSection1Title"[^>]*>(.+?)\1>,is) { $new_title = $2; - } elsif ($contents =~ m,<(h\d)\sclass="docSection1Title"[^>]*>(.+?)<\1>,is) { + print STDERR "using title '$new_title' from docSection1Title\n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="doc[^"]*Title"[^>]*>(.+?)\1>,is) { $new_title = $2; - } elsif ($contents =~ m,<(h\d)\sclass="chapter"[^>]*>(.+?)<\1>,is) { + print STDERR "using title '$new_title' from doc.+Title\n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="chapter"[^>]*>(.+?)\1>,is) { $new_title = $2; - } elsif ($contents =~ m,<(h\d)\sclass="sect1"[^>]*>(.+?)<\1>,is) { + print STDERR "using title '$new_title' from chapter\n" if ($verbose); + } elsif ($contents =~ m,<(h\d)\s+class="sect1"[^>]*>(.+?)\1>,is) { $new_title = $2; + print STDERR "using title '$new_title' from sect1\n" if ($verbose); } else { if ($contents =~ m,