--- filter.pl 2003/12/14 19:11:30 1.1 +++ filter.pl 2005/02/01 14:34:55 1.6 @@ -9,9 +9,9 @@ $pre |= ''; $post |= ''; my $isbn; - if ($href =~ m/xmlid=([^&]+)&/) { + if ($href =~ m/xmlid=([^&]+)&/i) { $isbn = $1; - } elsif ($href =~ m/xmlid=([^&]+)$/) { + } elsif ($href =~ m/xmlid=([^&]+)$/i) { $isbn = $1; } else { print STDERR "skipping $href\n"; @@ -29,7 +29,7 @@ $mode .= "_"; my $view; - if ($href =~ m/view=([^&]+)&/) { + if ($isbn =~ m/_index$/ && $href =~ m/view=([^&]+)&/) { $isbn .= "_".$1; } @@ -39,7 +39,7 @@ $isbn .= ".html"; # anchor - if ($href =~ m/(#.+)$/) { + if ($href =~ m/(#[^&]+)/) { $isbn .= $1; } @@ -58,23 +58,32 @@ } close(IN); -$html =~ s,()O'Reilly Network Safari Bookshelf\s+-\s+,$1,gsi || die "$infile: title"; +$html =~ s!(<title>)O'Reilly Network Safari Bookshelf\s+-\s+!$1!gsi || die "$infile: title"; -$html =~ s,<body leftmargin="0" topmargin="0" marginwidth="0" marginheight="0">,<body leftmargin="10" topmargin="10" marginwidth="10" marginheight="10">,s || die "$infile: margins"; -$html =~ s,<a name="toppage">.*<!--Copyright.*?-->,,s || die "$infile: surround layout"; +$html =~ s!<body leftmargin="0" topmargin="0" marginwidth="0" marginheight="0">!<body leftmargin="10" topmargin="10" marginwidth="10" marginheight="10">!s || die "$infile: margins"; +$html =~ s;<a name="toppage">.*<!--Copyright.*?-->;<a name="toppage"></a>;s || die "$infile: surround layout"; -$html =~ s,<td valign="top" class="v2">.*?(<td valign="top" class="v2" align="right">),$1,s || die "$infile: top buttons"; -$html =~ s,<td valign="top" class="v2"><a target="_new".*?(<td valign="top" class="v2" align="right">),$1,s || warn "bottom buttons"; +$html =~ s!<td valign="top" class="v2">.*?(<td valign="top" class="v2" align="right">)!$1!si || warn "$infile: top buttons"; +$html =~ s!<td valign="top" class="v2"><a target="_new".*?(<td valign="top" class="v2" align="right">)!$1!si || warn "bottom buttons"; -$html =~ s,<p><b>URL</b>.*$,</body></html>,s || die "$infile: footer"; +$html =~ s!<p><b>URL</b>.*$!</body></html>!si || die "$infile: footer"; -$html =~ s,<!--.+?-->,,gs; +$html =~ s;<!--.+?-->;;gs; $html =~ s!(<a\s+[^>]*href=")([^"]+)("[^>]*>)!xmlid2file("$2","$1","$3")!iegs || die "$infile: links"; $html =~ s!<a target="_new"[^>]*href="http://[^>]+>(.+?)</a>!$1!gs; +$html =~ s!<img[^>]+Buy Print Version[^>]+>!!gs; +$html =~ s!<a[^>]+onclick="OpenWin[^>]+mode=downloadPDF[^>]+>\s*<img[^>]+Download this chapter[^>]+>\s*</a>!!gs; + open(OUT,"> $outfile") || die "$outfile: $!"; print "$outfile\n"; print OUT $html; close(OUT); + + +# fix timestamp +# atime = 8, ctime = 9 +my @s = stat($infile) || die "stat $infile: $!"; +utime $s[8],$s[9], $outfile || die "touch $outfile: $1";