| Revision 45 (by dpavlin, 2004/07/21 12:19:26) |
more clean-up for ooo 1.1.x
|
#!/usr/bin/perl -w
use strict;
use Text::Iconv;
my $cp = "ISO-8859-2";
Text::Iconv->raise_error(1); # die on bad encoding!
my $html;
while(<>) {
$html .= $_;
}
my $c;
if ($html =~ m!CONTENT="text/html; charset=([^"]+)"!is) {
$c = Text::Iconv->new("$1",$cp);
}
$html =~ s/\s+LANG="[^"]+"//gsi;
$html =~ s/<\/*FONT[^>]*>//gsi;
$html =~ s/&#(\d+);/chr($1)/gsie;
$html =~ s/\s+STYLE="margin-bottom: 0in"//gsi;
$html =~ s/\s+STYLE="line-height: 100%"//gsi;
$html =~ s/<(SDFIELD)[^>]*><\/\1>//gsi;
$html =~ s/(STYLE="[^"]*)text-indent:\s+\d+cm(;\s+)*/$1/gsi;
$html =~ s/(STYLE="[^"]*)line-height:\s+\d+%;*/$1/gsi;
$html =~ s/(STYLE="[^"]*)widows:\s+\d+;*/$1/gsi;
$html =~ s/(STYLE="[^"]*)orphans:\s+\d+;*/$1/gsi;
$html =~ s/STYLE="\s*"\s*//gsi;
# remove excessive empty lines
$html =~ s,<p[^>]*>(?:\s*<br>\s*)*</p>,,gsi;
$html =~ s,(<(?:table|td)[^>]*)(width="*\d+"*),$1,gsi;
if ($c) {
$html = $c->convert($html) || die "can't convert codepage!";
}
$html =~ s/š/¹/gs;
$html =~ s/Š/©/gs;
$html =~ s,"(text/html;\s+charset=)\S+?","$1$cp",gsi;
print "$html";