| 1 |
42 |
dpavlin |
#!/usr/bin/perl -w |
| 2 |
|
|
|
| 3 |
|
|
use strict; |
| 4 |
|
|
use Text::Iconv; |
| 5 |
|
|
|
| 6 |
|
|
my $cp = "ISO-8859-2"; |
| 7 |
|
|
|
| 8 |
|
|
Text::Iconv->raise_error(1); # die on bad encoding! |
| 9 |
|
|
|
| 10 |
|
|
my $html; |
| 11 |
|
|
while(<>) { |
| 12 |
|
|
$html .= $_; |
| 13 |
|
|
} |
| 14 |
|
|
|
| 15 |
45 |
dpavlin |
my $c; |
| 16 |
|
|
if ($html =~ m!CONTENT="text/html; charset=([^"]+)"!is) { |
| 17 |
|
|
$c = Text::Iconv->new("$1",$cp); |
| 18 |
|
|
} |
| 19 |
|
|
|
| 20 |
42 |
dpavlin |
$html =~ s/\s+LANG="[^"]+"//gsi; |
| 21 |
|
|
$html =~ s/<\/*FONT[^>]*>//gsi; |
| 22 |
|
|
$html =~ s/&#(\d+);/chr($1)/gsie; |
| 23 |
|
|
$html =~ s/\s+STYLE="margin-bottom: 0in"//gsi; |
| 24 |
45 |
dpavlin |
$html =~ s/\s+STYLE="line-height: 100%"//gsi; |
| 25 |
|
|
$html =~ s/<(SDFIELD)[^>]*><\/\1>//gsi; |
| 26 |
42 |
dpavlin |
|
| 27 |
45 |
dpavlin |
$html =~ s/(STYLE="[^"]*)text-indent:\s+\d+cm(;\s+)*/$1/gsi; |
| 28 |
|
|
$html =~ s/(STYLE="[^"]*)line-height:\s+\d+%;*/$1/gsi; |
| 29 |
|
|
$html =~ s/(STYLE="[^"]*)widows:\s+\d+;*/$1/gsi; |
| 30 |
|
|
$html =~ s/(STYLE="[^"]*)orphans:\s+\d+;*/$1/gsi; |
| 31 |
|
|
$html =~ s/STYLE="\s*"\s*//gsi; |
| 32 |
|
|
|
| 33 |
42 |
dpavlin |
# remove excessive empty lines |
| 34 |
|
|
$html =~ s,<p[^>]*>(?:\s*<br>\s*)*</p>,,gsi; |
| 35 |
|
|
$html =~ s,(<(?:table|td)[^>]*)(width="*\d+"*),$1,gsi; |
| 36 |
|
|
|
| 37 |
45 |
dpavlin |
if ($c) { |
| 38 |
|
|
$html = $c->convert($html) || die "can't convert codepage!"; |
| 39 |
|
|
} |
| 40 |
42 |
dpavlin |
|
| 41 |
|
|
$html =~ s/š/¹/gs; |
| 42 |
|
|
$html =~ s/Š/©/gs; |
| 43 |
|
|
|
| 44 |
|
|
$html =~ s,"(text/html;\s+charset=)\S+?","$1$cp",gsi; |
| 45 |
|
|
|
| 46 |
|
|
print "$html"; |