Line # Revision Author
1 42 dpavlin #!/usr/bin/perl -w
2
3 use strict;
4 use Text::Iconv;
5
6 my $cp = "ISO-8859-2";
7
8 Text::Iconv->raise_error(1); # die on bad encoding!
9
10 my $html;
11 while(<>) {
12 $html .= $_;
13 }
14
15 45 dpavlin my $c;
16 if ($html =~ m!CONTENT="text/html; charset=([^"]+)"!is) {
17 $c = Text::Iconv->new("$1",$cp);
18 }
19
20 42 dpavlin $html =~ s/\s+LANG="[^"]+"//gsi;
21 $html =~ s/<\/*FONT[^>]*>//gsi;
22 $html =~ s/&#(\d+);/chr($1)/gsie;
23 $html =~ s/\s+STYLE="margin-bottom: 0in"//gsi;
24 45 dpavlin $html =~ s/\s+STYLE="line-height: 100%"//gsi;
25 $html =~ s/<(SDFIELD)[^>]*><\/\1>//gsi;
26 42 dpavlin
27 45 dpavlin $html =~ s/(STYLE="[^"]*)text-indent:\s+\d+cm(;\s+)*/$1/gsi;
28 $html =~ s/(STYLE="[^"]*)line-height:\s+\d+%;*/$1/gsi;
29 $html =~ s/(STYLE="[^"]*)widows:\s+\d+;*/$1/gsi;
30 $html =~ s/(STYLE="[^"]*)orphans:\s+\d+;*/$1/gsi;
31 $html =~ s/STYLE="\s*"\s*//gsi;
32
33 42 dpavlin # remove excessive empty lines
34 $html =~ s,<p[^>]*>(?:\s*<br>\s*)*</p>,,gsi;
35 $html =~ s,(<(?:table|td)[^>]*)(width="*\d+"*),$1,gsi;
36
37 45 dpavlin if ($c) {
38 $html = $c->convert($html) || die "can't convert codepage!";
39 }
40 42 dpavlin
41 $html =~ s/&scaron;/¹/gs;
42 $html =~ s/&Scaron;/©/gs;
43
44 $html =~ s,"(text/html;\s+charset=)\S+?","$1$cp",gsi;
45
46 print "$html";