1 |
#!/usr/bin/perl |
2 |
# -*- Mode: Perl -*- |
3 |
# $Basename: HTML.pm $ |
4 |
# $Revision: 1.11 $ |
5 |
# Author : Ulrich Pfeifer with Andreas König |
6 |
# Created On : Sat Nov 1 1997 |
7 |
# Last Modified By: Ulrich Pfeifer |
8 |
# Last Modified On: Fri Jan 4 16:06:14 2002 |
9 |
# Language : CPerl |
10 |
# Update Count : 14 |
11 |
# Status : Unknown, Use with caution! |
12 |
# |
13 |
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved. |
14 |
# |
15 |
# |
16 |
|
17 |
package WAIT::Parse::Ora; |
18 |
use base qw(WAIT::Parse::Base); |
19 |
|
20 |
use HTML::Parser; |
21 |
use Encode; |
22 |
use strict; |
23 |
|
24 |
|
25 |
=pod |
26 |
|
27 |
Text from 2002-03-05 is structured with <div> tags as follows: |
28 |
|
29 |
index.html: |
30 |
<div id="biblio"> BIBLIOGRAPHISCHE ANGABEN |
31 |
<div id="short_desc"> KURZE BESCHREIBUNG |
32 |
|
33 |
desc.html: |
34 |
<div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG |
35 |
|
36 |
author.html: |
37 |
<div id="author_bio"> BIOGRAPHIE DES AUTOREN |
38 |
|
39 |
translator.html: |
40 |
<div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS |
41 |
|
42 |
=cut |
43 |
|
44 |
my $debug = 0; |
45 |
|
46 |
my $globalp = HTML::Parser->new( |
47 |
api_version => 3, |
48 |
start_h => [\&handle_start, "tagname, attr"], |
49 |
end_h => [\&handle_end, "tagname"], |
50 |
text_h => [\&handle_text, "dtext"], |
51 |
marked_sections => 1, |
52 |
); |
53 |
my %result; |
54 |
my $text; |
55 |
my $open; |
56 |
my $div; |
57 |
|
58 |
sub initialize_text { |
59 |
if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) { |
60 |
$text = "\x{100}"; |
61 |
} else { |
62 |
$text = ""; |
63 |
} |
64 |
} |
65 |
|
66 |
sub finished_text { |
67 |
if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) { |
68 |
$text =~ s/^\x{100}//; |
69 |
} |
70 |
$text =~ s/^\s+//; |
71 |
$text =~ s/\s+$//; |
72 |
$text =~ s/\s+/ /g; |
73 |
$text; |
74 |
} |
75 |
|
76 |
sub handle_start { |
77 |
my $tag = shift; |
78 |
my $attr = shift; |
79 |
|
80 |
return unless $tag eq "div"; |
81 |
$div = $attr->{id}; |
82 |
utf8::upgrade($div) if oreilly_de_catalog::config::UPGRADE_DIV(); |
83 |
$open++; |
84 |
print ">" x $open, $tag, "\n" if $debug; |
85 |
} |
86 |
|
87 |
sub handle_end { |
88 |
my $tag = shift; |
89 |
|
90 |
return unless $tag eq "div"; |
91 |
print "<" x $open, $tag, "\n" if $debug; |
92 |
$open--; |
93 |
return unless $div; |
94 |
if (defined $result{$div}){ |
95 |
$result{$div} .= " " . finished_text(); |
96 |
} else { |
97 |
$result{$div} = finished_text(); |
98 |
} |
99 |
initialize_text(); |
100 |
} |
101 |
|
102 |
|
103 |
sub handle_text { |
104 |
my $c = shift; |
105 |
$text .= $c if $open; |
106 |
} |
107 |
|
108 |
|
109 |
# WAIT::Parse::Ora::my_parse |
110 |
sub my_parse ($) { |
111 |
my($s) = @_; |
112 |
my $ls; |
113 |
if (oreilly_de_catalog::config::CHARACTER_STATS() ) { |
114 |
use charnames ":full"; |
115 |
my %seen; |
116 |
while ( $s =~ /([^\000-\177])/g ) { |
117 |
my $c = $1; |
118 |
$seen{$c}++; |
119 |
} |
120 |
for my $c (sort {$seen{$a} <=> $seen{$b} } keys %seen) { |
121 |
my $ord = ord $c; |
122 |
my $charname = charnames::viacode($ord); |
123 |
# printf "CS: %5d %4d %s %s\n", $ord, $seen{$c}, $c, $charname; |
124 |
printf "CS: %5x %4d %s\n", $ord, $seen{$c}, $charname||"[undef]"; |
125 |
} |
126 |
} |
127 |
if (oreilly_de_catalog::config::ENTITY_STATS() ) { |
128 |
use charnames ":full"; |
129 |
my %seen; |
130 |
while ( $s =~ /(&#(\d+|x[\da-f]+);?)/ig ) { |
131 |
my($full) = $1; |
132 |
$seen{$full}++; |
133 |
} |
134 |
for my $full (sort {$seen{$a} <=> $seen{$b} } keys %seen) { |
135 |
my($n) = $full =~ /&#(\d+|x[\da-f]+)/i; |
136 |
# die "full[$full]" unless defined $n; |
137 |
my $ord = $n =~ s/^x//i ? hex $n : $n; |
138 |
my $charname = charnames::viacode($ord); |
139 |
# printf "CS: %5d %4d %s %s\n", $ord, $seen{$c}, $c, $charname; |
140 |
printf "ES: %-12s %5x %4d %s\n", $full, $ord, $seen{$full}, $charname||"[undef]"; |
141 |
} |
142 |
} |
143 |
|
144 |
if (oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE()) { |
145 |
warn "Warning: this HTML::Parser has Unicode support on" |
146 |
if HTML::Entities::UNICODE_SUPPORT(); |
147 |
$ls = Encode::encode("ISO-8859-1", $s, 1); |
148 |
# HTML::Parser 3.25, 3.26 returns **mostly** LATIN for entities. |
149 |
# We reduce the amount of mixed-encoding content to just a few |
150 |
# punctuation characters when we work with Latin here. |
151 |
|
152 |
} elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER() ) { |
153 |
$ls = Encode::encode_utf8($s); |
154 |
} elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) { |
155 |
$ls = $s; |
156 |
$ls =~ s/([^\000-\177])/ "&#" . ord($1) . ";" /ge; |
157 |
utf8::downgrade($ls); # didn't improve the coredumpness |
158 |
die "ls[$ls] not 7bit clean" unless $ls =~ /^[\000-\177]*$/; |
159 |
# warn "ls[$ls]"; |
160 |
} else { |
161 |
$ls = $s; |
162 |
} |
163 |
%result = (); |
164 |
initialize_text(); |
165 |
$open = 0; |
166 |
$div = undef; |
167 |
if (0) { # XXX probieren ueber probieren wg Entities und UTF-8 |
168 |
# code that directly deals with $s because it doesn't want the |
169 |
# conversion to $ls (latin1) |
170 |
|
171 |
$s =~ s/\s+/ /g; # die CR nerven noch mehr als die LF |
172 |
my $pre_s = $s; |
173 |
# $s =~ s/™//g; |
174 |
# $s =~ s/\302\255//g; # 2.8 pounds in |
175 |
if ( 0 && $HTML::Parser::VERSION == 3.26 ) { |
176 |
# Should be handled by HTML::Entities, numeric entities and |
177 |
# HTML::Entities and -DUNICODE_ENTITIES => core dump |
178 |
|
179 |
my $saidinfo = 0; |
180 |
local $| = 1; |
181 |
while ( $s =~ s/\&\#(\d+)\;/chr($1)/e ) { |
182 |
print "Info:" unless $saidinfo++; |
183 |
print " &#$1;"; |
184 |
} |
185 |
print "\n" if $saidinfo; |
186 |
|
187 |
} |
188 |
} |
189 |
if (oreilly_de_catalog::config::DISPOSE_PARSER_EACH_TIME() ) { |
190 |
my $p = HTML::Parser->new( |
191 |
api_version => 3, |
192 |
start_h => [\&handle_start, "tagname, attr"], |
193 |
end_h => [\&handle_end, "tagname"], |
194 |
text_h => [\&handle_text, "dtext"], |
195 |
marked_sections => 1, |
196 |
); |
197 |
$p->parse($ls); |
198 |
$p->eof; |
199 |
} else { |
200 |
$globalp->parse($ls); |
201 |
$globalp->eof; |
202 |
} |
203 |
if (0) { # XXX |
204 |
# code that tries to postprocess the nonsense resulting from the above |
205 |
while (my($k,$v) = each %result) { |
206 |
next unless defined($v) && length($v); |
207 |
next if Encode::is_utf8($v); |
208 |
next unless $v =~ /[^\040-\177]/; |
209 |
# Wenn UTF-8 und nicht-UTF-8 gemischt sind, sind wir erledigt |
210 |
my $utf8v; |
211 |
if (HTML::Entities::UNICODE_SUPPORT()) { |
212 |
if (0) { |
213 |
# klappt nicht == 2002-04-02 |
214 |
$utf8v = Encode::decode("ISO-8859-1",$v); |
215 |
} elsif (1) { |
216 |
$utf8v = $v; |
217 |
# fuehrt zu "unexpected downgraded strings" und die haben dann |
218 |
# noch ein UTF-8 Teile, die nicht als solche markiert sind |
219 |
} |
220 |
} else { |
221 |
# Want to find out which condition we need to watch |
222 |
if ($HTML::Parser::VERSION != 3.26) { |
223 |
# klappt nicht == 2002-04-02 |
224 |
$utf8v = Encode::decode("ISO-8859-1",$v); |
225 |
} else { |
226 |
# klappt nicht == 2002-04-02 |
227 |
$utf8v = $v; |
228 |
Encode::_utf8_on($utf8v); |
229 |
Encode::is_utf8($utf8v, 1) or die "Not UTF8 [$utf8v]"; |
230 |
} |
231 |
} |
232 |
$result{$k} = $utf8v; |
233 |
} |
234 |
} |
235 |
if ( oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE() ) { |
236 |
while (my($k,$v) = each %result) { |
237 |
next unless defined($v) && length($v); |
238 |
my $utf8v = Encode::decode("ISO-8859-1",$v); |
239 |
$result{$k} = $utf8v; |
240 |
} |
241 |
} elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER()) { |
242 |
while (my($k,$v) = each %result) { |
243 |
next unless defined($v) && length($v); |
244 |
my $utf8v = Encode::decode_utf8($v); |
245 |
$result{$k} = $utf8v; |
246 |
} |
247 |
} elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) { |
248 |
while (my($k,$v) = each %result) { |
249 |
next unless defined($v) && length($v); |
250 |
utf8::upgrade($v); |
251 |
$result{$k} = $v; |
252 |
} |
253 |
} |
254 |
} |
255 |
|
256 |
sub split { |
257 |
my ($self, $doc) = @_; |
258 |
my %doc = map { $_ => "" } qw(isbn author aboutauthor chapter |
259 |
translator abouttranslator colophon |
260 |
abstract title subtitle title_orig toc inx); |
261 |
|
262 |
if ($doc->{author}) { |
263 |
my_parse($doc->{author}); |
264 |
$doc{aboutauthor} = $result{author_bio}; |
265 |
} |
266 |
if ($doc->{translator}) { |
267 |
my_parse($doc->{translator}); |
268 |
$doc{abouttranslator} = $result{translator_bio}; |
269 |
} |
270 |
if ($doc->{index}) { |
271 |
my_parse($doc->{index}); |
272 |
$doc{abstract} = $result{short_desc}; |
273 |
$doc{isbn} = $result{isbn}; |
274 |
$doc{author} = $result{author_names} || ""; |
275 |
$doc{translator} = $result{translator_names}; |
276 |
$doc{title} = $result{title}; |
277 |
$doc{subtitle} = $result{subtitle}; |
278 |
$doc{title_orig} = $result{title_orig}; |
279 |
} |
280 |
if ($doc->{chapter}) { |
281 |
my $content = $doc->{chapter}; |
282 |
my $bs; |
283 |
$bs++ if $content =~ s/^.*?<!--\s*sample chapter (begins (here )?)?-->//si; |
284 |
my $es; |
285 |
$es++ if $content =~ s/<!--\s*(End of )?sample chapter (ends here )?-->.*//si; |
286 |
unless ($bs){ |
287 |
$content =~ s/^.*?<h1/<h1/si; |
288 |
} |
289 |
unless ($es){ |
290 |
$content =~ s/<HR.*//si; |
291 |
} |
292 |
$content =~ s/^/<div id="chapter">/; |
293 |
$content .= "</div>\n"; |
294 |
my_parse($content); |
295 |
$doc{chapter} = $result{chapter}; |
296 |
} |
297 |
if ($doc->{colophon}) { |
298 |
my_parse($doc->{colophon}); |
299 |
my $s = $doc{colophon} = $result{colophon}; |
300 |
# use Devel::Peek; |
301 |
# Devel::Peek::Dump($s); |
302 |
} |
303 |
if ($doc->{toc}) { |
304 |
my_parse($doc->{toc}); |
305 |
if (my $s = $result{book_toc}) { |
306 |
# $s =~ s/<BR>/ /ig; # very wrong! if we have <BR> here, it was <BR> |
307 |
$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl |
308 |
$s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einleitung 1.1 Zugriff |
309 |
$s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi |
310 |
$doc{toc} = $s; |
311 |
} else { |
312 |
warn "toc contains no \$s?"; |
313 |
} |
314 |
} |
315 |
if ($doc->{inx}) { |
316 |
my_parse($doc->{inx}); |
317 |
my $s = $result{book_inx} || ""; |
318 |
# $s =~ s/<BR>/ /ig; # wrong!, see above |
319 |
$s =~ s/&#(8211);/-/g; |
320 |
$s =~ s/&#(8220);/"/g; |
321 |
$s =~ s/&#(8222);/"/g; |
322 |
$s =~ s/&#(8217);/'/g; |
323 |
$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl |
324 |
$s =~ s/\s*,\s+/ /g; # Komma |
325 |
1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus ê) |
326 |
1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben |
327 |
$s =~ s/(\w+)\( \)/$1()/g; # functions in the index |
328 |
$doc{inx} = $s; |
329 |
} |
330 |
if ($doc->{desc}) { |
331 |
my_parse($doc->{desc}); |
332 |
$doc{desc} = $result{long_desc}; |
333 |
} |
334 |
|
335 |
if (0) { |
336 |
# we did really convert the stuff we just read in to UTF8 |
337 |
# (although WAIT::Document::Ora::conv_getline converts to UTF8 |
338 |
# itself), because my_parse did the conversion back to latin1. |
339 |
# This nonsense must stop. All routines must get and give UTF-8. |
340 |
# If they want to process something else internally, they must |
341 |
# convert twice |
342 |
while (my($k,$v) = each %doc) { |
343 |
next unless defined($v) && length($v); |
344 |
my $utf8v = Encode::decode("ISO-8859-1",$v); |
345 |
$doc{$k} = $utf8v; |
346 |
} |
347 |
} |
348 |
# warn "ALERT: No author" unless $doc{author}; |
349 |
|
350 |
return \%doc; |
351 |
} |
352 |
|
353 |
1; |