/[wait]/branches/unido/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/unido/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 106 - (show annotations)
Tue Jul 13 12:22:09 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 10573 byte(s)
Changes made by Andreas J. Koenig <andreas.koenig(at)anima.de> for Unido project

1 #!/usr/bin/perl
2 # -*- Mode: Perl -*-
3 # $Basename: HTML.pm $
4 # $Revision: 1.11 $
5 # Author : Ulrich Pfeifer with Andreas König
6 # Created On : Sat Nov 1 1997
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Fri Jan 4 16:06:14 2002
9 # Language : CPerl
10 # Update Count : 14
11 # Status : Unknown, Use with caution!
12 #
13 # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14 #
15 #
16
17 package WAIT::Parse::Ora;
18 use base qw(WAIT::Parse::Base);
19
20 use HTML::Parser;
21 use Encode;
22 use strict;
23
24
25 =pod
26
27 Text from 2002-03-05 is structured with <div> tags as follows:
28
29 index.html:
30 <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
31 <div id="short_desc"> KURZE BESCHREIBUNG
32
33 desc.html:
34 <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
35
36 author.html:
37 <div id="author_bio"> BIOGRAPHIE DES AUTOREN
38
39 translator.html:
40 <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
41
42 =cut
43
44 my $debug = 0;
45
46 my $globalp = HTML::Parser->new(
47 api_version => 3,
48 start_h => [\&handle_start, "tagname, attr"],
49 end_h => [\&handle_end, "tagname"],
50 text_h => [\&handle_text, "dtext"],
51 marked_sections => 1,
52 );
53 my %result;
54 my $text;
55 my $open;
56 my $div;
57
58 sub initialize_text {
59 if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
60 $text = "\x{100}";
61 } else {
62 $text = "";
63 }
64 }
65
66 sub finished_text {
67 if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
68 $text =~ s/^\x{100}//;
69 }
70 $text =~ s/^\s+//;
71 $text =~ s/\s+$//;
72 $text =~ s/\s+/ /g;
73 $text;
74 }
75
76 sub handle_start {
77 my $tag = shift;
78 my $attr = shift;
79
80 return unless $tag eq "div";
81 $div = $attr->{id};
82 utf8::upgrade($div) if oreilly_de_catalog::config::UPGRADE_DIV();
83 $open++;
84 print ">" x $open, $tag, "\n" if $debug;
85 }
86
87 sub handle_end {
88 my $tag = shift;
89
90 return unless $tag eq "div";
91 print "<" x $open, $tag, "\n" if $debug;
92 $open--;
93 return unless $div;
94 if (defined $result{$div}){
95 $result{$div} .= " " . finished_text();
96 } else {
97 $result{$div} = finished_text();
98 }
99 initialize_text();
100 }
101
102
103 sub handle_text {
104 my $c = shift;
105 $text .= $c if $open;
106 }
107
108
109 # WAIT::Parse::Ora::my_parse
110 sub my_parse ($) {
111 my($s) = @_;
112 my $ls;
113 if (oreilly_de_catalog::config::CHARACTER_STATS() ) {
114 use charnames ":full";
115 my %seen;
116 while ( $s =~ /([^\000-\177])/g ) {
117 my $c = $1;
118 $seen{$c}++;
119 }
120 for my $c (sort {$seen{$a} <=> $seen{$b} } keys %seen) {
121 my $ord = ord $c;
122 my $charname = charnames::viacode($ord);
123 # printf "CS: %5d %4d %s %s\n", $ord, $seen{$c}, $c, $charname;
124 printf "CS: %5x %4d %s\n", $ord, $seen{$c}, $charname||"[undef]";
125 }
126 }
127 if (oreilly_de_catalog::config::ENTITY_STATS() ) {
128 use charnames ":full";
129 my %seen;
130 while ( $s =~ /(&#(\d+|x[\da-f]+);?)/ig ) {
131 my($full) = $1;
132 $seen{$full}++;
133 }
134 for my $full (sort {$seen{$a} <=> $seen{$b} } keys %seen) {
135 my($n) = $full =~ /&#(\d+|x[\da-f]+)/i;
136 # die "full[$full]" unless defined $n;
137 my $ord = $n =~ s/^x//i ? hex $n : $n;
138 my $charname = charnames::viacode($ord);
139 # printf "CS: %5d %4d %s %s\n", $ord, $seen{$c}, $c, $charname;
140 printf "ES: %-12s %5x %4d %s\n", $full, $ord, $seen{$full}, $charname||"[undef]";
141 }
142 }
143
144 if (oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE()) {
145 warn "Warning: this HTML::Parser has Unicode support on"
146 if HTML::Entities::UNICODE_SUPPORT();
147 $ls = Encode::encode("ISO-8859-1", $s, 1);
148 # HTML::Parser 3.25, 3.26 returns **mostly** LATIN for entities.
149 # We reduce the amount of mixed-encoding content to just a few
150 # punctuation characters when we work with Latin here.
151
152 } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER() ) {
153 $ls = Encode::encode_utf8($s);
154 } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
155 $ls = $s;
156 $ls =~ s/([^\000-\177])/ "&#" . ord($1) . ";" /ge;
157 utf8::downgrade($ls); # didn't improve the coredumpness
158 die "ls[$ls] not 7bit clean" unless $ls =~ /^[\000-\177]*$/;
159 # warn "ls[$ls]";
160 } else {
161 $ls = $s;
162 }
163 %result = ();
164 initialize_text();
165 $open = 0;
166 $div = undef;
167 if (0) { # XXX probieren ueber probieren wg Entities und UTF-8
168 # code that directly deals with $s because it doesn't want the
169 # conversion to $ls (latin1)
170
171 $s =~ s/\s+/ /g; # die CR nerven noch mehr als die LF
172 my $pre_s = $s;
173 # $s =~ s/&#153;//g;
174 # $s =~ s/\302\255//g; # 2.8 pounds in
175 if ( 0 && $HTML::Parser::VERSION == 3.26 ) {
176 # Should be handled by HTML::Entities, numeric entities and
177 # HTML::Entities and -DUNICODE_ENTITIES => core dump
178
179 my $saidinfo = 0;
180 local $| = 1;
181 while ( $s =~ s/\&\#(\d+)\;/chr($1)/e ) {
182 print "Info:" unless $saidinfo++;
183 print " &#$1;";
184 }
185 print "\n" if $saidinfo;
186
187 }
188 }
189 if (oreilly_de_catalog::config::DISPOSE_PARSER_EACH_TIME() ) {
190 my $p = HTML::Parser->new(
191 api_version => 3,
192 start_h => [\&handle_start, "tagname, attr"],
193 end_h => [\&handle_end, "tagname"],
194 text_h => [\&handle_text, "dtext"],
195 marked_sections => 1,
196 );
197 $p->parse($ls);
198 $p->eof;
199 } else {
200 $globalp->parse($ls);
201 $globalp->eof;
202 }
203 if (0) { # XXX
204 # code that tries to postprocess the nonsense resulting from the above
205 while (my($k,$v) = each %result) {
206 next unless defined($v) && length($v);
207 next if Encode::is_utf8($v);
208 next unless $v =~ /[^\040-\177]/;
209 # Wenn UTF-8 und nicht-UTF-8 gemischt sind, sind wir erledigt
210 my $utf8v;
211 if (HTML::Entities::UNICODE_SUPPORT()) {
212 if (0) {
213 # klappt nicht == 2002-04-02
214 $utf8v = Encode::decode("ISO-8859-1",$v);
215 } elsif (1) {
216 $utf8v = $v;
217 # fuehrt zu "unexpected downgraded strings" und die haben dann
218 # noch ein UTF-8 Teile, die nicht als solche markiert sind
219 }
220 } else {
221 # Want to find out which condition we need to watch
222 if ($HTML::Parser::VERSION != 3.26) {
223 # klappt nicht == 2002-04-02
224 $utf8v = Encode::decode("ISO-8859-1",$v);
225 } else {
226 # klappt nicht == 2002-04-02
227 $utf8v = $v;
228 Encode::_utf8_on($utf8v);
229 Encode::is_utf8($utf8v, 1) or die "Not UTF8 [$utf8v]";
230 }
231 }
232 $result{$k} = $utf8v;
233 }
234 }
235 if ( oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE() ) {
236 while (my($k,$v) = each %result) {
237 next unless defined($v) && length($v);
238 my $utf8v = Encode::decode("ISO-8859-1",$v);
239 $result{$k} = $utf8v;
240 }
241 } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER()) {
242 while (my($k,$v) = each %result) {
243 next unless defined($v) && length($v);
244 my $utf8v = Encode::decode_utf8($v);
245 $result{$k} = $utf8v;
246 }
247 } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
248 while (my($k,$v) = each %result) {
249 next unless defined($v) && length($v);
250 utf8::upgrade($v);
251 $result{$k} = $v;
252 }
253 }
254 }
255
256 sub split {
257 my ($self, $doc) = @_;
258 my %doc = map { $_ => "" } qw(isbn author aboutauthor chapter
259 translator abouttranslator colophon
260 abstract title subtitle title_orig toc inx);
261
262 if ($doc->{author}) {
263 my_parse($doc->{author});
264 $doc{aboutauthor} = $result{author_bio};
265 }
266 if ($doc->{translator}) {
267 my_parse($doc->{translator});
268 $doc{abouttranslator} = $result{translator_bio};
269 }
270 if ($doc->{index}) {
271 my_parse($doc->{index});
272 $doc{abstract} = $result{short_desc};
273 $doc{isbn} = $result{isbn};
274 $doc{author} = $result{author_names} || "";
275 $doc{translator} = $result{translator_names};
276 $doc{title} = $result{title};
277 $doc{subtitle} = $result{subtitle};
278 $doc{title_orig} = $result{title_orig};
279 }
280 if ($doc->{chapter}) {
281 my $content = $doc->{chapter};
282 my $bs;
283 $bs++ if $content =~ s/^.*?<!--\s*sample chapter (begins (here )?)?-->//si;
284 my $es;
285 $es++ if $content =~ s/<!--\s*(End of )?sample chapter (ends here )?-->.*//si;
286 unless ($bs){
287 $content =~ s/^.*?<h1/<h1/si;
288 }
289 unless ($es){
290 $content =~ s/<HR.*//si;
291 }
292 $content =~ s/^/<div id="chapter">/;
293 $content .= "</div>\n";
294 my_parse($content);
295 $doc{chapter} = $result{chapter};
296 }
297 if ($doc->{colophon}) {
298 my_parse($doc->{colophon});
299 my $s = $doc{colophon} = $result{colophon};
300 # use Devel::Peek;
301 # Devel::Peek::Dump($s);
302 }
303 if ($doc->{toc}) {
304 my_parse($doc->{toc});
305 if (my $s = $result{book_toc}) {
306 # $s =~ s/<BR>/ /ig; # very wrong! if we have <BR> here, it was &lt;BR&gt;
307 $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
308 $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einleitung 1.1 Zugriff
309 $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
310 $doc{toc} = $s;
311 } else {
312 warn "toc contains no \$s?";
313 }
314 }
315 if ($doc->{inx}) {
316 my_parse($doc->{inx});
317 my $s = $result{book_inx} || "";
318 # $s =~ s/<BR>/ /ig; # wrong!, see above
319 $s =~ s/&#(8211);/-/g;
320 $s =~ s/&#(8220);/"/g;
321 $s =~ s/&#(8222);/"/g;
322 $s =~ s/&#(8217);/'/g;
323 $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
324 $s =~ s/\s*,\s+/ /g; # Komma
325 1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus &#234;)
326 1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
327 $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
328 $doc{inx} = $s;
329 }
330 if ($doc->{desc}) {
331 my_parse($doc->{desc});
332 $doc{desc} = $result{long_desc};
333 }
334
335 if (0) {
336 # we did really convert the stuff we just read in to UTF8
337 # (although WAIT::Document::Ora::conv_getline converts to UTF8
338 # itself), because my_parse did the conversion back to latin1.
339 # This nonsense must stop. All routines must get and give UTF-8.
340 # If they want to process something else internally, they must
341 # convert twice
342 while (my($k,$v) = each %doc) {
343 next unless defined($v) && length($v);
344 my $utf8v = Encode::decode("ISO-8859-1",$v);
345 $doc{$k} = $utf8v;
346 }
347 }
348 # warn "ALERT: No author" unless $doc{author};
349
350 return \%doc;
351 }
352
353 1;

  ViewVC Help
Powered by ViewVC 1.1.26