WAIT/Parse/Ora.pm

#!/usr/bin/perl
#                              -*- Mode: Perl -*- 
# $Basename: HTML.pm $
# $Revision: 1.11 $
# Author          : Ulrich Pfeifer with Andreas König
# Created On      : Sat Nov 1 1997
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Fri Jan  4 16:06:14 2002
# Language        : CPerl
# Update Count    : 14
# Status          : Unknown, Use with caution!
# 
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
# 
# 

package WAIT::Parse::Ora;
use base qw(WAIT::Parse::Base);

use HTML::Parser;
use Encode;
use strict;


=pod

Text from 2002-03-05 is structured with <div> tags as follows:

  index.html:
  <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
  <div id="short_desc"> KURZE BESCHREIBUNG

  desc.html:
  <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG

  author.html:
  <div id="author_bio"> BIOGRAPHIE DES AUTOREN

  translator.html:
  <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS

=cut

my $debug = 0;

my $globalp = HTML::Parser->new(
                                api_version => 3,
                                start_h => [\&handle_start, "tagname, attr"],
                                end_h   => [\&handle_end,   "tagname"],
                                text_h  => [\&handle_text,  "dtext"],
                                marked_sections => 1,
                               );
my %result;
my $text;
my $open;
my $div;

sub initialize_text {
  if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
    $text = "\x{100}";
  } else {
    $text = "";
  }
}

sub finished_text {
  if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
    $text =~ s/^\x{100}//;
  }
  $text =~ s/^\s+//;
  $text =~ s/\s+$//;
  $text =~ s/\s+/ /g;
  $text;
}

sub handle_start {
  my $tag = shift;
  my $attr = shift;

  return unless $tag eq "div";
  $div = $attr->{id};
  utf8::upgrade($div) if oreilly_de_catalog::config::UPGRADE_DIV();
  $open++;
  print ">" x $open, $tag,  "\n" if $debug;
}

sub handle_end {
  my $tag = shift;

  return unless $tag eq "div";
  print "<" x $open, $tag,  "\n" if $debug;
  $open--;
  return unless $div;
  if (defined $result{$div}){
    $result{$div} .= " " . finished_text();
  } else {
    $result{$div} = finished_text();
  }
  initialize_text();
}


sub handle_text {
  my $c = shift;
  $text .= $c if $open;
}


# WAIT::Parse::Ora::my_parse
sub my_parse ($) {
  my($s) = @_;
  my $ls;
  if (oreilly_de_catalog::config::CHARACTER_STATS() ) {
    use charnames ":full";
    my %seen;
    while ( $s =~ /([^\000-\177])/g ) {
      my $c = $1;
      $seen{$c}++;
    }
    for my $c (sort {$seen{$a} <=> $seen{$b} } keys %seen) {
      my $ord = ord $c;
      my $charname = charnames::viacode($ord);
      # printf "CS: %5d %4d %s %s\n", $ord, $seen{$c}, $c, $charname;
      printf "CS: %5x %4d %s\n", $ord, $seen{$c}, $charname||"[undef]";
    }
  }
  if (oreilly_de_catalog::config::ENTITY_STATS() ) {
    use charnames ":full";
    my %seen;
    while ( $s =~ /(&#(\d+|x[\da-f]+);?)/ig ) {
      my($full) = $1;
      $seen{$full}++;
    }
    for my $full (sort {$seen{$a} <=> $seen{$b} } keys %seen) {
      my($n) = $full =~ /&#(\d+|x[\da-f]+)/i;
      # die "full[$full]" unless defined $n;
      my $ord = $n =~ s/^x//i ? hex $n : $n;
      my $charname = charnames::viacode($ord);
      # printf "CS: %5d %4d %s %s\n", $ord, $seen{$c}, $c, $charname;
      printf "ES: %-12s %5x %4d %s\n", $full, $ord, $seen{$full}, $charname||"[undef]";
    }
  }

  if (oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE()) {
    warn "Warning: this HTML::Parser has Unicode support on"
        if HTML::Entities::UNICODE_SUPPORT();
    $ls = Encode::encode("ISO-8859-1", $s, 1);
    # HTML::Parser 3.25, 3.26 returns **mostly** LATIN for entities.
    # We reduce the amount of mixed-encoding content to just a few
    # punctuation characters when we work with Latin here.

  } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER() ) {
    $ls = Encode::encode_utf8($s);
  } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
    $ls = $s;
    $ls =~ s/([^\000-\177])/ "&#" . ord($1) . ";" /ge;
    utf8::downgrade($ls); # didn't improve the coredumpness
    die "ls[$ls] not 7bit clean" unless $ls =~ /^[\000-\177]*$/;
    # warn "ls[$ls]";
  } else {
    $ls = $s;
  }
  %result = ();
  initialize_text();
  $open = 0;
  $div = undef;
  if (0) { # XXX probieren ueber probieren wg Entities und UTF-8
    # code that directly deals with $s because it doesn't want the
    # conversion to $ls (latin1)

    $s =~ s/\s+/ /g; # die CR nerven noch mehr als die LF
    my $pre_s = $s;
    # $s =~ s/&#153;//g;
    # $s =~ s/\302\255//g; # 2.8 pounds in 
    if ( 0 && $HTML::Parser::VERSION == 3.26 ) {
      # Should be handled by HTML::Entities, numeric entities and
      # HTML::Entities and -DUNICODE_ENTITIES => core dump

      my $saidinfo = 0;
      local $| = 1;
      while ( $s =~ s/\&\#(\d+)\;/chr($1)/e ) {
        print "Info:" unless $saidinfo++;
        print " &#$1;";
      }
      print "\n" if $saidinfo;

    }
  }
  if (oreilly_de_catalog::config::DISPOSE_PARSER_EACH_TIME() ) {
    my $p = HTML::Parser->new(
                              api_version => 3,
                              start_h => [\&handle_start, "tagname, attr"],
                              end_h   => [\&handle_end,   "tagname"],
                              text_h  => [\&handle_text,  "dtext"],
                              marked_sections => 1,
                             );
    $p->parse($ls);
    $p->eof;
  } else {
    $globalp->parse($ls);
    $globalp->eof;
  }
  if (0) { # XXX
    # code that tries to postprocess the nonsense resulting from the above
    while (my($k,$v) = each %result) {
      next unless defined($v) && length($v);
      next if Encode::is_utf8($v);
      next unless $v =~ /[^\040-\177]/;
      # Wenn UTF-8 und nicht-UTF-8 gemischt sind, sind wir erledigt
      my $utf8v;
      if (HTML::Entities::UNICODE_SUPPORT()) {
        if (0) {
          # klappt nicht == 2002-04-02
          $utf8v = Encode::decode("ISO-8859-1",$v);
        } elsif (1) {
          $utf8v = $v;
          # fuehrt zu "unexpected downgraded strings" und die haben dann
          # noch ein UTF-8 Teile, die nicht als solche markiert sind
        }
      } else {
        # Want to find out which condition we need to watch
        if ($HTML::Parser::VERSION != 3.26) {
          # klappt nicht == 2002-04-02
          $utf8v = Encode::decode("ISO-8859-1",$v);
        } else {
          # klappt nicht == 2002-04-02
          $utf8v = $v;
          Encode::_utf8_on($utf8v);
          Encode::is_utf8($utf8v, 1) or die "Not UTF8 [$utf8v]";
        }
      }
      $result{$k} = $utf8v;
    }
  }
  if ( oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE() ) {
    while (my($k,$v) = each %result) {
      next unless defined($v) && length($v);
      my $utf8v = Encode::decode("ISO-8859-1",$v);
      $result{$k} = $utf8v;
    }
  } elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER()) {
    while (my($k,$v) = each %result) {
      next unless defined($v) && length($v);
      my $utf8v = Encode::decode_utf8($v);
      $result{$k} = $utf8v;
    }
  } elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
    while (my($k,$v) = each %result) {
      next unless defined($v) && length($v);
      utf8::upgrade($v);
      $result{$k} = $v;
    }
  }
}

sub split {
  my ($self, $doc) = @_;
  my %doc = map { $_ => "" } qw(isbn author aboutauthor chapter
                                translator abouttranslator colophon
                                abstract title subtitle title_orig toc inx);

  if ($doc->{author}) {
    my_parse($doc->{author});
    $doc{aboutauthor}  = $result{author_bio};
  }
  if ($doc->{translator}) {
    my_parse($doc->{translator});
    $doc{abouttranslator}  = $result{translator_bio};
  }
  if ($doc->{index}) {
    my_parse($doc->{index});
    $doc{abstract} = $result{short_desc};
    $doc{isbn} = $result{isbn};
    $doc{author} = $result{author_names} || "";
    $doc{translator} = $result{translator_names};
    $doc{title} = $result{title};
    $doc{subtitle} = $result{subtitle};
    $doc{title_orig} = $result{title_orig};
  }
  if ($doc->{chapter}) {
    my $content = $doc->{chapter};
    my $bs;
    $bs++ if $content =~ s/^.*?<!--\s*sample chapter (begins (here )?)?-->//si;
    my $es;
    $es++ if $content =~ s/<!--\s*(End of )?sample chapter (ends here )?-->.*//si;
    unless ($bs){
      $content =~ s/^.*?<h1/<h1/si;
    }
    unless ($es){
      $content =~ s/<HR.*//si;
    }
    $content =~ s/^/<div id="chapter">/;
    $content .= "</div>\n";
    my_parse($content);
    $doc{chapter} = $result{chapter};
  }
  if ($doc->{colophon}) {
    my_parse($doc->{colophon});
    my $s = $doc{colophon} = $result{colophon};
    # use Devel::Peek;
    # Devel::Peek::Dump($s);
  }
  if ($doc->{toc}) {
    my_parse($doc->{toc});
    if (my $s = $result{book_toc}) {
      # $s =~ s/<BR>/ /ig; # very wrong! if we have <BR> here, it was &lt;BR&gt;
      $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
      $s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einleitung 1.1 Zugriff
      $s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
      $doc{toc} = $s;
    } else {
      warn "toc contains no \$s?";
    }
  }
  if ($doc->{inx}) {
    my_parse($doc->{inx});
    my $s = $result{book_inx} || "";
    # $s =~ s/<BR>/ /ig; # wrong!, see above
    $s =~ s/&#(8211);/-/g;
    $s =~ s/&#(8220);/"/g;
    $s =~ s/&#(8222);/"/g;
    $s =~ s/&#(8217);/'/g;
    $s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
    $s =~ s/\s*,\s+/ /g; # Komma
    1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus &#234;)
    1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
    $s =~ s/(\w+)\( \)/$1()/g; # functions in the index
    $doc{inx} = $s;
  }
  if ($doc->{desc}) {
    my_parse($doc->{desc});
    $doc{desc}  = $result{long_desc};
  }

  if (0) {
    # we did really convert the stuff we just read in to UTF8
    # (although WAIT::Document::Ora::conv_getline converts to UTF8
    # itself), because my_parse did the conversion back to latin1.
    # This nonsense must stop. All routines must get and give UTF-8.
    # If they want to process something else internally, they must
    # convert twice
    while (my($k,$v) = each %doc) {
      next unless defined($v) && length($v);
      my $utf8v = Encode::decode("ISO-8859-1",$v);
      $doc{$k} = $utf8v;
    }
  }
  # warn "ALERT: No author" unless $doc{author};

  return \%doc;
}

1;
1	#!/usr/bin/perl
2	# -- Mode: Perl --
3	# $Basename: HTML.pm $
4	# $Revision: 1.11 $
5	# Author : Ulrich Pfeifer with Andreas König
6	# Created On : Sat Nov 1 1997
7	# Last Modified By: Ulrich Pfeifer
8	# Last Modified On: Fri Jan 4 16:06:14 2002
9	# Language : CPerl
10	# Update Count : 14
11	# Status : Unknown, Use with caution!
12	#
13	# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14	#
15	#
16
17	package WAIT::Parse::Ora;
18	use base qw(WAIT::Parse::Base);
19
20	use HTML::Parser;
21	use Encode;
22	use strict;
23
24
25	=pod
26
27	Text from 2002-03-05 is structured with <div> tags as follows:
28
29	index.html:
30	<div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
31	<div id="short_desc"> KURZE BESCHREIBUNG
32
33	desc.html:
34	<div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
35
36	author.html:
37	<div id="author_bio"> BIOGRAPHIE DES AUTOREN
38
39	translator.html:
40	<div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
41
42	=cut
43
44	my $debug = 0;
45
46	my $globalp = HTML::Parser->new(
47	api_version => 3,
48	start_h => [\&handle_start, "tagname, attr"],
49	end_h => [\&handle_end, "tagname"],
50	text_h => [\&handle_text, "dtext"],
51	marked_sections => 1,
52	);
53	my %result;
54	my $text;
55	my $open;
56	my $div;
57
58	sub initialize_text {
59	if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
60	$text = "\x{100}";
61	} else {
62	$text = "";
63	}
64	}
65
66	sub finished_text {
67	if (oreilly_de_catalog::config::BRUTE_FORCE_UPGRADE() ) {
68	$text =~ s/^\x{100}//;
69	}
70	$text =~ s/^\s+//;
71	$text =~ s/\s+$//;
72	$text =~ s/\s+/ /g;
73	$text;
74	}
75
76	sub handle_start {
77	my $tag = shift;
78	my $attr = shift;
79
80	return unless $tag eq "div";
81	$div = $attr->{id};
82	utf8::upgrade($div) if oreilly_de_catalog::config::UPGRADE_DIV();
83	$open++;
84	print ">" x $open, $tag, "\n" if $debug;
85	}
86
87	sub handle_end {
88	my $tag = shift;
89
90	return unless $tag eq "div";
91	print "<" x $open, $tag, "\n" if $debug;
92	$open--;
93	return unless $div;
94	if (defined $result{$div}){
95	$result{$div} .= " " . finished_text();
96	} else {
97	$result{$div} = finished_text();
98	}
99	initialize_text();
100	}
101
102
103	sub handle_text {
104	my $c = shift;
105	$text .= $c if $open;
106	}
107
108
109	# WAIT::Parse::Ora::my_parse
110	sub my_parse ($) {
111	my($s) = @_;
112	my $ls;
113	if (oreilly_de_catalog::config::CHARACTER_STATS() ) {
114	use charnames ":full";
115	my %seen;
116	while ( $s =~ /([^\000-\177])/g ) {
117	my $c = $1;
118	$seen{$c}++;
119	}
120	for my $c (sort {$seen{$a} <=> $seen{$b} } keys %seen) {
121	my $ord = ord $c;
122	my $charname = charnames::viacode($ord);
123	# printf "CS: %5d %4d %s %s\n", $ord, $seen{$c}, $c, $charname;
124	printf "CS: %5x %4d %s\n", $ord, $seen{$c}, $charname\|\|"[undef]";
125	}
126	}
127	if (oreilly_de_catalog::config::ENTITY_STATS() ) {
128	use charnames ":full";
129	my %seen;
130	while ( $s =~ /(&#(\d+\|x[\da-f]+);?)/ig ) {
131	my($full) = $1;
132	$seen{$full}++;
133	}
134	for my $full (sort {$seen{$a} <=> $seen{$b} } keys %seen) {
135	my($n) = $full =~ /&#(\d+\|x[\da-f]+)/i;
136	# die "full[$full]" unless defined $n;
137	my $ord = $n =~ s/^x//i ? hex $n : $n;
138	my $charname = charnames::viacode($ord);
139	# printf "CS: %5d %4d %s %s\n", $ord, $seen{$c}, $c, $charname;
140	printf "ES: %-12s %5x %4d %s\n", $full, $ord, $seen{$full}, $charname\|\|"[undef]";
141	}
142	}
143
144	if (oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE()) {
145	warn "Warning: this HTML::Parser has Unicode support on"
146	if HTML::Entities::UNICODE_SUPPORT();
147	$ls = Encode::encode("ISO-8859-1", $s, 1);
148	# HTML::Parser 3.25, 3.26 returns mostly LATIN for entities.
149	# We reduce the amount of mixed-encoding content to just a few
150	# punctuation characters when we work with Latin here.
151
152	} elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER() ) {
153	$ls = Encode::encode_utf8($s);
154	} elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
155	$ls = $s;
156	$ls =~ s/([^\000-\177])/ "&#" . ord($1) . ";" /ge;
157	utf8::downgrade($ls); # didn't improve the coredumpness
158	die "ls[$ls] not 7bit clean" unless $ls =~ /^[\000-\177]*$/;
159	# warn "ls[$ls]";
160	} else {
161	$ls = $s;
162	}
163	%result = ();
164	initialize_text();
165	$open = 0;
166	$div = undef;
167	if (0) { # XXX probieren ueber probieren wg Entities und UTF-8
168	# code that directly deals with $s because it doesn't want the
169	# conversion to $ls (latin1)
170
171	$s =~ s/\s+/ /g; # die CR nerven noch mehr als die LF
172	my $pre_s = $s;
173	# $s =~ s///g;
174	# $s =~ s/\302\255//g; # 2.8 pounds in
175	if ( 0 && $HTML::Parser::VERSION == 3.26 ) {
176	# Should be handled by HTML::Entities, numeric entities and
177	# HTML::Entities and -DUNICODE_ENTITIES => core dump
178
179	my $saidinfo = 0;
180	local $\| = 1;
181	while ( $s =~ s/\&\#(\d+)\;/chr($1)/e ) {
182	print "Info:" unless $saidinfo++;
183	print " &#$1;";
184	}
185	print "\n" if $saidinfo;
186
187	}
188	}
189	if (oreilly_de_catalog::config::DISPOSE_PARSER_EACH_TIME() ) {
190	my $p = HTML::Parser->new(
191	api_version => 3,
192	start_h => [\&handle_start, "tagname, attr"],
193	end_h => [\&handle_end, "tagname"],
194	text_h => [\&handle_text, "dtext"],
195	marked_sections => 1,
196	);
197	$p->parse($ls);
198	$p->eof;
199	} else {
200	$globalp->parse($ls);
201	$globalp->eof;
202	}
203	if (0) { # XXX
204	# code that tries to postprocess the nonsense resulting from the above
205	while (my($k,$v) = each %result) {
206	next unless defined($v) && length($v);
207	next if Encode::is_utf8($v);
208	next unless $v =~ /[^\040-\177]/;
209	# Wenn UTF-8 und nicht-UTF-8 gemischt sind, sind wir erledigt
210	my $utf8v;
211	if (HTML::Entities::UNICODE_SUPPORT()) {
212	if (0) {
213	# klappt nicht == 2002-04-02
214	$utf8v = Encode::decode("ISO-8859-1",$v);
215	} elsif (1) {
216	$utf8v = $v;
217	# fuehrt zu "unexpected downgraded strings" und die haben dann
218	# noch ein UTF-8 Teile, die nicht als solche markiert sind
219	}
220	} else {
221	# Want to find out which condition we need to watch
222	if ($HTML::Parser::VERSION != 3.26) {
223	# klappt nicht == 2002-04-02
224	$utf8v = Encode::decode("ISO-8859-1",$v);
225	} else {
226	# klappt nicht == 2002-04-02
227	$utf8v = $v;
228	Encode::_utf8_on($utf8v);
229	Encode::is_utf8($utf8v, 1) or die "Not UTF8 [$utf8v]";
230	}
231	}
232	$result{$k} = $utf8v;
233	}
234	}
235	if ( oreilly_de_catalog::config::ALLOW_LATIN_INTERMEDIATE() ) {
236	while (my($k,$v) = each %result) {
237	next unless defined($v) && length($v);
238	my $utf8v = Encode::decode("ISO-8859-1",$v);
239	$result{$k} = $utf8v;
240	}
241	} elsif (oreilly_de_catalog::config::PROTECT_UTF8_FOR_HTML_PARSER()) {
242	while (my($k,$v) = each %result) {
243	next unless defined($v) && length($v);
244	my $utf8v = Encode::decode_utf8($v);
245	$result{$k} = $utf8v;
246	}
247	} elsif (oreilly_de_catalog::config::PROTECT_UTF8_WITH_AMP() ) {
248	while (my($k,$v) = each %result) {
249	next unless defined($v) && length($v);
250	utf8::upgrade($v);
251	$result{$k} = $v;
252	}
253	}
254	}
255
256	sub split {
257	my ($self, $doc) = @_;
258	my %doc = map { $_ => "" } qw(isbn author aboutauthor chapter
259	translator abouttranslator colophon
260	abstract title subtitle title_orig toc inx);
261
262	if ($doc->{author}) {
263	my_parse($doc->{author});
264	$doc{aboutauthor} = $result{author_bio};
265	}
266	if ($doc->{translator}) {
267	my_parse($doc->{translator});
268	$doc{abouttranslator} = $result{translator_bio};
269	}
270	if ($doc->{index}) {
271	my_parse($doc->{index});
272	$doc{abstract} = $result{short_desc};
273	$doc{isbn} = $result{isbn};
274	$doc{author} = $result{author_names} \|\| "";
275	$doc{translator} = $result{translator_names};
276	$doc{title} = $result{title};
277	$doc{subtitle} = $result{subtitle};
278	$doc{title_orig} = $result{title_orig};
279	}
280	if ($doc->{chapter}) {
281	my $content = $doc->{chapter};
282	my $bs;
283	$bs++ if $content =~ s/^.?<!--\ssample chapter (begins (here )?)?-->//si;
284	my $es;
285	$es++ if $content =~ s/<!--\s(End of )?sample chapter (ends here )?-->.//si;
286	unless ($bs){
287	$content =~ s/^.*?<h1/<h1/si;
288	}
289	unless ($es){
290	$content =~ s/<HR.*//si;
291	}
292	$content =~ s/^/<div id="chapter">/;
293	$content .= "</div>\n";
294	my_parse($content);
295	$doc{chapter} = $result{chapter};
296	}
297	if ($doc->{colophon}) {
298	my_parse($doc->{colophon});
299	my $s = $doc{colophon} = $result{colophon};
300	# use Devel::Peek;
301	# Devel::Peek::Dump($s);
302	}
303	if ($doc->{toc}) {
304	my_parse($doc->{toc});
305	if (my $s = $result{book_toc}) {
306	# $s =~ s/<BR>/ /ig; # very wrong! if we have <BR> here, it was <BR>
307	$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
308	$s =~ s/\b\d+(\.\d+)?\b//g; # 1.0 Einleitung 1.1 Zugriff
309	$s =~ s/\b\d+\.//g; # 7.vi Options 8.Enhanced Tags 9.nvi-New vi
310	$doc{toc} = $s;
311	} else {
312	warn "toc contains no \$s?";
313	}
314	}
315	if ($doc->{inx}) {
316	my_parse($doc->{inx});
317	my $s = $result{book_inx} \|\| "";
318	# $s =~ s/<BR>/ /ig; # wrong!, see above
319	$s =~ s/&#(8211);/-/g;
320	$s =~ s/&#(8220);/"/g;
321	$s =~ s/&#(8222);/"/g;
322	$s =~ s/&#(8217);/'/g;
323	$s =~ s/[\xa0]/ /g; # nbsp; need [] because of a bug in this perl
324	$s =~ s/\s*,\s+/ /g; # Komma
325	1 while $s =~ s/\s\d+-\d+\s/ /g; # Seitenangaben (nicht aber das 234 aus ê)
326	1 while $s =~ s/\s\d+\s/ /g; # Seitenangaben
327	$s =~ s/(\w+)\( \)/$1()/g; # functions in the index
328	$doc{inx} = $s;
329	}
330	if ($doc->{desc}) {
331	my_parse($doc->{desc});
332	$doc{desc} = $result{long_desc};
333	}
334
335	if (0) {
336	# we did really convert the stuff we just read in to UTF8
337	# (although WAIT::Document::Ora::conv_getline converts to UTF8
338	# itself), because my_parse did the conversion back to latin1.
339	# This nonsense must stop. All routines must get and give UTF-8.
340	# If they want to process something else internally, they must
341	# convert twice
342	while (my($k,$v) = each %doc) {
343	next unless defined($v) && length($v);
344	my $utf8v = Encode::decode("ISO-8859-1",$v);
345	$doc{$k} = $utf8v;
346	}
347	}
348	# warn "ALERT: No author" unless $doc{author};
349
350	return \%doc;
351	}
352
353	1;