1 |
#!/usr/bin/perl |
#!/usr/bin/perl |
2 |
# -*- Mode: Perl -*- |
# -*- Mode: Perl -*- |
3 |
# $Basename: HTML.pm $ |
# $Basename: HTML.pm $ |
4 |
# $Revision: 1.4 $ |
# $Revision: 1.7 $ |
5 |
# Author : Ulrich Pfeifer with Andreas König |
# Author : Ulrich Pfeifer with Andreas König |
6 |
# Created On : Sat Nov 1 1997 |
# Created On : Sat Nov 1 1997 |
7 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
16 |
|
|
17 |
package WAIT::Parse::Ora; |
package WAIT::Parse::Ora; |
18 |
use HTML::Parser; |
use HTML::Parser; |
19 |
|
use Encode; |
20 |
use strict; |
use strict; |
21 |
use vars qw(@ISA); |
use vars qw(@ISA); |
22 |
@ISA = qw(WAIT::Parse::Base); |
@ISA = qw(WAIT::Parse::Base); |
23 |
|
|
24 |
|
|
25 |
|
=pod |
26 |
|
|
27 |
|
Text from 2002-03-05 is structured with <div> tags as follows: |
28 |
|
|
29 |
|
index.html: |
30 |
|
<div id="biblio"> BIBLIOGRAPHISCHE ANGABEN |
31 |
|
<div id="short_desc"> KURZE BESCHREIBUNG |
32 |
|
|
33 |
|
desc.html: |
34 |
|
<div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG |
35 |
|
|
36 |
|
author.html: |
37 |
|
<div id="author_bio"> BIOGRAPHIE DES AUTOREN |
38 |
|
|
39 |
|
translator.html: |
40 |
|
<div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS |
41 |
|
|
42 |
|
=cut |
43 |
|
|
44 |
my $debug = 0; |
my $debug = 0; |
45 |
my %text = ( |
my %is_text = ( |
46 |
p => 'text', |
p => 'text', |
47 |
|
a => 'text', # uebersetzer |
48 |
# h1 => 'text', |
# h1 => 'text', |
49 |
# h2 => 'text', |
# h2 => 'text', |
50 |
# h3 => 'text', |
# h3 => 'text', |
51 |
title => 'title', |
title => 'title', |
52 |
); |
); |
53 |
|
|
54 |
my $p = HTML::Parser->new( |
my $p = HTML::Parser->new( |
64 |
|
|
65 |
sub handle_start { |
sub handle_start { |
66 |
my $tag = shift; |
my $tag = shift; |
67 |
|
my $attr = shift; |
68 |
|
|
69 |
return unless $text{$tag}; |
return unless |
70 |
|
$is_text{$tag} # well-formed paragraphs |
71 |
|
|| |
72 |
|
$tag eq "h3" # good for desc, author, and colo |
73 |
|
|| |
74 |
|
($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html |
75 |
$open++; |
$open++; |
76 |
print ">" x $open, $tag, "\n" if $debug; |
print ">" x $open, $tag, "\n" if $debug; |
77 |
} |
} |
79 |
sub handle_end { |
sub handle_end { |
80 |
my $tag = shift; |
my $tag = shift; |
81 |
|
|
82 |
return unless $text{$tag}; |
return unless $is_text{$tag}; |
83 |
print "<" x $open, $tag, "\n" if $debug; |
print "<" x $open, $tag, "\n" if $debug; |
84 |
$open--; |
$open--; |
85 |
$text =~ s/^\s+//; |
$text =~ s/^\s+//; |
86 |
$text =~ s/\s+$//; |
$text =~ s/\s+$//; |
87 |
$text =~ s/\s+/ /g; |
$text =~ s/\s+/ /g; |
88 |
$result{$text{$tag}} .= $text . ' '; |
$result{$is_text{$tag}} .= $text . ' '; |
89 |
$text = ''; |
$text = ''; |
90 |
} |
} |
91 |
|
|
92 |
|
|
93 |
sub handle_text { |
sub handle_text { |
94 |
$text .= $_[0] if $open; |
my $c = shift; |
95 |
|
if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) { |
96 |
|
$open--; |
97 |
|
return; |
98 |
|
} |
99 |
|
$text .= $c if $open; |
100 |
|
} |
101 |
|
|
102 |
|
sub my_parse ($) { |
103 |
|
my($s) = @_; |
104 |
|
my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns |
105 |
|
# LATIN for entities |
106 |
|
# and we would get |
107 |
|
# mixed content in |
108 |
|
# result |
109 |
|
$p->parse($ls); |
110 |
|
$p->eof; |
111 |
} |
} |
112 |
|
|
113 |
sub split { |
sub split { |
114 |
my ($self, $doc) = @_; |
my ($self, $doc) = @_; |
115 |
my %doc = ( isbn => '', author => '', about => '', colophon => '' ); |
my %doc = ( isbn => '', |
116 |
my $desc = $doc->{desc}; |
author => '', |
117 |
my $auth = $doc->{author}; |
aboutauthor => '', |
118 |
my $colophon = $doc->{colophon}; |
colophon => '', |
119 |
|
abstract => '' |
120 |
|
); |
121 |
|
|
122 |
if ($doc->{author}) { |
if ($doc->{author}) { |
123 |
%result = (); |
%result = (); |
124 |
$text = ''; |
$text = ''; |
125 |
$open = 0; |
$open = 0; |
126 |
$p->parse($doc->{author}); |
my_parse($doc->{author}); |
|
$p->eof; |
|
127 |
$doc{author} = $result{title}; |
$doc{author} = $result{title}; |
128 |
$doc{author} =~ s/^By\s+//; |
$doc{aboutauthor} = $result{text}; |
|
$doc{about} = $result{text}; |
|
129 |
} |
} |
130 |
if ($doc->{index}) { |
if ($doc->{index}) { |
131 |
$doc->{index} =~ /ISBN\s*([^<]+)/ and $doc{isbn} = $1; |
$doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1; |
132 |
|
%result = (); |
133 |
|
$text = ''; |
134 |
|
$open = 0; |
135 |
|
my_parse($doc->{index}); |
136 |
|
$doc{abstract} = $result{text}; |
137 |
} |
} |
138 |
if ($doc->{colophon}) { |
if ($doc->{colophon}) { |
139 |
%result = (); |
%result = (); |
140 |
$text = ''; |
$text = ''; |
141 |
$open = 0; |
$open = 0; |
142 |
$p->parse($doc->{colophon}); |
my_parse($doc->{colophon}); |
|
$p->eof; |
|
143 |
$doc{colophon} = $result{text}; |
$doc{colophon} = $result{text}; |
144 |
} |
} |
145 |
%result = (); |
%result = (); |
146 |
$text = ''; |
$text = ''; |
147 |
$open = 0; |
$open = 0; |
148 |
|
|
149 |
$p->parse($doc->{desc}); |
my_parse($doc->{desc}); |
|
$p->eof; |
|
150 |
|
|
151 |
$doc{text} = $result{text}; |
$doc{desc} = $result{text}; |
152 |
$doc{title} = $result{title}; |
$doc{title} = $result{title}; |
153 |
|
|
154 |
|
while (my($k,$v) = each %doc) { |
155 |
|
my $utf8v = Encode::decode("ISO-8859-1",$v); |
156 |
|
$doc{$k} = $utf8v; |
157 |
|
} |
158 |
|
|
159 |
|
$doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//; |
160 |
|
$doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /; |
161 |
|
$doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /; |
162 |
|
# warn "desc[$doc{desc}]"; |
163 |
|
# warn "abstract[$doc{abstract}]"; # zu viel, zu viel! |
164 |
|
|
165 |
return \%doc; |
return \%doc; |
166 |
} |
} |
167 |
|
|