1 |
#!/usr/bin/perl |
2 |
# -*- Mode: Perl -*- |
3 |
# $Basename: HTML.pm $ |
4 |
# $Revision: 1.5 $ |
5 |
# Author : Ulrich Pfeifer with Andreas König |
6 |
# Created On : Sat Nov 1 1997 |
7 |
# Last Modified By: Ulrich Pfeifer |
8 |
# Last Modified On: Fri Jan 4 16:06:14 2002 |
9 |
# Language : CPerl |
10 |
# Update Count : 14 |
11 |
# Status : Unknown, Use with caution! |
12 |
# |
13 |
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved. |
14 |
# |
15 |
# |
16 |
|
17 |
package WAIT::Parse::Ora; |
18 |
use HTML::Parser; |
19 |
use Encode; |
20 |
use strict; |
21 |
use vars qw(@ISA); |
22 |
@ISA = qw(WAIT::Parse::Base); |
23 |
|
24 |
my $debug = 0; |
25 |
my %text = ( |
26 |
p => 'text', |
27 |
# h1 => 'text', |
28 |
# h2 => 'text', |
29 |
# h3 => 'text', |
30 |
title => 'title', |
31 |
); |
32 |
|
33 |
my $p = HTML::Parser->new( |
34 |
api_version => 3, |
35 |
start_h => [\&handle_start, "tagname, attr"], |
36 |
end_h => [\&handle_end, "tagname"], |
37 |
text_h => [\&handle_text, "dtext"], |
38 |
marked_sections => 1, |
39 |
); |
40 |
my %result; |
41 |
my $text; |
42 |
my $open; |
43 |
|
44 |
sub handle_start { |
45 |
my $tag = shift; |
46 |
|
47 |
return unless $text{$tag}; |
48 |
$open++; |
49 |
print ">" x $open, $tag, "\n" if $debug; |
50 |
} |
51 |
|
52 |
sub handle_end { |
53 |
my $tag = shift; |
54 |
|
55 |
return unless $text{$tag}; |
56 |
print "<" x $open, $tag, "\n" if $debug; |
57 |
$open--; |
58 |
$text =~ s/^\s+//; |
59 |
$text =~ s/\s+$//; |
60 |
$text =~ s/\s+/ /g; |
61 |
$result{$text{$tag}} .= $text . ' '; |
62 |
$text = ''; |
63 |
} |
64 |
|
65 |
|
66 |
sub handle_text { |
67 |
$text .= $_[0] if $open; |
68 |
} |
69 |
|
70 |
sub my_parse ($) { |
71 |
my($s) = @_; |
72 |
my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns |
73 |
# LATIN for entities |
74 |
# and we would get |
75 |
# mixed content in |
76 |
# result |
77 |
$p->parse($ls); |
78 |
$p->eof; |
79 |
} |
80 |
|
81 |
sub split { |
82 |
my ($self, $doc) = @_; |
83 |
my %doc = ( isbn => '', author => '', about => '', colophon => '' ); |
84 |
my $desc = $doc->{desc}; |
85 |
my $auth = $doc->{author}; |
86 |
my $colophon = $doc->{colophon}; |
87 |
|
88 |
if ($doc->{author}) { |
89 |
%result = (); |
90 |
$text = ''; |
91 |
$open = 0; |
92 |
my_parse($doc->{author}); |
93 |
$doc{author} = $result{title}; |
94 |
$doc{author} =~ s/^By\s+//; |
95 |
$doc{about} = $result{text}; |
96 |
} |
97 |
if ($doc->{index}) { |
98 |
$doc->{index} =~ /ISBN\s*([^<]+)/ and $doc{isbn} = $1; |
99 |
} |
100 |
if ($doc->{colophon}) { |
101 |
%result = (); |
102 |
$text = ''; |
103 |
$open = 0; |
104 |
my_parse($doc->{colophon}); |
105 |
$doc{colophon} = $result{text}; |
106 |
} |
107 |
%result = (); |
108 |
$text = ''; |
109 |
$open = 0; |
110 |
|
111 |
my_parse($doc->{desc}); |
112 |
|
113 |
$doc{text} = $result{text}; |
114 |
$doc{title} = $result{title}; |
115 |
|
116 |
while (my($k,$v) = each %doc) { |
117 |
my $utf8v = Encode::decode("ISO-8859-1",$v); |
118 |
$doc{$k} = $utf8v; |
119 |
} |
120 |
|
121 |
return \%doc; |
122 |
} |
123 |
|
124 |
1; |