1 |
#!/usr/bin/perl |
#!/usr/bin/perl |
2 |
# -*- Mode: Perl -*- |
# -*- Mode: Perl -*- |
3 |
# $Basename: HTML.pm $ |
# $Basename: HTML.pm $ |
4 |
# $Revision: 1.1 $ |
# $Revision: 1.4 $ |
5 |
# Author : Ulrich Pfeifer with Andreas König |
# Author : Ulrich Pfeifer with Andreas König |
6 |
# Created On : Sat Nov 1 1997 |
# Created On : Sat Nov 1 1997 |
7 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
8 |
# Last Modified On: Mon Dec 31 14:51:55 2001 |
# Last Modified On: Fri Jan 4 16:06:14 2002 |
9 |
# Language : CPerl |
# Language : CPerl |
10 |
# Update Count : 7 |
# Update Count : 14 |
11 |
# Status : Unknown, Use with caution! |
# Status : Unknown, Use with caution! |
12 |
# |
# |
13 |
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved. |
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved. |
15 |
# |
# |
16 |
|
|
17 |
package WAIT::Parse::Ora; |
package WAIT::Parse::Ora; |
18 |
|
use HTML::Parser; |
19 |
|
use strict; |
20 |
use vars qw(@ISA); |
use vars qw(@ISA); |
|
require HTML::Parse; |
|
|
require HTML::FormatText; |
|
|
use HTML::Entities qw(decode_entities); |
|
21 |
@ISA = qw(WAIT::Parse::Base); |
@ISA = qw(WAIT::Parse::Base); |
22 |
|
|
23 |
|
my $debug = 0; |
24 |
|
my %text = ( |
25 |
|
p => 'text', |
26 |
|
# h1 => 'text', |
27 |
|
# h2 => 'text', |
28 |
|
# h3 => 'text', |
29 |
|
title => 'title', |
30 |
|
); |
31 |
|
|
32 |
|
my $p = HTML::Parser->new( |
33 |
|
api_version => 3, |
34 |
|
start_h => [\&handle_start, "tagname, attr"], |
35 |
|
end_h => [\&handle_end, "tagname"], |
36 |
|
text_h => [\&handle_text, "dtext"], |
37 |
|
marked_sections => 1, |
38 |
|
); |
39 |
|
my %result; |
40 |
|
my $text; |
41 |
|
my $open; |
42 |
|
|
43 |
|
sub handle_start { |
44 |
|
my $tag = shift; |
45 |
|
|
46 |
|
return unless $text{$tag}; |
47 |
|
$open++; |
48 |
|
print ">" x $open, $tag, "\n" if $debug; |
49 |
|
} |
50 |
|
|
51 |
|
sub handle_end { |
52 |
|
my $tag = shift; |
53 |
|
|
54 |
|
return unless $text{$tag}; |
55 |
|
print "<" x $open, $tag, "\n" if $debug; |
56 |
|
$open--; |
57 |
|
$text =~ s/^\s+//; |
58 |
|
$text =~ s/\s+$//; |
59 |
|
$text =~ s/\s+/ /g; |
60 |
|
$result{$text{$tag}} .= $text . ' '; |
61 |
|
$text = ''; |
62 |
|
} |
63 |
|
|
64 |
|
|
65 |
|
sub handle_text { |
66 |
|
$text .= $_[0] if $open; |
67 |
|
} |
68 |
|
|
69 |
sub split { |
sub split { |
70 |
my ($self, $doc) = @_; |
my ($self, $doc) = @_; |
71 |
|
my %doc = ( isbn => '', author => '', about => '', colophon => '' ); |
72 |
my $desc = $doc->{desc}; |
my $desc = $doc->{desc}; |
73 |
my $auth = $doc->{author}; |
my $auth = $doc->{author}; |
74 |
my ($title) = $desc =~ /<title\s*>(.*?)<\/title\s*>/si; |
my $colophon = $doc->{colophon}; |
|
my ($author) = $auth =~ /<title\s*>(.*?)<\/title\s*>/si; |
|
|
my $html = HTML::Parse::parse_html($desc); |
|
|
my $formatter = HTML::FormatText->new; |
|
|
|
|
|
{ |
|
|
'text', $formatter->format($html), |
|
|
'title', $formatter->format(HTML::Parse::parse_html($title)), |
|
|
'author', $formatter->format(HTML::Parse::parse_html($author)), |
|
|
}; |
|
|
} |
|
75 |
|
|
76 |
sub tag { |
if ($doc->{author}) { |
77 |
my ($self, $doc) = @_; |
%result = (); |
78 |
|
$text = ''; |
79 |
|
$open = 0; |
80 |
|
$p->parse($doc->{author}); |
81 |
|
$p->eof; |
82 |
|
$doc{author} = $result{title}; |
83 |
|
$doc{author} =~ s/^By\s+//; |
84 |
|
$doc{about} = $result{text}; |
85 |
|
} |
86 |
|
if ($doc->{index}) { |
87 |
|
$doc->{index} =~ /ISBN\s*([^<]+)/ and $doc{isbn} = $1; |
88 |
|
} |
89 |
|
if ($doc->{colophon}) { |
90 |
|
%result = (); |
91 |
|
$text = ''; |
92 |
|
$open = 0; |
93 |
|
$p->parse($doc->{colophon}); |
94 |
|
$p->eof; |
95 |
|
$doc{colophon} = $result{text}; |
96 |
|
} |
97 |
|
%result = (); |
98 |
|
$text = ''; |
99 |
|
$open = 0; |
100 |
|
|
101 |
my $html_source = $doc->{desc}; |
$p->parse($doc->{desc}); |
102 |
$html_source =~ tr/\r/\n/; |
$p->eof; |
103 |
|
|
104 |
my ($pre,$title,$body) |
$doc{text} = $result{text}; |
105 |
= $html_source =~ /^(.*?<title\s*>)(.*?)(<\/title\s*>.+)/si; |
$doc{title} = $result{title}; |
106 |
|
|
107 |
( |
return \%doc; |
|
{'text' => 1}, decode_entities($pre), |
|
|
{'title' => 1}, decode_entities($title), |
|
|
{'text' => 1}, decode_entities($body), |
|
|
); |
|
108 |
} |
} |
109 |
|
|
110 |
|
1; |