1 |
#!/usr/bin/perl |
2 |
# -*- Mode: Perl -*- |
3 |
# $Basename: HTML.pm $ |
4 |
# $Revision: 1.3 $ |
5 |
# Author : Ulrich Pfeifer with Andreas König |
6 |
# Created On : Sat Nov 1 1997 |
7 |
# Last Modified By: Ulrich Pfeifer |
8 |
# Last Modified On: Fri Jan 4 16:06:14 2002 |
9 |
# Language : CPerl |
10 |
# Update Count : 14 |
11 |
# Status : Unknown, Use with caution! |
12 |
# |
13 |
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved. |
14 |
# |
15 |
# |
16 |
|
17 |
package WAIT::Parse::Ora; |
18 |
use HTML::Parser; |
19 |
use strict; |
20 |
use vars qw(@ISA); |
21 |
@ISA = qw(WAIT::Parse::Base); |
22 |
|
23 |
my $debug = 0; |
24 |
my %text = ( |
25 |
p => 'text', |
26 |
# h1 => 'text', |
27 |
# h2 => 'text', |
28 |
# h3 => 'text', |
29 |
title => 'title', |
30 |
); |
31 |
|
32 |
my $p = HTML::Parser->new( |
33 |
api_version => 3, |
34 |
start_h => [\&handle_start, "tagname, attr"], |
35 |
end_h => [\&handle_end, "tagname"], |
36 |
text_h => [\&handle_text, "dtext"], |
37 |
marked_sections => 1, |
38 |
); |
39 |
my %result; |
40 |
my $text; |
41 |
my $open; |
42 |
|
43 |
sub handle_start { |
44 |
my $tag = shift; |
45 |
|
46 |
return unless $text{$tag}; |
47 |
$open++; |
48 |
print ">" x $open, $tag, "\n" if $debug; |
49 |
} |
50 |
|
51 |
sub handle_end { |
52 |
my $tag = shift; |
53 |
|
54 |
return unless $text{$tag}; |
55 |
print "<" x $open, $tag, "\n" if $debug; |
56 |
$open--; |
57 |
$text =~ s/^\s+//; |
58 |
$text =~ s/\s+$//; |
59 |
$text =~ s/\s+/ /g; |
60 |
$result{$text{$tag}} .= $text . ' '; |
61 |
$text = ''; |
62 |
} |
63 |
|
64 |
|
65 |
sub handle_text { |
66 |
$text .= $_[0] if $open; |
67 |
} |
68 |
|
69 |
sub split { |
70 |
my ($self, $doc) = @_; |
71 |
my %doc = ( isbn => '', author => '', about => '' ); |
72 |
my $desc = $doc->{desc}; |
73 |
my $auth = $doc->{author}; |
74 |
|
75 |
if ($doc->{author}) { |
76 |
%result = (); |
77 |
$text = ''; |
78 |
$open = 0; |
79 |
$p->parse($doc->{author}); |
80 |
$p->eof; |
81 |
$doc{author} = $result{title}; |
82 |
$doc{about} = $result{text}; |
83 |
} |
84 |
if ($doc->{index}) { |
85 |
$doc->{index} =~ /ISBN\s*([-\d]+)/ and $doc{isbn} = $1; |
86 |
} |
87 |
%result = (); |
88 |
$text = ''; |
89 |
$open = 0; |
90 |
|
91 |
$p->parse($doc->{desc}); |
92 |
$p->eof; |
93 |
|
94 |
$doc{text} = $result{text}; |
95 |
$doc{title} = $result{title}; |
96 |
|
97 |
return \%doc; |
98 |
} |
99 |
|
100 |
1; |