1 |
#!/usr/bin/perl |
#!/usr/bin/perl |
2 |
# -*- Mode: Perl -*- |
# -*- Mode: Perl -*- |
3 |
# $Basename: HTML.pm $ |
# $Basename: HTML.pm $ |
4 |
# $Revision: 1.4 $ |
# $Revision: 1.5 $ |
5 |
# Author : Ulrich Pfeifer with Andreas König |
# Author : Ulrich Pfeifer with Andreas König |
6 |
# Created On : Sat Nov 1 1997 |
# Created On : Sat Nov 1 1997 |
7 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
16 |
|
|
17 |
package WAIT::Parse::Ora; |
package WAIT::Parse::Ora; |
18 |
use HTML::Parser; |
use HTML::Parser; |
19 |
|
use Encode; |
20 |
use strict; |
use strict; |
21 |
use vars qw(@ISA); |
use vars qw(@ISA); |
22 |
@ISA = qw(WAIT::Parse::Base); |
@ISA = qw(WAIT::Parse::Base); |
67 |
$text .= $_[0] if $open; |
$text .= $_[0] if $open; |
68 |
} |
} |
69 |
|
|
70 |
|
sub my_parse ($) { |
71 |
|
my($s) = @_; |
72 |
|
my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns |
73 |
|
# LATIN for entities |
74 |
|
# and we would get |
75 |
|
# mixed content in |
76 |
|
# result |
77 |
|
$p->parse($ls); |
78 |
|
$p->eof; |
79 |
|
} |
80 |
|
|
81 |
sub split { |
sub split { |
82 |
my ($self, $doc) = @_; |
my ($self, $doc) = @_; |
83 |
my %doc = ( isbn => '', author => '', about => '', colophon => '' ); |
my %doc = ( isbn => '', author => '', about => '', colophon => '' ); |
89 |
%result = (); |
%result = (); |
90 |
$text = ''; |
$text = ''; |
91 |
$open = 0; |
$open = 0; |
92 |
$p->parse($doc->{author}); |
my_parse($doc->{author}); |
|
$p->eof; |
|
93 |
$doc{author} = $result{title}; |
$doc{author} = $result{title}; |
94 |
$doc{author} =~ s/^By\s+//; |
$doc{author} =~ s/^By\s+//; |
95 |
$doc{about} = $result{text}; |
$doc{about} = $result{text}; |
101 |
%result = (); |
%result = (); |
102 |
$text = ''; |
$text = ''; |
103 |
$open = 0; |
$open = 0; |
104 |
$p->parse($doc->{colophon}); |
my_parse($doc->{colophon}); |
|
$p->eof; |
|
105 |
$doc{colophon} = $result{text}; |
$doc{colophon} = $result{text}; |
106 |
} |
} |
107 |
%result = (); |
%result = (); |
108 |
$text = ''; |
$text = ''; |
109 |
$open = 0; |
$open = 0; |
110 |
|
|
111 |
$p->parse($doc->{desc}); |
my_parse($doc->{desc}); |
|
$p->eof; |
|
112 |
|
|
113 |
$doc{text} = $result{text}; |
$doc{text} = $result{text}; |
114 |
$doc{title} = $result{title}; |
$doc{title} = $result{title}; |
115 |
|
|
116 |
|
while (my($k,$v) = each %doc) { |
117 |
|
my $utf8v = Encode::decode("ISO-8859-1",$v); |
118 |
|
$doc{$k} = $utf8v; |
119 |
|
} |
120 |
|
|
121 |
return \%doc; |
return \%doc; |
122 |
} |
} |
123 |
|
|