1 |
#line 1 |
2 |
package HTML::TokeParser; |
3 |
|
4 |
# $Id: TokeParser.pm,v 2.35 2005/12/02 16:08:17 gisle Exp $ |
5 |
|
6 |
require HTML::PullParser; |
7 |
@ISA=qw(HTML::PullParser); |
8 |
$VERSION = sprintf("%d.%02d", q$Revision: 2.35 $ =~ /(\d+)\.(\d+)/); |
9 |
|
10 |
use strict; |
11 |
use Carp (); |
12 |
use HTML::Entities qw(decode_entities); |
13 |
use HTML::Tagset (); |
14 |
|
15 |
my %ARGS = |
16 |
( |
17 |
start => "'S',tagname,attr,attrseq,text", |
18 |
end => "'E',tagname,text", |
19 |
text => "'T',text,is_cdata", |
20 |
process => "'PI',token0,text", |
21 |
comment => "'C',text", |
22 |
declaration => "'D',text", |
23 |
|
24 |
# options that default on |
25 |
unbroken_text => 1, |
26 |
); |
27 |
|
28 |
|
29 |
sub new |
30 |
{ |
31 |
my $class = shift; |
32 |
my %cnf; |
33 |
if (@_ == 1) { |
34 |
my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; |
35 |
%cnf = ($type => $_[0]); |
36 |
} |
37 |
else { |
38 |
%cnf = @_; |
39 |
} |
40 |
|
41 |
my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; |
42 |
|
43 |
my $self = $class->SUPER::new(%cnf, %ARGS) || return undef; |
44 |
|
45 |
$self->{textify} = $textify; |
46 |
$self; |
47 |
} |
48 |
|
49 |
|
50 |
sub get_tag |
51 |
{ |
52 |
my $self = shift; |
53 |
my $token; |
54 |
while (1) { |
55 |
$token = $self->get_token || return undef; |
56 |
my $type = shift @$token; |
57 |
next unless $type eq "S" || $type eq "E"; |
58 |
substr($token->[0], 0, 0) = "/" if $type eq "E"; |
59 |
return $token unless @_; |
60 |
for (@_) { |
61 |
return $token if $token->[0] eq $_; |
62 |
} |
63 |
} |
64 |
} |
65 |
|
66 |
|
67 |
sub _textify { |
68 |
my($self, $token) = @_; |
69 |
my $tag = $token->[1]; |
70 |
return undef unless exists $self->{textify}{$tag}; |
71 |
|
72 |
my $alt = $self->{textify}{$tag}; |
73 |
my $text; |
74 |
if (ref($alt)) { |
75 |
$text = &$alt(@$token); |
76 |
} else { |
77 |
$text = $token->[2]{$alt || "alt"}; |
78 |
$text = "[\U$tag]" unless defined $text; |
79 |
} |
80 |
return $text; |
81 |
} |
82 |
|
83 |
|
84 |
sub get_text |
85 |
{ |
86 |
my $self = shift; |
87 |
my @text; |
88 |
while (my $token = $self->get_token) { |
89 |
my $type = $token->[0]; |
90 |
if ($type eq "T") { |
91 |
my $text = $token->[1]; |
92 |
decode_entities($text) unless $token->[2]; |
93 |
push(@text, $text); |
94 |
} elsif ($type =~ /^[SE]$/) { |
95 |
my $tag = $token->[1]; |
96 |
if ($type eq "S") { |
97 |
if (defined(my $text = _textify($self, $token))) { |
98 |
push(@text, $text); |
99 |
next; |
100 |
} |
101 |
} else { |
102 |
$tag = "/$tag"; |
103 |
} |
104 |
if (!@_ || grep $_ eq $tag, @_) { |
105 |
$self->unget_token($token); |
106 |
last; |
107 |
} |
108 |
push(@text, " ") |
109 |
if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]}; |
110 |
} |
111 |
} |
112 |
join("", @text); |
113 |
} |
114 |
|
115 |
|
116 |
sub get_trimmed_text |
117 |
{ |
118 |
my $self = shift; |
119 |
my $text = $self->get_text(@_); |
120 |
$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; |
121 |
$text; |
122 |
} |
123 |
|
124 |
sub get_phrase { |
125 |
my $self = shift; |
126 |
my @text; |
127 |
while (my $token = $self->get_token) { |
128 |
my $type = $token->[0]; |
129 |
if ($type eq "T") { |
130 |
my $text = $token->[1]; |
131 |
decode_entities($text) unless $token->[2]; |
132 |
push(@text, $text); |
133 |
} elsif ($type =~ /^[SE]$/) { |
134 |
my $tag = $token->[1]; |
135 |
if ($type eq "S") { |
136 |
if (defined(my $text = _textify($self, $token))) { |
137 |
push(@text, $text); |
138 |
next; |
139 |
} |
140 |
} |
141 |
if (!$HTML::Tagset::isPhraseMarkup{$tag}) { |
142 |
$self->unget_token($token); |
143 |
last; |
144 |
} |
145 |
push(@text, " ") if $tag eq "br"; |
146 |
} |
147 |
} |
148 |
my $text = join("", @text); |
149 |
$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; |
150 |
$text; |
151 |
} |
152 |
|
153 |
1; |
154 |
|
155 |
|
156 |
__END__ |
157 |
|
158 |
#line 367 |