/[XML-Feed]/inc/HTML/TokeParser.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /inc/HTML/TokeParser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (show annotations)
Sun Mar 16 19:47:49 2008 UTC (16 years, 1 month ago) by dpavlin
File size: 3154 byte(s)
import XML::Feed 0.12 from CPAN

1 #line 1
2 package HTML::TokeParser;
3
4 # $Id: TokeParser.pm,v 2.35 2005/12/02 16:08:17 gisle Exp $
5
6 require HTML::PullParser;
7 @ISA=qw(HTML::PullParser);
8 $VERSION = sprintf("%d.%02d", q$Revision: 2.35 $ =~ /(\d+)\.(\d+)/);
9
10 use strict;
11 use Carp ();
12 use HTML::Entities qw(decode_entities);
13 use HTML::Tagset ();
14
15 my %ARGS =
16 (
17 start => "'S',tagname,attr,attrseq,text",
18 end => "'E',tagname,text",
19 text => "'T',text,is_cdata",
20 process => "'PI',token0,text",
21 comment => "'C',text",
22 declaration => "'D',text",
23
24 # options that default on
25 unbroken_text => 1,
26 );
27
28
29 sub new
30 {
31 my $class = shift;
32 my %cnf;
33 if (@_ == 1) {
34 my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
35 %cnf = ($type => $_[0]);
36 }
37 else {
38 %cnf = @_;
39 }
40
41 my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
42
43 my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
44
45 $self->{textify} = $textify;
46 $self;
47 }
48
49
50 sub get_tag
51 {
52 my $self = shift;
53 my $token;
54 while (1) {
55 $token = $self->get_token || return undef;
56 my $type = shift @$token;
57 next unless $type eq "S" || $type eq "E";
58 substr($token->[0], 0, 0) = "/" if $type eq "E";
59 return $token unless @_;
60 for (@_) {
61 return $token if $token->[0] eq $_;
62 }
63 }
64 }
65
66
67 sub _textify {
68 my($self, $token) = @_;
69 my $tag = $token->[1];
70 return undef unless exists $self->{textify}{$tag};
71
72 my $alt = $self->{textify}{$tag};
73 my $text;
74 if (ref($alt)) {
75 $text = &$alt(@$token);
76 } else {
77 $text = $token->[2]{$alt || "alt"};
78 $text = "[\U$tag]" unless defined $text;
79 }
80 return $text;
81 }
82
83
84 sub get_text
85 {
86 my $self = shift;
87 my @text;
88 while (my $token = $self->get_token) {
89 my $type = $token->[0];
90 if ($type eq "T") {
91 my $text = $token->[1];
92 decode_entities($text) unless $token->[2];
93 push(@text, $text);
94 } elsif ($type =~ /^[SE]$/) {
95 my $tag = $token->[1];
96 if ($type eq "S") {
97 if (defined(my $text = _textify($self, $token))) {
98 push(@text, $text);
99 next;
100 }
101 } else {
102 $tag = "/$tag";
103 }
104 if (!@_ || grep $_ eq $tag, @_) {
105 $self->unget_token($token);
106 last;
107 }
108 push(@text, " ")
109 if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
110 }
111 }
112 join("", @text);
113 }
114
115
116 sub get_trimmed_text
117 {
118 my $self = shift;
119 my $text = $self->get_text(@_);
120 $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
121 $text;
122 }
123
124 sub get_phrase {
125 my $self = shift;
126 my @text;
127 while (my $token = $self->get_token) {
128 my $type = $token->[0];
129 if ($type eq "T") {
130 my $text = $token->[1];
131 decode_entities($text) unless $token->[2];
132 push(@text, $text);
133 } elsif ($type =~ /^[SE]$/) {
134 my $tag = $token->[1];
135 if ($type eq "S") {
136 if (defined(my $text = _textify($self, $token))) {
137 push(@text, $text);
138 next;
139 }
140 }
141 if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
142 $self->unget_token($token);
143 last;
144 }
145 push(@text, " ") if $tag eq "br";
146 }
147 }
148 my $text = join("", @text);
149 $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
150 $text;
151 }
152
153 1;
154
155
156 __END__
157
158 #line 367

  ViewVC Help
Powered by ViewVC 1.1.26