/[XML-Feed]/inc/HTML/TokeParser.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /inc/HTML/TokeParser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (hide annotations)
Sun Mar 16 19:47:49 2008 UTC (16 years, 2 months ago) by dpavlin
File size: 3154 byte(s)
import XML::Feed 0.12 from CPAN

1 dpavlin 1 #line 1
2     package HTML::TokeParser;
3    
4     # $Id: TokeParser.pm,v 2.35 2005/12/02 16:08:17 gisle Exp $
5    
6     require HTML::PullParser;
7     @ISA=qw(HTML::PullParser);
8     $VERSION = sprintf("%d.%02d", q$Revision: 2.35 $ =~ /(\d+)\.(\d+)/);
9    
10     use strict;
11     use Carp ();
12     use HTML::Entities qw(decode_entities);
13     use HTML::Tagset ();
14    
15     my %ARGS =
16     (
17     start => "'S',tagname,attr,attrseq,text",
18     end => "'E',tagname,text",
19     text => "'T',text,is_cdata",
20     process => "'PI',token0,text",
21     comment => "'C',text",
22     declaration => "'D',text",
23    
24     # options that default on
25     unbroken_text => 1,
26     );
27    
28    
29     sub new
30     {
31     my $class = shift;
32     my %cnf;
33     if (@_ == 1) {
34     my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
35     %cnf = ($type => $_[0]);
36     }
37     else {
38     %cnf = @_;
39     }
40    
41     my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
42    
43     my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
44    
45     $self->{textify} = $textify;
46     $self;
47     }
48    
49    
50     sub get_tag
51     {
52     my $self = shift;
53     my $token;
54     while (1) {
55     $token = $self->get_token || return undef;
56     my $type = shift @$token;
57     next unless $type eq "S" || $type eq "E";
58     substr($token->[0], 0, 0) = "/" if $type eq "E";
59     return $token unless @_;
60     for (@_) {
61     return $token if $token->[0] eq $_;
62     }
63     }
64     }
65    
66    
67     sub _textify {
68     my($self, $token) = @_;
69     my $tag = $token->[1];
70     return undef unless exists $self->{textify}{$tag};
71    
72     my $alt = $self->{textify}{$tag};
73     my $text;
74     if (ref($alt)) {
75     $text = &$alt(@$token);
76     } else {
77     $text = $token->[2]{$alt || "alt"};
78     $text = "[\U$tag]" unless defined $text;
79     }
80     return $text;
81     }
82    
83    
84     sub get_text
85     {
86     my $self = shift;
87     my @text;
88     while (my $token = $self->get_token) {
89     my $type = $token->[0];
90     if ($type eq "T") {
91     my $text = $token->[1];
92     decode_entities($text) unless $token->[2];
93     push(@text, $text);
94     } elsif ($type =~ /^[SE]$/) {
95     my $tag = $token->[1];
96     if ($type eq "S") {
97     if (defined(my $text = _textify($self, $token))) {
98     push(@text, $text);
99     next;
100     }
101     } else {
102     $tag = "/$tag";
103     }
104     if (!@_ || grep $_ eq $tag, @_) {
105     $self->unget_token($token);
106     last;
107     }
108     push(@text, " ")
109     if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
110     }
111     }
112     join("", @text);
113     }
114    
115    
116     sub get_trimmed_text
117     {
118     my $self = shift;
119     my $text = $self->get_text(@_);
120     $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
121     $text;
122     }
123    
124     sub get_phrase {
125     my $self = shift;
126     my @text;
127     while (my $token = $self->get_token) {
128     my $type = $token->[0];
129     if ($type eq "T") {
130     my $text = $token->[1];
131     decode_entities($text) unless $token->[2];
132     push(@text, $text);
133     } elsif ($type =~ /^[SE]$/) {
134     my $tag = $token->[1];
135     if ($type eq "S") {
136     if (defined(my $text = _textify($self, $token))) {
137     push(@text, $text);
138     next;
139     }
140     }
141     if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
142     $self->unget_token($token);
143     last;
144     }
145     push(@text, " ") if $tag eq "br";
146     }
147     }
148     my $text = join("", @text);
149     $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
150     $text;
151     }
152    
153     1;
154    
155    
156     __END__
157    
158     #line 367

  ViewVC Help
Powered by ViewVC 1.1.26