25 |
sub split { |
sub split { |
26 |
my ($self, $html_source) = @_; |
my ($self, $html_source) = @_; |
27 |
|
|
28 |
my ($title) = $html_source =~ /<title\s*>(.*?)<\/title\s*>/si; |
my (undef,$title) = $html_source =~ /<(title|h1|h2|h3|h4)[^>]*>(.*?)<\/\1\s*>/si; |
29 |
|
|
30 |
my $html = HTML::Parse::parse_html($html_source); |
my $html = HTML::Parse::parse_html($html_source); |
31 |
my $formatter = HTML::FormatText->new; |
my $formatter = HTML::FormatText->new; |
32 |
|
|
33 |
{ |
{ |
34 |
'text', $formatter->format($html), |
'text', $formatter->format($html), |
35 |
'title', $formatter->format(HTML::Parse::parse_html($title)), |
'title', $title || 'no title', |
36 |
}; |
}; |
37 |
} |
} |
38 |
|
|
44 |
my ($pre,$title,$body) |
my ($pre,$title,$body) |
45 |
= $html_source =~ /^(.*?<title\s*>)(.*?)(<\/title\s*>.+)/si; |
= $html_source =~ /^(.*?<title\s*>)(.*?)(<\/title\s*>.+)/si; |
46 |
|
|
47 |
|
$pre .= ''; |
48 |
|
$title .= ''; |
49 |
|
$body .= ''; |
50 |
|
|
51 |
( |
( |
52 |
{'text' => 1}, decode_entities($pre), |
{'text' => 1}, decode_entities($pre), |
53 |
{'title' => 1}, decode_entities($title), |
{'title' => 2}, decode_entities($title), |
54 |
{'text' => 1}, decode_entities($body), |
{'text' => 1}, decode_entities($body), |
55 |
); |
); |
56 |
} |
} |
57 |
|
|
58 |
|
1; |