242 |
$self->log->debug("looking for $message $tag"); |
$self->log->debug("looking for $message $tag"); |
243 |
@results = $tree->look_down( '_tag', $el, sub { |
@results = $tree->look_down( '_tag', $el, sub { |
244 |
return 1 unless ( $attr && $value ); |
return 1 unless ( $attr && $value ); |
245 |
( $_[0]->attr( $attr ) || '' ) eq $value; |
( $_[0]->attr( $attr ) || '' ) =~ m/\b\Q$value\E\b/ |
246 |
}); |
}); |
247 |
last if @results; |
last if @results; |
248 |
} |
} |
392 |
|
|
393 |
my $page_tree = HTML::TreeBuilder->new or die "can't create page tree"; |
my $page_tree = HTML::TreeBuilder->new or die "can't create page tree"; |
394 |
$page_tree->parse( $mech->content ) or die "can't parse page at $page_uri"; |
$page_tree->parse( $mech->content ) or die "can't parse page at $page_uri"; |
395 |
my $div = $self->element_by_triplet( |
my @divs = $self->element_by_triplet( |
396 |
tree => $page_tree, |
tree => $page_tree, |
397 |
message => "result page $nr", |
message => "result page $nr", |
398 |
triplets => $args->{scrape} |
triplets => $args->{scrape} |
399 |
); |
); |
400 |
|
|
401 |
$self->add_record( |
if ( @divs ) { |
402 |
in_feed => $feed, |
|
403 |
title => $mech->title, |
my $html = join("<hr/>\n", map { $_->as_HTML } @divs ); |
404 |
link => $page_uri, |
$self->log->debug("found ", $#divs + 1, " element ", length($html), " bytes"); |
405 |
content => $div->as_HTML, |
|
406 |
# summary => |
$self->add_record( |
407 |
# category => |
in_feed => $feed, |
408 |
# author => |
title => $mech->title, |
409 |
# issued => |
link => $page_uri, |
410 |
# modified => |
content => $html, |
411 |
) if ( $div ); |
# summary => |
412 |
|
# category => |
413 |
|
# author => |
414 |
|
# issued => |
415 |
|
# modified => |
416 |
|
); |
417 |
|
|
418 |
|
} else { |
419 |
|
$self->log->debug("NO CONTENT scraped from page $nr"); |
420 |
|
} |
421 |
|
|
422 |
$mech->back; |
$mech->back; |
423 |
$page_tree->delete; |
$page_tree->delete; |