| Revision 176 (by dpavlin, 2008/03/11 12:29:28) |
- move configuration to ~/.sites.yaml
- added ability to submit form at beginning (for login) |
#!/usr/bin/perl
# Web Site Mapper
# Copyright (C) 2004 Timm Murray
#
# Somewhat modified by Dobrica Palinusic, see
# http://svn.rot13.org/index.cgi/perl/view/trunk/web_site_mapper.pl
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
use strict;
use warnings;
use WWW::Mechanize;
use URI;
use YAML qw/Dump LoadFile/;
use Time::HiRes qw/time/;
use Data::Dump qw/dump/;
my $template_yaml = <<'__EOF__';
---
#'http://www.example.com/':
# form_name: 'loginForm'
# fields:
# 'user': 'foobar@example.com'
# 'password': secret
'http://blog.rot13.org':
__EOF__
my $rc = $ENV{'HOME'} . '/.sites.yaml';
my $credentials;
# maximum re-visit of each page
my $max_seen = 10;
my %seen;
my ( @sites, @site_hosts );
sub DISALLOWED_SCHEMES () { qw( mailto javascript ) }
my $debug = 1;
my $verbose = 0;
if ( -e $rc ) {
$credentials = LoadFile( $rc ) or die "can't open $rc: $!";
@sites = keys %$credentials;
@site_hosts = map { URI->new($_)->host } @sites;
warn "# loaded $rc with credentilas for: ", join(", ", @sites), "\n";
warn dump( $credentials );
} else {
open(my $yaml, '>', $rc ) || die "can't open $rc: $!";
print $yaml $template_yaml;
close($yaml);
warn "# create template $rc edit it to your needs\n";
exit 1;
}
my $mech = WWW::Mechanize->new();
sub get_page {
my ($data, $uri) = @_;
my $page_to_load = $uri->canonical;
# Don't process pages that have already been loaded
if(exists $data->{$page_to_load}) {
# warn "\t$page_to_load already indexed\n" if $debug;
return;
}
# Don't process pages that aren't listed in the sites above
unless ( grep { lc($uri->host) eq lc ($_) } @site_hosts ) {
warn "\t$page_to_load not in allowed sites\n" if $debug;
return;
}
print "$page_to_load";
my $t = time();
my $response = $mech->get( $page_to_load );
$t = time() - $t;
$data->{$page_to_load}{status} = $mech->status;
if($mech->success) {
$data->{$page_to_load}{content_type} = $mech->ct;
$data->{$page_to_load}{title} = $mech->title;
my @links = map { $_->url_abs } $mech->links;
if ($debug) {
warn "\tResponse successful\n";
warn "\tContent-type: ", $data->{$page_to_load}{content_type}, "\n";
warn "\tTitle: ", $data->{$page_to_load}{title}, "\n";
warn "\tLinks: " . join("\n", map "\t\t$_", @links) . "\n";
} else {
my $size = length( $mech->content );
print " ", $mech->status, " ", $mech->ct, sprintf(" %d in %.2fs (%.2f b/s)", $size, $t, $size / $t), "\n";
}
$data->{$page_to_load}{links} = [];
foreach my $link (@links) {
my $uri = URI->new($link);
next if grep { $uri->scheme eq $_ } DISALLOWED_SCHEMES;
my $url = $uri->canonical->as_string;
$url =~ s/#.*$//;
# warn "\tFollowing $url\n" if $debug;
my $url_no_params = $url;
$url_no_params =~ s/\?.*$//;
$seen{$url_no_params}++;
if ($seen{$url_no_params} > $max_seen) {
print "skipped $url_no_params, seen $seen{$url_no_params}\n" if ($verbose);
next;
}
push @{ $data->{$page_to_load}{links} }, $url;
get_page( $data, $uri );
}
}
else {
warn "\tResponse unsuccessful\n" if $debug;
}
}
{
my $data = { };
foreach my $site ( @sites ) {
warn "## indexing $site\n";
my $uri = URI->new($site);
$mech->get( $uri );
if ( my $form = $credentials->{$site} ) {
warn "## login using form ", dump($form);
$mech->submit_form( %$form );
}
get_page( $data, $uri );
}
print Dump($data) if ($debug);
}