#!/usr/bin/perl # Web Site Mapper # Copyright (C) 2004 Timm Murray # # Somewhat modified by Dobrica Palinusic, see # http://svn.rot13.org/index.cgi/perl/view/trunk/web_site_mapper.pl # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # use strict; use warnings; use WWW::Mechanize; use URI; use YAML qw/Dump LoadFile/; use Time::HiRes qw/time/; use Data::Dump qw/dump/; my $template_yaml = <<'__EOF__'; --- #'http://www.example.com/': # form_name: 'loginForm' # fields: # 'user': 'foobar@example.com' # 'password': secret 'http://blog.rot13.org': __EOF__ my $rc = $ENV{'HOME'} . '/.sites.yaml'; my $credentials; # maximum re-visit of each page my $max_seen = 10; my %seen; my ( @sites, @site_hosts ); sub DISALLOWED_SCHEMES () { qw( mailto javascript ) } my $debug = 1; my $verbose = 0; if ( -e $rc ) { $credentials = LoadFile( $rc ) or die "can't open $rc: $!"; @sites = keys %$credentials; @site_hosts = map { URI->new($_)->host } @sites; warn "# loaded $rc with credentilas for: ", join(", ", @sites), "\n"; warn dump( $credentials ); } else { open(my $yaml, '>', $rc ) || die "can't open $rc: $!"; print $yaml $template_yaml; close($yaml); warn "# create template $rc edit it to your needs\n"; exit 1; } my $mech = WWW::Mechanize->new(); sub get_page { my ($data, $uri) = @_; my $page_to_load = $uri->canonical; # Don't process pages that have already been loaded if(exists $data->{$page_to_load}) { # warn "\t$page_to_load already indexed\n" if $debug; return; } # Don't process pages that aren't listed in the sites above unless ( grep { lc($uri->host) eq lc ($_) } @site_hosts ) { warn "\t$page_to_load not in allowed sites\n" if $debug; return; } print "$page_to_load"; my $t = time(); my $response = $mech->get( $page_to_load ); $t = time() - $t; $data->{$page_to_load}{status} = $mech->status; if($mech->success) { $data->{$page_to_load}{content_type} = $mech->ct; $data->{$page_to_load}{title} = $mech->title; my @links = map { $_->url_abs } $mech->links; if ($debug) { warn "\tResponse successful\n"; warn "\tContent-type: ", $data->{$page_to_load}{content_type}, "\n"; warn "\tTitle: ", $data->{$page_to_load}{title}, "\n"; warn "\tLinks: " . join("\n", map "\t\t$_", @links) . "\n"; } else { my $size = length( $mech->content ); print " ", $mech->status, " ", $mech->ct, sprintf(" %d in %.2fs (%.2f b/s)", $size, $t, $size / $t), "\n"; } $data->{$page_to_load}{links} = []; foreach my $link (@links) { my $uri = URI->new($link); next if grep { $uri->scheme eq $_ } DISALLOWED_SCHEMES; my $url = $uri->canonical->as_string; $url =~ s/#.*$//; # warn "\tFollowing $url\n" if $debug; my $url_no_params = $url; $url_no_params =~ s/\?.*$//; $seen{$url_no_params}++; if ($seen{$url_no_params} > $max_seen) { print "skipped $url_no_params, seen $seen{$url_no_params}\n" if ($verbose); next; } push @{ $data->{$page_to_load}{links} }, $url; get_page( $data, $uri ); } } else { warn "\tResponse unsuccessful\n" if $debug; } } { my $data = { }; foreach my $site ( @sites ) { warn "## indexing $site\n"; my $uri = URI->new($site); $mech->get( $uri ); if ( my $form = $credentials->{$site} ) { warn "## login using form ", dump($form); $mech->submit_form( %$form ); } get_page( $data, $uri ); } print Dump($data) if ($debug); }