| 1 |
173 |
dpavlin |
#!/usr/bin/perl |
| 2 |
|
|
# Web Site Mapper |
| 3 |
|
|
# Copyright (C) 2004 Timm Murray |
| 4 |
|
|
# |
| 5 |
176 |
dpavlin |
# Somewhat modified by Dobrica Palinusic, see |
| 6 |
|
|
# http://svn.rot13.org/index.cgi/perl/view/trunk/web_site_mapper.pl |
| 7 |
|
|
# |
| 8 |
173 |
dpavlin |
# This program is free software; you can redistribute it and/or modify |
| 9 |
|
|
# it under the terms of the GNU General Public License as published by |
| 10 |
|
|
# the Free Software Foundation; either version 2 of the License, or |
| 11 |
|
|
# (at your option) any later version. |
| 12 |
|
|
# |
| 13 |
|
|
# This program is distributed in the hope that it will be useful, |
| 14 |
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 |
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 16 |
|
|
# GNU General Public License for more details. |
| 17 |
|
|
# |
| 18 |
|
|
# You should have received a copy of the GNU General Public License |
| 19 |
|
|
# along with this program; if not, write to the Free Software |
| 20 |
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 21 |
|
|
# |
| 22 |
|
|
|
| 23 |
|
|
use strict; |
| 24 |
|
|
use warnings; |
| 25 |
|
|
use WWW::Mechanize; |
| 26 |
|
|
use URI; |
| 27 |
176 |
dpavlin |
use YAML qw/Dump LoadFile/; |
| 28 |
174 |
dpavlin |
use Time::HiRes qw/time/; |
| 29 |
176 |
dpavlin |
use Data::Dump qw/dump/; |
| 30 |
173 |
dpavlin |
|
| 31 |
176 |
dpavlin |
my $template_yaml = <<'__EOF__'; |
| 32 |
|
|
--- |
| 33 |
|
|
#'http://www.example.com/': |
| 34 |
|
|
# form_name: 'loginForm' |
| 35 |
|
|
# fields: |
| 36 |
|
|
# 'user': 'foobar@example.com' |
| 37 |
|
|
# 'password': secret |
| 38 |
|
|
'http://blog.rot13.org': |
| 39 |
|
|
__EOF__ |
| 40 |
|
|
|
| 41 |
|
|
my $rc = $ENV{'HOME'} . '/.sites.yaml'; |
| 42 |
|
|
my $credentials; |
| 43 |
|
|
|
| 44 |
173 |
dpavlin |
# maximum re-visit of each page |
| 45 |
|
|
my $max_seen = 10; |
| 46 |
|
|
my %seen; |
| 47 |
|
|
|
| 48 |
176 |
dpavlin |
my ( @sites, @site_hosts ); |
| 49 |
173 |
dpavlin |
|
| 50 |
176 |
dpavlin |
sub DISALLOWED_SCHEMES () { qw( mailto javascript ) } |
| 51 |
|
|
|
| 52 |
|
|
my $debug = 1; |
| 53 |
174 |
dpavlin |
my $verbose = 0; |
| 54 |
|
|
|
| 55 |
176 |
dpavlin |
if ( -e $rc ) { |
| 56 |
|
|
$credentials = LoadFile( $rc ) or die "can't open $rc: $!"; |
| 57 |
|
|
@sites = keys %$credentials; |
| 58 |
|
|
@site_hosts = map { URI->new($_)->host } @sites; |
| 59 |
|
|
warn "# loaded $rc with credentilas for: ", join(", ", @sites), "\n"; |
| 60 |
|
|
warn dump( $credentials ); |
| 61 |
|
|
} else { |
| 62 |
|
|
open(my $yaml, '>', $rc ) || die "can't open $rc: $!"; |
| 63 |
|
|
print $yaml $template_yaml; |
| 64 |
|
|
close($yaml); |
| 65 |
|
|
warn "# create template $rc edit it to your needs\n"; |
| 66 |
|
|
exit 1; |
| 67 |
|
|
} |
| 68 |
|
|
|
| 69 |
|
|
my $mech = WWW::Mechanize->new(); |
| 70 |
|
|
|
| 71 |
|
|
sub get_page { |
| 72 |
174 |
dpavlin |
my ($data, $uri) = @_; |
| 73 |
173 |
dpavlin |
|
| 74 |
174 |
dpavlin |
my $page_to_load = $uri->canonical; |
| 75 |
173 |
dpavlin |
|
| 76 |
174 |
dpavlin |
# Don't process pages that have already been loaded |
| 77 |
|
|
if(exists $data->{$page_to_load}) { |
| 78 |
|
|
# warn "\t$page_to_load already indexed\n" if $debug; |
| 79 |
|
|
return; |
| 80 |
173 |
dpavlin |
} |
| 81 |
|
|
|
| 82 |
|
|
# Don't process pages that aren't listed in the sites above |
| 83 |
176 |
dpavlin |
unless ( grep { lc($uri->host) eq lc ($_) } @site_hosts ) { |
| 84 |
174 |
dpavlin |
warn "\t$page_to_load not in allowed sites\n" if $debug; |
| 85 |
173 |
dpavlin |
return; |
| 86 |
|
|
} |
| 87 |
|
|
|
| 88 |
174 |
dpavlin |
print "$page_to_load"; |
| 89 |
|
|
|
| 90 |
|
|
my $t = time(); |
| 91 |
|
|
|
| 92 |
173 |
dpavlin |
my $response = $mech->get( $page_to_load ); |
| 93 |
|
|
|
| 94 |
174 |
dpavlin |
$t = time() - $t; |
| 95 |
|
|
|
| 96 |
173 |
dpavlin |
$data->{$page_to_load}{status} = $mech->status; |
| 97 |
|
|
|
| 98 |
174 |
dpavlin |
if($mech->success) { |
| 99 |
173 |
dpavlin |
$data->{$page_to_load}{content_type} = $mech->ct; |
| 100 |
|
|
$data->{$page_to_load}{title} = $mech->title; |
| 101 |
|
|
|
| 102 |
|
|
my @links = map { $_->url_abs } $mech->links; |
| 103 |
174 |
dpavlin |
|
| 104 |
|
|
if ($debug) { |
| 105 |
|
|
warn "\tResponse successful\n"; |
| 106 |
|
|
warn "\tContent-type: ", $data->{$page_to_load}{content_type}, "\n"; |
| 107 |
|
|
warn "\tTitle: ", $data->{$page_to_load}{title}, "\n"; |
| 108 |
|
|
warn "\tLinks: " . join("\n", map "\t\t$_", @links) . "\n"; |
| 109 |
|
|
} else { |
| 110 |
|
|
my $size = length( $mech->content ); |
| 111 |
|
|
print " ", $mech->status, " ", $mech->ct, sprintf(" %d in %.2fs (%.2f b/s)", $size, $t, $size / $t), "\n"; |
| 112 |
|
|
} |
| 113 |
|
|
|
| 114 |
173 |
dpavlin |
$data->{$page_to_load}{links} = []; |
| 115 |
|
|
|
| 116 |
|
|
foreach my $link (@links) { |
| 117 |
|
|
my $uri = URI->new($link); |
| 118 |
|
|
next if grep { $uri->scheme eq $_ } DISALLOWED_SCHEMES; |
| 119 |
|
|
my $url = $uri->canonical->as_string; |
| 120 |
|
|
$url =~ s/#.*$//; |
| 121 |
174 |
dpavlin |
# warn "\tFollowing $url\n" if $debug; |
| 122 |
173 |
dpavlin |
my $url_no_params = $url; |
| 123 |
|
|
$url_no_params =~ s/\?.*$//; |
| 124 |
|
|
$seen{$url_no_params}++; |
| 125 |
|
|
if ($seen{$url_no_params} > $max_seen) { |
| 126 |
174 |
dpavlin |
print "skipped $url_no_params, seen $seen{$url_no_params}\n" if ($verbose); |
| 127 |
173 |
dpavlin |
next; |
| 128 |
|
|
} |
| 129 |
|
|
push @{ $data->{$page_to_load}{links} }, $url; |
| 130 |
|
|
|
| 131 |
|
|
get_page( $data, $uri ); |
| 132 |
|
|
} |
| 133 |
|
|
} |
| 134 |
|
|
else { |
| 135 |
174 |
dpavlin |
warn "\tResponse unsuccessful\n" if $debug; |
| 136 |
173 |
dpavlin |
} |
| 137 |
|
|
} |
| 138 |
|
|
|
| 139 |
|
|
|
| 140 |
|
|
{ |
| 141 |
|
|
my $data = { }; |
| 142 |
176 |
dpavlin |
foreach my $site ( @sites ) { |
| 143 |
|
|
warn "## indexing $site\n"; |
| 144 |
|
|
my $uri = URI->new($site); |
| 145 |
|
|
$mech->get( $uri ); |
| 146 |
|
|
if ( my $form = $credentials->{$site} ) { |
| 147 |
|
|
warn "## login using form ", dump($form); |
| 148 |
|
|
$mech->submit_form( %$form ); |
| 149 |
|
|
} |
| 150 |
|
|
get_page( $data, $uri ); |
| 151 |
|
|
} |
| 152 |
173 |
dpavlin |
|
| 153 |
174 |
dpavlin |
print Dump($data) if ($debug); |
| 154 |
173 |
dpavlin |
} |
| 155 |
|
|
|