Line # Revision Author
1 173 dpavlin #!/usr/bin/perl
2 # Web Site Mapper
3 # Copyright (C) 2004 Timm Murray
4 #
5 176 dpavlin # Somewhat modified by Dobrica Palinusic, see
6 # http://svn.rot13.org/index.cgi/perl/view/trunk/web_site_mapper.pl
7 #
8 173 dpavlin # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 #
22
23 use strict;
24 use warnings;
25 use WWW::Mechanize;
26 use URI;
27 176 dpavlin use YAML qw/Dump LoadFile/;
28 174 dpavlin use Time::HiRes qw/time/;
29 176 dpavlin use Data::Dump qw/dump/;
30 173 dpavlin
31 176 dpavlin my $template_yaml = <<'__EOF__';
32 ---
33 #'http://www.example.com/':
34 # form_name: 'loginForm'
35 # fields:
36 # 'user': 'foobar@example.com'
37 # 'password': secret
38 'http://blog.rot13.org':
39 __EOF__
40
41 my $rc = $ENV{'HOME'} . '/.sites.yaml';
42 my $credentials;
43
44 173 dpavlin # maximum re-visit of each page
45 my $max_seen = 10;
46 my %seen;
47
48 176 dpavlin my ( @sites, @site_hosts );
49 173 dpavlin
50 176 dpavlin sub DISALLOWED_SCHEMES () { qw( mailto javascript ) }
51
52 my $debug = 1;
53 174 dpavlin my $verbose = 0;
54
55 176 dpavlin if ( -e $rc ) {
56 $credentials = LoadFile( $rc ) or die "can't open $rc: $!";
57 @sites = keys %$credentials;
58 @site_hosts = map { URI->new($_)->host } @sites;
59 warn "# loaded $rc with credentilas for: ", join(", ", @sites), "\n";
60 warn dump( $credentials );
61 } else {
62 open(my $yaml, '>', $rc ) || die "can't open $rc: $!";
63 print $yaml $template_yaml;
64 close($yaml);
65 warn "# create template $rc edit it to your needs\n";
66 exit 1;
67 }
68
69 my $mech = WWW::Mechanize->new();
70
71 sub get_page {
72 174 dpavlin my ($data, $uri) = @_;
73 173 dpavlin
74 174 dpavlin my $page_to_load = $uri->canonical;
75 173 dpavlin
76 174 dpavlin # Don't process pages that have already been loaded
77 if(exists $data->{$page_to_load}) {
78 # warn "\t$page_to_load already indexed\n" if $debug;
79 return;
80 173 dpavlin }
81
82 # Don't process pages that aren't listed in the sites above
83 176 dpavlin unless ( grep { lc($uri->host) eq lc ($_) } @site_hosts ) {
84 174 dpavlin warn "\t$page_to_load not in allowed sites\n" if $debug;
85 173 dpavlin return;
86 }
87
88 174 dpavlin print "$page_to_load";
89
90 my $t = time();
91
92 173 dpavlin my $response = $mech->get( $page_to_load );
93
94 174 dpavlin $t = time() - $t;
95
96 173 dpavlin $data->{$page_to_load}{status} = $mech->status;
97
98 174 dpavlin if($mech->success) {
99 173 dpavlin $data->{$page_to_load}{content_type} = $mech->ct;
100 $data->{$page_to_load}{title} = $mech->title;
101
102 my @links = map { $_->url_abs } $mech->links;
103 174 dpavlin
104 if ($debug) {
105 warn "\tResponse successful\n";
106 warn "\tContent-type: ", $data->{$page_to_load}{content_type}, "\n";
107 warn "\tTitle: ", $data->{$page_to_load}{title}, "\n";
108 warn "\tLinks: " . join("\n", map "\t\t$_", @links) . "\n";
109 } else {
110 my $size = length( $mech->content );
111 print " ", $mech->status, " ", $mech->ct, sprintf(" %d in %.2fs (%.2f b/s)", $size, $t, $size / $t), "\n";
112 }
113
114 173 dpavlin $data->{$page_to_load}{links} = [];
115
116 foreach my $link (@links) {
117 my $uri = URI->new($link);
118 next if grep { $uri->scheme eq $_ } DISALLOWED_SCHEMES;
119 my $url = $uri->canonical->as_string;
120 $url =~ s/#.*$//;
121 174 dpavlin # warn "\tFollowing $url\n" if $debug;
122 173 dpavlin my $url_no_params = $url;
123 $url_no_params =~ s/\?.*$//;
124 $seen{$url_no_params}++;
125 if ($seen{$url_no_params} > $max_seen) {
126 174 dpavlin print "skipped $url_no_params, seen $seen{$url_no_params}\n" if ($verbose);
127 173 dpavlin next;
128 }
129 push @{ $data->{$page_to_load}{links} }, $url;
130
131 get_page( $data, $uri );
132 }
133 }
134 else {
135 174 dpavlin warn "\tResponse unsuccessful\n" if $debug;
136 173 dpavlin }
137 }
138
139
140 {
141 my $data = { };
142 176 dpavlin foreach my $site ( @sites ) {
143 warn "## indexing $site\n";
144 my $uri = URI->new($site);
145 $mech->get( $uri );
146 if ( my $form = $credentials->{$site} ) {
147 warn "## login using form ", dump($form);
148 $mech->submit_form( %$form );
149 }
150 get_page( $data, $uri );
151 }
152 173 dpavlin
153 174 dpavlin print Dump($data) if ($debug);
154 173 dpavlin }
155