--- trunk/google-groups2mbox.pl 2010/11/26 14:37:09 140 +++ trunk/google-groups2mbox.pl 2010/11/26 14:54:52 141 @@ -2,7 +2,18 @@ use warnings; use strict; -my $group = 'angular'; +# Craws google ground and create mbox archive +# Dobrica Pavlinusic 2010-11-26 +# +# usage: +# +# ./google-groups2mbox.pl angular > angular +# +# You can also continue import from selected offset: +# +# START=30 ./google-groups2mbox.pl angular >> angular + +my $group = $ARGV[0] || die "usage: $0 google-groups-name\n"; use WWW::Mechanize; use Data::Dump qw(dump); @@ -11,18 +22,26 @@ $mech->get( "http://groups.google.com/group/$group/topics?gvc=2" ); -foreach my $link ( $mech->find_all_links( url_regex => qr/browse_thread/ ) ) { - print STDERR "# ",$link->text; - $mech->follow_link( url => $link->url ); - foreach my $m_link ( $mech->find_all_links( url_regex => qr/dmode=source/ ) ) { - $mech->get( $m_link->url . '&output=gplain' ); - print STDERR "."; - my $msg = $mech->content; - $msg =~ s/\r//gs; - $msg =~ s/^\s+//s; - print "From $group\@googlegroups.com " . localtime() . "\n$msg\n"; +my $start = $ENV{START} || 0; + +while (1) { + $mech->follow_link( url_regex => qr/start=$start/ ) if $start; + + foreach my $link ( $mech->find_all_links( url_regex => qr/browse_thread/ ) ) { + $start++; + print STDERR "# $start ",$link->text; + $mech->follow_link( url => $link->url ); + foreach my $m_link ( $mech->find_all_links( url_regex => qr/dmode=source/ ) ) { + $mech->get( $m_link->url . '&output=gplain' ); + print STDERR "."; + my $msg = $mech->content; + $msg =~ s/\r//gs; + $msg =~ s/^\s+//s; + print "From $group\@googlegroups.com " . localtime() . "\n$msg\n"; + $mech->back; + sleep 1; + } + print STDERR "\n"; $mech->back; } - print STDERR "\n"; - $mech->back; }