| 1 |
11 |
dpavlin |
#!/usr/bin/perl -w |
| 2 |
|
|
use strict; |
| 3 |
|
|
$|++; |
| 4 |
|
|
|
| 5 |
|
|
# Mirror CPAN latest archive. Based on article "Mirroring your own mini-CPAN" |
| 6 |
|
|
# by Randal L. Schwartz for Linux Magazine Column 42 (Nov 2002) available on |
| 7 |
|
|
# http://www.stonehenge.com/merlyn/LinuxMag/col42.html |
| 8 |
|
|
# http://www.stonehenge.com/merlyn/LinuxMag/col42.listing.txt |
| 9 |
|
|
|
| 10 |
|
|
# TODO: |
| 11 |
|
|
# - support for ZIP archives (and fix .tar.gz cludges) |
| 12 |
|
|
# - add version requirement for Archive::Tar (0.22 has a bug which |
| 13 |
|
|
# prevents it to extract some tars) |
| 14 |
|
|
|
| 15 |
|
|
### CONFIG |
| 16 |
|
|
|
| 17 |
|
|
#my $REMOTE = "http://ftp.linux.hr/CPAN/"; |
| 18 |
46 |
dpavlin |
my $REMOTE = "http://www.cpan.org/"; |
| 19 |
|
|
#my $REMOTE = "http://cpan.pliva.hr/"; |
| 20 |
11 |
dpavlin |
# my $REMOTE = "http://fi.cpan.org/"; |
| 21 |
|
|
# my $REMOTE = "http://au.cpan.org/"; |
| 22 |
|
|
# my $REMOTE = "file://Users/merlyn/MIRROR/CPAN/"; |
| 23 |
|
|
|
| 24 |
|
|
## warning: unknown files below this dir are deleted! |
| 25 |
15 |
dpavlin |
#my $LOCAL = "/mirrors/cpan/CPAN/"; |
| 26 |
|
|
my $LOCAL = "/rest/cpan/CPAN/"; |
| 27 |
11 |
dpavlin |
|
| 28 |
|
|
my $TRACE = 0; |
| 29 |
|
|
|
| 30 |
33 |
dpavlin |
# This may or may not save you some disks space (depending on filesystem |
| 31 |
|
|
# that you use to store CPAN mirror) |
| 32 |
|
|
# |
| 33 |
|
|
# If you want to create gziped readme files, change $readme_ext to |
| 34 |
|
|
# my $readme_ext = '.readme.gz'; |
| 35 |
|
|
# |
| 36 |
|
|
# I found out that gziping readme files doesn't save any |
| 37 |
|
|
# space at one server and saves space on my laptop. YMMV |
| 38 |
|
|
my $readme_ext = '.readme.gz'; |
| 39 |
|
|
|
| 40 |
11 |
dpavlin |
### END CONFIG |
| 41 |
|
|
|
| 42 |
|
|
## core - |
| 43 |
|
|
use File::Path qw(mkpath); |
| 44 |
|
|
use File::Basename qw(dirname); |
| 45 |
|
|
use File::Spec::Functions qw(catfile devnull); |
| 46 |
|
|
use File::Find qw(find); |
| 47 |
28 |
dpavlin |
use Getopt::Long; |
| 48 |
32 |
dpavlin |
use IO::Zlib; |
| 49 |
11 |
dpavlin |
|
| 50 |
|
|
## LWP - |
| 51 |
|
|
use URI (); |
| 52 |
|
|
use LWP::Simple qw(mirror RC_OK RC_NOT_MODIFIED); |
| 53 |
|
|
|
| 54 |
|
|
## Compress::Zlib - |
| 55 |
|
|
use Compress::Zlib qw(gzopen $gzerrno); |
| 56 |
|
|
|
| 57 |
|
|
## Archive::Tar - |
| 58 |
|
|
use Archive::Tar qw(); |
| 59 |
|
|
|
| 60 |
28 |
dpavlin |
## process command-line arguments |
| 61 |
|
|
my $result = GetOptions( |
| 62 |
|
|
"local=s" => \$LOCAL, |
| 63 |
|
|
"remote=s" => \$REMOTE, |
| 64 |
|
|
"verbose!" => \$TRACE, |
| 65 |
|
|
"debug!" => \$TRACE |
| 66 |
|
|
); |
| 67 |
|
|
|
| 68 |
|
|
print "local path: $LOCAL\nremote URI: $REMOTE\n" if ($TRACE); |
| 69 |
|
|
|
| 70 |
11 |
dpavlin |
## first, get index files |
| 71 |
|
|
my_mirror($_) for qw( |
| 72 |
|
|
authors/01mailrc.txt.gz |
| 73 |
|
|
modules/02packages.details.txt.gz |
| 74 |
|
|
modules/03modlist.data.gz |
| 75 |
47 |
dpavlin |
MIRRORED.BY |
| 76 |
11 |
dpavlin |
); |
| 77 |
|
|
|
| 78 |
|
|
## now walk the packages list |
| 79 |
|
|
my $details = catfile($LOCAL, qw(modules 02packages.details.txt.gz)); |
| 80 |
|
|
my $gz = gzopen($details, "rb") or die "Cannot open details: $gzerrno"; |
| 81 |
|
|
my $inheader = 1; |
| 82 |
|
|
while ($gz->gzreadline($_) > 0) { |
| 83 |
|
|
if ($inheader) { |
| 84 |
|
|
$inheader = 0 unless /\S/; |
| 85 |
|
|
next; |
| 86 |
|
|
} |
| 87 |
|
|
|
| 88 |
|
|
my ($module, $version, $path) = split; |
| 89 |
|
|
next if $path =~ m{/perl-5}; # skip Perl distributions |
| 90 |
|
|
my_mirror("authors/id/$path", 1); |
| 91 |
|
|
} |
| 92 |
|
|
|
| 93 |
|
|
## finally, clean the files we didn't stick there |
| 94 |
|
|
clean_unmirrored(); |
| 95 |
|
|
|
| 96 |
47 |
dpavlin |
print "creating 'indices/ls-lR.gz'\n"; |
| 97 |
|
|
system "cd $LOCAL && ls -lR | gzip > indices/ls-lR.gz" || die "$!"; |
| 98 |
|
|
|
| 99 |
11 |
dpavlin |
exit 0; |
| 100 |
|
|
|
| 101 |
|
|
BEGIN { |
| 102 |
|
|
## %mirrored tracks the already done, keyed by filename |
| 103 |
|
|
## 1 = local-checked, 2 = remote-mirrored |
| 104 |
|
|
my %mirrored; |
| 105 |
|
|
|
| 106 |
|
|
sub my_mirror { |
| 107 |
|
|
my $path = shift; # partial URL |
| 108 |
|
|
my $skip_if_present = shift; # true/false |
| 109 |
|
|
|
| 110 |
|
|
my $remote_uri = URI->new_abs($path, $REMOTE)->as_string; # full URL |
| 111 |
|
|
my $local_file = catfile($LOCAL, split "/", $path); # native absolute file |
| 112 |
|
|
my $checksum_might_be_up_to_date = 1; |
| 113 |
|
|
|
| 114 |
|
|
if ($skip_if_present and -f $local_file) { |
| 115 |
|
|
## upgrade to checked if not already |
| 116 |
|
|
$mirrored{$local_file} = 1 unless $mirrored{$local_file}; |
| 117 |
|
|
} elsif (($mirrored{$local_file} || 0) < 2) { |
| 118 |
|
|
## upgrade to full mirror |
| 119 |
|
|
$mirrored{$local_file} = 2; |
| 120 |
|
|
|
| 121 |
|
|
mkpath(dirname($local_file), $TRACE, 0711); |
| 122 |
|
|
print $path if $TRACE; |
| 123 |
|
|
my $status = mirror($remote_uri, $local_file); |
| 124 |
|
|
|
| 125 |
|
|
if ($status == RC_OK) { |
| 126 |
|
|
$checksum_might_be_up_to_date = 0; |
| 127 |
|
|
print " ... updated\n" if $TRACE; |
| 128 |
|
|
} elsif ($status != RC_NOT_MODIFIED) { |
| 129 |
|
|
warn "\n$remote_uri: $status\n"; |
| 130 |
|
|
return; |
| 131 |
|
|
} else { |
| 132 |
|
|
print " ... up to date\n" if $TRACE; |
| 133 |
|
|
} |
| 134 |
|
|
} |
| 135 |
|
|
|
| 136 |
|
|
if ($path =~ m{^authors/id}) { # maybe fetch CHECKSUMS |
| 137 |
|
|
my $checksum_path = |
| 138 |
|
|
URI->new_abs("CHECKSUMS", $remote_uri)->rel($REMOTE); |
| 139 |
|
|
if ($path ne $checksum_path) { |
| 140 |
|
|
my_mirror($checksum_path, $checksum_might_be_up_to_date); |
| 141 |
|
|
} |
| 142 |
|
|
} |
| 143 |
|
|
} |
| 144 |
|
|
|
| 145 |
|
|
sub clean_unmirrored { |
| 146 |
|
|
find sub { |
| 147 |
33 |
dpavlin |
return if /${readme_ext}$/; # don't erase readme files |
| 148 |
46 |
dpavlin |
check_readme($File::Find::name) if ($mirrored{$File::Find::name} && $mirrored{$File::Find::name} == 2); |
| 149 |
11 |
dpavlin |
return unless -f and not $mirrored{$File::Find::name}; |
| 150 |
|
|
print "$File::Find::name ... removed\n" if $TRACE; |
| 151 |
|
|
unlink $_ or warn "Cannot remove $File::Find::name: $!"; |
| 152 |
|
|
my $path = $File::Find::name; |
| 153 |
33 |
dpavlin |
if ($path =~ s/(\.tar\.gz|\.tgz)/${readme_ext}/g && -f $path) { |
| 154 |
11 |
dpavlin |
# only if we erase archive also! |
| 155 |
|
|
unlink $path or warn "Cannot remove $path: $!"; |
| 156 |
|
|
} |
| 157 |
|
|
}, $LOCAL; |
| 158 |
|
|
} |
| 159 |
|
|
|
| 160 |
|
|
sub check_readme { |
| 161 |
|
|
|
| 162 |
|
|
my $path = shift; |
| 163 |
|
|
# fixup some things |
| 164 |
|
|
my $readme_path = $path; |
| 165 |
33 |
dpavlin |
$readme_path =~ s/\.(tar\.gz|\.tgz)/${readme_ext}/g || return; # just .tar.gz is supported! |
| 166 |
11 |
dpavlin |
|
| 167 |
|
|
my $at = Archive::Tar->new($path) or die "Archive::Tar failed on $path\n"; |
| 168 |
|
|
|
| 169 |
|
|
if (! -f $readme_path) { |
| 170 |
|
|
# create readme file |
| 171 |
32 |
dpavlin |
my @readmes = sort grep m{^[^/]+/README\z}i, $at->list_files(); |
| 172 |
11 |
dpavlin |
my $readme; |
| 173 |
|
|
|
| 174 |
|
|
if ($readme = shift @readmes) { |
| 175 |
33 |
dpavlin |
my $fh; |
| 176 |
|
|
if ($readme_ext =~ m/\.gz/) { |
| 177 |
|
|
$fh = IO::Zlib->new($readme_path, "wb"); |
| 178 |
|
|
} else { |
| 179 |
|
|
$fh = IO::File->new($readme_path, "w"); |
| 180 |
|
|
} |
| 181 |
32 |
dpavlin |
if (defined $fh) { |
| 182 |
|
|
print $fh $at->get_content($readme); |
| 183 |
|
|
$fh->close; |
| 184 |
|
|
} else { |
| 185 |
|
|
die "Cannot create $readme_path: $!"; |
| 186 |
|
|
} |
| 187 |
11 |
dpavlin |
|
| 188 |
|
|
print "$readme_path ... created\n" if $TRACE; |
| 189 |
|
|
|
| 190 |
|
|
} else { |
| 191 |
|
|
|
| 192 |
|
|
$readme_path =~ s/^.+\/(.+)$/$1/; |
| 193 |
|
|
print "can't find readme for $readme_path ...\n" if $TRACE; |
| 194 |
|
|
|
| 195 |
|
|
} |
| 196 |
|
|
|
| 197 |
|
|
} |
| 198 |
|
|
} |
| 199 |
|
|
|
| 200 |
|
|
|
| 201 |
|
|
} |