Line # Revision Author
1 11 dpavlin #!/usr/bin/perl -w
2 use strict;
3 $|++;
4
5 # Mirror CPAN latest archive. Based on article "Mirroring your own mini-CPAN"
6 # by Randal L. Schwartz for Linux Magazine Column 42 (Nov 2002) available on
7 # http://www.stonehenge.com/merlyn/LinuxMag/col42.html
8 # http://www.stonehenge.com/merlyn/LinuxMag/col42.listing.txt
9
10 # TODO:
11 # - support for ZIP archives (and fix .tar.gz cludges)
12 # - add version requirement for Archive::Tar (0.22 has a bug which
13 # prevents it to extract some tars)
14
15 ### CONFIG
16
17 #my $REMOTE = "http://ftp.linux.hr/CPAN/";
18 46 dpavlin my $REMOTE = "http://www.cpan.org/";
19 #my $REMOTE = "http://cpan.pliva.hr/";
20 11 dpavlin # my $REMOTE = "http://fi.cpan.org/";
21 # my $REMOTE = "http://au.cpan.org/";
22 # my $REMOTE = "file://Users/merlyn/MIRROR/CPAN/";
23
24 ## warning: unknown files below this dir are deleted!
25 15 dpavlin #my $LOCAL = "/mirrors/cpan/CPAN/";
26 my $LOCAL = "/rest/cpan/CPAN/";
27 11 dpavlin
28 my $TRACE = 0;
29
30 33 dpavlin # This may or may not save you some disks space (depending on filesystem
31 # that you use to store CPAN mirror)
32 #
33 # If you want to create gziped readme files, change $readme_ext to
34 # my $readme_ext = '.readme.gz';
35 #
36 # I found out that gziping readme files doesn't save any
37 # space at one server and saves space on my laptop. YMMV
38 my $readme_ext = '.readme.gz';
39
40 11 dpavlin ### END CONFIG
41
42 ## core -
43 use File::Path qw(mkpath);
44 use File::Basename qw(dirname);
45 use File::Spec::Functions qw(catfile devnull);
46 use File::Find qw(find);
47 28 dpavlin use Getopt::Long;
48 32 dpavlin use IO::Zlib;
49 11 dpavlin
50 ## LWP -
51 use URI ();
52 use LWP::Simple qw(mirror RC_OK RC_NOT_MODIFIED);
53
54 ## Compress::Zlib -
55 use Compress::Zlib qw(gzopen $gzerrno);
56
57 ## Archive::Tar -
58 use Archive::Tar qw();
59
60 28 dpavlin ## process command-line arguments
61 my $result = GetOptions(
62 "local=s" => \$LOCAL,
63 "remote=s" => \$REMOTE,
64 "verbose!" => \$TRACE,
65 "debug!" => \$TRACE
66 );
67
68 print "local path: $LOCAL\nremote URI: $REMOTE\n" if ($TRACE);
69
70 11 dpavlin ## first, get index files
71 my_mirror($_) for qw(
72 authors/01mailrc.txt.gz
73 modules/02packages.details.txt.gz
74 modules/03modlist.data.gz
75 47 dpavlin MIRRORED.BY
76 11 dpavlin );
77
78 ## now walk the packages list
79 my $details = catfile($LOCAL, qw(modules 02packages.details.txt.gz));
80 my $gz = gzopen($details, "rb") or die "Cannot open details: $gzerrno";
81 my $inheader = 1;
82 while ($gz->gzreadline($_) > 0) {
83 if ($inheader) {
84 $inheader = 0 unless /\S/;
85 next;
86 }
87
88 my ($module, $version, $path) = split;
89 next if $path =~ m{/perl-5}; # skip Perl distributions
90 my_mirror("authors/id/$path", 1);
91 }
92
93 ## finally, clean the files we didn't stick there
94 clean_unmirrored();
95
96 47 dpavlin print "creating 'indices/ls-lR.gz'\n";
97 system "cd $LOCAL && ls -lR | gzip > indices/ls-lR.gz" || die "$!";
98
99 11 dpavlin exit 0;
100
101 BEGIN {
102 ## %mirrored tracks the already done, keyed by filename
103 ## 1 = local-checked, 2 = remote-mirrored
104 my %mirrored;
105
106 sub my_mirror {
107 my $path = shift; # partial URL
108 my $skip_if_present = shift; # true/false
109
110 my $remote_uri = URI->new_abs($path, $REMOTE)->as_string; # full URL
111 my $local_file = catfile($LOCAL, split "/", $path); # native absolute file
112 my $checksum_might_be_up_to_date = 1;
113
114 if ($skip_if_present and -f $local_file) {
115 ## upgrade to checked if not already
116 $mirrored{$local_file} = 1 unless $mirrored{$local_file};
117 } elsif (($mirrored{$local_file} || 0) < 2) {
118 ## upgrade to full mirror
119 $mirrored{$local_file} = 2;
120
121 mkpath(dirname($local_file), $TRACE, 0711);
122 print $path if $TRACE;
123 my $status = mirror($remote_uri, $local_file);
124
125 if ($status == RC_OK) {
126 $checksum_might_be_up_to_date = 0;
127 print " ... updated\n" if $TRACE;
128 } elsif ($status != RC_NOT_MODIFIED) {
129 warn "\n$remote_uri: $status\n";
130 return;
131 } else {
132 print " ... up to date\n" if $TRACE;
133 }
134 }
135
136 if ($path =~ m{^authors/id}) { # maybe fetch CHECKSUMS
137 my $checksum_path =
138 URI->new_abs("CHECKSUMS", $remote_uri)->rel($REMOTE);
139 if ($path ne $checksum_path) {
140 my_mirror($checksum_path, $checksum_might_be_up_to_date);
141 }
142 }
143 }
144
145 sub clean_unmirrored {
146 find sub {
147 33 dpavlin return if /${readme_ext}$/; # don't erase readme files
148 46 dpavlin check_readme($File::Find::name) if ($mirrored{$File::Find::name} && $mirrored{$File::Find::name} == 2);
149 11 dpavlin return unless -f and not $mirrored{$File::Find::name};
150 print "$File::Find::name ... removed\n" if $TRACE;
151 unlink $_ or warn "Cannot remove $File::Find::name: $!";
152 my $path = $File::Find::name;
153 33 dpavlin if ($path =~ s/(\.tar\.gz|\.tgz)/${readme_ext}/g && -f $path) {
154 11 dpavlin # only if we erase archive also!
155 unlink $path or warn "Cannot remove $path: $!";
156 }
157 }, $LOCAL;
158 }
159
160 sub check_readme {
161
162 my $path = shift;
163 # fixup some things
164 my $readme_path = $path;
165 33 dpavlin $readme_path =~ s/\.(tar\.gz|\.tgz)/${readme_ext}/g || return; # just .tar.gz is supported!
166 11 dpavlin
167 my $at = Archive::Tar->new($path) or die "Archive::Tar failed on $path\n";
168
169 if (! -f $readme_path) {
170 # create readme file
171 32 dpavlin my @readmes = sort grep m{^[^/]+/README\z}i, $at->list_files();
172 11 dpavlin my $readme;
173
174 if ($readme = shift @readmes) {
175 33 dpavlin my $fh;
176 if ($readme_ext =~ m/\.gz/) {
177 $fh = IO::Zlib->new($readme_path, "wb");
178 } else {
179 $fh = IO::File->new($readme_path, "w");
180 }
181 32 dpavlin if (defined $fh) {
182 print $fh $at->get_content($readme);
183 $fh->close;
184 } else {
185 die "Cannot create $readme_path: $!";
186 }
187 11 dpavlin
188 print "$readme_path ... created\n" if $TRACE;
189
190 } else {
191
192 $readme_path =~ s/^.+\/(.+)$/$1/;
193 print "can't find readme for $readme_path ...\n" if $TRACE;
194
195 }
196
197 }
198 }
199
200
201 }