1 |
#============================================================= -*-perl-*- |
2 |
# |
3 |
# BackupPC::Xfer::RsyncDigest package |
4 |
# |
5 |
# DESCRIPTION |
6 |
# |
7 |
# This library defines a BackupPC::Xfer::RsyncDigest class for computing |
8 |
# and caching rsync checksums. |
9 |
# |
10 |
# AUTHOR |
11 |
# Craig Barratt <cbarratt@users.sourceforge.net> |
12 |
# |
13 |
# COPYRIGHT |
14 |
# Copyright (C) 2001-2003 Craig Barratt |
15 |
# |
16 |
# This program is free software; you can redistribute it and/or modify |
17 |
# it under the terms of the GNU General Public License as published by |
18 |
# the Free Software Foundation; either version 2 of the License, or |
19 |
# (at your option) any later version. |
20 |
# |
21 |
# This program is distributed in the hope that it will be useful, |
22 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
23 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
24 |
# GNU General Public License for more details. |
25 |
# |
26 |
# You should have received a copy of the GNU General Public License |
27 |
# along with this program; if not, write to the Free Software |
28 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
29 |
# |
30 |
#======================================================================== |
31 |
# |
32 |
# Version 2.1.0, released 20 Jun 2004. |
33 |
# |
34 |
# See http://backuppc.sourceforge.net. |
35 |
# |
36 |
#======================================================================== |
37 |
|
38 |
package BackupPC::Xfer::RsyncDigest; |
39 |
|
40 |
use strict; |
41 |
use BackupPC::FileZIO; |
42 |
|
43 |
use vars qw( $RsyncLibOK ); |
44 |
use Carp; |
45 |
require Exporter; |
46 |
use vars qw( @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS ); |
47 |
|
48 |
my $Log = \&logHandler; |
49 |
|
50 |
# |
51 |
# Magic value for checksum seed. We only cache block and file digests |
52 |
# when the checksum seed matches this value. |
53 |
# |
54 |
use constant RSYNC_CSUMSEED_CACHE => 32761; |
55 |
|
56 |
@ISA = qw(Exporter); |
57 |
|
58 |
@EXPORT = qw( ); |
59 |
|
60 |
@EXPORT_OK = qw( |
61 |
RSYNC_CSUMSEED_CACHE |
62 |
); |
63 |
|
64 |
%EXPORT_TAGS = ( |
65 |
'all' => [ @EXPORT_OK ], |
66 |
); |
67 |
|
68 |
BEGIN { |
69 |
eval "use File::RsyncP;"; |
70 |
if ( $@ ) { |
71 |
# |
72 |
# File::RsyncP doesn't exist. Define some dummy constant |
73 |
# subs so that the code below doesn't barf. |
74 |
# |
75 |
$RsyncLibOK = 0; |
76 |
} else { |
77 |
$RsyncLibOK = 1; |
78 |
} |
79 |
}; |
80 |
|
81 |
# |
82 |
# Return the rsync block size based on the file size. |
83 |
# We also make sure the block size plus 4 (ie: cheeksumSeed) |
84 |
# is not a multiple of 64 - otherwise the cached checksums |
85 |
# will not be the same for protocol versions <= 26 and > 26. |
86 |
# |
87 |
sub blockSize |
88 |
{ |
89 |
my($class, $fileSize, $defaultBlkSize) = @_; |
90 |
|
91 |
my $blkSize = int($fileSize / 10000); |
92 |
$blkSize = $defaultBlkSize if ( $blkSize < $defaultBlkSize ); |
93 |
$blkSize = 16384 if ( $blkSize > 16384 ); |
94 |
$blkSize += 4 if ( (($blkSize + 4) % 64) == 0 ); |
95 |
return $blkSize; |
96 |
} |
97 |
|
98 |
sub fileDigestIsCached |
99 |
{ |
100 |
my($class, $file) = @_; |
101 |
my $data; |
102 |
|
103 |
open(my $fh, "<", $file) || return -1; |
104 |
binmode($fh); |
105 |
return -2 if ( sysread($fh, $data, 1) != 1 ); |
106 |
close($fh); |
107 |
return $data eq chr(0xd6) ? 1 : 0; |
108 |
} |
109 |
|
110 |
# |
111 |
# Compute and add rsync block and file digests to the given file. |
112 |
# |
113 |
# Empty files don't get cached checksums. |
114 |
# |
115 |
# If verify is set then existing cached checksums are checked. |
116 |
# |
117 |
# Returns 0 on success. Returns 1 on good verify and 2 on bad verify. |
118 |
# Returns a variety of negative values on error. |
119 |
# |
120 |
sub digestAdd |
121 |
{ |
122 |
my($class, $file, $blockSize, $checksumSeed, $verify) = @_; |
123 |
my $retValue = 0; |
124 |
|
125 |
# |
126 |
# Don't cache checksums if the checksumSeed is not RSYNC_CSUMSEED_CACHE |
127 |
# or if the file is empty. |
128 |
# |
129 |
return -100 if ( $checksumSeed != RSYNC_CSUMSEED_CACHE || !-s $file ); |
130 |
|
131 |
if ( $blockSize == 0 ) { |
132 |
&$Log("digestAdd: bad blockSize ($file, $blockSize, $checksumSeed)"); |
133 |
$blockSize = 2048; |
134 |
} |
135 |
my $nBlks = int(65536 * 16 / $blockSize) + 1; |
136 |
my($data, $blockDigest, $fileDigest); |
137 |
|
138 |
return -101 if ( !$RsyncLibOK ); |
139 |
|
140 |
my $digest = File::RsyncP::Digest->new; |
141 |
$digest->add(pack("V", $checksumSeed)) if ( $checksumSeed ); |
142 |
|
143 |
return -102 if ( !defined(my $fh = BackupPC::FileZIO->open($file, 0, 1)) ); |
144 |
|
145 |
while ( 1 ) { |
146 |
$fh->read(\$data, $nBlks * $blockSize); |
147 |
last if ( $data eq "" ); |
148 |
$blockDigest .= $digest->blockDigest($data, $blockSize, 16, |
149 |
$checksumSeed); |
150 |
$digest->add($data); |
151 |
} |
152 |
$fileDigest = $digest->digest2; |
153 |
my $eofPosn = sysseek($fh->{fh}, 0, 1); |
154 |
$fh->close; |
155 |
my $rsyncData = $blockDigest . $fileDigest; |
156 |
my $metaData = pack("VVVV", $blockSize, |
157 |
$checksumSeed, |
158 |
length($blockDigest) / 20, |
159 |
0x5fe3c289, # magic number |
160 |
); |
161 |
my $data2 = chr(0xb3) . $rsyncData . $metaData; |
162 |
# printf("appending %d+%d bytes to %s at offset %d\n", |
163 |
# length($rsyncData), |
164 |
# length($metaData), |
165 |
# $file, |
166 |
# $eofPosn); |
167 |
open(my $fh2, "+<", $file) || return -103; |
168 |
binmode($fh2); |
169 |
return -104 if ( sysread($fh2, $data, 1) != 1 ); |
170 |
if ( $data ne chr(0x78) && $data ne chr(0xd6) ) { |
171 |
&$Log(sprintf("digestAdd: $file has unexpected first char 0x%x", |
172 |
ord($data))); |
173 |
return -105; |
174 |
} |
175 |
return -106 if ( sysseek($fh2, $eofPosn, 0) != $eofPosn ); |
176 |
if ( $verify ) { |
177 |
my $data3; |
178 |
|
179 |
# |
180 |
# Verify the cached checksums |
181 |
# |
182 |
return -107 if ( $data ne chr(0xd6) ); |
183 |
return -108 if ( sysread($fh2, $data3, length($data2) + 1) < 0 ); |
184 |
if ( $data2 eq $data3 ) { |
185 |
return 1; |
186 |
} |
187 |
# |
188 |
# Checksums don't agree - fall through so we rewrite the data |
189 |
# |
190 |
&$Log("digestAdd: $file verify failed; redoing checksums"); |
191 |
return -109 if ( sysseek($fh2, $eofPosn, 0) != $eofPosn ); |
192 |
$retValue = 2; |
193 |
} |
194 |
return -110 if ( syswrite($fh2, $data2) != length($data2) ); |
195 |
if ( $verify ) { |
196 |
# |
197 |
# Make sure there is no extraneous data on the end of |
198 |
# the file. Seek to the end and truncate if it doesn't |
199 |
# match our expected length. |
200 |
# |
201 |
return -111 if ( !defined(sysseek($fh2, 0, 2)) ); |
202 |
if ( sysseek($fh2, 0, 1) != $eofPosn + length($data2) ) { |
203 |
if ( !truncate($fh2, $eofPosn + length($data2)) ) { |
204 |
&$Log(sprintf("digestAdd: $file truncate from %d to %d failed", |
205 |
sysseek($fh2, 0, 1), $eofPosn + length($data2))); |
206 |
return -112; |
207 |
} else { |
208 |
&$Log(sprintf("digestAdd: $file truncated from %d to %d", |
209 |
sysseek($fh2, 0, 1), $eofPosn + length($data2))); |
210 |
} |
211 |
} |
212 |
} |
213 |
return -113 if ( !defined(sysseek($fh2, 0, 0)) ); |
214 |
return -114 if ( syswrite($fh2, chr(0xd6)) != 1 ); |
215 |
close($fh2); |
216 |
return $retValue; |
217 |
} |
218 |
|
219 |
# |
220 |
# Return rsync checksums for the given file. We read the cached checksums |
221 |
# if they exist and the block size and checksum seed match. Otherwise |
222 |
# we compute the checksums from the file contents. |
223 |
# |
224 |
# The doCache flag can take three ranges: |
225 |
# |
226 |
# - doCache < 0: don't generate/use cached checksums |
227 |
# - doCache == 0: don't generate, but do use cached checksums if available |
228 |
# - doCache > 0: generate (if necessary) and use cached checksums |
229 |
# |
230 |
# Note: caching is only enabled when compression is on and the |
231 |
# checksum seed is RSYNC_CSUMSEED_CACHE (32761). |
232 |
# |
233 |
# Returns 0 on success. Returns a variety of negative values on error. |
234 |
# |
235 |
sub digestStart |
236 |
{ |
237 |
my($class, $fileName, $fileSize, $blockSize, $defBlkSize, |
238 |
$checksumSeed, $needMD4, $compress, $doCache) = @_; |
239 |
|
240 |
return -1 if ( !$RsyncLibOK ); |
241 |
|
242 |
my $data; |
243 |
|
244 |
my $dg = bless { |
245 |
name => $fileName, |
246 |
needMD4 => $needMD4, |
247 |
digest => File::RsyncP::Digest->new, |
248 |
}, $class; |
249 |
|
250 |
if ( $fileSize > 0 && $compress && $doCache >= 0 ) { |
251 |
open(my $fh, "<", $fileName) || return -2; |
252 |
binmode($fh); |
253 |
return -3 if ( read($fh, $data, 1) != 1 ); |
254 |
my $ret; |
255 |
|
256 |
if ( $data eq chr(0x78) && $doCache > 0 |
257 |
&& $checksumSeed == RSYNC_CSUMSEED_CACHE ) { |
258 |
# |
259 |
# RSYNC_CSUMSEED_CACHE (32761) is the magic number that |
260 |
# rsync uses for checksumSeed with the --fixed-csum option. |
261 |
# |
262 |
# We now add the cached checksum data to the file. There |
263 |
# is a possible race condition here since two BackupPC_dump |
264 |
# processes might call this function at the same time |
265 |
# on the same file. But this should be ok since both |
266 |
# processes will write the same data, and the order |
267 |
# in which they write it doesn't matter. |
268 |
# |
269 |
close($fh); |
270 |
$ret = $dg->digestAdd($fileName, |
271 |
$blockSize |
272 |
|| BackupPC::Xfer::RsyncDigest->blockSize( |
273 |
$fileSize, $defBlkSize), |
274 |
$checksumSeed); |
275 |
if ( $ret < 0 ) { |
276 |
&$Log("digestAdd($fileName) failed ($ret)"); |
277 |
} |
278 |
# |
279 |
# now re-open the file and re-read the first byte |
280 |
# |
281 |
open($fh, "<", $fileName) || return -4; |
282 |
binmode($fh); |
283 |
return -5 if ( read($fh, $data, 1) != 1 ); |
284 |
} |
285 |
if ( $ret >= 0 && $data eq chr(0xd6) ) { |
286 |
# |
287 |
# Looks like this file has cached checksums |
288 |
# Read the last 48 bytes: that's 2 file MD4s (32 bytes) |
289 |
# plus 4 words of meta data |
290 |
# |
291 |
return -6 if ( !defined(seek($fh, -48, 2)) ); |
292 |
return -7 if ( read($fh, $data, 48) != 48 ); |
293 |
($dg->{md4DigestOld}, |
294 |
$dg->{md4Digest}, |
295 |
$dg->{blockSize}, |
296 |
$dg->{checksumSeed}, |
297 |
$dg->{nBlocks}, |
298 |
$dg->{magic}) = unpack("a16 a16 V V V V", $data); |
299 |
if ( $dg->{magic} == 0x5fe3c289 |
300 |
&& $dg->{checksumSeed} == $checksumSeed |
301 |
&& ($blockSize == 0 || $dg->{blockSize} == $blockSize) ) { |
302 |
$dg->{fh} = $fh; |
303 |
$dg->{cached} = 1; |
304 |
# |
305 |
# position the file at the start of the rsync block checksums |
306 |
# (4 (adler) + 16 (md4) bytes each) |
307 |
# |
308 |
return -8 |
309 |
if ( !defined(seek($fh, -$dg->{nBlocks}*20 - 48, 2)) ); |
310 |
} else { |
311 |
# |
312 |
# cached checksums are not valid, so we close the |
313 |
# file and treat it as uncached. |
314 |
# |
315 |
$dg->{cachedInvalid} = 1; |
316 |
close($fh); |
317 |
} |
318 |
} |
319 |
} |
320 |
if ( !$dg->{cached} ) { |
321 |
# |
322 |
# This file doesn't have cached checksums, or the checksumSeed |
323 |
# or blocksize doesn't match. Open the file and prepare to |
324 |
# compute the checksums. |
325 |
# |
326 |
$blockSize |
327 |
= BackupPC::Xfer::RsyncDigest->blockSize($fileSize, $defBlkSize) |
328 |
if ( $blockSize == 0 ); |
329 |
$dg->{checksumSeed} = $checksumSeed; |
330 |
$dg->{blockSize} = $blockSize; |
331 |
$dg->{fh} = BackupPC::FileZIO->open($fileName, 0, $compress); |
332 |
return -9 if ( !defined($dg->{fh}) ); |
333 |
if ( $needMD4) { |
334 |
$dg->{csumDigest} = File::RsyncP::Digest->new; |
335 |
$dg->{csumDigest}->add(pack("V", $dg->{checksumSeed})); |
336 |
} |
337 |
} |
338 |
return (undef, $dg, $dg->{blockSize}); |
339 |
} |
340 |
|
341 |
sub digestGet |
342 |
{ |
343 |
my($dg, $num, $csumLen, $noPad) = @_; |
344 |
my($fileData); |
345 |
my $blockSize = $dg->{blockSize}; |
346 |
|
347 |
if ( $dg->{cached} ) { |
348 |
my $thisNum = $num; |
349 |
$thisNum = $dg->{nBlocks} if ( $thisNum > $dg->{nBlocks} ); |
350 |
read($dg->{fh}, $fileData, 20 * $thisNum); |
351 |
$dg->{nBlocks} -= $thisNum; |
352 |
if ( $thisNum < $num && !$noPad) { |
353 |
# |
354 |
# unexpected shortfall of data; pad with zero digest |
355 |
# |
356 |
$fileData .= pack("c", 0) x (20 * ($num - $thisNum)); |
357 |
} |
358 |
return $dg->{digest}->blockDigestExtract($fileData, $csumLen); |
359 |
} else { |
360 |
if ( $dg->{fh}->read(\$fileData, $blockSize * $num) <= 0 ) { |
361 |
# |
362 |
# unexpected shortfall of data; pad with zeros |
363 |
# |
364 |
$fileData = pack("c", 0) x ($blockSize * $num) if ( !$noPad ); |
365 |
} |
366 |
$dg->{csumDigest}->add($fileData) if ( $dg->{needMD4} ); |
367 |
return $dg->{digest}->blockDigest($fileData, $blockSize, |
368 |
$csumLen, $dg->{checksumSeed}); |
369 |
} |
370 |
} |
371 |
|
372 |
sub digestEnd |
373 |
{ |
374 |
my($dg, $skipMD4) = @_; |
375 |
my($fileData); |
376 |
|
377 |
if ( $dg->{cached} ) { |
378 |
close($dg->{fh}); |
379 |
return $dg->{md4DigestOld} if ( $dg->{needMD4} ); |
380 |
} else { |
381 |
# |
382 |
# make sure we read the entire file for the file MD4 digest |
383 |
# |
384 |
if ( $dg->{needMD4} && !$skipMD4 ) { |
385 |
my $fileData; |
386 |
while ( $dg->{fh}->read(\$fileData, 65536) > 0 ) { |
387 |
$dg->{csumDigest}->add($fileData); |
388 |
} |
389 |
} |
390 |
$dg->{fh}->close(); |
391 |
return $dg->{csumDigest}->digest if ( $dg->{needMD4} ); |
392 |
} |
393 |
} |
394 |
|
395 |
sub isCached |
396 |
{ |
397 |
my($dg) = @_; |
398 |
|
399 |
return wantarray ? ($dg->{cached}, $dg->{cachedInvalid}) : $dg->{cached}; |
400 |
} |
401 |
|
402 |
sub blockSizeCurr |
403 |
{ |
404 |
my($dg) = @_; |
405 |
|
406 |
return $dg->{blockSize}; |
407 |
} |
408 |
|
409 |
# |
410 |
# Default log handler |
411 |
# |
412 |
sub logHandler |
413 |
{ |
414 |
my($str) = @_; |
415 |
|
416 |
print(STDERR $str, "\n"); |
417 |
} |
418 |
|
419 |
# |
420 |
# Set log handler to a new subroutine. |
421 |
# |
422 |
sub logHandlerSet |
423 |
{ |
424 |
my($sub) = @_; |
425 |
|
426 |
$Log = $sub; |
427 |
} |
428 |
|
429 |
1; |