/[BackupPC]/upstream/2.1.0/lib/BackupPC/PoolWrite.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /upstream/2.1.0/lib/BackupPC/PoolWrite.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (show annotations)
Wed Jun 22 19:12:04 2005 UTC (18 years, 10 months ago) by dpavlin
File size: 16937 byte(s)
import of version 2.1.0

1 #============================================================= -*-perl-*-
2 #
3 # BackupPC::PoolWrite package
4 #
5 # DESCRIPTION
6 #
7 # This library defines a BackupPC::PoolWrite class for writing
8 # files to disk that are candidates for pooling. One instance
9 # of this class is used to write each file. The following steps
10 # are executed:
11 #
12 # - As the incoming data arrives, the first 1MB is buffered
13 # in memory so the MD5 digest can be computed.
14 #
15 # - A running comparison against all the candidate pool files
16 # (ie: those with the same MD5 digest, usually at most a single
17 # file) is done as new incoming data arrives. Up to $MaxFiles
18 # simultaneous files can be compared in parallel. This
19 # involves reading and uncompressing one or more pool files.
20 #
21 # - When a pool file no longer matches it is discarded from
22 # the search. If there are more than $MaxFiles candidates, one of
23 # the new candidates is added to the search, first checking
24 # that it matches up to the current point (this requires
25 # re-reading one of the other pool files).
26 #
27 # - When or if no pool files match then the new file is written
28 # to disk. This could occur many MB into the file. We don't
29 # need to buffer all this data in memory since we can copy it
30 # from the last matching pool file, up to the point where it
31 # fully matched.
32 #
33 # - When all the new data is complete, if a pool file exactly
34 # matches then the file is simply created as a hardlink to
35 # the pool file.
36 #
37 # AUTHOR
38 # Craig Barratt <cbarratt@users.sourceforge.net>
39 #
40 # COPYRIGHT
41 # Copyright (C) 2001-2003 Craig Barratt
42 #
43 # This program is free software; you can redistribute it and/or modify
44 # it under the terms of the GNU General Public License as published by
45 # the Free Software Foundation; either version 2 of the License, or
46 # (at your option) any later version.
47 #
48 # This program is distributed in the hope that it will be useful,
49 # but WITHOUT ANY WARRANTY; without even the implied warranty of
50 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
51 # GNU General Public License for more details.
52 #
53 # You should have received a copy of the GNU General Public License
54 # along with this program; if not, write to the Free Software
55 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
56 #
57 #========================================================================
58 #
59 # Version 2.1.0, released 20 Jun 2004.
60 #
61 # See http://backuppc.sourceforge.net.
62 #
63 #========================================================================
64
65 package BackupPC::PoolWrite;
66
67 use strict;
68
69 use File::Path;
70 use Digest::MD5;
71 use BackupPC::FileZIO;
72
73 sub new
74 {
75 my($class, $bpc, $fileName, $fileSize, $compress) = @_;
76
77 my $self = bless {
78 fileName => $fileName,
79 fileSize => $fileSize,
80 bpc => $bpc,
81 compress => $compress,
82 nWrite => 0,
83 digest => undef,
84 files => [],
85 fileCnt => -1,
86 fhOut => undef,
87 errors => [],
88 data => "",
89 eof => undef,
90 }, $class;
91
92 $self->{hardLinkMax} = $bpc->ConfValue("HardLinkMax");
93
94 #
95 # Always unlink any current file in case it is already linked
96 #
97 unlink($fileName) if ( -f $fileName );
98 return $self;
99 }
100
101 my $BufSize = 1048576; # 1MB or 2^20
102 my $MaxFiles = 20; # max number of compare files open at one time
103
104 sub write
105 {
106 my($a, $dataRef) = @_;
107
108 return if ( $a->{eof} );
109 $a->{data} .= $$dataRef if ( defined($dataRef) );
110 return if ( length($a->{data}) < $BufSize && defined($dataRef) );
111
112 #
113 # Correct the fileSize if it is wrong (rsync might transfer
114 # a file whose length is different to the length sent with the
115 # file list if the file changes between the file list sending
116 # and the file sending). Here we only catch the case where
117 # we haven't computed the digest (ie: we have written no more
118 # than $BufSize). We catch the big file case below.
119 #
120 if ( !defined($dataRef) && !defined($a->{digest})
121 && $a->{fileSize} != length($a->{data}) ) {
122 #my $newSize = length($a->{data});
123 #print("Fixing file size from $a->{fileSize} to $newSize\n");
124 $a->{fileSize} = length($a->{data});
125 }
126
127 if ( !defined($a->{digest}) && length($a->{data}) > 0 ) {
128 #
129 # build a list of all the candidate matching files
130 #
131 my $md5 = Digest::MD5->new;
132 $a->{fileSize} = length($a->{data})
133 if ( $a->{fileSize} < length($a->{data}) );
134 $a->{digest} = $a->{bpc}->Buffer2MD5($md5, $a->{fileSize}, \$a->{data});
135 if ( !defined($a->{base} = $a->{bpc}->MD52Path($a->{digest},
136 $a->{compress})) ) {
137 push(@{$a->{errors}}, "Unable to get path from '$a->{digest}'"
138 . " for $a->{fileName}\n");
139 } else {
140 while ( @{$a->{files}} < $MaxFiles ) {
141 my $fh;
142 my $fileName = $a->{fileCnt} < 0 ? $a->{base}
143 : "$a->{base}_$a->{fileCnt}";
144 last if ( !-f $fileName );
145 if ( (stat(_))[3] >= $a->{hardLinkMax}
146 || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
147 $a->{compress})) ) {
148 $a->{fileCnt}++;
149 next;
150 }
151 push(@{$a->{files}}, {
152 name => $fileName,
153 fh => $fh,
154 });
155 $a->{fileCnt}++;
156 }
157 }
158 #
159 # if there are no candidate files then we must write
160 # the new file to disk
161 #
162 if ( !@{$a->{files}} ) {
163 $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
164 1, $a->{compress});
165 if ( !defined($a->{fhOut}) ) {
166 push(@{$a->{errors}}, "Unable to open $a->{fileName}"
167 . " for writing\n");
168 }
169 }
170 }
171 my $dataLen = length($a->{data});
172 if ( !defined($a->{fhOut}) && length($a->{data}) > 0 ) {
173 #
174 # See if the new chunk of data continues to match the
175 # candidate files.
176 #
177 for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
178 my($d, $match);
179 my $fileName = $a->{fileCnt} < 0 ? $a->{base}
180 : "$a->{base}_$a->{fileCnt}";
181 if ( $dataLen > 0 ) {
182 # verify next $dataLen bytes from candidate file
183 my $n = $a->{files}[$i]->{fh}->read(\$d, $dataLen);
184 next if ( $n == $dataLen && $d eq $a->{data} );
185 } else {
186 # verify candidate file is at EOF
187 my $n = $a->{files}[$i]->{fh}->read(\$d, 100);
188 next if ( $n == 0 );
189 }
190 #print(" File $a->{files}[$i]->{name} doesn't match\n");
191 #
192 # this candidate file didn't match. Replace it
193 # with a new candidate file. We have to qualify
194 # any new candidate file by making sure that its
195 # first $a->{nWrite} bytes match, plus the next $dataLen
196 # bytes match $a->{data}.
197 #
198 while ( -f $fileName ) {
199 my $fh;
200 if ( (stat(_))[3] >= $a->{hardLinkMax}
201 || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
202 $a->{compress})) ) {
203 $a->{fileCnt}++;
204 #print(" Discarding $fileName (open failed)\n");
205 $fileName = "$a->{base}_$a->{fileCnt}";
206 next;
207 }
208 if ( !$a->{files}[$i]->{fh}->rewind() ) {
209 push(@{$a->{errors}},
210 "Unable to rewind $a->{files}[$i]->{name}"
211 . " for compare\n");
212 }
213 $match = $a->filePartialCompare($a->{files}[$i]->{fh}, $fh,
214 $a->{nWrite}, $dataLen, \$a->{data});
215 if ( $match ) {
216 $a->{files}[$i]->{fh}->close();
217 $a->{files}[$i]->{fh} = $fh,
218 $a->{files}[$i]->{name} = $fileName;
219 #print(" Found new candidate $fileName\n");
220 $a->{fileCnt}++;
221 last;
222 } else {
223 #print(" Discarding $fileName (no match)\n");
224 }
225 $fh->close();
226 $a->{fileCnt}++;
227 $fileName = "$a->{base}_$a->{fileCnt}";
228 }
229 if ( !$match ) {
230 #
231 # We couldn't find another candidate file
232 #
233 if ( @{$a->{files}} == 1 ) {
234 #print(" Exhausted matches, now writing\n");
235 $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
236 1, $a->{compress});
237 if ( !defined($a->{fhOut}) ) {
238 push(@{$a->{errors}},
239 "Unable to open $a->{fileName}"
240 . " for writing\n");
241 } else {
242 if ( !$a->{files}[$i]->{fh}->rewind() ) {
243 push(@{$a->{errors}},
244 "Unable to rewind"
245 . " $a->{files}[$i]->{name} for copy\n");
246 }
247 $a->filePartialCopy($a->{files}[$i]->{fh}, $a->{fhOut},
248 $a->{nWrite});
249 }
250 }
251 $a->{files}[$i]->{fh}->close();
252 splice(@{$a->{files}}, $i, 1);
253 $i--;
254 }
255 }
256 }
257 if ( defined($a->{fhOut}) && $dataLen > 0 ) {
258 #
259 # if we are in writing mode then just write the data
260 #
261 my $n = $a->{fhOut}->write(\$a->{data});
262 if ( $n != $dataLen ) {
263 push(@{$a->{errors}}, "Unable to write $dataLen bytes to"
264 . " $a->{fileName} (got $n)\n");
265 }
266 }
267 $a->{nWrite} += $dataLen;
268 $a->{data} = "";
269 return if ( defined($dataRef) );
270
271 #
272 # We are at EOF, so finish up
273 #
274 $a->{eof} = 1;
275
276 #
277 # Make sure the fileSize was correct. See above for comments about
278 # rsync.
279 #
280 if ( $a->{nWrite} != $a->{fileSize} ) {
281 #
282 # Oops, fileSize was wrong, so our MD5 digest was wrong and our
283 # effort to match files likely failed. This is ugly, but our
284 # only choice at this point is to re-write the entire file with
285 # the correct length. We need to rename the file, open it for
286 # reading, and then re-write the file with the correct length.
287 #
288
289 #print("Doing big file fixup ($a->{fileSize} != $a->{nWrite})\n");
290
291 my($fh, $fileName);
292 $a->{fileSize} = $a->{nWrite};
293
294 if ( defined($a->{fhOut}) ) {
295 if ( $a->{fileName} =~ /(.*)\// ) {
296 $fileName = $1;
297 } else {
298 $fileName = ".";
299 }
300 #
301 # Find a unique target temporary file name
302 #
303 my $i = 0;
304 while ( -f "$fileName/t$$.$i" ) {
305 $i++;
306 }
307 $fileName = "$fileName/t$$.$i";
308 $a->{fhOut}->close();
309 if ( !rename($a->{fileName}, $fileName)
310 || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
311 $a->{compress})) ) {
312 push(@{$a->{errors}}, "Can't rename $a->{fileName} -> $fileName"
313 . " or open during size fixup\n");
314 }
315 #print("Using temporary name $fileName\n");
316 } elsif ( defined($a->{files}) && defined($a->{files}[0]) ) {
317 #
318 # We haven't written anything yet, so just use the
319 # compare file to copy from.
320 #
321 $fh = $a->{files}[0]->{fh};
322 $fh->rewind;
323 #print("Using compare file $a->{files}[0]->{name}\n");
324 }
325 if ( defined($fh) ) {
326 my $poolWrite = BackupPC::PoolWrite->new($a->{bpc}, $a->{fileName},
327 $a->{fileSize}, $a->{compress});
328 my $nRead = 0;
329
330 while ( $nRead < $a->{fileSize} ) {
331 my $thisRead = $a->{fileSize} - $nRead < $BufSize
332 ? $a->{fileSize} - $nRead : $BufSize;
333 my $data;
334 my $n = $fh->read(\$data, $thisRead);
335 if ( $n != $thisRead ) {
336 push(@{$a->{errors}},
337 "Unable to read $thisRead bytes during resize"
338 . " from temp $fileName (got $n)\n");
339 last;
340 }
341 $poolWrite->write(\$data);
342 $nRead += $thisRead;
343 }
344 $fh->close;
345 unlink($fileName) if ( defined($fileName) );
346 if ( @{$a->{errors}} ) {
347 $poolWrite->close;
348 return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
349 } else {
350 return $poolWrite->close;
351 }
352 }
353 }
354
355 #
356 # Close the compare files
357 #
358 foreach my $f ( @{$a->{files}} ) {
359 $f->{fh}->close();
360 }
361
362 if ( $a->{fileSize} == 0 ) {
363 #
364 # Simply create an empty file
365 #
366 local(*OUT);
367 if ( !open(OUT, ">", $a->{fileName}) ) {
368 push(@{$a->{errors}}, "Can't open $a->{fileName} for empty"
369 . " output\n");
370 } else {
371 close(OUT);
372 }
373 return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
374 } elsif ( defined($a->{fhOut}) ) {
375 $a->{fhOut}->close();
376 return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
377 } else {
378 if ( @{$a->{files}} == 0 ) {
379 push(@{$a->{errors}}, "Botch, no matches on $a->{fileName}"
380 . " ($a->{digest})\n");
381 } elsif ( @{$a->{files}} > 1 ) {
382 #
383 # This is no longer a real error because $Conf{HardLinkMax}
384 # could be hit, thereby creating identical pool files
385 #
386 #my $str = "Unexpected multiple matches on"
387 # . " $a->{fileName} ($a->{digest})\n";
388 #for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
389 # $str .= " -> $a->{files}[$i]->{name}\n";
390 #}
391 #push(@{$a->{errors}}, $str);
392 }
393 #print(" Linking $a->{fileName} to $a->{files}[0]->{name}\n");
394 if ( @{$a->{files}} && !link($a->{files}[0]->{name}, $a->{fileName}) ) {
395 push(@{$a->{errors}}, "Can't link $a->{fileName} to"
396 . " $a->{files}[0]->{name}\n");
397 }
398 return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
399 }
400 }
401
402 #
403 # Finish writing: pass undef dataRef to write so it can do all
404 # the work. Returns a 4 element array:
405 #
406 # (existingFlag, digestString, outputFileLength, errorList)
407 #
408 sub close
409 {
410 my($a) = @_;
411
412 return $a->write(undef);
413 }
414
415 #
416 # Abort a pool write
417 #
418 sub abort
419 {
420 my($a) = @_;
421
422 if ( defined($a->{fhOut}) ) {
423 $a->{fhOut}->close();
424 unlink($a->{fileName});
425 }
426 foreach my $f ( @{$a->{files}} ) {
427 $f->{fh}->close();
428 }
429 $a->{files} = [];
430 }
431
432 #
433 # Copy $nBytes from files $fhIn to $fhOut.
434 #
435 sub filePartialCopy
436 {
437 my($a, $fhIn, $fhOut, $nBytes) = @_;
438 my($nRead);
439
440 while ( $nRead < $nBytes ) {
441 my $thisRead = $nBytes - $nRead < $BufSize
442 ? $nBytes - $nRead : $BufSize;
443 my $data;
444 my $n = $fhIn->read(\$data, $thisRead);
445 if ( $n != $thisRead ) {
446 push(@{$a->{errors}},
447 "Unable to read $thisRead bytes from "
448 . $fhIn->name . " (got $n)\n");
449 return;
450 }
451 $n = $fhOut->write(\$data, $thisRead);
452 if ( $n != $thisRead ) {
453 push(@{$a->{errors}},
454 "Unable to write $thisRead bytes to "
455 . $fhOut->name . " (got $n)\n");
456 return;
457 }
458 $nRead += $thisRead;
459 }
460 }
461
462 #
463 # Compare $nBytes from files $fh0 and $fh1, and also compare additional
464 # $extra bytes from $fh1 to $$extraData.
465 #
466 sub filePartialCompare
467 {
468 my($a, $fh0, $fh1, $nBytes, $extra, $extraData) = @_;
469 my($nRead, $n);
470 my($data0, $data1);
471
472 while ( $nRead < $nBytes ) {
473 my $thisRead = $nBytes - $nRead < $BufSize
474 ? $nBytes - $nRead : $BufSize;
475 $n = $fh0->read(\$data0, $thisRead);
476 if ( $n != $thisRead ) {
477 push(@{$a->{errors}}, "Unable to read $thisRead bytes from "
478 . $fh0->name . " (got $n)\n");
479 return;
480 }
481 $n = $fh1->read(\$data1, $thisRead);
482 return 0 if ( $n < $thisRead || $data0 ne $data1 );
483 $nRead += $thisRead;
484 }
485 if ( $extra > 0 ) {
486 # verify additional bytes
487 $n = $fh1->read(\$data1, $extra);
488 return 0 if ( $n != $extra || $data1 ne $$extraData );
489 } else {
490 # verify EOF
491 $n = $fh1->read(\$data1, 100);
492 return 0 if ( $n != 0 );
493 }
494 return 1;
495 }
496
497 1;

  ViewVC Help
Powered by ViewVC 1.1.26