/[BackupPC]/trunk/lib/BackupPC/PoolWrite.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/BackupPC/PoolWrite.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 316 - (hide annotations)
Mon Jan 30 13:37:17 2006 UTC (18 years, 4 months ago) by dpavlin
File size: 16936 byte(s)
 r9152@llin:  dpavlin | 2006-01-30 14:11:45 +0100
 update to upstream 2.1.2

1 dpavlin 1 #============================================================= -*-perl-*-
2     #
3     # BackupPC::PoolWrite package
4     #
5     # DESCRIPTION
6     #
7     # This library defines a BackupPC::PoolWrite class for writing
8     # files to disk that are candidates for pooling. One instance
9     # of this class is used to write each file. The following steps
10     # are executed:
11     #
12     # - As the incoming data arrives, the first 1MB is buffered
13     # in memory so the MD5 digest can be computed.
14     #
15     # - A running comparison against all the candidate pool files
16     # (ie: those with the same MD5 digest, usually at most a single
17     # file) is done as new incoming data arrives. Up to $MaxFiles
18     # simultaneous files can be compared in parallel. This
19     # involves reading and uncompressing one or more pool files.
20     #
21     # - When a pool file no longer matches it is discarded from
22     # the search. If there are more than $MaxFiles candidates, one of
23     # the new candidates is added to the search, first checking
24     # that it matches up to the current point (this requires
25     # re-reading one of the other pool files).
26     #
27     # - When or if no pool files match then the new file is written
28     # to disk. This could occur many MB into the file. We don't
29     # need to buffer all this data in memory since we can copy it
30     # from the last matching pool file, up to the point where it
31     # fully matched.
32     #
33     # - When all the new data is complete, if a pool file exactly
34     # matches then the file is simply created as a hardlink to
35     # the pool file.
36     #
37     # AUTHOR
38     # Craig Barratt <cbarratt@users.sourceforge.net>
39     #
40     # COPYRIGHT
41     # Copyright (C) 2001-2003 Craig Barratt
42     #
43     # This program is free software; you can redistribute it and/or modify
44     # it under the terms of the GNU General Public License as published by
45     # the Free Software Foundation; either version 2 of the License, or
46     # (at your option) any later version.
47     #
48     # This program is distributed in the hope that it will be useful,
49     # but WITHOUT ANY WARRANTY; without even the implied warranty of
50     # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
51     # GNU General Public License for more details.
52     #
53     # You should have received a copy of the GNU General Public License
54     # along with this program; if not, write to the Free Software
55     # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
56     #
57     #========================================================================
58     #
59 dpavlin 316 # Version 2.1.2, released 5 Sep 2005.
60 dpavlin 1 #
61     # See http://backuppc.sourceforge.net.
62     #
63     #========================================================================
64    
65     package BackupPC::PoolWrite;
66    
67     use strict;
68    
69     use File::Path;
70     use Digest::MD5;
71     use BackupPC::FileZIO;
72    
73     sub new
74     {
75     my($class, $bpc, $fileName, $fileSize, $compress) = @_;
76    
77     my $self = bless {
78     fileName => $fileName,
79     fileSize => $fileSize,
80     bpc => $bpc,
81     compress => $compress,
82     nWrite => 0,
83     digest => undef,
84     files => [],
85     fileCnt => -1,
86     fhOut => undef,
87     errors => [],
88     data => "",
89     eof => undef,
90     }, $class;
91    
92     $self->{hardLinkMax} = $bpc->ConfValue("HardLinkMax");
93    
94     #
95     # Always unlink any current file in case it is already linked
96     #
97     unlink($fileName) if ( -f $fileName );
98     return $self;
99     }
100    
101     my $BufSize = 1048576; # 1MB or 2^20
102     my $MaxFiles = 20; # max number of compare files open at one time
103    
104     sub write
105     {
106     my($a, $dataRef) = @_;
107    
108     return if ( $a->{eof} );
109     $a->{data} .= $$dataRef if ( defined($dataRef) );
110     return if ( length($a->{data}) < $BufSize && defined($dataRef) );
111    
112     #
113     # Correct the fileSize if it is wrong (rsync might transfer
114     # a file whose length is different to the length sent with the
115     # file list if the file changes between the file list sending
116     # and the file sending). Here we only catch the case where
117     # we haven't computed the digest (ie: we have written no more
118     # than $BufSize). We catch the big file case below.
119     #
120     if ( !defined($dataRef) && !defined($a->{digest})
121     && $a->{fileSize} != length($a->{data}) ) {
122     #my $newSize = length($a->{data});
123     #print("Fixing file size from $a->{fileSize} to $newSize\n");
124     $a->{fileSize} = length($a->{data});
125     }
126    
127     if ( !defined($a->{digest}) && length($a->{data}) > 0 ) {
128     #
129     # build a list of all the candidate matching files
130     #
131     my $md5 = Digest::MD5->new;
132     $a->{fileSize} = length($a->{data})
133     if ( $a->{fileSize} < length($a->{data}) );
134     $a->{digest} = $a->{bpc}->Buffer2MD5($md5, $a->{fileSize}, \$a->{data});
135     if ( !defined($a->{base} = $a->{bpc}->MD52Path($a->{digest},
136     $a->{compress})) ) {
137     push(@{$a->{errors}}, "Unable to get path from '$a->{digest}'"
138     . " for $a->{fileName}\n");
139     } else {
140     while ( @{$a->{files}} < $MaxFiles ) {
141     my $fh;
142     my $fileName = $a->{fileCnt} < 0 ? $a->{base}
143     : "$a->{base}_$a->{fileCnt}";
144     last if ( !-f $fileName );
145     if ( (stat(_))[3] >= $a->{hardLinkMax}
146     || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
147     $a->{compress})) ) {
148     $a->{fileCnt}++;
149     next;
150     }
151     push(@{$a->{files}}, {
152     name => $fileName,
153     fh => $fh,
154     });
155     $a->{fileCnt}++;
156     }
157     }
158     #
159     # if there are no candidate files then we must write
160     # the new file to disk
161     #
162     if ( !@{$a->{files}} ) {
163     $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
164     1, $a->{compress});
165     if ( !defined($a->{fhOut}) ) {
166     push(@{$a->{errors}}, "Unable to open $a->{fileName}"
167     . " for writing\n");
168     }
169     }
170     }
171     my $dataLen = length($a->{data});
172     if ( !defined($a->{fhOut}) && length($a->{data}) > 0 ) {
173     #
174     # See if the new chunk of data continues to match the
175     # candidate files.
176     #
177     for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
178     my($d, $match);
179     my $fileName = $a->{fileCnt} < 0 ? $a->{base}
180     : "$a->{base}_$a->{fileCnt}";
181     if ( $dataLen > 0 ) {
182     # verify next $dataLen bytes from candidate file
183     my $n = $a->{files}[$i]->{fh}->read(\$d, $dataLen);
184     next if ( $n == $dataLen && $d eq $a->{data} );
185     } else {
186     # verify candidate file is at EOF
187     my $n = $a->{files}[$i]->{fh}->read(\$d, 100);
188     next if ( $n == 0 );
189     }
190     #print(" File $a->{files}[$i]->{name} doesn't match\n");
191     #
192     # this candidate file didn't match. Replace it
193     # with a new candidate file. We have to qualify
194     # any new candidate file by making sure that its
195     # first $a->{nWrite} bytes match, plus the next $dataLen
196     # bytes match $a->{data}.
197     #
198     while ( -f $fileName ) {
199     my $fh;
200     if ( (stat(_))[3] >= $a->{hardLinkMax}
201     || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
202     $a->{compress})) ) {
203     $a->{fileCnt}++;
204     #print(" Discarding $fileName (open failed)\n");
205     $fileName = "$a->{base}_$a->{fileCnt}";
206     next;
207     }
208     if ( !$a->{files}[$i]->{fh}->rewind() ) {
209     push(@{$a->{errors}},
210     "Unable to rewind $a->{files}[$i]->{name}"
211     . " for compare\n");
212     }
213     $match = $a->filePartialCompare($a->{files}[$i]->{fh}, $fh,
214     $a->{nWrite}, $dataLen, \$a->{data});
215     if ( $match ) {
216     $a->{files}[$i]->{fh}->close();
217     $a->{files}[$i]->{fh} = $fh,
218     $a->{files}[$i]->{name} = $fileName;
219     #print(" Found new candidate $fileName\n");
220     $a->{fileCnt}++;
221     last;
222     } else {
223     #print(" Discarding $fileName (no match)\n");
224     }
225     $fh->close();
226     $a->{fileCnt}++;
227     $fileName = "$a->{base}_$a->{fileCnt}";
228     }
229     if ( !$match ) {
230     #
231     # We couldn't find another candidate file
232     #
233     if ( @{$a->{files}} == 1 ) {
234     #print(" Exhausted matches, now writing\n");
235     $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
236     1, $a->{compress});
237     if ( !defined($a->{fhOut}) ) {
238     push(@{$a->{errors}},
239     "Unable to open $a->{fileName}"
240     . " for writing\n");
241     } else {
242     if ( !$a->{files}[$i]->{fh}->rewind() ) {
243     push(@{$a->{errors}},
244     "Unable to rewind"
245     . " $a->{files}[$i]->{name} for copy\n");
246     }
247     $a->filePartialCopy($a->{files}[$i]->{fh}, $a->{fhOut},
248     $a->{nWrite});
249     }
250     }
251     $a->{files}[$i]->{fh}->close();
252     splice(@{$a->{files}}, $i, 1);
253     $i--;
254     }
255     }
256     }
257     if ( defined($a->{fhOut}) && $dataLen > 0 ) {
258     #
259     # if we are in writing mode then just write the data
260     #
261     my $n = $a->{fhOut}->write(\$a->{data});
262     if ( $n != $dataLen ) {
263     push(@{$a->{errors}}, "Unable to write $dataLen bytes to"
264     . " $a->{fileName} (got $n)\n");
265     }
266     }
267     $a->{nWrite} += $dataLen;
268     $a->{data} = "";
269     return if ( defined($dataRef) );
270    
271     #
272     # We are at EOF, so finish up
273     #
274     $a->{eof} = 1;
275    
276     #
277     # Make sure the fileSize was correct. See above for comments about
278     # rsync.
279     #
280     if ( $a->{nWrite} != $a->{fileSize} ) {
281     #
282     # Oops, fileSize was wrong, so our MD5 digest was wrong and our
283     # effort to match files likely failed. This is ugly, but our
284     # only choice at this point is to re-write the entire file with
285     # the correct length. We need to rename the file, open it for
286     # reading, and then re-write the file with the correct length.
287     #
288    
289     #print("Doing big file fixup ($a->{fileSize} != $a->{nWrite})\n");
290    
291     my($fh, $fileName);
292     $a->{fileSize} = $a->{nWrite};
293    
294     if ( defined($a->{fhOut}) ) {
295     if ( $a->{fileName} =~ /(.*)\// ) {
296     $fileName = $1;
297     } else {
298     $fileName = ".";
299     }
300     #
301     # Find a unique target temporary file name
302     #
303     my $i = 0;
304     while ( -f "$fileName/t$$.$i" ) {
305     $i++;
306     }
307     $fileName = "$fileName/t$$.$i";
308     $a->{fhOut}->close();
309     if ( !rename($a->{fileName}, $fileName)
310     || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
311     $a->{compress})) ) {
312     push(@{$a->{errors}}, "Can't rename $a->{fileName} -> $fileName"
313     . " or open during size fixup\n");
314     }
315     #print("Using temporary name $fileName\n");
316     } elsif ( defined($a->{files}) && defined($a->{files}[0]) ) {
317     #
318     # We haven't written anything yet, so just use the
319     # compare file to copy from.
320     #
321     $fh = $a->{files}[0]->{fh};
322     $fh->rewind;
323     #print("Using compare file $a->{files}[0]->{name}\n");
324     }
325     if ( defined($fh) ) {
326     my $poolWrite = BackupPC::PoolWrite->new($a->{bpc}, $a->{fileName},
327     $a->{fileSize}, $a->{compress});
328     my $nRead = 0;
329    
330     while ( $nRead < $a->{fileSize} ) {
331     my $thisRead = $a->{fileSize} - $nRead < $BufSize
332     ? $a->{fileSize} - $nRead : $BufSize;
333     my $data;
334     my $n = $fh->read(\$data, $thisRead);
335     if ( $n != $thisRead ) {
336     push(@{$a->{errors}},
337     "Unable to read $thisRead bytes during resize"
338     . " from temp $fileName (got $n)\n");
339     last;
340     }
341     $poolWrite->write(\$data);
342     $nRead += $thisRead;
343     }
344     $fh->close;
345     unlink($fileName) if ( defined($fileName) );
346     if ( @{$a->{errors}} ) {
347     $poolWrite->close;
348     return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
349     } else {
350     return $poolWrite->close;
351     }
352     }
353     }
354    
355     #
356     # Close the compare files
357     #
358     foreach my $f ( @{$a->{files}} ) {
359     $f->{fh}->close();
360     }
361    
362     if ( $a->{fileSize} == 0 ) {
363     #
364     # Simply create an empty file
365     #
366     local(*OUT);
367     if ( !open(OUT, ">", $a->{fileName}) ) {
368     push(@{$a->{errors}}, "Can't open $a->{fileName} for empty"
369     . " output\n");
370     } else {
371     close(OUT);
372     }
373     return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
374     } elsif ( defined($a->{fhOut}) ) {
375     $a->{fhOut}->close();
376     return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
377     } else {
378     if ( @{$a->{files}} == 0 ) {
379     push(@{$a->{errors}}, "Botch, no matches on $a->{fileName}"
380     . " ($a->{digest})\n");
381     } elsif ( @{$a->{files}} > 1 ) {
382     #
383     # This is no longer a real error because $Conf{HardLinkMax}
384     # could be hit, thereby creating identical pool files
385     #
386     #my $str = "Unexpected multiple matches on"
387     # . " $a->{fileName} ($a->{digest})\n";
388     #for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
389     # $str .= " -> $a->{files}[$i]->{name}\n";
390     #}
391     #push(@{$a->{errors}}, $str);
392     }
393     #print(" Linking $a->{fileName} to $a->{files}[0]->{name}\n");
394     if ( @{$a->{files}} && !link($a->{files}[0]->{name}, $a->{fileName}) ) {
395     push(@{$a->{errors}}, "Can't link $a->{fileName} to"
396     . " $a->{files}[0]->{name}\n");
397     }
398     return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
399     }
400     }
401    
402     #
403     # Finish writing: pass undef dataRef to write so it can do all
404     # the work. Returns a 4 element array:
405     #
406     # (existingFlag, digestString, outputFileLength, errorList)
407     #
408     sub close
409     {
410     my($a) = @_;
411    
412     return $a->write(undef);
413     }
414    
415     #
416     # Abort a pool write
417     #
418     sub abort
419     {
420     my($a) = @_;
421    
422     if ( defined($a->{fhOut}) ) {
423     $a->{fhOut}->close();
424     unlink($a->{fileName});
425     }
426     foreach my $f ( @{$a->{files}} ) {
427     $f->{fh}->close();
428     }
429     $a->{files} = [];
430     }
431    
432     #
433     # Copy $nBytes from files $fhIn to $fhOut.
434     #
435     sub filePartialCopy
436     {
437     my($a, $fhIn, $fhOut, $nBytes) = @_;
438     my($nRead);
439    
440     while ( $nRead < $nBytes ) {
441     my $thisRead = $nBytes - $nRead < $BufSize
442     ? $nBytes - $nRead : $BufSize;
443     my $data;
444     my $n = $fhIn->read(\$data, $thisRead);
445     if ( $n != $thisRead ) {
446     push(@{$a->{errors}},
447     "Unable to read $thisRead bytes from "
448     . $fhIn->name . " (got $n)\n");
449     return;
450     }
451     $n = $fhOut->write(\$data, $thisRead);
452     if ( $n != $thisRead ) {
453     push(@{$a->{errors}},
454     "Unable to write $thisRead bytes to "
455     . $fhOut->name . " (got $n)\n");
456     return;
457     }
458     $nRead += $thisRead;
459     }
460     }
461    
462     #
463     # Compare $nBytes from files $fh0 and $fh1, and also compare additional
464     # $extra bytes from $fh1 to $$extraData.
465     #
466     sub filePartialCompare
467     {
468     my($a, $fh0, $fh1, $nBytes, $extra, $extraData) = @_;
469     my($nRead, $n);
470     my($data0, $data1);
471    
472     while ( $nRead < $nBytes ) {
473     my $thisRead = $nBytes - $nRead < $BufSize
474     ? $nBytes - $nRead : $BufSize;
475     $n = $fh0->read(\$data0, $thisRead);
476     if ( $n != $thisRead ) {
477     push(@{$a->{errors}}, "Unable to read $thisRead bytes from "
478     . $fh0->name . " (got $n)\n");
479     return;
480     }
481     $n = $fh1->read(\$data1, $thisRead);
482     return 0 if ( $n < $thisRead || $data0 ne $data1 );
483     $nRead += $thisRead;
484     }
485     if ( $extra > 0 ) {
486     # verify additional bytes
487     $n = $fh1->read(\$data1, $extra);
488     return 0 if ( $n != $extra || $data1 ne $$extraData );
489     } else {
490     # verify EOF
491     $n = $fh1->read(\$data1, 100);
492     return 0 if ( $n != 0 );
493     }
494     return 1;
495     }
496    
497     1;

  ViewVC Help
Powered by ViewVC 1.1.26