lib/BackupPC/PoolWrite.pm

#============================================================= -*-perl-*-
#
# BackupPC::PoolWrite package
#
# DESCRIPTION
#
#   This library defines a BackupPC::PoolWrite class for writing
#   files to disk that are candidates for pooling.  One instance
#   of this class is used to write each file.  The following steps
#   are executed:
#
#     - As the incoming data arrives, the first 1MB is buffered
#       in memory so the MD5 digest can be computed.
#
#     - A running comparison against all the candidate pool files
#       (ie: those with the same MD5 digest, usually at most a single
#       file) is done as new incoming data arrives.  Up to $MaxFiles
#       simultaneous files can be compared in parallel.  This
#       involves reading and uncompressing one or more pool files.
#
#     - When a pool file no longer matches it is discarded from
#       the search.  If there are more than $MaxFiles candidates, one of
#       the new candidates is added to the search, first checking
#       that it matches up to the current point (this requires
#       re-reading one of the other pool files).
#
#     - When or if no pool files match then the new file is written
#       to disk.  This could occur many MB into the file.  We don't
#       need to buffer all this data in memory since we can copy it
#       from the last matching pool file, up to the point where it
#       fully matched.
#
#     - When all the new data is complete, if a pool file exactly
#       matches then the file is simply created as a hardlink to
#       the pool file.
#
# AUTHOR
#   Craig Barratt  <cbarratt@users.sourceforge.net>
#
# COPYRIGHT
#   Copyright (C) 2001-2003  Craig Barratt
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#========================================================================
#
# Version 2.1.2, released 5 Sep 2005.
#
# See http://backuppc.sourceforge.net.
#
#========================================================================

package BackupPC::PoolWrite;

use strict;

use File::Path;
use Digest::MD5;
use BackupPC::FileZIO;

sub new
{
    my($class, $bpc, $fileName, $fileSize, $compress) = @_;

    my $self = bless {
        fileName => $fileName,
        fileSize => $fileSize,
        bpc      => $bpc,
        compress => $compress,
        nWrite   => 0,
        digest   => undef,
        files    => [],
        fileCnt  => -1,
        fhOut    => undef,
        errors   => [],
        data     => "",
        eof      => undef,
    }, $class;

    $self->{hardLinkMax} = $bpc->ConfValue("HardLinkMax");

    #
    # Always unlink any current file in case it is already linked
    #
    unlink($fileName) if ( -f $fileName );
    return $self;
}

my $BufSize  = 1048576;  # 1MB or 2^20
my $MaxFiles = 20;       # max number of compare files open at one time

sub write
{
    my($a, $dataRef) = @_;

    return if ( $a->{eof} );
    $a->{data} .= $$dataRef if ( defined($dataRef) );
    return if ( length($a->{data}) < $BufSize && defined($dataRef) );

    #
    # Correct the fileSize if it is wrong (rsync might transfer
    # a file whose length is different to the length sent with the
    # file list if the file changes between the file list sending
    # and the file sending).  Here we only catch the case where
    # we haven't computed the digest (ie: we have written no more
    # than $BufSize).  We catch the big file case below.
    #
    if ( !defined($dataRef) && !defined($a->{digest})
                && $a->{fileSize} != length($a->{data}) ) {
        #my $newSize = length($a->{data});
        #print("Fixing file size from $a->{fileSize} to $newSize\n");
        $a->{fileSize} = length($a->{data});
    }

    if ( !defined($a->{digest}) && length($a->{data}) > 0 ) {
        #
        # build a list of all the candidate matching files
        #
        my $md5 = Digest::MD5->new;
        $a->{fileSize} = length($a->{data})
                            if ( $a->{fileSize} < length($a->{data}) );
        $a->{digest} = $a->{bpc}->Buffer2MD5($md5, $a->{fileSize}, \$a->{data});
        if ( !defined($a->{base} = $a->{bpc}->MD52Path($a->{digest},
                                                       $a->{compress})) ) {
            push(@{$a->{errors}}, "Unable to get path from '$a->{digest}'"
                                . " for $a->{fileName}\n");
        } else {
            while ( @{$a->{files}} < $MaxFiles ) {
                my $fh;
                my $fileName = $a->{fileCnt} < 0 ? $a->{base}
                                        : "$a->{base}_$a->{fileCnt}";
                last if ( !-f $fileName );
                if ( (stat(_))[3] >= $a->{hardLinkMax}
                    || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
                                                     $a->{compress})) ) {
                    $a->{fileCnt}++;
                    next;
                }
                push(@{$a->{files}}, {
                        name => $fileName,
                        fh   => $fh,
                     });
                $a->{fileCnt}++;
            }
        }
        #
        # if there are no candidate files then we must write
        # the new file to disk
        #
        if ( !@{$a->{files}} ) {
            $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
                                              1, $a->{compress});
            if ( !defined($a->{fhOut}) ) {
                push(@{$a->{errors}}, "Unable to open $a->{fileName}"
                                    . " for writing\n");
            }
        }
    }
    my $dataLen = length($a->{data});
    if ( !defined($a->{fhOut}) && length($a->{data}) > 0 ) {
        #
        # See if the new chunk of data continues to match the
        # candidate files.
        #
        for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
            my($d, $match);
            my $fileName = $a->{fileCnt} < 0 ? $a->{base}
                                             : "$a->{base}_$a->{fileCnt}";
            if ( $dataLen > 0 ) {
                # verify next $dataLen bytes from candidate file
                my $n = $a->{files}[$i]->{fh}->read(\$d, $dataLen);
                next if ( $n == $dataLen && $d eq $a->{data} );
            } else {
                # verify candidate file is at EOF
                my $n = $a->{files}[$i]->{fh}->read(\$d, 100);
                next if ( $n == 0 );
            }
            #print("   File $a->{files}[$i]->{name} doesn't match\n");
            #
            # this candidate file didn't match.  Replace it
            # with a new candidate file.  We have to qualify
            # any new candidate file by making sure that its
            # first $a->{nWrite} bytes match, plus the next $dataLen
            # bytes match $a->{data}.
            #
            while ( -f $fileName ) {
                my $fh;
                if ( (stat(_))[3] >= $a->{hardLinkMax}
                    || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
                                                     $a->{compress})) ) {
                    $a->{fileCnt}++;
                    #print("   Discarding $fileName (open failed)\n");
                    $fileName = "$a->{base}_$a->{fileCnt}";
                    next;
                }
                if ( !$a->{files}[$i]->{fh}->rewind() ) {
                    push(@{$a->{errors}},
                            "Unable to rewind $a->{files}[$i]->{name}"
                          . " for compare\n");
                }
                $match = $a->filePartialCompare($a->{files}[$i]->{fh}, $fh,
                                          $a->{nWrite}, $dataLen, \$a->{data});
                if ( $match ) {
                    $a->{files}[$i]->{fh}->close();
                    $a->{files}[$i]->{fh} = $fh,
                    $a->{files}[$i]->{name} = $fileName;
                    #print("   Found new candidate $fileName\n");
                    $a->{fileCnt}++;
                    last;
                } else {
                    #print("   Discarding $fileName (no match)\n");
                }
                $fh->close();
                $a->{fileCnt}++;
                $fileName = "$a->{base}_$a->{fileCnt}";
            }
            if ( !$match ) {
                #
                # We couldn't find another candidate file
                #
                if ( @{$a->{files}} == 1 ) {
                    #print("   Exhausted matches, now writing\n");
                    $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
                                                    1, $a->{compress});
                    if ( !defined($a->{fhOut}) ) {
                        push(@{$a->{errors}},
                                "Unable to open $a->{fileName}"
                              . " for writing\n");
                    } else {
                        if ( !$a->{files}[$i]->{fh}->rewind() ) {
                            push(@{$a->{errors}}, 
                                     "Unable to rewind"
                                   . " $a->{files}[$i]->{name} for copy\n");
                        }
                        $a->filePartialCopy($a->{files}[$i]->{fh}, $a->{fhOut},
                                        $a->{nWrite});
                    }
                }
                $a->{files}[$i]->{fh}->close();
                splice(@{$a->{files}}, $i, 1);
                $i--;
            }
        }
    }
    if ( defined($a->{fhOut}) && $dataLen > 0 ) {
        #
        # if we are in writing mode then just write the data
        #
        my $n = $a->{fhOut}->write(\$a->{data});
        if ( $n != $dataLen ) {
            push(@{$a->{errors}}, "Unable to write $dataLen bytes to"
                                . " $a->{fileName} (got $n)\n");
        }
    }
    $a->{nWrite} += $dataLen;
    $a->{data} = "";
    return if ( defined($dataRef) );

    #
    # We are at EOF, so finish up
    #
    $a->{eof} = 1;

    #
    # Make sure the fileSize was correct.  See above for comments about
    # rsync.
    #
    if ( $a->{nWrite} != $a->{fileSize} ) {
        #
        # Oops, fileSize was wrong, so our MD5 digest was wrong and our
        # effort to match files likely failed.  This is ugly, but our
        # only choice at this point is to re-write the entire file with
        # the correct length.  We need to rename the file, open it for
        # reading, and then re-write the file with the correct length.
        #

        #print("Doing big file fixup ($a->{fileSize} != $a->{nWrite})\n");

        my($fh, $fileName);
        $a->{fileSize} = $a->{nWrite};

        if ( defined($a->{fhOut}) ) {
            if ( $a->{fileName} =~ /(.*)\// ) {
                $fileName = $1;
            } else {
                $fileName = ".";
            }
            #
            # Find a unique target temporary file name
            #
            my $i = 0;
            while ( -f "$fileName/t$$.$i" ) {
                $i++;
            }
            $fileName = "$fileName/t$$.$i";
            $a->{fhOut}->close();
            if ( !rename($a->{fileName}, $fileName)
              || !defined($fh = BackupPC::FileZIO->open($fileName, 0,
                                                 $a->{compress})) ) {
                push(@{$a->{errors}}, "Can't rename $a->{fileName} -> $fileName"
                                    . " or open during size fixup\n");
            }
            #print("Using temporary name $fileName\n");
        } elsif ( defined($a->{files}) && defined($a->{files}[0]) ) {
            #
            # We haven't written anything yet, so just use the
            # compare file to copy from.
            #
            $fh = $a->{files}[0]->{fh};
            $fh->rewind;
            #print("Using compare file $a->{files}[0]->{name}\n");
        }
        if ( defined($fh) ) {
            my $poolWrite = BackupPC::PoolWrite->new($a->{bpc}, $a->{fileName},
                                        $a->{fileSize}, $a->{compress});
            my $nRead = 0;

            while ( $nRead < $a->{fileSize} ) {
                my $thisRead = $a->{fileSize} - $nRead < $BufSize
                             ? $a->{fileSize} - $nRead : $BufSize;
                my $data;
                my $n = $fh->read(\$data, $thisRead);
                if ( $n != $thisRead ) {
                    push(@{$a->{errors}},
                                "Unable to read $thisRead bytes during resize"
                               . " from temp $fileName (got $n)\n");
                    last;
                }
                $poolWrite->write(\$data);
                $nRead += $thisRead;
            }
            $fh->close;
            unlink($fileName) if ( defined($fileName) );
            if ( @{$a->{errors}} ) {
                $poolWrite->close;
                return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
            } else {
                return $poolWrite->close;
            }
        }
    }

    #
    # Close the compare files
    #
    foreach my $f ( @{$a->{files}} ) {
        $f->{fh}->close();
    }

    if ( $a->{fileSize} == 0 ) {
        #
        # Simply create an empty file
        #
        local(*OUT);
        if ( !open(OUT, ">", $a->{fileName}) ) {
            push(@{$a->{errors}}, "Can't open $a->{fileName} for empty"
                                . " output\n");
        } else {
            close(OUT);
        }
        return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
    } elsif ( defined($a->{fhOut}) ) {
        $a->{fhOut}->close();
        return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
    } else {
        if ( @{$a->{files}} == 0 ) {
            push(@{$a->{errors}}, "Botch, no matches on $a->{fileName}"
                                . " ($a->{digest})\n");
        } elsif ( @{$a->{files}} > 1 ) {
            #
            # This is no longer a real error because $Conf{HardLinkMax}
            # could be hit, thereby creating identical pool files
            #
            #my $str = "Unexpected multiple matches on"
            #       . " $a->{fileName} ($a->{digest})\n";
            #for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
            #    $str .= "     -> $a->{files}[$i]->{name}\n";
            #}
            #push(@{$a->{errors}}, $str);
        }
        #print("   Linking $a->{fileName} to $a->{files}[0]->{name}\n");
        if ( @{$a->{files}} && !link($a->{files}[0]->{name}, $a->{fileName}) ) {
            push(@{$a->{errors}}, "Can't link $a->{fileName} to"
                                . " $a->{files}[0]->{name}\n");
        }
        return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
    }
}

#
# Finish writing: pass undef dataRef to write so it can do all
# the work.  Returns a 4 element array:
#
#   (existingFlag, digestString, outputFileLength, errorList)
#
sub close
{
    my($a) = @_;

    return $a->write(undef);
}

#
# Abort a pool write
#
sub abort
{
    my($a) = @_;

    if ( defined($a->{fhOut}) ) {
        $a->{fhOut}->close();
        unlink($a->{fileName});
    }
    foreach my $f ( @{$a->{files}} ) {
        $f->{fh}->close();
    }
    $a->{files} = [];
}

#
# Copy $nBytes from files $fhIn to $fhOut.
#
sub filePartialCopy
{
    my($a, $fhIn, $fhOut, $nBytes) = @_;
    my($nRead);

    while ( $nRead < $nBytes ) {
        my $thisRead = $nBytes - $nRead < $BufSize
                            ? $nBytes - $nRead : $BufSize;
        my $data;
        my $n = $fhIn->read(\$data, $thisRead);
        if ( $n != $thisRead ) {
            push(@{$a->{errors}},
                        "Unable to read $thisRead bytes from "
                       . $fhIn->name . " (got $n)\n");
            return;
        }
        $n = $fhOut->write(\$data, $thisRead);
        if ( $n != $thisRead ) {
            push(@{$a->{errors}},
                        "Unable to write $thisRead bytes to "
                       . $fhOut->name . " (got $n)\n");
            return;
        }
        $nRead += $thisRead;
    }
}

#
# Compare $nBytes from files $fh0 and $fh1, and also compare additional
# $extra bytes from $fh1 to $$extraData.
#
sub filePartialCompare
{
    my($a, $fh0, $fh1, $nBytes, $extra, $extraData) = @_;
    my($nRead, $n);
    my($data0, $data1);

    while ( $nRead < $nBytes ) {
        my $thisRead = $nBytes - $nRead < $BufSize
                            ? $nBytes - $nRead : $BufSize;
        $n = $fh0->read(\$data0, $thisRead);
        if ( $n != $thisRead ) {
            push(@{$a->{errors}}, "Unable to read $thisRead bytes from "
                                 . $fh0->name . " (got $n)\n");
            return;
        }
        $n = $fh1->read(\$data1, $thisRead);
        return 0 if ( $n < $thisRead || $data0 ne $data1 );
        $nRead += $thisRead;
    }
    if ( $extra > 0 ) {
        # verify additional bytes
        $n = $fh1->read(\$data1, $extra);
        return 0 if ( $n != $extra || $data1 ne $$extraData );
    } else {
        # verify EOF
        $n = $fh1->read(\$data1, 100);
        return 0 if ( $n != 0 );
    }
    return 1;
}

1;
1	dpavlin	1	#============================================================= --perl--
2			#
3			# BackupPC::PoolWrite package
4			#
5			# DESCRIPTION
6			#
7			# This library defines a BackupPC::PoolWrite class for writing
8			# files to disk that are candidates for pooling. One instance
9			# of this class is used to write each file. The following steps
10			# are executed:
11			#
12			# - As the incoming data arrives, the first 1MB is buffered
13			# in memory so the MD5 digest can be computed.
14			#
15			# - A running comparison against all the candidate pool files
16			# (ie: those with the same MD5 digest, usually at most a single
17			# file) is done as new incoming data arrives. Up to $MaxFiles
18			# simultaneous files can be compared in parallel. This
19			# involves reading and uncompressing one or more pool files.
20			#
21			# - When a pool file no longer matches it is discarded from
22			# the search. If there are more than $MaxFiles candidates, one of
23			# the new candidates is added to the search, first checking
24			# that it matches up to the current point (this requires
25			# re-reading one of the other pool files).
26			#
27			# - When or if no pool files match then the new file is written
28			# to disk. This could occur many MB into the file. We don't
29			# need to buffer all this data in memory since we can copy it
30			# from the last matching pool file, up to the point where it
31			# fully matched.
32			#
33			# - When all the new data is complete, if a pool file exactly
34			# matches then the file is simply created as a hardlink to
35			# the pool file.
36			#
37			# AUTHOR
38			# Craig Barratt <cbarratt@users.sourceforge.net>
39			#
40			# COPYRIGHT
41			# Copyright (C) 2001-2003 Craig Barratt
42			#
43			# This program is free software; you can redistribute it and/or modify
44			# it under the terms of the GNU General Public License as published by
45			# the Free Software Foundation; either version 2 of the License, or
46			# (at your option) any later version.
47			#
48			# This program is distributed in the hope that it will be useful,
49			# but WITHOUT ANY WARRANTY; without even the implied warranty of
50			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
51			# GNU General Public License for more details.
52			#
53			# You should have received a copy of the GNU General Public License
54			# along with this program; if not, write to the Free Software
55			# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
56			#
57			#========================================================================
58			#
59	dpavlin	316	# Version 2.1.2, released 5 Sep 2005.
60	dpavlin	1	#
61			# See http://backuppc.sourceforge.net.
62			#
63			#========================================================================
64
65			package BackupPC::PoolWrite;
66
67			use strict;
68
69			use File::Path;
70			use Digest::MD5;
71			use BackupPC::FileZIO;
72
73			sub new
74			{
75			my($class, $bpc, $fileName, $fileSize, $compress) = @_;
76
77			my $self = bless {
78			fileName => $fileName,
79			fileSize => $fileSize,
80			bpc => $bpc,
81			compress => $compress,
82			nWrite => 0,
83			digest => undef,
84			files => [],
85			fileCnt => -1,
86			fhOut => undef,
87			errors => [],
88			data => "",
89			eof => undef,
90			}, $class;
91
92			$self->{hardLinkMax} = $bpc->ConfValue("HardLinkMax");
93
94			#
95			# Always unlink any current file in case it is already linked
96			#
97			unlink($fileName) if ( -f $fileName );
98			return $self;
99			}
100
101			my $BufSize = 1048576; # 1MB or 2^20
102			my $MaxFiles = 20; # max number of compare files open at one time
103
104			sub write
105			{
106			my($a, $dataRef) = @_;
107
108			return if ( $a->{eof} );
109			$a->{data} .= $$dataRef if ( defined($dataRef) );
110			return if ( length($a->{data}) < $BufSize && defined($dataRef) );
111
112			#
113			# Correct the fileSize if it is wrong (rsync might transfer
114			# a file whose length is different to the length sent with the
115			# file list if the file changes between the file list sending
116			# and the file sending). Here we only catch the case where
117			# we haven't computed the digest (ie: we have written no more
118			# than $BufSize). We catch the big file case below.
119			#
120			if ( !defined($dataRef) && !defined($a->{digest})
121			&& $a->{fileSize} != length($a->{data}) ) {
122			#my $newSize = length($a->{data});
123			#print("Fixing file size from $a->{fileSize} to $newSize\n");
124			$a->{fileSize} = length($a->{data});
125			}
126
127			if ( !defined($a->{digest}) && length($a->{data}) > 0 ) {
128			#
129			# build a list of all the candidate matching files
130			#
131			my $md5 = Digest::MD5->new;
132			$a->{fileSize} = length($a->{data})
133			if ( $a->{fileSize} < length($a->{data}) );
134			$a->{digest} = $a->{bpc}->Buffer2MD5($md5, $a->{fileSize}, \$a->{data});
135			if ( !defined($a->{base} = $a->{bpc}->MD52Path($a->{digest},
136			$a->{compress})) ) {
137			push(@{$a->{errors}}, "Unable to get path from '$a->{digest}'"
138			. " for $a->{fileName}\n");
139			} else {
140			while ( @{$a->{files}} < $MaxFiles ) {
141			my $fh;
142			my $fileName = $a->{fileCnt} < 0 ? $a->{base}
143			: "$a->{base}_$a->{fileCnt}";
144			last if ( !-f $fileName );
145			if ( (stat(_))[3] >= $a->{hardLinkMax}
146			\|\| !defined($fh = BackupPC::FileZIO->open($fileName, 0,
147			$a->{compress})) ) {
148			$a->{fileCnt}++;
149			next;
150			}
151			push(@{$a->{files}}, {
152			name => $fileName,
153			fh => $fh,
154			});
155			$a->{fileCnt}++;
156			}
157			}
158			#
159			# if there are no candidate files then we must write
160			# the new file to disk
161			#
162			if ( !@{$a->{files}} ) {
163			$a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
164			1, $a->{compress});
165			if ( !defined($a->{fhOut}) ) {
166			push(@{$a->{errors}}, "Unable to open $a->{fileName}"
167			. " for writing\n");
168			}
169			}
170			}
171			my $dataLen = length($a->{data});
172			if ( !defined($a->{fhOut}) && length($a->{data}) > 0 ) {
173			#
174			# See if the new chunk of data continues to match the
175			# candidate files.
176			#
177			for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
178			my($d, $match);
179			my $fileName = $a->{fileCnt} < 0 ? $a->{base}
180			: "$a->{base}_$a->{fileCnt}";
181			if ( $dataLen > 0 ) {
182			# verify next $dataLen bytes from candidate file
183			my $n = $a->{files}[$i]->{fh}->read(\$d, $dataLen);
184			next if ( $n == $dataLen && $d eq $a->{data} );
185			} else {
186			# verify candidate file is at EOF
187			my $n = $a->{files}[$i]->{fh}->read(\$d, 100);
188			next if ( $n == 0 );
189			}
190			#print(" File $a->{files}[$i]->{name} doesn't match\n");
191			#
192			# this candidate file didn't match. Replace it
193			# with a new candidate file. We have to qualify
194			# any new candidate file by making sure that its
195			# first $a->{nWrite} bytes match, plus the next $dataLen
196			# bytes match $a->{data}.
197			#
198			while ( -f $fileName ) {
199			my $fh;
200			if ( (stat(_))[3] >= $a->{hardLinkMax}
201			\|\| !defined($fh = BackupPC::FileZIO->open($fileName, 0,
202			$a->{compress})) ) {
203			$a->{fileCnt}++;
204			#print(" Discarding $fileName (open failed)\n");
205			$fileName = "$a->{base}_$a->{fileCnt}";
206			next;
207			}
208			if ( !$a->{files}[$i]->{fh}->rewind() ) {
209			push(@{$a->{errors}},
210			"Unable to rewind $a->{files}[$i]->{name}"
211			. " for compare\n");
212			}
213			$match = $a->filePartialCompare($a->{files}[$i]->{fh}, $fh,
214			$a->{nWrite}, $dataLen, \$a->{data});
215			if ( $match ) {
216			$a->{files}[$i]->{fh}->close();
217			$a->{files}[$i]->{fh} = $fh,
218			$a->{files}[$i]->{name} = $fileName;
219			#print(" Found new candidate $fileName\n");
220			$a->{fileCnt}++;
221			last;
222			} else {
223			#print(" Discarding $fileName (no match)\n");
224			}
225			$fh->close();
226			$a->{fileCnt}++;
227			$fileName = "$a->{base}_$a->{fileCnt}";
228			}
229			if ( !$match ) {
230			#
231			# We couldn't find another candidate file
232			#
233			if ( @{$a->{files}} == 1 ) {
234			#print(" Exhausted matches, now writing\n");
235			$a->{fhOut} = BackupPC::FileZIO->open($a->{fileName},
236			1, $a->{compress});
237			if ( !defined($a->{fhOut}) ) {
238			push(@{$a->{errors}},
239			"Unable to open $a->{fileName}"
240			. " for writing\n");
241			} else {
242			if ( !$a->{files}[$i]->{fh}->rewind() ) {
243			push(@{$a->{errors}},
244			"Unable to rewind"
245			. " $a->{files}[$i]->{name} for copy\n");
246			}
247			$a->filePartialCopy($a->{files}[$i]->{fh}, $a->{fhOut},
248			$a->{nWrite});
249			}
250			}
251			$a->{files}[$i]->{fh}->close();
252			splice(@{$a->{files}}, $i, 1);
253			$i--;
254			}
255			}
256			}
257			if ( defined($a->{fhOut}) && $dataLen > 0 ) {
258			#
259			# if we are in writing mode then just write the data
260			#
261			my $n = $a->{fhOut}->write(\$a->{data});
262			if ( $n != $dataLen ) {
263			push(@{$a->{errors}}, "Unable to write $dataLen bytes to"
264			. " $a->{fileName} (got $n)\n");
265			}
266			}
267			$a->{nWrite} += $dataLen;
268			$a->{data} = "";
269			return if ( defined($dataRef) );
270
271			#
272			# We are at EOF, so finish up
273			#
274			$a->{eof} = 1;
275
276			#
277			# Make sure the fileSize was correct. See above for comments about
278			# rsync.
279			#
280			if ( $a->{nWrite} != $a->{fileSize} ) {
281			#
282			# Oops, fileSize was wrong, so our MD5 digest was wrong and our
283			# effort to match files likely failed. This is ugly, but our
284			# only choice at this point is to re-write the entire file with
285			# the correct length. We need to rename the file, open it for
286			# reading, and then re-write the file with the correct length.
287			#
288
289			#print("Doing big file fixup ($a->{fileSize} != $a->{nWrite})\n");
290
291			my($fh, $fileName);
292			$a->{fileSize} = $a->{nWrite};
293
294			if ( defined($a->{fhOut}) ) {
295			if ( $a->{fileName} =~ /(.*)\// ) {
296			$fileName = $1;
297			} else {
298			$fileName = ".";
299			}
300			#
301			# Find a unique target temporary file name
302			#
303			my $i = 0;
304			while ( -f "$fileName/t$$.$i" ) {
305			$i++;
306			}
307			$fileName = "$fileName/t$$.$i";
308			$a->{fhOut}->close();
309			if ( !rename($a->{fileName}, $fileName)
310			\|\| !defined($fh = BackupPC::FileZIO->open($fileName, 0,
311			$a->{compress})) ) {
312			push(@{$a->{errors}}, "Can't rename $a->{fileName} -> $fileName"
313			. " or open during size fixup\n");
314			}
315			#print("Using temporary name $fileName\n");
316			} elsif ( defined($a->{files}) && defined($a->{files}[0]) ) {
317			#
318			# We haven't written anything yet, so just use the
319			# compare file to copy from.
320			#
321			$fh = $a->{files}[0]->{fh};
322			$fh->rewind;
323			#print("Using compare file $a->{files}[0]->{name}\n");
324			}
325			if ( defined($fh) ) {
326			my $poolWrite = BackupPC::PoolWrite->new($a->{bpc}, $a->{fileName},
327			$a->{fileSize}, $a->{compress});
328			my $nRead = 0;
329
330			while ( $nRead < $a->{fileSize} ) {
331			my $thisRead = $a->{fileSize} - $nRead < $BufSize
332			? $a->{fileSize} - $nRead : $BufSize;
333			my $data;
334			my $n = $fh->read(\$data, $thisRead);
335			if ( $n != $thisRead ) {
336			push(@{$a->{errors}},
337			"Unable to read $thisRead bytes during resize"
338			. " from temp $fileName (got $n)\n");
339			last;
340			}
341			$poolWrite->write(\$data);
342			$nRead += $thisRead;
343			}
344			$fh->close;
345			unlink($fileName) if ( defined($fileName) );
346			if ( @{$a->{errors}} ) {
347			$poolWrite->close;
348			return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
349			} else {
350			return $poolWrite->close;
351			}
352			}
353			}
354
355			#
356			# Close the compare files
357			#
358			foreach my $f ( @{$a->{files}} ) {
359			$f->{fh}->close();
360			}
361
362			if ( $a->{fileSize} == 0 ) {
363			#
364			# Simply create an empty file
365			#
366			local(*OUT);
367			if ( !open(OUT, ">", $a->{fileName}) ) {
368			push(@{$a->{errors}}, "Can't open $a->{fileName} for empty"
369			. " output\n");
370			} else {
371			close(OUT);
372			}
373			return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
374			} elsif ( defined($a->{fhOut}) ) {
375			$a->{fhOut}->close();
376			return (0, $a->{digest}, -s $a->{fileName}, $a->{errors});
377			} else {
378			if ( @{$a->{files}} == 0 ) {
379			push(@{$a->{errors}}, "Botch, no matches on $a->{fileName}"
380			. " ($a->{digest})\n");
381			} elsif ( @{$a->{files}} > 1 ) {
382			#
383			# This is no longer a real error because $Conf{HardLinkMax}
384			# could be hit, thereby creating identical pool files
385			#
386			#my $str = "Unexpected multiple matches on"
387			# . " $a->{fileName} ($a->{digest})\n";
388			#for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) {
389			# $str .= " -> $a->{files}[$i]->{name}\n";
390			#}
391			#push(@{$a->{errors}}, $str);
392			}
393			#print(" Linking $a->{fileName} to $a->{files}[0]->{name}\n");
394			if ( @{$a->{files}} && !link($a->{files}[0]->{name}, $a->{fileName}) ) {
395			push(@{$a->{errors}}, "Can't link $a->{fileName} to"
396			. " $a->{files}[0]->{name}\n");
397			}
398			return (1, $a->{digest}, -s $a->{fileName}, $a->{errors});
399			}
400			}
401
402			#
403			# Finish writing: pass undef dataRef to write so it can do all
404			# the work. Returns a 4 element array:
405			#
406			# (existingFlag, digestString, outputFileLength, errorList)
407			#
408			sub close
409			{
410			my($a) = @_;
411
412			return $a->write(undef);
413			}
414
415			#
416			# Abort a pool write
417			#
418			sub abort
419			{
420			my($a) = @_;
421
422			if ( defined($a->{fhOut}) ) {
423			$a->{fhOut}->close();
424			unlink($a->{fileName});
425			}
426			foreach my $f ( @{$a->{files}} ) {
427			$f->{fh}->close();
428			}
429			$a->{files} = [];
430			}
431
432			#
433			# Copy $nBytes from files $fhIn to $fhOut.
434			#
435			sub filePartialCopy
436			{
437			my($a, $fhIn, $fhOut, $nBytes) = @_;
438			my($nRead);
439
440			while ( $nRead < $nBytes ) {
441			my $thisRead = $nBytes - $nRead < $BufSize
442			? $nBytes - $nRead : $BufSize;
443			my $data;
444			my $n = $fhIn->read(\$data, $thisRead);
445			if ( $n != $thisRead ) {
446			push(@{$a->{errors}},
447			"Unable to read $thisRead bytes from "
448			. $fhIn->name . " (got $n)\n");
449			return;
450			}
451			$n = $fhOut->write(\$data, $thisRead);
452			if ( $n != $thisRead ) {
453			push(@{$a->{errors}},
454			"Unable to write $thisRead bytes to "
455			. $fhOut->name . " (got $n)\n");
456			return;
457			}
458			$nRead += $thisRead;
459			}
460			}
461
462			#
463			# Compare $nBytes from files $fh0 and $fh1, and also compare additional
464			# $extra bytes from $fh1 to $$extraData.
465			#
466			sub filePartialCompare
467			{
468			my($a, $fh0, $fh1, $nBytes, $extra, $extraData) = @_;
469			my($nRead, $n);
470			my($data0, $data1);
471
472			while ( $nRead < $nBytes ) {
473			my $thisRead = $nBytes - $nRead < $BufSize
474			? $nBytes - $nRead : $BufSize;
475			$n = $fh0->read(\$data0, $thisRead);
476			if ( $n != $thisRead ) {
477			push(@{$a->{errors}}, "Unable to read $thisRead bytes from "
478			. $fh0->name . " (got $n)\n");
479			return;
480			}
481			$n = $fh1->read(\$data1, $thisRead);
482			return 0 if ( $n < $thisRead \|\| $data0 ne $data1 );
483			$nRead += $thisRead;
484			}
485			if ( $extra > 0 ) {
486			# verify additional bytes
487			$n = $fh1->read(\$data1, $extra);
488			return 0 if ( $n != $extra \|\| $data1 ne $$extraData );
489			} else {
490			# verify EOF
491			$n = $fh1->read(\$data1, 100);
492			return 0 if ( $n != 0 );
493			}
494			return 1;
495			}
496
497			1;