HADOOP-16932. distcp copy calls getFileStatus() needlessly and can fail against S3 (#1936)

Contributed by Steve Loughran.

This strips out all the -p preservation options which have already been
processed when uploading a file before deciding whether or not to query
the far end for the status of the (existing/uploaded) file to see if any
other attributes need changing.

This will avoid 404 caching-related issues in S3, wherein a newly created
file can have a 404 entry in the S3 load balancer's cache from the
probes for the file's existence prior to the upload.

It partially addresses a regression caused by HADOOP-8143,
"Change distcp to have -pb on by default" that causes a resurfacing
of HADOOP-13145, "In DistCp, prevent unnecessary getFileStatus call when
not preserving metadata"
This commit is contained in:
Steve Loughran 2020-04-07 17:55:55 +01:00 committed by GitHub
parent bffb43b00e
commit 20eec95867
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 97 additions and 37 deletions

View File

@ -199,6 +199,9 @@ public class DistCpUtils {
EnumSet<FileAttribute> attributes,
boolean preserveRawXattrs) throws IOException {
// strip out those attributes we don't need any more
attributes.remove(FileAttribute.BLOCKSIZE);
attributes.remove(FileAttribute.CHECKSUMTYPE);
// If not preserving anything from FileStatus, don't bother fetching it.
FileStatus targetFileStatus = attributes.isEmpty() ? null :
targetFS.getFileStatus(path);

View File

@ -45,6 +45,7 @@ import org.junit.Test;
import com.google.common.collect.Lists;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.util.EnumSet;
@ -66,6 +67,7 @@ import static org.apache.hadoop.hdfs.server.namenode.AclTestHelpers.aclEntry;
import static org.apache.hadoop.test.LambdaTestUtils.intercept;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertTrue;
import static org.assertj.core.api.Assertions.assertThat;
@ -191,15 +193,95 @@ public class TestDistCpUtils {
DistCpUtils.preserve(fs, dst, srcStatus, attributes, false);
CopyListingFileStatus dstStatus = new CopyListingFileStatus(fs.getFileStatus(dst));
assertStatusEqual(fs, dst, srcStatus);
}
private void assertStatusEqual(final FileSystem fs,
final Path dst,
final CopyListingFileStatus srcStatus) throws IOException {
FileStatus destStatus = fs.getFileStatus(dst);
CopyListingFileStatus dstStatus = new CopyListingFileStatus(
destStatus);
String text = String.format("Source %s; dest %s: wrong ", srcStatus,
destStatus);
// FileStatus.equals only compares path field, must explicitly compare all fields
Assert.assertTrue(srcStatus.getPermission().equals(dstStatus.getPermission()));
Assert.assertTrue(srcStatus.getOwner().equals(dstStatus.getOwner()));
Assert.assertTrue(srcStatus.getGroup().equals(dstStatus.getGroup()));
Assert.assertTrue(srcStatus.getAccessTime() == dstStatus.getAccessTime());
Assert.assertTrue(srcStatus.getModificationTime() == dstStatus.getModificationTime());
Assert.assertTrue(srcStatus.getReplication() == dstStatus.getReplication());
assertEquals(text + "permission",
srcStatus.getPermission(), dstStatus.getPermission());
assertEquals(text + "owner",
srcStatus.getOwner(), dstStatus.getOwner());
assertEquals(text + "group",
srcStatus.getGroup(), dstStatus.getGroup());
assertEquals(text + "accessTime",
srcStatus.getAccessTime(), dstStatus.getAccessTime());
assertEquals(text + "modificationTime",
srcStatus.getModificationTime(), dstStatus.getModificationTime());
assertEquals(text + "replication",
srcStatus.getReplication(), dstStatus.getReplication());
}
private void assertStatusNotEqual(final FileSystem fs,
final Path dst,
final CopyListingFileStatus srcStatus) throws IOException {
FileStatus destStatus = fs.getFileStatus(dst);
CopyListingFileStatus dstStatus = new CopyListingFileStatus(
destStatus);
String text = String.format("Source %s; dest %s: wrong ",
srcStatus, destStatus);
// FileStatus.equals only compares path field,
// must explicitly compare all fields
assertNotEquals(text + "permission",
srcStatus.getPermission(), dstStatus.getPermission());
assertNotEquals(text + "owner",
srcStatus.getOwner(), dstStatus.getOwner());
assertNotEquals(text + "group",
srcStatus.getGroup(), dstStatus.getGroup());
assertNotEquals(text + "accessTime",
srcStatus.getAccessTime(), dstStatus.getAccessTime());
assertNotEquals(text + "modificationTime",
srcStatus.getModificationTime(), dstStatus.getModificationTime());
assertNotEquals(text + "replication",
srcStatus.getReplication(), dstStatus.getReplication());
}
@Test
public void testSkipsNeedlessAttributes() throws Exception {
FileSystem fs = FileSystem.get(config);
// preserve replication, block size, user, group, permission,
// checksum type and timestamps
Path src = new Path("/tmp/testSkipsNeedlessAttributes/source");
Path dst = new Path("/tmp/testSkipsNeedlessAttributes/dest");
// there is no need to actually create a source file, just a file
// status of one
CopyListingFileStatus srcStatus = new CopyListingFileStatus(
new FileStatus(0, false, 1, 32, 0, src));
// if an attribute is needed, preserve will fail to find the file
EnumSet<FileAttribute> attrs = EnumSet.of(FileAttribute.ACL,
FileAttribute.GROUP,
FileAttribute.PERMISSION,
FileAttribute.TIMES,
FileAttribute.XATTR);
for (FileAttribute attr : attrs) {
intercept(FileNotFoundException.class, () ->
DistCpUtils.preserve(fs, dst, srcStatus,
EnumSet.of(attr),
false));
}
// but with the preservation flags only used
// in file creation, this does not happen
DistCpUtils.preserve(fs, dst, srcStatus,
EnumSet.of(
FileAttribute.BLOCKSIZE,
FileAttribute.CHECKSUMTYPE),
false);
}
@Test
@ -258,16 +340,8 @@ public class TestDistCpUtils {
// FileStatus.equals only compares path field, must explicitly compare all
// fields
Assert.assertEquals("getPermission", srcStatus.getPermission(),
dstStatus.getPermission());
Assert.assertEquals("Owner", srcStatus.getOwner(), dstStatus.getOwner());
Assert.assertEquals("Group", srcStatus.getGroup(), dstStatus.getGroup());
Assert.assertEquals("AccessTime", srcStatus.getAccessTime(),
dstStatus.getAccessTime());
Assert.assertEquals("ModificationTime", srcStatus.getModificationTime(),
dstStatus.getModificationTime());
Assert.assertEquals("Replication", srcStatus.getReplication(),
dstStatus.getReplication());
assertStatusEqual(fs, dest, srcStatus);
Assert.assertArrayEquals(en1.toArray(), dd2.toArray());
}
@ -486,12 +560,7 @@ public class TestDistCpUtils {
CopyListingFileStatus dstStatus = new CopyListingFileStatus(fs.getFileStatus(dst));
// FileStatus.equals only compares path field, must explicitly compare all fields
Assert.assertFalse(srcStatus.getPermission().equals(dstStatus.getPermission()));
Assert.assertFalse(srcStatus.getOwner().equals(dstStatus.getOwner()));
Assert.assertFalse(srcStatus.getGroup().equals(dstStatus.getGroup()));
Assert.assertFalse(srcStatus.getAccessTime() == dstStatus.getAccessTime());
Assert.assertFalse(srcStatus.getModificationTime() == dstStatus.getModificationTime());
Assert.assertFalse(srcStatus.getReplication() == dstStatus.getReplication());
assertStatusNotEqual(fs, dst, srcStatus);
}
@Test
@ -842,13 +911,7 @@ public class TestDistCpUtils {
// FileStatus.equals only compares path field, must explicitly compare all fields
// attributes of src -> f2 ? should be yes
CopyListingFileStatus f2Status = new CopyListingFileStatus(fs.getFileStatus(f2));
Assert.assertTrue(srcStatus.getPermission().equals(f2Status.getPermission()));
Assert.assertTrue(srcStatus.getOwner().equals(f2Status.getOwner()));
Assert.assertTrue(srcStatus.getGroup().equals(f2Status.getGroup()));
Assert.assertTrue(srcStatus.getAccessTime() == f2Status.getAccessTime());
Assert.assertTrue(srcStatus.getModificationTime() == f2Status.getModificationTime());
Assert.assertTrue(srcStatus.getReplication() == f2Status.getReplication());
assertStatusEqual(fs, f2, srcStatus);
// attributes of src -> f1 ? should be no
CopyListingFileStatus f1Status = new CopyListingFileStatus(fs.getFileStatus(f1));
@ -1047,13 +1110,7 @@ public class TestDistCpUtils {
// FileStatus.equals only compares path field, must explicitly compare all fields
// attributes of src -> f0 ? should be yes
CopyListingFileStatus f0Status = new CopyListingFileStatus(fs.getFileStatus(f0));
Assert.assertTrue(srcStatus.getPermission().equals(f0Status.getPermission()));
Assert.assertTrue(srcStatus.getOwner().equals(f0Status.getOwner()));
Assert.assertTrue(srcStatus.getGroup().equals(f0Status.getGroup()));
Assert.assertTrue(srcStatus.getAccessTime() == f0Status.getAccessTime());
Assert.assertTrue(srcStatus.getModificationTime() == f0Status.getModificationTime());
Assert.assertTrue(srcStatus.getReplication() == f0Status.getReplication());
assertStatusEqual(fs, f0, srcStatus);
// attributes of src -> f1 ? should be no
CopyListingFileStatus f1Status = new CopyListingFileStatus(fs.getFileStatus(f1));