HADOOP-16932. distcp copy calls getFileStatus() needlessly and can fail against S3 (#1936)

Contributed by Steve Loughran.

This strips out all the -p preservation options which have already been
processed when uploading a file before deciding whether or not to query
the far end for the status of the (existing/uploaded) file to see if any
other attributes need changing.

This will avoid 404 caching-related issues in S3, wherein a newly created
file can have a 404 entry in the S3 load balancer's cache from the
probes for the file's existence prior to the upload.

It partially addresses a regression caused by HADOOP-8143,
"Change distcp to have -pb on by default" that causes a resurfacing
of HADOOP-13145, "In DistCp, prevent unnecessary getFileStatus call when
not preserving metadata"

Change-Id: Ibc25d19e92548e6165eb8397157ebf89446333f7
This commit is contained in:
Steve Loughran 2020-04-07 17:55:55 +01:00
parent 749a5b81da
commit e4331a73c9
No known key found for this signature in database
GPG Key ID: D22CF846DBB162A0
2 changed files with 97 additions and 37 deletions

View File

@ -199,6 +199,9 @@ public class DistCpUtils {
EnumSet<FileAttribute> attributes, EnumSet<FileAttribute> attributes,
boolean preserveRawXattrs) throws IOException { boolean preserveRawXattrs) throws IOException {
// strip out those attributes we don't need any more
attributes.remove(FileAttribute.BLOCKSIZE);
attributes.remove(FileAttribute.CHECKSUMTYPE);
// If not preserving anything from FileStatus, don't bother fetching it. // If not preserving anything from FileStatus, don't bother fetching it.
FileStatus targetFileStatus = attributes.isEmpty() ? null : FileStatus targetFileStatus = attributes.isEmpty() ? null :
targetFS.getFileStatus(path); targetFS.getFileStatus(path);

View File

@ -45,6 +45,7 @@ import org.junit.Test;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStream; import java.io.OutputStream;
import java.util.EnumSet; import java.util.EnumSet;
@ -66,6 +67,7 @@ import static org.apache.hadoop.hdfs.server.namenode.AclTestHelpers.aclEntry;
import static org.apache.hadoop.test.LambdaTestUtils.intercept; import static org.apache.hadoop.test.LambdaTestUtils.intercept;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThat;
@ -191,15 +193,95 @@ public class TestDistCpUtils {
DistCpUtils.preserve(fs, dst, srcStatus, attributes, false); DistCpUtils.preserve(fs, dst, srcStatus, attributes, false);
CopyListingFileStatus dstStatus = new CopyListingFileStatus(fs.getFileStatus(dst)); assertStatusEqual(fs, dst, srcStatus);
}
private void assertStatusEqual(final FileSystem fs,
final Path dst,
final CopyListingFileStatus srcStatus) throws IOException {
FileStatus destStatus = fs.getFileStatus(dst);
CopyListingFileStatus dstStatus = new CopyListingFileStatus(
destStatus);
String text = String.format("Source %s; dest %s: wrong ", srcStatus,
destStatus);
// FileStatus.equals only compares path field, must explicitly compare all fields // FileStatus.equals only compares path field, must explicitly compare all fields
Assert.assertTrue(srcStatus.getPermission().equals(dstStatus.getPermission())); assertEquals(text + "permission",
Assert.assertTrue(srcStatus.getOwner().equals(dstStatus.getOwner())); srcStatus.getPermission(), dstStatus.getPermission());
Assert.assertTrue(srcStatus.getGroup().equals(dstStatus.getGroup())); assertEquals(text + "owner",
Assert.assertTrue(srcStatus.getAccessTime() == dstStatus.getAccessTime()); srcStatus.getOwner(), dstStatus.getOwner());
Assert.assertTrue(srcStatus.getModificationTime() == dstStatus.getModificationTime()); assertEquals(text + "group",
Assert.assertTrue(srcStatus.getReplication() == dstStatus.getReplication()); srcStatus.getGroup(), dstStatus.getGroup());
assertEquals(text + "accessTime",
srcStatus.getAccessTime(), dstStatus.getAccessTime());
assertEquals(text + "modificationTime",
srcStatus.getModificationTime(), dstStatus.getModificationTime());
assertEquals(text + "replication",
srcStatus.getReplication(), dstStatus.getReplication());
}
private void assertStatusNotEqual(final FileSystem fs,
final Path dst,
final CopyListingFileStatus srcStatus) throws IOException {
FileStatus destStatus = fs.getFileStatus(dst);
CopyListingFileStatus dstStatus = new CopyListingFileStatus(
destStatus);
String text = String.format("Source %s; dest %s: wrong ",
srcStatus, destStatus);
// FileStatus.equals only compares path field,
// must explicitly compare all fields
assertNotEquals(text + "permission",
srcStatus.getPermission(), dstStatus.getPermission());
assertNotEquals(text + "owner",
srcStatus.getOwner(), dstStatus.getOwner());
assertNotEquals(text + "group",
srcStatus.getGroup(), dstStatus.getGroup());
assertNotEquals(text + "accessTime",
srcStatus.getAccessTime(), dstStatus.getAccessTime());
assertNotEquals(text + "modificationTime",
srcStatus.getModificationTime(), dstStatus.getModificationTime());
assertNotEquals(text + "replication",
srcStatus.getReplication(), dstStatus.getReplication());
}
@Test
public void testSkipsNeedlessAttributes() throws Exception {
FileSystem fs = FileSystem.get(config);
// preserve replication, block size, user, group, permission,
// checksum type and timestamps
Path src = new Path("/tmp/testSkipsNeedlessAttributes/source");
Path dst = new Path("/tmp/testSkipsNeedlessAttributes/dest");
// there is no need to actually create a source file, just a file
// status of one
CopyListingFileStatus srcStatus = new CopyListingFileStatus(
new FileStatus(0, false, 1, 32, 0, src));
// if an attribute is needed, preserve will fail to find the file
EnumSet<FileAttribute> attrs = EnumSet.of(FileAttribute.ACL,
FileAttribute.GROUP,
FileAttribute.PERMISSION,
FileAttribute.TIMES,
FileAttribute.XATTR);
for (FileAttribute attr : attrs) {
intercept(FileNotFoundException.class, () ->
DistCpUtils.preserve(fs, dst, srcStatus,
EnumSet.of(attr),
false));
}
// but with the preservation flags only used
// in file creation, this does not happen
DistCpUtils.preserve(fs, dst, srcStatus,
EnumSet.of(
FileAttribute.BLOCKSIZE,
FileAttribute.CHECKSUMTYPE),
false);
} }
@Test @Test
@ -258,16 +340,8 @@ public class TestDistCpUtils {
// FileStatus.equals only compares path field, must explicitly compare all // FileStatus.equals only compares path field, must explicitly compare all
// fields // fields
Assert.assertEquals("getPermission", srcStatus.getPermission(), assertStatusEqual(fs, dest, srcStatus);
dstStatus.getPermission());
Assert.assertEquals("Owner", srcStatus.getOwner(), dstStatus.getOwner());
Assert.assertEquals("Group", srcStatus.getGroup(), dstStatus.getGroup());
Assert.assertEquals("AccessTime", srcStatus.getAccessTime(),
dstStatus.getAccessTime());
Assert.assertEquals("ModificationTime", srcStatus.getModificationTime(),
dstStatus.getModificationTime());
Assert.assertEquals("Replication", srcStatus.getReplication(),
dstStatus.getReplication());
Assert.assertArrayEquals(en1.toArray(), dd2.toArray()); Assert.assertArrayEquals(en1.toArray(), dd2.toArray());
} }
@ -486,12 +560,7 @@ public class TestDistCpUtils {
CopyListingFileStatus dstStatus = new CopyListingFileStatus(fs.getFileStatus(dst)); CopyListingFileStatus dstStatus = new CopyListingFileStatus(fs.getFileStatus(dst));
// FileStatus.equals only compares path field, must explicitly compare all fields // FileStatus.equals only compares path field, must explicitly compare all fields
Assert.assertFalse(srcStatus.getPermission().equals(dstStatus.getPermission())); assertStatusNotEqual(fs, dst, srcStatus);
Assert.assertFalse(srcStatus.getOwner().equals(dstStatus.getOwner()));
Assert.assertFalse(srcStatus.getGroup().equals(dstStatus.getGroup()));
Assert.assertFalse(srcStatus.getAccessTime() == dstStatus.getAccessTime());
Assert.assertFalse(srcStatus.getModificationTime() == dstStatus.getModificationTime());
Assert.assertFalse(srcStatus.getReplication() == dstStatus.getReplication());
} }
@Test @Test
@ -842,13 +911,7 @@ public class TestDistCpUtils {
// FileStatus.equals only compares path field, must explicitly compare all fields // FileStatus.equals only compares path field, must explicitly compare all fields
// attributes of src -> f2 ? should be yes // attributes of src -> f2 ? should be yes
CopyListingFileStatus f2Status = new CopyListingFileStatus(fs.getFileStatus(f2)); assertStatusEqual(fs, f2, srcStatus);
Assert.assertTrue(srcStatus.getPermission().equals(f2Status.getPermission()));
Assert.assertTrue(srcStatus.getOwner().equals(f2Status.getOwner()));
Assert.assertTrue(srcStatus.getGroup().equals(f2Status.getGroup()));
Assert.assertTrue(srcStatus.getAccessTime() == f2Status.getAccessTime());
Assert.assertTrue(srcStatus.getModificationTime() == f2Status.getModificationTime());
Assert.assertTrue(srcStatus.getReplication() == f2Status.getReplication());
// attributes of src -> f1 ? should be no // attributes of src -> f1 ? should be no
CopyListingFileStatus f1Status = new CopyListingFileStatus(fs.getFileStatus(f1)); CopyListingFileStatus f1Status = new CopyListingFileStatus(fs.getFileStatus(f1));
@ -1047,13 +1110,7 @@ public class TestDistCpUtils {
// FileStatus.equals only compares path field, must explicitly compare all fields // FileStatus.equals only compares path field, must explicitly compare all fields
// attributes of src -> f0 ? should be yes // attributes of src -> f0 ? should be yes
CopyListingFileStatus f0Status = new CopyListingFileStatus(fs.getFileStatus(f0)); assertStatusEqual(fs, f0, srcStatus);
Assert.assertTrue(srcStatus.getPermission().equals(f0Status.getPermission()));
Assert.assertTrue(srcStatus.getOwner().equals(f0Status.getOwner()));
Assert.assertTrue(srcStatus.getGroup().equals(f0Status.getGroup()));
Assert.assertTrue(srcStatus.getAccessTime() == f0Status.getAccessTime());
Assert.assertTrue(srcStatus.getModificationTime() == f0Status.getModificationTime());
Assert.assertTrue(srcStatus.getReplication() == f0Status.getReplication());
// attributes of src -> f1 ? should be no // attributes of src -> f1 ? should be no
CopyListingFileStatus f1Status = new CopyListingFileStatus(fs.getFileStatus(f1)); CopyListingFileStatus f1Status = new CopyListingFileStatus(fs.getFileStatus(f1));