From 46b14111892bef38e83b235bdcdeac044c264fd7 Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Wed, 5 Jan 2022 23:53:07 +0530 Subject: [PATCH] HADOOP-18056. DistCp: Filter duplicates in the source paths. (#3825). Contributed by Ayush Saxena. Reviewed-by: tomscut Reviewed-by: Steve Loughran --- .../apache/hadoop/tools/DistCpOptions.java | 15 +++++++++++- .../apache/hadoop/tools/TestCopyListing.java | 23 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java index 07420bdcc91..094c21d8d78 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java @@ -30,8 +30,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.tools.util.DistCpUtils; +import java.util.ArrayList; import java.util.Collections; import java.util.EnumSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.NoSuchElementException; import java.util.Set; @@ -233,7 +235,18 @@ public final class DistCpOptions { public List getSourcePaths() { return sourcePaths == null ? - null : Collections.unmodifiableList(sourcePaths); + null : + Collections.unmodifiableList(getUniquePaths(sourcePaths)); + } + + private List getUniquePaths(List srcPaths) { + Set uniquePaths = new LinkedHashSet<>(); + for (Path path : srcPaths) { + if (!uniquePaths.add(path)) { + LOG.info("Path: {} added multiple times, ignoring the redundant entry.", path); + } + } + return new ArrayList<>(uniquePaths); } public Path getTargetPath() { diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java index ce7d00d2bd7..69e1421f084 100644 --- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java @@ -167,6 +167,29 @@ public class TestCopyListing extends SimpleCopyListing { } } + @Test + public void testDuplicateSourcePaths() throws Exception { + FileSystem fs = FileSystem.get(getConf()); + List srcPaths = new ArrayList(); + try { + srcPaths.add(new Path("/tmp/in")); + srcPaths.add(new Path("/tmp/in")); + TestDistCpUtils.createFile(fs, "/tmp/in/src1/1.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src2/1.txt"); + Path target = new Path("/tmp/out"); + Path listingFile = new Path("/tmp/list"); + final DistCpOptions options = + new DistCpOptions.Builder(srcPaths, target).build(); + final DistCpContext context = new DistCpContext(options); + CopyListing listing = + CopyListing.getCopyListing(getConf(), CREDENTIALS, context); + listing.buildListing(listingFile, context); + Assert.assertTrue(fs.exists(listingFile)); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + @Test(timeout=10000) public void testBuildListing() { FileSystem fs = null;