HADOOP-18056. DistCp: Filter duplicates in the source paths. (#3825). Contributed by Ayush Saxena.

Reviewed-by: tomscut <litao@bigo.sg>
Reviewed-by: Steve Loughran <stevel@apache.org>
This commit is contained in:
Ayush Saxena 2022-01-05 23:53:07 +05:30
parent 6b83fe4a00
commit 5edb33b5ed
No known key found for this signature in database
GPG Key ID: D09AE71061AB564D
2 changed files with 37 additions and 1 deletions

View File

@ -30,8 +30,10 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.util.DistCpUtils;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
@ -233,7 +235,18 @@ public final class DistCpOptions {
public List<Path> getSourcePaths() {
return sourcePaths == null ?
null : Collections.unmodifiableList(sourcePaths);
null :
Collections.unmodifiableList(getUniquePaths(sourcePaths));
}
private List<Path> getUniquePaths(List<Path> srcPaths) {
Set<Path> uniquePaths = new LinkedHashSet<>();
for (Path path : srcPaths) {
if (!uniquePaths.add(path)) {
LOG.info("Path: {} added multiple times, ignoring the redundant entry.", path);
}
}
return new ArrayList<>(uniquePaths);
}
public Path getTargetPath() {

View File

@ -167,6 +167,29 @@ public class TestCopyListing extends SimpleCopyListing {
}
}
@Test
public void testDuplicateSourcePaths() throws Exception {
FileSystem fs = FileSystem.get(getConf());
List<Path> srcPaths = new ArrayList<Path>();
try {
srcPaths.add(new Path("/tmp/in"));
srcPaths.add(new Path("/tmp/in"));
TestDistCpUtils.createFile(fs, "/tmp/in/src1/1.txt");
TestDistCpUtils.createFile(fs, "/tmp/in/src2/1.txt");
Path target = new Path("/tmp/out");
Path listingFile = new Path("/tmp/list");
final DistCpOptions options =
new DistCpOptions.Builder(srcPaths, target).build();
final DistCpContext context = new DistCpContext(options);
CopyListing listing =
CopyListing.getCopyListing(getConf(), CREDENTIALS, context);
listing.buildListing(listingFile, context);
Assert.assertTrue(fs.exists(listingFile));
} finally {
TestDistCpUtils.delete(fs, "/tmp");
}
}
@Test(timeout=10000)
public void testBuildListing() {
FileSystem fs = null;