HADOOP-18056. DistCp: Filter duplicates in the source paths. (#3825). Contributed by Ayush Saxena.
Reviewed-by: tomscut <litao@bigo.sg> Reviewed-by: Steve Loughran <stevel@apache.org>
This commit is contained in:
parent
d7cacea07b
commit
46b1411189
|
@ -30,8 +30,10 @@ import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -233,7 +235,18 @@ public final class DistCpOptions {
|
||||||
|
|
||||||
public List<Path> getSourcePaths() {
|
public List<Path> getSourcePaths() {
|
||||||
return sourcePaths == null ?
|
return sourcePaths == null ?
|
||||||
null : Collections.unmodifiableList(sourcePaths);
|
null :
|
||||||
|
Collections.unmodifiableList(getUniquePaths(sourcePaths));
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Path> getUniquePaths(List<Path> srcPaths) {
|
||||||
|
Set<Path> uniquePaths = new LinkedHashSet<>();
|
||||||
|
for (Path path : srcPaths) {
|
||||||
|
if (!uniquePaths.add(path)) {
|
||||||
|
LOG.info("Path: {} added multiple times, ignoring the redundant entry.", path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new ArrayList<>(uniquePaths);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Path getTargetPath() {
|
public Path getTargetPath() {
|
||||||
|
|
|
@ -167,6 +167,29 @@ public class TestCopyListing extends SimpleCopyListing {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDuplicateSourcePaths() throws Exception {
|
||||||
|
FileSystem fs = FileSystem.get(getConf());
|
||||||
|
List<Path> srcPaths = new ArrayList<Path>();
|
||||||
|
try {
|
||||||
|
srcPaths.add(new Path("/tmp/in"));
|
||||||
|
srcPaths.add(new Path("/tmp/in"));
|
||||||
|
TestDistCpUtils.createFile(fs, "/tmp/in/src1/1.txt");
|
||||||
|
TestDistCpUtils.createFile(fs, "/tmp/in/src2/1.txt");
|
||||||
|
Path target = new Path("/tmp/out");
|
||||||
|
Path listingFile = new Path("/tmp/list");
|
||||||
|
final DistCpOptions options =
|
||||||
|
new DistCpOptions.Builder(srcPaths, target).build();
|
||||||
|
final DistCpContext context = new DistCpContext(options);
|
||||||
|
CopyListing listing =
|
||||||
|
CopyListing.getCopyListing(getConf(), CREDENTIALS, context);
|
||||||
|
listing.buildListing(listingFile, context);
|
||||||
|
Assert.assertTrue(fs.exists(listingFile));
|
||||||
|
} finally {
|
||||||
|
TestDistCpUtils.delete(fs, "/tmp");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test(timeout=10000)
|
@Test(timeout=10000)
|
||||||
public void testBuildListing() {
|
public void testBuildListing() {
|
||||||
FileSystem fs = null;
|
FileSystem fs = null;
|
||||||
|
|
Loading…
Reference in New Issue