HDFS-9630. DistCp minor refactoring and clean up. Contributed by Kai Zheng.

Change-Id: I363c4ffcac32116ddcdc0a22fac3db92f14a0db0
2016-01-11 09:46:56 -08:00 · 2016-01-11 09:46:56 -08:00 · 95f32015ad
parent 8c1adeaa26
commit 95f32015ad
12 changed files with 65 additions and 51 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@ -211,6 +211,8 @@ Trunk (Unreleased)
    HDFS-9582. TestLeaseRecoveryStriped file missing Apache License header 
    and not well formatted. (umamahesh)

+    HDFS-9630. DistCp minor refactoring and clean up. (Kai Zheng via zhz)
+
  OPTIMIZATIONS

  BUG FIXES
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexCopyFilter.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexCopyFilter.java
@ -20,10 +20,16 @@

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.fs.*;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IOUtils;

-import java.io.*;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java
@ -39,7 +39,8 @@

 import com.google.common.annotations.VisibleForTesting;

-import java.io.*;
+import java.io.FileNotFoundException;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;

@ -165,9 +166,9 @@ protected void validatePaths(DistCpOptions options)
    }
  }

-  /** {@inheritDoc} */
  @Override
-  public void doBuildListing(Path pathToListingFile, DistCpOptions options) throws IOException {
+  protected void doBuildListing(Path pathToListingFile,
+                                DistCpOptions options) throws IOException {
    if(options.shouldUseDiff()) {
      doBuildListingWithSnapshotDiff(getWriter(pathToListingFile), options);
    }else {
@ -227,8 +228,9 @@ private void addToFileListing(SequenceFile.Writer fileListWriter,
   * @throws IOException
   */
  @VisibleForTesting
-  public void doBuildListingWithSnapshotDiff(SequenceFile.Writer fileListWriter,
-      DistCpOptions options) throws IOException {
+  protected void doBuildListingWithSnapshotDiff(
+      SequenceFile.Writer fileListWriter, DistCpOptions options)
+      throws IOException {
    ArrayList<DiffInfo> diffList = distCpSync.prepareDiffList();
    Path sourceRoot = options.getSourcePaths().get(0);
    FileSystem sourceFS = sourceRoot.getFileSystem(getConf());
@ -287,7 +289,7 @@ public void doBuildListingWithSnapshotDiff(SequenceFile.Writer fileListWriter,
   * @throws IOException
   */
  @VisibleForTesting
-  public void doBuildListing(SequenceFile.Writer fileListWriter,
+  protected void doBuildListing(SequenceFile.Writer fileListWriter,
      DistCpOptions options) throws IOException {
    if (options.getNumListstatusThreads() > 0) {
      numListstatusThreads = options.getNumListstatusThreads();
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java
@ -27,10 +27,16 @@
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobStatus;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
-import org.apache.hadoop.tools.*;
+import org.apache.hadoop.tools.CopyListing;
+import org.apache.hadoop.tools.CopyListingFileStatus;
+import org.apache.hadoop.tools.DistCpConstants;
+import org.apache.hadoop.tools.DistCpOptions;
 import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
+import org.apache.hadoop.tools.GlobbedCopyListing;
 import org.apache.hadoop.tools.util.DistCpUtils;

 import java.io.IOException;
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyOutputFormat.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyOutputFormat.java
@ -20,7 +20,10 @@

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 import org.apache.hadoop.mapreduce.security.TokenCache;
 import org.apache.hadoop.tools.DistCpConstants;
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java
@ -201,11 +201,13 @@ private void compareCheckSums(FileSystem sourceFS, Path source,
        targetFS, target)) {
      StringBuilder errorMessage = new StringBuilder("Check-sum mismatch between ")
          .append(source).append(" and ").append(target).append(".");
-      if (sourceFS.getFileStatus(source).getBlockSize() != targetFS.getFileStatus(target).getBlockSize()) {
+      if (sourceFS.getFileStatus(source).getBlockSize() !=
+          targetFS.getFileStatus(target).getBlockSize()) {
        errorMessage.append(" Source and target differ in block-size.")
            .append(" Use -pb to preserve block-sizes during copy.")
            .append(" Alternatively, skip checksum-checks altogether, using -skipCrc.")
-						.append(" (NOTE: By skipping checksums, one runs the risk of masking data-corruption during file-transfer.)");
+            .append(" (NOTE: By skipping checksums, one runs the risk of " +
+                "masking data-corruption during file-transfer.)");
      }
      throw new IOException(errorMessage.toString());
    }
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/UniformSizeInputFormat.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/UniformSizeInputFormat.java
@ -23,12 +23,16 @@
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.tools.CopyListingFileStatus;
 import org.apache.hadoop.tools.DistCpConstants;
 import org.apache.hadoop.tools.util.DistCpUtils;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.mapreduce.*;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.hadoop.conf.Configuration;
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java
@ -18,17 +18,7 @@

 package org.apache.hadoop.tools.util;

-import java.io.IOException;
-import java.net.InetAddress;
-import java.net.URI;
-import java.net.UnknownHostException;
-import java.text.DecimalFormat;
-import java.util.EnumSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Map.Entry;
-
+import com.google.common.collect.Maps;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@ -49,10 +39,15 @@
 import org.apache.hadoop.tools.DistCpOptions;
 import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
 import org.apache.hadoop.tools.mapred.UniformSizeInputFormat;
-
-import com.google.common.collect.Maps;
 import org.apache.hadoop.util.StringUtils;

+import java.io.IOException;
+import java.text.DecimalFormat;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
 /**
 * Utility functions used in DistCp.
 */
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/ProducerConsumer.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/ProducerConsumer.java
@ -20,15 +20,11 @@

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.tools.util.WorkReport;
-import org.apache.hadoop.tools.util.WorkRequest;
-import org.apache.hadoop.tools.util.WorkRequestProcessor;

-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.ArrayList;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;

 /**
 * ProducerConsumer class encapsulates input and output queues and a
@ -51,8 +47,8 @@ public class ProducerConsumer<T, R> {
   *  @param numThreads   Size of thread-pool to execute Workers.
   */
  public ProducerConsumer(int numThreads) {
-    this.inputQueue = new LinkedBlockingQueue<WorkRequest<T>>();
-    this.outputQueue = new LinkedBlockingQueue<WorkReport<R>>();
+    this.inputQueue = new LinkedBlockingQueue<>();
+    this.outputQueue = new LinkedBlockingQueue<>();
    executor = Executors.newFixedThreadPool(numThreads);
    workCnt = new AtomicInteger(0);
  }
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/ThrottledInputStream.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/ThrottledInputStream.java
@ -18,13 +18,10 @@

 package org.apache.hadoop.tools.util;

-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.PositionedReadable;

-import com.google.common.base.Preconditions;
+import java.io.IOException;
+import java.io.InputStream;

 /**
 * The ThrottleInputStream provides bandwidth throttling on a specified
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/WorkRequestProcessor.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/WorkRequestProcessor.java
@ -18,9 +18,6 @@

 package org.apache.hadoop.tools.util;

-import org.apache.hadoop.tools.util.WorkReport;
-import org.apache.hadoop.tools.util.WorkRequest;
-
 /**
 *  Interface for ProducerConsumer worker loop.
 *
--- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java
+++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java
@ -378,8 +378,9 @@ public void testMakeDirFailure() {
              workPath);
      copyMapper.setup(context);

-      copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), pathList.get(0))),
-              new CopyListingFileStatus(fs.getFileStatus(pathList.get(0))), context);
+      copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH),
+          pathList.get(0))),
+          new CopyListingFileStatus(fs.getFileStatus(pathList.get(0))), context);

      Assert.assertTrue("There should have been an exception.", false);
    }
@ -525,7 +526,8 @@ public Mapper<Text, CopyListingFileStatus, Text, Text>.Context run() {
      mkdirs(TARGET_PATH);
      cluster.getFileSystem().setPermission(new Path(SOURCE_PATH + "/src/file"),
          new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ));
-      cluster.getFileSystem().setPermission(new Path(TARGET_PATH), new FsPermission((short)511));
+      cluster.getFileSystem().setPermission(new Path(TARGET_PATH),
+          new FsPermission((short)511));

      final FileSystem tmpFS = tmpUser.doAs(new PrivilegedAction<FileSystem>() {
        @Override
@ -785,7 +787,8 @@ private void doTestIgnoreFailures(boolean ignoreFailures) {
      }
      if (ignoreFailures) {
        for (Text value : stubContext.getWriter().values()) {
-          Assert.assertTrue(value.toString() + " is not skipped", value.toString().startsWith("FAIL:"));
+          Assert.assertTrue(value.toString() + " is not skipped",
+              value.toString().startsWith("FAIL:"));
        }
      }
      Assert.assertTrue("There should have been an exception.", ignoreFailures);
@ -813,7 +816,6 @@ public void testPreserveBlockSizeAndReplication() {
  @Test(timeout=40000)
  public void testCopyFailOnBlockSizeDifference() {
    try {
-
      deleteState();
      createSourceDataWithDifferentBlockSize();

@ -833,16 +835,18 @@ public void testCopyFailOnBlockSizeDifference() {

      for (Path path : pathList) {
        final FileStatus fileStatus = fs.getFileStatus(path);
-        copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)),
-            new CopyListingFileStatus(fileStatus), context);
+        copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH),
+            path)), new CopyListingFileStatus(fileStatus), context);
      }

      Assert.fail("Copy should have failed because of block-size difference.");
    }
    catch (Exception exception) {
      // Check that the exception suggests the use of -pb/-skipCrc.
-      Assert.assertTrue("Failure exception should have suggested the use of -pb.", exception.getCause().getCause().getMessage().contains("pb"));
-      Assert.assertTrue("Failure exception should have suggested the use of -skipCrc.", exception.getCause().getCause().getMessage().contains("skipCrc"));
+      Assert.assertTrue("Failure exception should have suggested the use of -pb.",
+          exception.getCause().getCause().getMessage().contains("pb"));
+      Assert.assertTrue("Failure exception should have suggested the use of -skipCrc.",
+          exception.getCause().getCause().getMessage().contains("skipCrc"));
    }
  }