HDFS-12255. Block Storage: Cblock should generated unique trace ID for the ops. Contributed by Mukul Kumar Singh.

2017-08-15 10:57:11 -07:00 · 2017-08-15 10:57:11 -07:00 · 6a16d7c7ab
parent e61530f5d9
commit 6a16d7c7ab
4 changed files with 46 additions and 47 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/cblock/jscsiHelper/BlockWriterTask.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/cblock/jscsiHelper/BlockWriterTask.java
@ -93,9 +93,9 @@ public class BlockWriterTask implements Runnable {
      long endTime = Time.monotonicNow();
      Preconditions.checkState(data.length > 0, "Block data is zero length");
      startTime = Time.monotonicNow();
-      // BUG: fix the trace ID.
      ContainerProtocolCalls.writeSmallFile(client, containerName,
-          Long.toString(block.getBlockID()), data, "");
+          Long.toString(block.getBlockID()), data,
+          flusher.getTraceID(new File(dbPath), block.getBlockID()));
      endTime = Time.monotonicNow();
      flusher.getTargetMetrics().updateContainerWriteLatency(
          endTime - startTime);
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/cblock/jscsiHelper/ContainerCacheFlusher.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/cblock/jscsiHelper/ContainerCacheFlusher.java
@ -28,6 +28,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.ozone.OzoneConsts;
 import org.apache.hadoop.scm.XceiverClientManager;
 import org.apache.hadoop.scm.container.common.helpers.Pipeline;
+import org.apache.hadoop.util.Time;
 import org.apache.hadoop.utils.LevelDBStore;
 import org.iq80.leveldb.Options;
 import org.slf4j.Logger;
@ -37,6 +38,8 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.nio.channels.ReadableByteChannel;
 import java.nio.file.Files;
@ -45,6 +48,7 @@ import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.UUID;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
@ -103,6 +107,7 @@ public class ContainerCacheFlusher implements Runnable {
  private AtomicBoolean shutdown;
  private final long levelDBCacheSize;
  private final int maxRetryCount;
+  private final String tracePrefix;

  private final ConcurrentMap<String, FinishCounter> finishCountMap;

@ -158,6 +163,7 @@ public class ContainerCacheFlusher implements Runnable {
    this.maxRetryCount =
        config.getInt(CBlockConfigKeys.DFS_CBLOCK_CACHE_MAX_RETRY_KEY,
            CBlockConfigKeys.DFS_CBLOCK_CACHE_MAX_RETRY_DEFAULT);
+    this.tracePrefix = getTracePrefix();
  }

  private void checkExistingLog(String prefixFileName, File dbPath) {
@ -436,6 +442,40 @@ public class ContainerCacheFlusher implements Runnable {
    LOG.info("Exiting flusher");
  }

+  /**
+   * Tries to get the local host IP Address as trace prefix
+   * for creating trace IDs, otherwise uses a random UUID for it.
+   */
+  private static String getTracePrefix() {
+    String tmp;
+    try {
+      tmp = InetAddress.getLocalHost().getHostAddress();
+    } catch (UnknownHostException ex) {
+      tmp = UUID.randomUUID().toString();
+      LOG.error("Unable to read the host address. Using a GUID for " +
+          "hostname:{} ", tmp, ex);
+    }
+    return tmp;
+  }
+
+  /**
+   * We create a trace ID to make it easy to debug issues.
+   * A trace ID is in IPAddress:UserName:VolumeName:blockID:second format.
+   *
+   * This will get written down on the data node if we get any failures, so
+   * with this trace ID we can correlate cBlock failures across machines.
+   *
+   * @param blockID - Block ID
+   * @return trace ID
+   */
+  public String getTraceID(File dbPath, long blockID) {
+    String volumeName = dbPath.getName();
+    String userName = dbPath.getParentFile().getName();
+    // mapping to seconds to make the string smaller.
+    return tracePrefix + ":" + userName + ":" + volumeName
+        + ":" + blockID + ":" + Time.monotonicNow() / 1000;
+  }
+
  /**
   * Keeps a Reference counted DB that we close only when the total Reference
   * has gone to zero.
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/cblock/jscsiHelper/cache/impl/AsyncBlockWriter.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/cblock/jscsiHelper/cache/impl/AsyncBlockWriter.java
@ -174,9 +174,9 @@ public class AsyncBlockWriter {
        long startTime = Time.monotonicNow();
        client = parentCache.getClientManager()
            .acquireClient(parentCache.getPipeline(block.getBlockID()));
-        // BUG: fix the trace ID.
        ContainerProtocolCalls.writeSmallFile(client, containerName,
-            Long.toString(block.getBlockID()), block.getData().array(), "");
+            Long.toString(block.getBlockID()), block.getData().array(),
+            parentCache.getTraceID(block.getBlockID()));
        long endTime = Time.monotonicNow();
        if (parentCache.isTraceEnabled()) {
          String datahash = DigestUtils.sha256Hex(block.getData().array());
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/cblock/jscsiHelper/cache/impl/CBlockLocalCache.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/cblock/jscsiHelper/cache/impl/CBlockLocalCache.java
@ -27,23 +27,19 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.scm.XceiverClientManager;
 import org.apache.hadoop.scm.container.common.helpers.Pipeline;
 import org.apache.hadoop.cblock.jscsiHelper.CBlockTargetMetrics;
-import org.apache.hadoop.util.Time;
 import org.apache.hadoop.utils.LevelDBStore;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.File;
 import java.io.IOException;
-import java.net.InetAddress;
 import java.net.URI;
 import java.net.URISyntaxException;
-import java.net.UnknownHostException;
 import java.nio.file.FileStore;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.List;
-import java.util.UUID;

 import static org.apache.hadoop.cblock.CBlockConfigKeys
    .DFS_CBLOCK_DISK_CACHE_PATH_DEFAULT;
@ -74,7 +70,7 @@ public class CBlockLocalCache implements CacheModule {
  private final LevelDBStore cacheDB;

  /**
-   * Asyncblock writer updates the cacheDB and writes the blocks async to
+   * AsyncBlock writer updates the cacheDB and writes the blocks async to
   * remote containers.
   */
  private final AsyncBlockWriter blockWriter;
@ -85,17 +81,8 @@ public class CBlockLocalCache implements CacheModule {
   * update the cacheDB.
   */
  private final SyncBlockReader blockReader;
-  /**
-   * We create a trace ID to make it easy to debug issues.
-   * A trace ID is in the following format. IPAddress:VolumeName:blockID:second
-   * <p>
-   * This will get written down on the data node if we get any failures, so
-   * with this trace ID we can correlate cBlock failures across machines.
-   */
  private final String userName;
  private final String volumeName;
-  private final String ipAddressString;
-  private final String tracePrefix;

  /**
   * From a block ID we are able to get the pipeline by indexing this array.
@ -160,8 +147,6 @@ public class CBlockLocalCache implements CacheModule {
    cacheDB = flusher.getCacheDB(dbPath.toString());
    this.containerList = containerPipelines.toArray(new
        Pipeline[containerPipelines.size()]);
-    this.ipAddressString = getHostIP();
-    this.tracePrefix = ipAddressString + ":" + this.volumeName;
    this.volumeSize = volumeSize;

    blockWriter = new AsyncBlockWriter(conf, this);
@ -325,22 +310,6 @@ public class CBlockLocalCache implements CacheModule {
    return false;
  }

-
-  /**
-   * Tries to get the local host IP Address for creating trace IDs.
-   */
-  private String getHostIP() {
-    String tmp;
-    try {
-      tmp = InetAddress.getLocalHost().toString();
-    } catch (UnknownHostException ex) {
-      tmp = UUID.randomUUID().toString();
-      LOG.error("Unable to read the host address. Using a GUID for " +
-          "hostname:{} ", tmp, ex);
-    }
-    return tmp;
-  }
-
  /**
   * Returns the local cache DB.
   *
@ -397,18 +366,8 @@ public class CBlockLocalCache implements CacheModule {
    return containerList[containerIdx];
  }

-  /**
-   * Returns a traceID based in Block ID.
-   * The format is HostIP:VolumeName:BlockID:timeStamp, in case of error this
-   * will be logged on the container side.
-   *
-   * @param blockID - Block ID
-   * @return trace ID
-   */
  String getTraceID(long blockID) {
-    // mapping to seconds to make the string smaller.
-    return this.tracePrefix + ":" + blockID + ":"
-        + Time.monotonicNow() / 1000;
+    return flusher.getTraceID(dbPath, blockID);
  }

  /**