From b1ec7afc12c4718703a858ba15391516c613803d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20L=C3=A9aut=C3=A9?= <xavier@metamarkets.com>
Date: Fri, 6 Feb 2015 14:00:40 -0800
Subject: [PATCH] Sort HadoopIndexer rows by time+dim bucket to help reduce
 spilling

---
 .../indexer/HadoopDruidIndexerMapper.java     |  9 +++++---
 .../io/druid/indexer/IndexGeneratorJob.java   | 22 ++++++++++++++++++-
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerMapper.java b/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerMapper.java
index 9186d4b304e..556432a64ee 100644
--- a/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerMapper.java
+++ b/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerMapper.java
@@ -32,6 +32,7 @@ public abstract class HadoopDruidIndexerMapper<KEYOUT, VALUEOUT> extends Mapper<
 {
   private HadoopDruidIndexerConfig config;
   private StringInputRowParser parser;
+  protected GranularitySpec granularitySpec;
 
   @Override
   protected void setup(Context context)
@@ -39,6 +40,7 @@ public abstract class HadoopDruidIndexerMapper<KEYOUT, VALUEOUT> extends Mapper<
   {
     config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
     parser = config.getParser();
+    granularitySpec = config.getGranularitySpec();
   }
 
   public HadoopDruidIndexerConfig getConfig()
@@ -69,9 +71,10 @@ public abstract class HadoopDruidIndexerMapper<KEYOUT, VALUEOUT> extends Mapper<
           throw e;
         }
       }
-      GranularitySpec spec = config.getGranularitySpec();
-      if (!spec.bucketIntervals().isPresent() || spec.bucketInterval(new DateTime(inputRow.getTimestampFromEpoch()))
-                                                     .isPresent()) {
+
+      if (!granularitySpec.bucketIntervals().isPresent()
+          || granularitySpec.bucketInterval(new DateTime(inputRow.getTimestampFromEpoch()))
+                            .isPresent()) {
         innerMap(inputRow, value, context);
       }
     }
diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/IndexGeneratorJob.java b/indexing-hadoop/src/main/java/io/druid/indexer/IndexGeneratorJob.java
index bb50ceb0749..6ba2c35b9f1 100644
--- a/indexing-hadoop/src/main/java/io/druid/indexer/IndexGeneratorJob.java
+++ b/indexing-hadoop/src/main/java/io/druid/indexer/IndexGeneratorJob.java
@@ -26,6 +26,8 @@ import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
 import com.google.common.io.Closeables;
 import com.google.common.primitives.Longs;
 import com.metamx.common.IAE;
@@ -34,7 +36,9 @@ import com.metamx.common.guava.CloseQuietly;
 import com.metamx.common.logger.Logger;
 import io.druid.collections.StupidPool;
 import io.druid.data.input.InputRow;
+import io.druid.data.input.Rows;
 import io.druid.data.input.impl.StringInputRowParser;
+import io.druid.granularity.QueryGranularity;
 import io.druid.offheap.OffheapBufferPool;
 import io.druid.query.aggregation.AggregatorFactory;
 import io.druid.segment.IndexIO;
@@ -214,6 +218,8 @@ public class IndexGeneratorJob implements Jobby
 
   public static class IndexGeneratorMapper extends HadoopDruidIndexerMapper<BytesWritable, Text>
   {
+    private static final HashFunction hashFunction = Hashing.murmur3_128();
+
     @Override
     protected void innerMap(
         InputRow inputRow,
@@ -228,10 +234,24 @@ public class IndexGeneratorJob implements Jobby
         throw new ISE("WTF?! No bucket found for row: %s", inputRow);
       }
 
+      final long truncatedTimestamp = granularitySpec.getQueryGranularity().truncate(inputRow.getTimestampFromEpoch());
+      final byte[] hashedDimensions = hashFunction.hashBytes(
+          HadoopDruidIndexerConfig.jsonMapper.writeValueAsBytes(
+              Rows.toGroupKey(
+                  truncatedTimestamp,
+                  inputRow
+              )
+          )
+      ).asBytes();
+
       context.write(
           new SortableBytes(
               bucket.get().toGroupKey(),
-              Longs.toByteArray(inputRow.getTimestampFromEpoch())
+              // sort rows by truncated timestamp and hashed dimensions to help reduce spilling on the reducer side
+              ByteBuffer.allocate(Longs.BYTES + hashedDimensions.length)
+                        .putLong(truncatedTimestamp)
+                        .put(hashedDimensions)
+                        .array()
           ).toBytesWritable(),
           text
       );