diff --git a/processing/src/main/java/io/druid/segment/IndexMaker.java b/processing/src/main/java/io/druid/segment/IndexMaker.java index f88a25bf573..33be9558174 100644 --- a/processing/src/main/java/io/druid/segment/IndexMaker.java +++ b/processing/src/main/java/io/druid/segment/IndexMaker.java @@ -23,7 +23,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Function; import com.google.common.base.Objects; import com.google.common.base.Predicate; -import com.google.common.base.Predicates; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; @@ -91,7 +90,6 @@ import java.nio.LongBuffer; import java.util.AbstractList; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -463,6 +461,7 @@ public class IndexMaker ); final Map dimIndexes = Maps.newHashMap(); + final Map dimensionCardinalities = Maps.newHashMap(); final Map> dimensionValuesLookup = Maps.newHashMap(); final ArrayList> dimConversions = Lists.newArrayListWithCapacity(adapters.size()); final Set skippedDimensions = Sets.newHashSet(); @@ -476,6 +475,7 @@ public class IndexMaker dimConversions, dimIndexes, skippedDimensions, + dimensionCardinalities, dimensionValuesLookup ); @@ -503,6 +503,7 @@ public class IndexMaker skippedDimensions, theRows, columnCapabilities, + dimensionCardinalities, dimensionValuesLookup, rowNumConversions ); @@ -527,6 +528,7 @@ public class IndexMaker final List> dimConversions, final Map dimIndexes, final Set skippedDimensions, + final Map dimensionCardinalities, final Map> dimensionValuesLookup ) { @@ -557,7 +559,28 @@ public class IndexMaker // sort all dimension values and treat all null values as empty strings final Iterable dimensionValues = CombiningIterable.createSplatted( - dimValueLookups, + Iterables.transform( + dimValueLookups, + new Function, Iterable>() + { + @Override + public Iterable apply(Indexed indexed) + { + return Iterables.transform( + indexed, + new Function() + { + @Override + public String apply(@Nullable String input) + { + return (input == null) ? "" : input; + } + } + ); + } + } + ) + , Ordering.natural().nullsFirst() ); @@ -572,6 +595,9 @@ public class IndexMaker ++cardinality; } + + dimensionCardinalities.put(dimension, cardinality); + if (cardinality == 0) { log.info("Skipping [%s], it is empty!", dimension); skippedDimensions.add(dimension); @@ -749,6 +775,7 @@ public class IndexMaker final Set skippedDimensions, final Iterable theRows, final Map columnCapabilities, + final Map dimensionCardinalities, final Map> dimensionValuesLookup, final List rowNumConversions ) throws IOException @@ -771,6 +798,7 @@ public class IndexMaker dimIndex, dimension, columnCapabilities, + dimensionCardinalities, dimensionValuesLookup, rowNumConversions ); @@ -788,6 +816,7 @@ public class IndexMaker final int dimIndex, final String dimension, final Map columnCapabilities, + final Map dimensionCardinalities, final Map> dimensionValuesLookup, final List rowNumConversions ) throws IOException @@ -818,6 +847,7 @@ public class IndexMaker ConciseSet nullSet = null; int rowCount = 0; + for (Rowboat theRow : theRows) { if (dimIndex > theRow.getDims().length) { if (nullSet == null) { @@ -838,44 +868,149 @@ public class IndexMaker rowCount++; } - GenericIndexed dictionary = null; final Iterable dimensionValues = dimensionValuesLookup.get(dimension); + GenericIndexed dictionary = GenericIndexed.fromIterable( + dimensionValues, + GenericIndexed.stringStrategy + ); boolean bumpDictionary = false; if (hasMultipleValues) { - List> vals = ((MultiValColumnDictionaryEntryStore) adder).get(); - multiValCol = VSizeIndexed.fromIterable( - FunctionalIterable - .create(vals) - //.filter(Predicates.>notNull()) - .transform( - new Function, VSizeIndexedInts>() - { - @Override - public VSizeIndexedInts apply(List input) - { - if (input == null) { - return VSizeIndexedInts.empty(); + final List> vals = ((MultiValColumnDictionaryEntryStore) adder).get(); + if (nullSet != null) { + log.info("Dimension[%s] has null rows.", dimension); + + if (Iterables.getFirst(dimensionValues, "") != null) { + bumpDictionary = true; + log.info("Dimension[%s] has no null value in the dictionary, expanding...", dimension); + + final List nullList = Lists.newArrayList(); + nullList.add(null); + + dictionary = GenericIndexed.fromIterable( + Iterables.concat(nullList, dimensionValues), + GenericIndexed.stringStrategy + ); + + final int dictionarySize = dictionary.size(); + multiValCol = VSizeIndexed.fromIterable( + FunctionalIterable + .create(vals) + .transform( + new Function, VSizeIndexedInts>() + { + @Override + public VSizeIndexedInts apply(final List input) + { + if (input == null) { + return VSizeIndexedInts.fromList( + new AbstractList() + { + @Override + public Integer get(int index) + { + return 0; + } + + @Override + public int size() + { + return 1; + } + }, dictionarySize + ); + } + return VSizeIndexedInts.fromList( + new AbstractList() + { + @Override + public Integer get(int index) + { + Integer val = input.get(index); + if (val == null) { + return 0; + } + return val + 1; + } + + @Override + public int size() + { + return input.size(); + } + }, + dictionarySize + ); + } + } + ) + ); + } else { + final int dictionarySize = dictionary.size(); + multiValCol = VSizeIndexed.fromIterable( + FunctionalIterable + .create(vals) + .transform( + new Function, VSizeIndexedInts>() + { + @Override + public VSizeIndexedInts apply(List input) + { + if (input == null) { + //return null; + return VSizeIndexedInts.fromList( + new AbstractList() + { + @Override + public Integer get(int index) + { + return 0; + } + + @Override + public int size() + { + return 1; + } + }, dictionarySize + ); + } + return VSizeIndexedInts.fromList( + input, + dictionarySize + ); + } + } + ) + ); + } + } else { + final int dictionarySize = dictionary.size(); + multiValCol = VSizeIndexed.fromIterable( + FunctionalIterable + .create(vals) + .transform( + new Function, VSizeIndexedInts>() + { + @Override + public VSizeIndexedInts apply(List input) + { + return VSizeIndexedInts.fromList( + input, + dictionarySize + ); } - return VSizeIndexedInts.fromList( - input, - Collections.max(input) - ); } - } - ) - ); - dictionary = GenericIndexed.fromIterable( - dimensionValues, - GenericIndexed.stringStrategy - ); + ) + ); + } } else { final List vals = ((SingleValColumnDictionaryEntryStore) adder).get(); if (nullSet != null) { log.info("Dimension[%s] has null rows.", dimension); - if (Iterables.getFirst(dimensionValues, "") != null) { + if (Iterables.getFirst(dimensionValues, null) != null) { bumpDictionary = true; log.info("Dimension[%s] has no null value in the dictionary, expanding...", dimension); @@ -899,6 +1034,27 @@ public class IndexMaker return val + 1; } + @Override + public int size() + { + return vals.size(); + } + }, dictionary.size() + ); + } else { + singleValCol = VSizeIndexedInts.fromList( + new AbstractList() + { + @Override + public Integer get(int index) + { + Integer val = vals.get(index); + if (val == null) { + return 0; + } + return val; + } + @Override public int size() { @@ -908,10 +1064,6 @@ public class IndexMaker ); } } else { - dictionary = GenericIndexed.fromIterable( - dimensionValues, - GenericIndexed.stringStrategy - ); singleValCol = VSizeIndexedInts.fromList(vals, dictionary.size()); } } @@ -942,65 +1094,49 @@ public class IndexMaker } GenericIndexed bitmaps; - if (!hasMultipleValues) { - if (nullSet != null) { - final ImmutableConciseSet theNullSet = ImmutableConciseSet.newImmutableFromMutable(nullSet); - if (bumpDictionary) { - bitmaps = GenericIndexed.fromIterable( - Iterables.concat( - Arrays.asList(theNullSet), - Iterables.transform( - conciseSets, - new Function() - { - @Override - public ImmutableConciseSet apply(ConciseSet input) - { - return ImmutableConciseSet.newImmutableFromMutable(input); - } - } - ) - ), - ConciseCompressedIndexedInts.objectStrategy - ); - } else { - Iterable immutableConciseSets = Iterables.transform( - conciseSets, - new Function() - { - @Override - public ImmutableConciseSet apply(ConciseSet input) - { - return ImmutableConciseSet.newImmutableFromMutable(input); - } - } - ); - bitmaps = GenericIndexed.fromIterable( - Iterables.concat( - Arrays.asList( - ImmutableConciseSet.union( - theNullSet, - Iterables.getFirst(immutableConciseSets, null) - ) - ), - Iterables.skip(immutableConciseSets, 1) - ), - ConciseCompressedIndexedInts.objectStrategy - ); - } - } else { + if (nullSet != null) { + final ImmutableConciseSet theNullSet = ImmutableConciseSet.newImmutableFromMutable(nullSet); + if (bumpDictionary) { bitmaps = GenericIndexed.fromIterable( - Iterables.transform( - conciseSets, - new Function() - { - @Override - public ImmutableConciseSet apply(ConciseSet input) - { - return ImmutableConciseSet.newImmutableFromMutable(input); - } - } + Iterables.concat( + Arrays.asList(theNullSet), + Iterables.transform( + conciseSets, + new Function() + { + @Override + public ImmutableConciseSet apply(ConciseSet input) + { + return ImmutableConciseSet.newImmutableFromMutable(input); + } + } + ) + ), + ConciseCompressedIndexedInts.objectStrategy + ); + } else { + Iterable immutableConciseSets = Iterables.transform( + conciseSets, + new Function() + { + @Override + public ImmutableConciseSet apply(ConciseSet input) + { + return ImmutableConciseSet.newImmutableFromMutable(input); + } + } + ); + + bitmaps = GenericIndexed.fromIterable( + Iterables.concat( + Arrays.asList( + ImmutableConciseSet.union( + theNullSet, + Iterables.getFirst(immutableConciseSets, null) + ) + ), + Iterables.skip(immutableConciseSets, 1) ), ConciseCompressedIndexedInts.objectStrategy ); @@ -1033,12 +1169,15 @@ public class IndexMaker int dimValIndex = 0; for (String dimVal : dimensionValuesLookup.get(dimension)) { if (hasSpatialIndexes) { - List stringCoords = Lists.newArrayList(SPLITTER.split(dimVal)); - float[] coords = new float[stringCoords.size()]; - for (int j = 0; j < coords.length; j++) { - coords[j] = Float.valueOf(stringCoords.get(j)); + if (dimVal != null && !dimVal.isEmpty()) { + List stringCoords = Lists.newArrayList(SPLITTER.split(dimVal)); + float[] coords = new float[stringCoords.size()]; + for (int j = 0; j < coords.length; j++) { + coords[j] = Float.valueOf(stringCoords.get(j)); + } + tree.insert(coords, conciseSets.get(dimValIndex)); } - tree.insert(coords, conciseSets.get(dimValIndex++)); + dimValIndex++; } } if (hasSpatialIndexes) { @@ -1517,7 +1656,7 @@ public class IndexMaker final Rowboat retVal = new Rowboat( lhs.getTimestamp(), lhs.getDims(), - lhs.getMetrics(), + metrics, lhs.getRowNum() ); diff --git a/processing/src/test/java/io/druid/segment/SchemalessTestFull.java b/processing/src/test/java/io/druid/segment/SchemalessTestFull.java index 58a391c7515..0efa20822ca 100644 --- a/processing/src/test/java/io/druid/segment/SchemalessTestFull.java +++ b/processing/src/test/java/io/druid/segment/SchemalessTestFull.java @@ -1174,6 +1174,15 @@ public class SchemalessTestFull new DateTime("2011-01-12T00:00:00.000Z"), new TopNResultValue( Arrays.>asList( + ImmutableMap.builder() + .put("provider", "") + .put("rows", 6L) + .put("index", 400.0D) + .put("addRowsIndexConstant", 407.0D) + .put("uniques", 0.0) + .put("maxIndex", 100.0) + .put("minIndex", 0.0) + .build(), ImmutableMap.builder() .put("provider", "spot") .put("rows", 4L) @@ -1183,15 +1192,6 @@ public class SchemalessTestFull .put("maxIndex", 100.0) .put("minIndex", 100.0) .build(), - ImmutableMap.builder() - .put("provider", "") - .put("rows", 3L) - .put("index", 200.0D) - .put("addRowsIndexConstant", 204.0D) - .put("uniques", 0.0) - .put("maxIndex", 100.0) - .put("minIndex", 0.0) - .build(), ImmutableMap.builder() .put("provider", "total_market") .put("rows", 2L) @@ -1386,7 +1386,7 @@ public class SchemalessTestFull .build(); failMsg += " timeseries "; - HashMap context = new HashMap(); + HashMap context = new HashMap(); Iterable> actualResults = Sequences.toList( runner.run(query, context), Lists.>newArrayList() @@ -1420,7 +1420,7 @@ public class SchemalessTestFull .build(); failMsg += " filtered timeseries "; - HashMap context = new HashMap(); + HashMap context = new HashMap(); Iterable> actualResults = Sequences.toList( runner.run(query, context), Lists.>newArrayList() @@ -1453,7 +1453,7 @@ public class SchemalessTestFull .build(); failMsg += " topN "; - HashMap context = new HashMap(); + HashMap context = new HashMap(); Iterable> actualResults = Sequences.toList( runner.run(query, context), Lists.>newArrayList() @@ -1487,7 +1487,7 @@ public class SchemalessTestFull .build(); failMsg += " filtered topN "; - HashMap context = new HashMap(); + HashMap context = new HashMap(); Iterable> actualResults = Sequences.toList( runner.run(query, context), Lists.>newArrayList() @@ -1505,7 +1505,7 @@ public class SchemalessTestFull .build(); failMsg += " search "; - HashMap context = new HashMap(); + HashMap context = new HashMap(); Iterable> actualResults = Sequences.toList( runner.run(query, context), Lists.>newArrayList() @@ -1524,7 +1524,7 @@ public class SchemalessTestFull .build(); failMsg += " filtered search "; - HashMap context = new HashMap(); + HashMap context = new HashMap(); Iterable> actualResults = Sequences.toList( runner.run(query, context), Lists.>newArrayList() @@ -1543,7 +1543,7 @@ public class SchemalessTestFull .build(); failMsg += " timeBoundary "; - HashMap context = new HashMap(); + HashMap context = new HashMap(); Iterable> actualResults = Sequences.toList( runner.run(query, context), Lists.>newArrayList()