fix all manners of brokenness from nulls and empty strings

This commit is contained in:
fjy 2014-08-14 11:11:57 -07:00
parent 1adec23126
commit 18d3acd3a8
2 changed files with 251 additions and 112 deletions

View File

@ -23,7 +23,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Function; import com.google.common.base.Function;
import com.google.common.base.Objects; import com.google.common.base.Objects;
import com.google.common.base.Predicate; import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators; import com.google.common.collect.Iterators;
@ -91,7 +90,6 @@ import java.nio.LongBuffer;
import java.util.AbstractList; import java.util.AbstractList;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -463,6 +461,7 @@ public class IndexMaker
); );
final Map<String, Integer> dimIndexes = Maps.newHashMap(); final Map<String, Integer> dimIndexes = Maps.newHashMap();
final Map<String, Integer> dimensionCardinalities = Maps.newHashMap();
final Map<String, Iterable<String>> dimensionValuesLookup = Maps.newHashMap(); final Map<String, Iterable<String>> dimensionValuesLookup = Maps.newHashMap();
final ArrayList<Map<String, IntBuffer>> dimConversions = Lists.newArrayListWithCapacity(adapters.size()); final ArrayList<Map<String, IntBuffer>> dimConversions = Lists.newArrayListWithCapacity(adapters.size());
final Set<String> skippedDimensions = Sets.newHashSet(); final Set<String> skippedDimensions = Sets.newHashSet();
@ -476,6 +475,7 @@ public class IndexMaker
dimConversions, dimConversions,
dimIndexes, dimIndexes,
skippedDimensions, skippedDimensions,
dimensionCardinalities,
dimensionValuesLookup dimensionValuesLookup
); );
@ -503,6 +503,7 @@ public class IndexMaker
skippedDimensions, skippedDimensions,
theRows, theRows,
columnCapabilities, columnCapabilities,
dimensionCardinalities,
dimensionValuesLookup, dimensionValuesLookup,
rowNumConversions rowNumConversions
); );
@ -527,6 +528,7 @@ public class IndexMaker
final List<Map<String, IntBuffer>> dimConversions, final List<Map<String, IntBuffer>> dimConversions,
final Map<String, Integer> dimIndexes, final Map<String, Integer> dimIndexes,
final Set<String> skippedDimensions, final Set<String> skippedDimensions,
final Map<String, Integer> dimensionCardinalities,
final Map<String, Iterable<String>> dimensionValuesLookup final Map<String, Iterable<String>> dimensionValuesLookup
) )
{ {
@ -557,7 +559,28 @@ public class IndexMaker
// sort all dimension values and treat all null values as empty strings // sort all dimension values and treat all null values as empty strings
final Iterable<String> dimensionValues = CombiningIterable.createSplatted( final Iterable<String> dimensionValues = CombiningIterable.createSplatted(
Iterables.transform(
dimValueLookups, dimValueLookups,
new Function<Indexed<String>, Iterable<String>>()
{
@Override
public Iterable<String> apply(Indexed<String> indexed)
{
return Iterables.transform(
indexed,
new Function<String, String>()
{
@Override
public String apply(@Nullable String input)
{
return (input == null) ? "" : input;
}
}
);
}
}
)
,
Ordering.<String>natural().nullsFirst() Ordering.<String>natural().nullsFirst()
); );
@ -572,6 +595,9 @@ public class IndexMaker
++cardinality; ++cardinality;
} }
dimensionCardinalities.put(dimension, cardinality);
if (cardinality == 0) { if (cardinality == 0) {
log.info("Skipping [%s], it is empty!", dimension); log.info("Skipping [%s], it is empty!", dimension);
skippedDimensions.add(dimension); skippedDimensions.add(dimension);
@ -749,6 +775,7 @@ public class IndexMaker
final Set<String> skippedDimensions, final Set<String> skippedDimensions,
final Iterable<Rowboat> theRows, final Iterable<Rowboat> theRows,
final Map<String, ColumnCapabilitiesImpl> columnCapabilities, final Map<String, ColumnCapabilitiesImpl> columnCapabilities,
final Map<String, Integer> dimensionCardinalities,
final Map<String, Iterable<String>> dimensionValuesLookup, final Map<String, Iterable<String>> dimensionValuesLookup,
final List<IntBuffer> rowNumConversions final List<IntBuffer> rowNumConversions
) throws IOException ) throws IOException
@ -771,6 +798,7 @@ public class IndexMaker
dimIndex, dimIndex,
dimension, dimension,
columnCapabilities, columnCapabilities,
dimensionCardinalities,
dimensionValuesLookup, dimensionValuesLookup,
rowNumConversions rowNumConversions
); );
@ -788,6 +816,7 @@ public class IndexMaker
final int dimIndex, final int dimIndex,
final String dimension, final String dimension,
final Map<String, ColumnCapabilitiesImpl> columnCapabilities, final Map<String, ColumnCapabilitiesImpl> columnCapabilities,
final Map<String, Integer> dimensionCardinalities,
final Map<String, Iterable<String>> dimensionValuesLookup, final Map<String, Iterable<String>> dimensionValuesLookup,
final List<IntBuffer> rowNumConversions final List<IntBuffer> rowNumConversions
) throws IOException ) throws IOException
@ -818,6 +847,7 @@ public class IndexMaker
ConciseSet nullSet = null; ConciseSet nullSet = null;
int rowCount = 0; int rowCount = 0;
for (Rowboat theRow : theRows) { for (Rowboat theRow : theRows) {
if (dimIndex > theRow.getDims().length) { if (dimIndex > theRow.getDims().length) {
if (nullSet == null) { if (nullSet == null) {
@ -838,16 +868,88 @@ public class IndexMaker
rowCount++; rowCount++;
} }
GenericIndexed<String> dictionary = null;
final Iterable<String> dimensionValues = dimensionValuesLookup.get(dimension); final Iterable<String> dimensionValues = dimensionValuesLookup.get(dimension);
GenericIndexed<String> dictionary = GenericIndexed.fromIterable(
dimensionValues,
GenericIndexed.stringStrategy
);
boolean bumpDictionary = false; boolean bumpDictionary = false;
if (hasMultipleValues) { if (hasMultipleValues) {
List<List<Integer>> vals = ((MultiValColumnDictionaryEntryStore) adder).get(); final List<List<Integer>> vals = ((MultiValColumnDictionaryEntryStore) adder).get();
if (nullSet != null) {
log.info("Dimension[%s] has null rows.", dimension);
if (Iterables.getFirst(dimensionValues, "") != null) {
bumpDictionary = true;
log.info("Dimension[%s] has no null value in the dictionary, expanding...", dimension);
final List<String> nullList = Lists.newArrayList();
nullList.add(null);
dictionary = GenericIndexed.fromIterable(
Iterables.concat(nullList, dimensionValues),
GenericIndexed.stringStrategy
);
final int dictionarySize = dictionary.size();
multiValCol = VSizeIndexed.fromIterable(
FunctionalIterable
.create(vals)
.transform(
new Function<List<Integer>, VSizeIndexedInts>()
{
@Override
public VSizeIndexedInts apply(final List<Integer> input)
{
if (input == null) {
return VSizeIndexedInts.fromList(
new AbstractList<Integer>()
{
@Override
public Integer get(int index)
{
return 0;
}
@Override
public int size()
{
return 1;
}
}, dictionarySize
);
}
return VSizeIndexedInts.fromList(
new AbstractList<Integer>()
{
@Override
public Integer get(int index)
{
Integer val = input.get(index);
if (val == null) {
return 0;
}
return val + 1;
}
@Override
public int size()
{
return input.size();
}
},
dictionarySize
);
}
}
)
);
} else {
final int dictionarySize = dictionary.size();
multiValCol = VSizeIndexed.fromIterable( multiValCol = VSizeIndexed.fromIterable(
FunctionalIterable FunctionalIterable
.create(vals) .create(vals)
//.filter(Predicates.<List<Integer>>notNull())
.transform( .transform(
new Function<List<Integer>, VSizeIndexedInts>() new Function<List<Integer>, VSizeIndexedInts>()
{ {
@ -855,27 +957,60 @@ public class IndexMaker
public VSizeIndexedInts apply(List<Integer> input) public VSizeIndexedInts apply(List<Integer> input)
{ {
if (input == null) { if (input == null) {
return VSizeIndexedInts.empty(); //return null;
return VSizeIndexedInts.fromList(
new AbstractList<Integer>()
{
@Override
public Integer get(int index)
{
return 0;
}
@Override
public int size()
{
return 1;
}
}, dictionarySize
);
} }
return VSizeIndexedInts.fromList( return VSizeIndexedInts.fromList(
input, input,
Collections.max(input) dictionarySize
); );
} }
} }
) )
); );
dictionary = GenericIndexed.fromIterable( }
dimensionValues, } else {
GenericIndexed.stringStrategy final int dictionarySize = dictionary.size();
multiValCol = VSizeIndexed.fromIterable(
FunctionalIterable
.create(vals)
.transform(
new Function<List<Integer>, VSizeIndexedInts>()
{
@Override
public VSizeIndexedInts apply(List<Integer> input)
{
return VSizeIndexedInts.fromList(
input,
dictionarySize
); );
}
}
)
);
}
} else { } else {
final List<Integer> vals = ((SingleValColumnDictionaryEntryStore) adder).get(); final List<Integer> vals = ((SingleValColumnDictionaryEntryStore) adder).get();
if (nullSet != null) { if (nullSet != null) {
log.info("Dimension[%s] has null rows.", dimension); log.info("Dimension[%s] has null rows.", dimension);
if (Iterables.getFirst(dimensionValues, "") != null) { if (Iterables.getFirst(dimensionValues, null) != null) {
bumpDictionary = true; bumpDictionary = true;
log.info("Dimension[%s] has no null value in the dictionary, expanding...", dimension); log.info("Dimension[%s] has no null value in the dictionary, expanding...", dimension);
@ -899,6 +1034,27 @@ public class IndexMaker
return val + 1; return val + 1;
} }
@Override
public int size()
{
return vals.size();
}
}, dictionary.size()
);
} else {
singleValCol = VSizeIndexedInts.fromList(
new AbstractList<Integer>()
{
@Override
public Integer get(int index)
{
Integer val = vals.get(index);
if (val == null) {
return 0;
}
return val;
}
@Override @Override
public int size() public int size()
{ {
@ -908,10 +1064,6 @@ public class IndexMaker
); );
} }
} else { } else {
dictionary = GenericIndexed.fromIterable(
dimensionValues,
GenericIndexed.stringStrategy
);
singleValCol = VSizeIndexedInts.fromList(vals, dictionary.size()); singleValCol = VSizeIndexedInts.fromList(vals, dictionary.size());
} }
} }
@ -942,7 +1094,7 @@ public class IndexMaker
} }
GenericIndexed<ImmutableConciseSet> bitmaps; GenericIndexed<ImmutableConciseSet> bitmaps;
if (!hasMultipleValues) {
if (nullSet != null) { if (nullSet != null) {
final ImmutableConciseSet theNullSet = ImmutableConciseSet.newImmutableFromMutable(nullSet); final ImmutableConciseSet theNullSet = ImmutableConciseSet.newImmutableFromMutable(nullSet);
if (bumpDictionary) { if (bumpDictionary) {
@ -1005,22 +1157,6 @@ public class IndexMaker
ConciseCompressedIndexedInts.objectStrategy ConciseCompressedIndexedInts.objectStrategy
); );
} }
} else {
bitmaps = GenericIndexed.fromIterable(
Iterables.transform(
conciseSets,
new Function<ConciseSet, ImmutableConciseSet>()
{
@Override
public ImmutableConciseSet apply(ConciseSet input)
{
return ImmutableConciseSet.newImmutableFromMutable(input);
}
}
),
ConciseCompressedIndexedInts.objectStrategy
);
}
// Make spatial indexes // Make spatial indexes
ImmutableRTree spatialIndex = null; ImmutableRTree spatialIndex = null;
@ -1033,12 +1169,15 @@ public class IndexMaker
int dimValIndex = 0; int dimValIndex = 0;
for (String dimVal : dimensionValuesLookup.get(dimension)) { for (String dimVal : dimensionValuesLookup.get(dimension)) {
if (hasSpatialIndexes) { if (hasSpatialIndexes) {
if (dimVal != null && !dimVal.isEmpty()) {
List<String> stringCoords = Lists.newArrayList(SPLITTER.split(dimVal)); List<String> stringCoords = Lists.newArrayList(SPLITTER.split(dimVal));
float[] coords = new float[stringCoords.size()]; float[] coords = new float[stringCoords.size()];
for (int j = 0; j < coords.length; j++) { for (int j = 0; j < coords.length; j++) {
coords[j] = Float.valueOf(stringCoords.get(j)); coords[j] = Float.valueOf(stringCoords.get(j));
} }
tree.insert(coords, conciseSets.get(dimValIndex++)); tree.insert(coords, conciseSets.get(dimValIndex));
}
dimValIndex++;
} }
} }
if (hasSpatialIndexes) { if (hasSpatialIndexes) {
@ -1517,7 +1656,7 @@ public class IndexMaker
final Rowboat retVal = new Rowboat( final Rowboat retVal = new Rowboat(
lhs.getTimestamp(), lhs.getTimestamp(),
lhs.getDims(), lhs.getDims(),
lhs.getMetrics(), metrics,
lhs.getRowNum() lhs.getRowNum()
); );

View File

@ -1174,6 +1174,15 @@ public class SchemalessTestFull
new DateTime("2011-01-12T00:00:00.000Z"), new DateTime("2011-01-12T00:00:00.000Z"),
new TopNResultValue( new TopNResultValue(
Arrays.<Map<String, Object>>asList( Arrays.<Map<String, Object>>asList(
ImmutableMap.<String, Object>builder()
.put("provider", "")
.put("rows", 6L)
.put("index", 400.0D)
.put("addRowsIndexConstant", 407.0D)
.put("uniques", 0.0)
.put("maxIndex", 100.0)
.put("minIndex", 0.0)
.build(),
ImmutableMap.<String, Object>builder() ImmutableMap.<String, Object>builder()
.put("provider", "spot") .put("provider", "spot")
.put("rows", 4L) .put("rows", 4L)
@ -1183,15 +1192,6 @@ public class SchemalessTestFull
.put("maxIndex", 100.0) .put("maxIndex", 100.0)
.put("minIndex", 100.0) .put("minIndex", 100.0)
.build(), .build(),
ImmutableMap.<String, Object>builder()
.put("provider", "")
.put("rows", 3L)
.put("index", 200.0D)
.put("addRowsIndexConstant", 204.0D)
.put("uniques", 0.0)
.put("maxIndex", 100.0)
.put("minIndex", 0.0)
.build(),
ImmutableMap.<String, Object>builder() ImmutableMap.<String, Object>builder()
.put("provider", "total_market") .put("provider", "total_market")
.put("rows", 2L) .put("rows", 2L)