mirror of https://github.com/apache/druid.git
improve nested column storage format for broader compatibility (#13568)
* bump nested column format version changes: * nested field files are now named by their position in field paths list, rather than directly by the path itself. this fixes issues with valid json properties with commas and newlines breaking the csv file meta.smoosh * update StructuredDataProcessor to deal in NestedPathPart to be consistent with other abstract path handling rather than building JQ syntax strings directly * add v3 format segment and test
This commit is contained in:
parent
7f3c117e3a
commit
9ae7a36ccd
|
@ -38,6 +38,7 @@ import org.apache.druid.segment.nested.StructuredDataProcessor;
|
|||
|
||||
import javax.annotation.Nullable;
|
||||
import javax.inject.Inject;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
@ -479,7 +480,7 @@ public class NestedDataExpressions
|
|||
final StructuredDataProcessor processor = new StructuredDataProcessor()
|
||||
{
|
||||
@Override
|
||||
public int processLiteralField(String fieldName, Object fieldValue)
|
||||
public int processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
|
||||
{
|
||||
// do nothing, we only want the list of fields returned by this processor
|
||||
return 0;
|
||||
|
@ -500,9 +501,9 @@ public class NestedDataExpressions
|
|||
// maybe in the future ProcessResults should deal in PathFinder.PathPart instead of strings for fields
|
||||
StructuredDataProcessor.ProcessResults info = processor.processFields(unwrap(input));
|
||||
List<String> transformed = info.getLiteralFields()
|
||||
.stream()
|
||||
.map(p -> NestedPathFinder.toNormalizedJsonPath(NestedPathFinder.parseJqPath(p)))
|
||||
.collect(Collectors.toList());
|
||||
.stream()
|
||||
.map(NestedPathFinder::toNormalizedJsonPath)
|
||||
.collect(Collectors.toList());
|
||||
return ExprEval.ofType(
|
||||
ExpressionType.STRING_ARRAY,
|
||||
transformed
|
||||
|
|
|
@ -35,10 +35,13 @@ import org.apache.druid.segment.nested.GlobalDictionarySortedCollector;
|
|||
import org.apache.druid.segment.nested.GlobalDimensionDictionary;
|
||||
import org.apache.druid.segment.nested.NestedDataComplexTypeSerde;
|
||||
import org.apache.druid.segment.nested.NestedLiteralTypeInfo;
|
||||
import org.apache.druid.segment.nested.NestedPathFinder;
|
||||
import org.apache.druid.segment.nested.NestedPathPart;
|
||||
import org.apache.druid.segment.nested.StructuredData;
|
||||
import org.apache.druid.segment.nested.StructuredDataProcessor;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.SortedMap;
|
||||
|
@ -56,8 +59,9 @@ public class NestedDataColumnIndexer implements DimensionIndexer<StructuredData,
|
|||
protected final StructuredDataProcessor indexerProcessor = new StructuredDataProcessor()
|
||||
{
|
||||
@Override
|
||||
public int processLiteralField(String fieldName, Object fieldValue)
|
||||
public int processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
|
||||
{
|
||||
final String fieldName = NestedPathFinder.toNormalizedJsonPath(fieldPath);
|
||||
LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
|
||||
if (fieldIndexer == null) {
|
||||
estimatedFieldKeySize += StructuredDataProcessor.estimateStringSize(fieldName);
|
||||
|
|
|
@ -77,7 +77,7 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
* Implementation of {@link NestedDataComplexColumn} which uses a {@link CompressedVariableSizedBlobColumn} for the
|
||||
* 'raw' {@link StructuredData} values and provides selectors for nested 'literal' field columns.
|
||||
*/
|
||||
public final class CompressedNestedDataComplexColumn<TStringDictionary extends Indexed<ByteBuffer>>
|
||||
public abstract class CompressedNestedDataComplexColumn<TStringDictionary extends Indexed<ByteBuffer>>
|
||||
extends NestedDataComplexColumn
|
||||
{
|
||||
private final NestedDataColumnMetadata metadata;
|
||||
|
@ -123,6 +123,12 @@ public final class CompressedNestedDataComplexColumn<TStringDictionary extends I
|
|||
this.compressedRawColumnSupplier = compressedRawColumnSupplier;
|
||||
}
|
||||
|
||||
public abstract List<NestedPathPart> parsePath(String path);
|
||||
|
||||
public abstract String getField(List<NestedPathPart> path);
|
||||
|
||||
public abstract String getFieldFileName(String fileNameBase, String field, int fieldIndex);
|
||||
|
||||
public GenericIndexed<String> getFields()
|
||||
{
|
||||
return fields;
|
||||
|
@ -133,7 +139,7 @@ public final class CompressedNestedDataComplexColumn<TStringDictionary extends I
|
|||
{
|
||||
List<List<NestedPathPart>> fieldParts = new ArrayList<>(fields.size());
|
||||
for (int i = 0; i < fields.size(); i++) {
|
||||
fieldParts.add(NestedPathFinder.parseJqPath(fields.get(i)));
|
||||
fieldParts.add(parsePath(fields.get(i)));
|
||||
}
|
||||
return fieldParts;
|
||||
}
|
||||
|
@ -405,11 +411,6 @@ public final class CompressedNestedDataComplexColumn<TStringDictionary extends I
|
|||
return getColumnHolder(field).getCapabilities().isNumeric();
|
||||
}
|
||||
|
||||
private String getField(List<NestedPathPart> path)
|
||||
{
|
||||
return NestedPathFinder.toNormalizedJqPath(path);
|
||||
}
|
||||
|
||||
private ColumnHolder getColumnHolder(String field)
|
||||
{
|
||||
return columns.computeIfAbsent(field, this::readNestedFieldColumn);
|
||||
|
@ -421,12 +422,17 @@ public final class CompressedNestedDataComplexColumn<TStringDictionary extends I
|
|||
if (fields.indexOf(field) < 0) {
|
||||
return null;
|
||||
}
|
||||
final NestedLiteralTypeInfo.TypeSet types = fieldInfo.getTypes(fields.indexOf(field));
|
||||
final ByteBuffer dataBuffer = fileMapper.mapFile(
|
||||
NestedDataColumnSerializer.getFieldFileName(metadata.getFileNameBase(), field)
|
||||
);
|
||||
final int fieldIndex = fields.indexOf(field);
|
||||
final NestedLiteralTypeInfo.TypeSet types = fieldInfo.getTypes(fieldIndex);
|
||||
final String fieldFileName = getFieldFileName(metadata.getFileNameBase(), field, fieldIndex);
|
||||
final ByteBuffer dataBuffer = fileMapper.mapFile(fieldFileName);
|
||||
if (dataBuffer == null) {
|
||||
throw new ISE("Can't find field [%s] in [%s] file.", field, metadata.getFileNameBase());
|
||||
throw new ISE(
|
||||
"Can't find field [%s] with name [%s] in [%s] file.",
|
||||
field,
|
||||
fieldFileName,
|
||||
metadata.getFileNameBase()
|
||||
);
|
||||
}
|
||||
|
||||
ColumnBuilder columnBuilder = new ColumnBuilder().setFileMapper(fileMapper);
|
||||
|
|
|
@ -239,7 +239,7 @@ public abstract class GlobalDictionaryEncodedFieldColumnWriter<T>
|
|||
bitmapIndexWriter.writeTo(channel, smoosher);
|
||||
}
|
||||
};
|
||||
final String fieldFileName = NestedDataColumnSerializer.getFieldFileName(columnName, fieldName);
|
||||
final String fieldFileName = NestedDataColumnSerializer.getInternalFileName(columnName, fieldName);
|
||||
final long size = fieldSerializer.getSerializedSize();
|
||||
log.debug("Column [%s] serializing [%s] field of size [%d].", columnName, fieldName, size);
|
||||
try (SmooshedWriter smooshChannel = smoosher.addWithSmooshedWriter(fieldFileName, size)) {
|
||||
|
|
|
@ -56,6 +56,7 @@ import java.io.IOException;
|
|||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.WritableByteChannel;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
|
||||
|
@ -69,6 +70,8 @@ public class NestedDataColumnSerializer implements GenericColumnSerializer<Struc
|
|||
public static final String RAW_FILE_NAME = "__raw";
|
||||
public static final String NULL_BITMAP_FILE_NAME = "__nullIndex";
|
||||
|
||||
public static final String NESTED_FIELD_PREFIX = "__field_";
|
||||
|
||||
private final String name;
|
||||
private final SegmentWriteOutMedium segmentWriteOutMedium;
|
||||
private final IndexSpec indexSpec;
|
||||
|
@ -78,9 +81,11 @@ public class NestedDataColumnSerializer implements GenericColumnSerializer<Struc
|
|||
private final StructuredDataProcessor fieldProcessor = new StructuredDataProcessor()
|
||||
{
|
||||
@Override
|
||||
public int processLiteralField(String fieldName, Object fieldValue)
|
||||
public int processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
|
||||
{
|
||||
final GlobalDictionaryEncodedFieldColumnWriter<?> writer = fieldWriters.get(fieldName);
|
||||
final GlobalDictionaryEncodedFieldColumnWriter<?> writer = fieldWriters.get(
|
||||
NestedPathFinder.toNormalizedJsonPath(fieldPath)
|
||||
);
|
||||
if (writer != null) {
|
||||
try {
|
||||
ExprEval<?> eval = ExprEval.bestEffortOf(fieldValue);
|
||||
|
@ -180,8 +185,10 @@ public class NestedDataColumnSerializer implements GenericColumnSerializer<Struc
|
|||
{
|
||||
this.fields = fields;
|
||||
this.fieldWriters = Maps.newHashMapWithExpectedSize(fields.size());
|
||||
int ctr = 0;
|
||||
for (Map.Entry<String, NestedLiteralTypeInfo.MutableTypeSet> field : fields.entrySet()) {
|
||||
final String fieldName = field.getKey();
|
||||
final String fieldFileName = NESTED_FIELD_PREFIX + ctr++;
|
||||
fieldsWriter.write(fieldName);
|
||||
fieldsInfoWriter.write(field.getValue());
|
||||
final GlobalDictionaryEncodedFieldColumnWriter<?> writer;
|
||||
|
@ -190,7 +197,7 @@ public class NestedDataColumnSerializer implements GenericColumnSerializer<Struc
|
|||
if (Types.is(type, ValueType.STRING)) {
|
||||
writer = new StringFieldColumnWriter(
|
||||
name,
|
||||
fieldName,
|
||||
fieldFileName,
|
||||
segmentWriteOutMedium,
|
||||
indexSpec,
|
||||
globalDictionaryIdLookup
|
||||
|
@ -198,7 +205,7 @@ public class NestedDataColumnSerializer implements GenericColumnSerializer<Struc
|
|||
} else if (Types.is(type, ValueType.LONG)) {
|
||||
writer = new LongFieldColumnWriter(
|
||||
name,
|
||||
fieldName,
|
||||
fieldFileName,
|
||||
segmentWriteOutMedium,
|
||||
indexSpec,
|
||||
globalDictionaryIdLookup
|
||||
|
@ -206,7 +213,7 @@ public class NestedDataColumnSerializer implements GenericColumnSerializer<Struc
|
|||
} else {
|
||||
writer = new DoubleFieldColumnWriter(
|
||||
name,
|
||||
fieldName,
|
||||
fieldFileName,
|
||||
segmentWriteOutMedium,
|
||||
indexSpec,
|
||||
globalDictionaryIdLookup
|
||||
|
@ -215,7 +222,7 @@ public class NestedDataColumnSerializer implements GenericColumnSerializer<Struc
|
|||
} else {
|
||||
writer = new VariantLiteralFieldColumnWriter(
|
||||
name,
|
||||
fieldName,
|
||||
fieldFileName,
|
||||
segmentWriteOutMedium,
|
||||
indexSpec,
|
||||
globalDictionaryIdLookup
|
||||
|
@ -317,8 +324,8 @@ public class NestedDataColumnSerializer implements GenericColumnSerializer<Struc
|
|||
{
|
||||
Preconditions.checkState(closedForWrite, "Not closed yet!");
|
||||
Preconditions.checkArgument(dictionaryWriter.isSorted(), "Dictionary not sorted?!?");
|
||||
// version 3
|
||||
channel.write(ByteBuffer.wrap(new byte[]{0x03}));
|
||||
// version 4
|
||||
channel.write(ByteBuffer.wrap(new byte[]{0x04}));
|
||||
channel.write(ByteBuffer.wrap(metadataBytes));
|
||||
fieldsWriter.writeTo(channel, smoosher);
|
||||
fieldsInfoWriter.writeTo(channel, smoosher);
|
||||
|
@ -341,6 +348,7 @@ public class NestedDataColumnSerializer implements GenericColumnSerializer<Struc
|
|||
if (channel instanceof SmooshedWriter) {
|
||||
channel.close();
|
||||
}
|
||||
|
||||
for (Map.Entry<String, NestedLiteralTypeInfo.MutableTypeSet> field : fields.entrySet()) {
|
||||
// remove writer so that it can be collected when we are done with it
|
||||
GlobalDictionaryEncodedFieldColumnWriter<?> writer = fieldWriters.remove(field.getKey());
|
||||
|
@ -357,11 +365,6 @@ public class NestedDataColumnSerializer implements GenericColumnSerializer<Struc
|
|||
}
|
||||
}
|
||||
|
||||
public static String getFieldFileName(String fileNameBase, String field)
|
||||
{
|
||||
return StringUtils.format("%s_%s", fileNameBase, field);
|
||||
}
|
||||
|
||||
public static String getInternalFileName(String fileNameBase, String field)
|
||||
{
|
||||
return StringUtils.format("%s.%s", fileNameBase, field);
|
||||
|
|
|
@ -45,6 +45,7 @@ import java.nio.ByteBuffer;
|
|||
|
||||
public class NestedDataColumnSupplier implements Supplier<ComplexColumn>
|
||||
{
|
||||
private final byte version;
|
||||
private final NestedDataColumnMetadata metadata;
|
||||
private final CompressedVariableSizedBlobColumnSupplier compressedRawColumnSupplier;
|
||||
private final ImmutableBitmap nullValues;
|
||||
|
@ -78,9 +79,9 @@ public class NestedDataColumnSupplier implements Supplier<ComplexColumn>
|
|||
TypeStrategy<Double> doubleTypeStrategy
|
||||
)
|
||||
{
|
||||
byte version = bb.get();
|
||||
this.version = bb.get();
|
||||
|
||||
if (version == 0x03) {
|
||||
if (version == 0x03 || version == 0x04) {
|
||||
try {
|
||||
final SmooshedFileMapper mapper = columnBuilder.getFileMapper();
|
||||
metadata = jsonMapper.readValue(
|
||||
|
@ -157,10 +158,10 @@ public class NestedDataColumnSupplier implements Supplier<ComplexColumn>
|
|||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RE(ex, "Failed to deserialize V3 column.");
|
||||
throw new RE(ex, "Failed to deserialize V%s column.", version);
|
||||
}
|
||||
} else {
|
||||
throw new RE("Unknown version" + version);
|
||||
throw new RE("Unknown version " + version);
|
||||
}
|
||||
|
||||
fileMapper = Preconditions.checkNotNull(columnBuilder.getFileMapper(), "Null fileMapper");
|
||||
|
@ -170,9 +171,17 @@ public class NestedDataColumnSupplier implements Supplier<ComplexColumn>
|
|||
|
||||
@Override
|
||||
public ComplexColumn get()
|
||||
{
|
||||
if (version == 0x03) {
|
||||
return makeV3();
|
||||
}
|
||||
return makeV4();
|
||||
}
|
||||
|
||||
private NestedDataColumnV3 makeV3()
|
||||
{
|
||||
if (frontCodedDictionarySupplier != null) {
|
||||
return new CompressedNestedDataComplexColumn<>(
|
||||
return new NestedDataColumnV3<>(
|
||||
metadata,
|
||||
columnConfig,
|
||||
compressedRawColumnSupplier,
|
||||
|
@ -185,7 +194,37 @@ public class NestedDataColumnSupplier implements Supplier<ComplexColumn>
|
|||
fileMapper
|
||||
);
|
||||
}
|
||||
return new CompressedNestedDataComplexColumn<>(
|
||||
return new NestedDataColumnV3<>(
|
||||
metadata,
|
||||
columnConfig,
|
||||
compressedRawColumnSupplier,
|
||||
nullValues,
|
||||
fields,
|
||||
fieldInfo,
|
||||
dictionary::singleThreaded,
|
||||
longDictionarySupplier,
|
||||
doubleDictionarySupplier,
|
||||
fileMapper
|
||||
);
|
||||
}
|
||||
|
||||
private NestedDataColumnV4 makeV4()
|
||||
{
|
||||
if (frontCodedDictionarySupplier != null) {
|
||||
return new NestedDataColumnV4<>(
|
||||
metadata,
|
||||
columnConfig,
|
||||
compressedRawColumnSupplier,
|
||||
nullValues,
|
||||
fields,
|
||||
fieldInfo,
|
||||
frontCodedDictionarySupplier,
|
||||
longDictionarySupplier,
|
||||
doubleDictionarySupplier,
|
||||
fileMapper
|
||||
);
|
||||
}
|
||||
return new NestedDataColumnV4<>(
|
||||
metadata,
|
||||
columnConfig,
|
||||
compressedRawColumnSupplier,
|
||||
|
@ -202,9 +241,7 @@ public class NestedDataColumnSupplier implements Supplier<ComplexColumn>
|
|||
private ByteBuffer loadInternalFile(SmooshedFileMapper fileMapper, String internalFileName) throws IOException
|
||||
{
|
||||
return fileMapper.mapFile(
|
||||
NestedDataColumnSerializer.getInternalFileName(
|
||||
metadata.getFileNameBase(), internalFileName
|
||||
)
|
||||
NestedDataColumnSerializer.getInternalFileName(metadata.getFileNameBase(), internalFileName)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.druid.segment.nested;
|
||||
|
||||
import com.google.common.base.Supplier;
|
||||
import org.apache.druid.collections.bitmap.ImmutableBitmap;
|
||||
import org.apache.druid.java.util.common.StringUtils;
|
||||
import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
|
||||
import org.apache.druid.segment.column.ColumnConfig;
|
||||
import org.apache.druid.segment.data.CompressedVariableSizedBlobColumnSupplier;
|
||||
import org.apache.druid.segment.data.FixedIndexed;
|
||||
import org.apache.druid.segment.data.GenericIndexed;
|
||||
import org.apache.druid.segment.data.Indexed;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.List;
|
||||
|
||||
public final class NestedDataColumnV3<TStringDictionary extends Indexed<ByteBuffer>>
|
||||
extends CompressedNestedDataComplexColumn<TStringDictionary>
|
||||
{
|
||||
public NestedDataColumnV3(
|
||||
NestedDataColumnMetadata metadata,
|
||||
ColumnConfig columnConfig,
|
||||
CompressedVariableSizedBlobColumnSupplier compressedRawColumnSupplier,
|
||||
ImmutableBitmap nullValues,
|
||||
GenericIndexed<String> fields,
|
||||
NestedLiteralTypeInfo fieldInfo,
|
||||
Supplier<TStringDictionary> stringDictionary,
|
||||
Supplier<FixedIndexed<Long>> longDictionarySupplier,
|
||||
Supplier<FixedIndexed<Double>> doubleDictionarySupplier,
|
||||
SmooshedFileMapper fileMapper
|
||||
)
|
||||
{
|
||||
super(
|
||||
metadata,
|
||||
columnConfig,
|
||||
compressedRawColumnSupplier,
|
||||
nullValues,
|
||||
fields,
|
||||
fieldInfo,
|
||||
stringDictionary,
|
||||
longDictionarySupplier,
|
||||
doubleDictionarySupplier,
|
||||
fileMapper
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<NestedPathPart> parsePath(String path)
|
||||
{
|
||||
return NestedPathFinder.parseJqPath(path);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getFieldFileName(String fileNameBase, String field, int fieldIndex)
|
||||
{
|
||||
return StringUtils.format("%s_%s", fileNameBase, field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getField(List<NestedPathPart> path)
|
||||
{
|
||||
return NestedPathFinder.toNormalizedJqPath(path);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.druid.segment.nested;
|
||||
|
||||
import com.google.common.base.Supplier;
|
||||
import org.apache.druid.collections.bitmap.ImmutableBitmap;
|
||||
import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
|
||||
import org.apache.druid.segment.column.ColumnConfig;
|
||||
import org.apache.druid.segment.data.CompressedVariableSizedBlobColumnSupplier;
|
||||
import org.apache.druid.segment.data.FixedIndexed;
|
||||
import org.apache.druid.segment.data.GenericIndexed;
|
||||
import org.apache.druid.segment.data.Indexed;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.List;
|
||||
|
||||
public final class NestedDataColumnV4<TStringDictionary extends Indexed<ByteBuffer>>
|
||||
extends CompressedNestedDataComplexColumn<TStringDictionary>
|
||||
{
|
||||
public NestedDataColumnV4(
|
||||
NestedDataColumnMetadata metadata,
|
||||
ColumnConfig columnConfig,
|
||||
CompressedVariableSizedBlobColumnSupplier compressedRawColumnSupplier,
|
||||
ImmutableBitmap nullValues,
|
||||
GenericIndexed<String> fields,
|
||||
NestedLiteralTypeInfo fieldInfo,
|
||||
Supplier<TStringDictionary> stringDictionary,
|
||||
Supplier<FixedIndexed<Long>> longDictionarySupplier,
|
||||
Supplier<FixedIndexed<Double>> doubleDictionarySupplier,
|
||||
SmooshedFileMapper fileMapper
|
||||
)
|
||||
{
|
||||
super(
|
||||
metadata,
|
||||
columnConfig,
|
||||
compressedRawColumnSupplier,
|
||||
nullValues,
|
||||
fields,
|
||||
fieldInfo,
|
||||
stringDictionary,
|
||||
longDictionarySupplier,
|
||||
doubleDictionarySupplier,
|
||||
fileMapper
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<NestedPathPart> parsePath(String path)
|
||||
{
|
||||
return NestedPathFinder.parseJsonPath(path);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getFieldFileName(String fileNameBase, String field, int fieldIndex)
|
||||
{
|
||||
return NestedDataColumnSerializer.getInternalFileName(
|
||||
fileNameBase,
|
||||
NestedDataColumnSerializer.NESTED_FIELD_PREFIX + fieldIndex
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getField(List<NestedPathPart> path)
|
||||
{
|
||||
return NestedPathFinder.toNormalizedJsonPath(path);
|
||||
}
|
||||
}
|
|
@ -162,7 +162,7 @@ public class NestedPathFinder
|
|||
public static String toNormalizedJqPath(List<NestedPathPart> paths)
|
||||
{
|
||||
if (paths.isEmpty()) {
|
||||
return StructuredDataProcessor.ROOT_LITERAL;
|
||||
return ".";
|
||||
}
|
||||
StringBuilder bob = new StringBuilder();
|
||||
boolean first = true;
|
||||
|
|
|
@ -19,10 +19,9 @@
|
|||
|
||||
package org.apache.druid.segment.nested;
|
||||
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -31,29 +30,23 @@ import java.util.Set;
|
|||
|
||||
public abstract class StructuredDataProcessor
|
||||
{
|
||||
public static final String ROOT_LITERAL = ".";
|
||||
private static final Set<String> ROOT_LITERAL_FIELDS = ImmutableSet.of(ROOT_LITERAL);
|
||||
|
||||
public abstract int processLiteralField(String fieldName, Object fieldValue);
|
||||
public abstract int processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue);
|
||||
|
||||
/**
|
||||
* Process fields, returning a list of all "normalized" 'jq' paths to literal fields, consistent with the output of
|
||||
* {@link NestedPathFinder#toNormalizedJqPath(List)}.
|
||||
*
|
||||
* Note: in the future, {@link ProcessResults#getLiteralFields()} should instead probably be modified to deal in
|
||||
* lists of {@link NestedPathPart} instead so that callers can decide how to represent the path instead of assuing
|
||||
* 'jq' syntax.
|
||||
* Process fields, returning a list of all paths to literal fields, represented as an ordered sequence of
|
||||
* {@link NestedPathPart}.
|
||||
*/
|
||||
public ProcessResults processFields(Object raw)
|
||||
{
|
||||
Queue<Field> toProcess = new ArrayDeque<>();
|
||||
raw = StructuredData.unwrap(raw);
|
||||
ArrayList<NestedPathPart> newPath = new ArrayList<>();
|
||||
if (raw instanceof Map) {
|
||||
toProcess.add(new MapField("", (Map<String, ?>) raw));
|
||||
toProcess.add(new MapField(newPath, (Map<String, ?>) raw));
|
||||
} else if (raw instanceof List) {
|
||||
toProcess.add(new ListField(ROOT_LITERAL, (List<?>) raw));
|
||||
toProcess.add(new ListField(newPath, (List<?>) raw));
|
||||
} else {
|
||||
return new ProcessResults().withFields(ROOT_LITERAL_FIELDS).withSize(processLiteralField(ROOT_LITERAL, raw));
|
||||
return new ProcessResults().addLiteralField(newPath, processLiteralField(newPath, raw));
|
||||
}
|
||||
|
||||
ProcessResults accumulator = new ProcessResults();
|
||||
|
@ -76,17 +69,18 @@ public abstract class StructuredDataProcessor
|
|||
for (Map.Entry<String, ?> entry : map.getMap().entrySet()) {
|
||||
// add estimated size of string key
|
||||
processResults.addSize(estimateStringSize(entry.getKey()));
|
||||
final String fieldName = map.getName() + ".\"" + entry.getKey() + "\"";
|
||||
Object value = StructuredData.unwrap(entry.getValue());
|
||||
// lists and maps go back in the queue
|
||||
final ArrayList<NestedPathPart> newPath = new ArrayList<>(map.getPath());
|
||||
newPath.add(new NestedPathField(entry.getKey()));
|
||||
if (value instanceof List) {
|
||||
List<?> theList = (List<?>) value;
|
||||
toProcess.add(new ListField(fieldName, theList));
|
||||
toProcess.add(new ListField(newPath, theList));
|
||||
} else if (value instanceof Map) {
|
||||
toProcess.add(new MapField(fieldName, (Map<String, ?>) value));
|
||||
toProcess.add(new MapField(newPath, (Map<String, ?>) value));
|
||||
} else {
|
||||
// literals get processed
|
||||
processResults.addLiteralField(fieldName, processLiteralField(fieldName, value));
|
||||
processResults.addLiteralField(newPath, processLiteralField(newPath, value));
|
||||
}
|
||||
}
|
||||
return processResults;
|
||||
|
@ -98,16 +92,17 @@ public abstract class StructuredDataProcessor
|
|||
ProcessResults results = new ProcessResults().withSize(8);
|
||||
final List<?> theList = list.getList();
|
||||
for (int i = 0; i < theList.size(); i++) {
|
||||
final String listFieldName = list.getName() + "[" + i + "]";
|
||||
final ArrayList<NestedPathPart> newPath = new ArrayList<>(list.getPath());
|
||||
newPath.add(new NestedPathArrayElement(i));
|
||||
final Object element = StructuredData.unwrap(theList.get(i));
|
||||
// maps and lists go back into the queue
|
||||
if (element instanceof Map) {
|
||||
toProcess.add(new MapField(listFieldName, (Map<String, ?>) element));
|
||||
toProcess.add(new MapField(newPath, (Map<String, ?>) element));
|
||||
} else if (element instanceof List) {
|
||||
toProcess.add(new ListField(listFieldName, (List<?>) element));
|
||||
toProcess.add(new ListField(newPath, (List<?>) element));
|
||||
} else {
|
||||
// literals get processed
|
||||
results.addLiteralField(listFieldName, processLiteralField(listFieldName, element));
|
||||
results.addLiteralField(newPath, processLiteralField(newPath, element));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
|
@ -115,16 +110,16 @@ public abstract class StructuredDataProcessor
|
|||
|
||||
abstract static class Field
|
||||
{
|
||||
private final String name;
|
||||
private final ArrayList<NestedPathPart> path;
|
||||
|
||||
protected Field(String name)
|
||||
protected Field(ArrayList<NestedPathPart> path)
|
||||
{
|
||||
this.name = name;
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
public String getName()
|
||||
public ArrayList<NestedPathPart> getPath()
|
||||
{
|
||||
return name;
|
||||
return path;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -132,9 +127,9 @@ public abstract class StructuredDataProcessor
|
|||
{
|
||||
private final List<?> list;
|
||||
|
||||
ListField(String name, List<?> list)
|
||||
ListField(ArrayList<NestedPathPart> path, List<?> list)
|
||||
{
|
||||
super(name);
|
||||
super(path);
|
||||
this.list = list;
|
||||
}
|
||||
|
||||
|
@ -148,9 +143,9 @@ public abstract class StructuredDataProcessor
|
|||
{
|
||||
private final Map<String, ?> map;
|
||||
|
||||
MapField(String name, Map<String, ?> map)
|
||||
MapField(ArrayList<NestedPathPart> path, Map<String, ?> map)
|
||||
{
|
||||
super(name);
|
||||
super(path);
|
||||
this.map = map;
|
||||
}
|
||||
|
||||
|
@ -165,7 +160,7 @@ public abstract class StructuredDataProcessor
|
|||
*/
|
||||
public static class ProcessResults
|
||||
{
|
||||
private Set<String> literalFields;
|
||||
private Set<ArrayList<NestedPathPart>> literalFields;
|
||||
private int estimatedSize;
|
||||
|
||||
public ProcessResults()
|
||||
|
@ -174,7 +169,7 @@ public abstract class StructuredDataProcessor
|
|||
estimatedSize = 0;
|
||||
}
|
||||
|
||||
public Set<String> getLiteralFields()
|
||||
public Set<ArrayList<NestedPathPart>> getLiteralFields()
|
||||
{
|
||||
return literalFields;
|
||||
}
|
||||
|
@ -190,19 +185,13 @@ public abstract class StructuredDataProcessor
|
|||
return this;
|
||||
}
|
||||
|
||||
public ProcessResults addLiteralField(String fieldName, int sizeOfValue)
|
||||
public ProcessResults addLiteralField(ArrayList<NestedPathPart> fieldPath, int sizeOfValue)
|
||||
{
|
||||
literalFields.add(fieldName);
|
||||
literalFields.add(fieldPath);
|
||||
this.estimatedSize += sizeOfValue;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ProcessResults withFields(Set<String> fields)
|
||||
{
|
||||
this.literalFields = fields;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ProcessResults withSize(int size)
|
||||
{
|
||||
this.estimatedSize = size;
|
||||
|
|
|
@ -129,12 +129,12 @@ public class NestedDataExpressionsTest extends InitializedNullHandlingTest
|
|||
Expr expr = Parser.parse("json_paths(nest)", MACRO_TABLE);
|
||||
ExprEval eval = expr.eval(inputBindings);
|
||||
Assert.assertEquals(ExpressionType.STRING_ARRAY, eval.type());
|
||||
Assert.assertArrayEquals(new Object[]{"$.y", "$.z", "$.x"}, (Object[]) eval.value());
|
||||
Assert.assertArrayEquals(new Object[]{"$.x", "$.y", "$.z"}, (Object[]) eval.value());
|
||||
|
||||
expr = Parser.parse("json_paths(nester)", MACRO_TABLE);
|
||||
eval = expr.eval(inputBindings);
|
||||
Assert.assertEquals(ExpressionType.STRING_ARRAY, eval.type());
|
||||
Assert.assertArrayEquals(new Object[]{"$.x[0]", "$.x[1]", "$.x[2]", "$.y.b", "$.y.a"}, (Object[]) eval.value());
|
||||
Assert.assertArrayEquals(new Object[]{"$.x[0]", "$.y.a", "$.x[1]", "$.y.b", "$.x[2]"}, (Object[]) eval.value());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -38,7 +38,7 @@ public class NestedDataColumnIndexerTest extends InitializedNullHandlingTest
|
|||
EncodedKeyComponent<StructuredData> key;
|
||||
// new raw value, new field, new dictionary entry
|
||||
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of("x", "foo"), false);
|
||||
Assert.assertEquals(230, key.getEffectiveSizeBytes());
|
||||
Assert.assertEquals(228, key.getEffectiveSizeBytes());
|
||||
Assert.assertEquals(1, indexer.getCardinality());
|
||||
// adding same value only adds estimated size of value itself
|
||||
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of("x", "foo"), false);
|
||||
|
@ -67,7 +67,7 @@ public class NestedDataColumnIndexerTest extends InitializedNullHandlingTest
|
|||
Assert.assertEquals(5, indexer.getCardinality());
|
||||
// new raw value, new fields
|
||||
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of("x", ImmutableList.of(1L, 2L, 10L)), false);
|
||||
Assert.assertEquals(292, key.getEffectiveSizeBytes());
|
||||
Assert.assertEquals(286, key.getEffectiveSizeBytes());
|
||||
Assert.assertEquals(5, indexer.getCardinality());
|
||||
// new raw value
|
||||
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of("x", ImmutableList.of(1L, 2L, 10L)), false);
|
||||
|
|
|
@ -28,6 +28,7 @@ import com.google.common.util.concurrent.ListenableFuture;
|
|||
import com.google.common.util.concurrent.ListeningExecutorService;
|
||||
import com.google.common.util.concurrent.MoreExecutors;
|
||||
import org.apache.druid.collections.bitmap.RoaringBitmapFactory;
|
||||
import org.apache.druid.guice.NestedDataModule;
|
||||
import org.apache.druid.java.util.common.io.Closer;
|
||||
import org.apache.druid.java.util.common.io.smoosh.FileSmoosher;
|
||||
import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
|
||||
|
@ -41,9 +42,12 @@ import org.apache.druid.segment.DimensionSelector;
|
|||
import org.apache.druid.segment.IndexSpec;
|
||||
import org.apache.druid.segment.NestedDataColumnIndexer;
|
||||
import org.apache.druid.segment.ObjectColumnSelector;
|
||||
import org.apache.druid.segment.QueryableIndex;
|
||||
import org.apache.druid.segment.SimpleAscendingOffset;
|
||||
import org.apache.druid.segment.TestHelper;
|
||||
import org.apache.druid.segment.column.BitmapColumnIndex;
|
||||
import org.apache.druid.segment.column.ColumnBuilder;
|
||||
import org.apache.druid.segment.column.ColumnHolder;
|
||||
import org.apache.druid.segment.column.ColumnIndexSupplier;
|
||||
import org.apache.druid.segment.column.ColumnType;
|
||||
import org.apache.druid.segment.column.DruidPredicateIndex;
|
||||
|
@ -52,9 +56,11 @@ import org.apache.druid.segment.column.StringValueSetIndex;
|
|||
import org.apache.druid.segment.column.TypeStrategy;
|
||||
import org.apache.druid.segment.writeout.TmpFileSegmentWriteOutMediumFactory;
|
||||
import org.apache.druid.testing.InitializedNullHandlingTest;
|
||||
import org.apache.druid.utils.CompressionUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
@ -102,6 +108,12 @@ public class NestedDataColumnSupplierTest extends InitializedNullHandlingTest
|
|||
|
||||
ByteBuffer baseBuffer;
|
||||
|
||||
@BeforeClass
|
||||
public static void staticSetup()
|
||||
{
|
||||
NestedDataModule.registerHandlersAndSerde();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setup() throws IOException
|
||||
{
|
||||
|
@ -214,6 +226,49 @@ public class NestedDataColumnSupplierTest extends InitializedNullHandlingTest
|
|||
Assert.assertEquals(expectedReason, failureReason.get());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLegacyV3ReaderFormat() throws IOException
|
||||
{
|
||||
String columnName = "shipTo";
|
||||
String firstValue = "Cole";
|
||||
File tmpLocation = tempFolder.newFolder();
|
||||
File v3Segment = new File(
|
||||
NestedDataColumnSupplierTest.class.getClassLoader().getResource("nested_segment_v3/index.zip").getFile()
|
||||
);
|
||||
CompressionUtils.unzip(v3Segment, tmpLocation);
|
||||
try (Closer closer = Closer.create()) {
|
||||
QueryableIndex theIndex = closer.register(TestHelper.getTestIndexIO().loadIndex(tmpLocation));
|
||||
ColumnHolder holder = theIndex.getColumnHolder(columnName);
|
||||
Assert.assertNotNull(holder);
|
||||
Assert.assertEquals(NestedDataComplexTypeSerde.TYPE, holder.getCapabilities().toColumnType());
|
||||
|
||||
NestedDataColumnV3<?> v3 = closer.register((NestedDataColumnV3<?>) holder.getColumn());
|
||||
Assert.assertNotNull(v3);
|
||||
|
||||
List<NestedPathPart> path = ImmutableList.of(new NestedPathField("lastName"));
|
||||
ColumnHolder nestedColumnHolder = v3.getColumnHolder(path);
|
||||
Assert.assertNotNull(nestedColumnHolder);
|
||||
Assert.assertEquals(ColumnType.STRING, nestedColumnHolder.getCapabilities().toColumnType());
|
||||
NestedFieldLiteralDictionaryEncodedColumn<?> nestedColumn =
|
||||
(NestedFieldLiteralDictionaryEncodedColumn<?>) nestedColumnHolder.getColumn();
|
||||
|
||||
Assert.assertNotNull(nestedColumn);
|
||||
|
||||
ColumnValueSelector<?> selector = nestedColumn.makeColumnValueSelector(
|
||||
new SimpleAscendingOffset(theIndex.getNumRows())
|
||||
);
|
||||
|
||||
ColumnIndexSupplier indexSupplier = v3.getColumnIndexSupplier(path);
|
||||
Assert.assertNotNull(indexSupplier);
|
||||
StringValueSetIndex valueSetIndex = indexSupplier.as(StringValueSetIndex.class);
|
||||
Assert.assertNotNull(valueSetIndex);
|
||||
|
||||
BitmapColumnIndex indexForValue = valueSetIndex.forValue(firstValue);
|
||||
Assert.assertEquals(firstValue, selector.getObject());
|
||||
Assert.assertTrue(indexForValue.computeBitmapResult(resultFactory).get(0));
|
||||
}
|
||||
}
|
||||
|
||||
private void smokeTest(NestedDataComplexColumn column) throws IOException
|
||||
{
|
||||
SimpleAscendingOffset offset = new SimpleAscendingOffset(data.size());
|
||||
|
|
Binary file not shown.
|
@ -2233,7 +2233,7 @@ public class CalciteNestedDataQueryTest extends BaseCalciteQueryTest
|
|||
),
|
||||
ImmutableList.of(
|
||||
new Object[]{"[\"$\"]", 5L},
|
||||
new Object[]{"[\"$.n.x\",\"$.array[0]\",\"$.array[1]\"]", 2L}
|
||||
new Object[]{"[\"$.array[1]\",\"$.array[0]\",\"$.n.x\"]", 2L}
|
||||
),
|
||||
RowSignature.builder()
|
||||
.add("EXPR$0", ColumnType.STRING_ARRAY)
|
||||
|
|
Loading…
Reference in New Issue