mirror of https://github.com/apache/druid.git
refactor InputFormat and InputEntityReader implementations (#8875)
* refactor InputFormat and InputReader to supply InputEntity and temp dir to constructors instead of read/sample * fix style
This commit is contained in:
parent
1611792855
commit
7fa3182fe5
|
@ -24,7 +24,6 @@ import com.fasterxml.jackson.databind.ObjectWriter;
|
|||
import org.apache.druid.guice.annotations.UnstableApi;
|
||||
import org.apache.druid.java.util.common.parsers.CloseableIterator;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
|
@ -44,7 +43,7 @@ public interface InputEntityReader
|
|||
*/
|
||||
ObjectWriter DEFAULT_JSON_WRITER = new ObjectMapper().writerWithDefaultPrettyPrinter();
|
||||
|
||||
CloseableIterator<InputRow> read(InputEntity source, File temporaryDirectory) throws IOException;
|
||||
CloseableIterator<InputRow> read() throws IOException;
|
||||
|
||||
CloseableIterator<InputRowListPlusJson> sample(InputEntity source, File temporaryDirectory) throws IOException;
|
||||
CloseableIterator<InputRowListPlusJson> sample() throws IOException;
|
||||
}
|
||||
|
|
|
@ -29,6 +29,8 @@ import org.apache.druid.data.input.impl.NestedInputFormat;
|
|||
import org.apache.druid.data.input.impl.SplittableInputSource;
|
||||
import org.apache.druid.guice.annotations.UnstableApi;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* InputFormat abstracts the file format of input data.
|
||||
* It creates a {@link InputEntityReader} to read data and parse it into {@link InputRow}.
|
||||
|
@ -53,5 +55,5 @@ public interface InputFormat
|
|||
@JsonIgnore
|
||||
boolean isSplittable();
|
||||
|
||||
InputEntityReader createReader(InputRowSchema inputRowSchema);
|
||||
InputEntityReader createReader(InputRowSchema inputRowSchema, InputEntity source, File temporaryDirectory);
|
||||
}
|
||||
|
|
|
@ -23,7 +23,6 @@ import org.apache.druid.java.util.common.CloseableIterators;
|
|||
import org.apache.druid.java.util.common.parsers.CloseableIterator;
|
||||
import org.apache.druid.java.util.common.parsers.ParseException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -37,9 +36,9 @@ import java.util.List;
|
|||
public abstract class IntermediateRowParsingReader<T> implements InputEntityReader
|
||||
{
|
||||
@Override
|
||||
public CloseableIterator<InputRow> read(InputEntity source, File temporaryDirectory) throws IOException
|
||||
public CloseableIterator<InputRow> read() throws IOException
|
||||
{
|
||||
return intermediateRowIterator(source, temporaryDirectory).flatMap(row -> {
|
||||
return intermediateRowIterator().flatMap(row -> {
|
||||
try {
|
||||
// since parseInputRows() returns a list, the below line always iterates over the list,
|
||||
// which means it calls Iterator.hasNext() and Iterator.next() at least once per row.
|
||||
|
@ -56,10 +55,10 @@ public abstract class IntermediateRowParsingReader<T> implements InputEntityRead
|
|||
}
|
||||
|
||||
@Override
|
||||
public CloseableIterator<InputRowListPlusJson> sample(InputEntity source, File temporaryDirectory)
|
||||
public CloseableIterator<InputRowListPlusJson> sample()
|
||||
throws IOException
|
||||
{
|
||||
return intermediateRowIterator(source, temporaryDirectory).map(row -> {
|
||||
return intermediateRowIterator().map(row -> {
|
||||
final String json;
|
||||
try {
|
||||
json = toJson(row);
|
||||
|
@ -83,7 +82,7 @@ public abstract class IntermediateRowParsingReader<T> implements InputEntityRead
|
|||
* Creates an iterator of intermediate rows. The returned rows will be consumed by {@link #parseInputRows} and
|
||||
* {@link #toJson}.
|
||||
*/
|
||||
protected abstract CloseableIterator<T> intermediateRowIterator(InputEntity source, File temporaryDirectory)
|
||||
protected abstract CloseableIterator<T> intermediateRowIterator()
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
|
|
|
@ -38,10 +38,14 @@ import java.util.List;
|
|||
public abstract class TextReader extends IntermediateRowParsingReader<String>
|
||||
{
|
||||
private final InputRowSchema inputRowSchema;
|
||||
final InputEntity source;
|
||||
final File temporaryDirectory;
|
||||
|
||||
public TextReader(InputRowSchema inputRowSchema)
|
||||
public TextReader(InputRowSchema inputRowSchema, InputEntity source, File temporaryDirectory)
|
||||
{
|
||||
this.inputRowSchema = inputRowSchema;
|
||||
this.source = source;
|
||||
this.temporaryDirectory = temporaryDirectory;
|
||||
}
|
||||
|
||||
public InputRowSchema getInputRowSchema()
|
||||
|
@ -50,7 +54,7 @@ public abstract class TextReader extends IntermediateRowParsingReader<String>
|
|||
}
|
||||
|
||||
@Override
|
||||
public CloseableIterator<String> intermediateRowIterator(InputEntity source, File temporaryDirectory)
|
||||
public CloseableIterator<String> intermediateRowIterator()
|
||||
throws IOException
|
||||
{
|
||||
final LineIterator delegate = new LineIterator(
|
||||
|
|
|
@ -24,6 +24,7 @@ import com.fasterxml.jackson.annotation.JsonProperty;
|
|||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.apache.druid.data.input.InputEntity;
|
||||
import org.apache.druid.data.input.InputEntityReader;
|
||||
import org.apache.druid.data.input.InputFormat;
|
||||
import org.apache.druid.data.input.InputRowSchema;
|
||||
|
@ -31,6 +32,7 @@ import org.apache.druid.indexer.Checks;
|
|||
import org.apache.druid.indexer.Property;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.File;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
@ -117,10 +119,12 @@ public class CsvInputFormat implements InputFormat
|
|||
}
|
||||
|
||||
@Override
|
||||
public InputEntityReader createReader(InputRowSchema inputRowSchema)
|
||||
public InputEntityReader createReader(InputRowSchema inputRowSchema, InputEntity source, File temporaryDirectory)
|
||||
{
|
||||
return new CsvReader(
|
||||
inputRowSchema,
|
||||
source,
|
||||
temporaryDirectory,
|
||||
listDelimiter,
|
||||
columns,
|
||||
findColumnsFromHeader,
|
||||
|
|
|
@ -24,6 +24,7 @@ import com.google.common.base.Preconditions;
|
|||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.opencsv.RFC4180Parser;
|
||||
import org.apache.druid.data.input.InputEntity;
|
||||
import org.apache.druid.data.input.InputRow;
|
||||
import org.apache.druid.data.input.InputRowSchema;
|
||||
import org.apache.druid.data.input.TextReader;
|
||||
|
@ -34,6 +35,7 @@ import org.apache.druid.java.util.common.parsers.ParserUtils;
|
|||
import org.apache.druid.java.util.common.parsers.Parsers;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
@ -51,13 +53,15 @@ public class CsvReader extends TextReader
|
|||
|
||||
CsvReader(
|
||||
InputRowSchema inputRowSchema,
|
||||
InputEntity source,
|
||||
File temporaryDirectory,
|
||||
@Nullable String listDelimiter,
|
||||
@Nullable List<String> columns,
|
||||
boolean findColumnsFromHeader,
|
||||
int skipHeaderRows
|
||||
)
|
||||
{
|
||||
super(inputRowSchema);
|
||||
super(inputRowSchema, source, temporaryDirectory);
|
||||
this.findColumnsFromHeader = findColumnsFromHeader;
|
||||
this.skipHeaderRows = skipHeaderRows;
|
||||
final String finalListDelimeter = listDelimiter == null ? Parsers.DEFAULT_LIST_DELIMITER : listDelimiter;
|
||||
|
|
|
@ -64,9 +64,9 @@ public class InputEntityIteratingReader implements InputSourceReader
|
|||
{
|
||||
return createIterator(entity -> {
|
||||
// InputEntityReader is stateful and so a new one should be created per entity.
|
||||
final InputEntityReader reader = inputFormat.createReader(inputRowSchema);
|
||||
final InputEntityReader reader = inputFormat.createReader(inputRowSchema, entity, temporaryDirectory);
|
||||
try {
|
||||
return reader.read(entity, temporaryDirectory);
|
||||
return reader.read();
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
|
@ -79,9 +79,9 @@ public class InputEntityIteratingReader implements InputSourceReader
|
|||
{
|
||||
return createIterator(entity -> {
|
||||
// InputEntityReader is stateful and so a new one should be created per entity.
|
||||
final InputEntityReader reader = inputFormat.createReader(inputRowSchema);
|
||||
final InputEntityReader reader = inputFormat.createReader(inputRowSchema, entity, temporaryDirectory);
|
||||
try {
|
||||
return reader.sample(entity, temporaryDirectory);
|
||||
return reader.sample();
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
|
|
|
@ -23,11 +23,13 @@ import com.fasterxml.jackson.annotation.JsonCreator;
|
|||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.core.JsonParser.Feature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.apache.druid.data.input.InputEntity;
|
||||
import org.apache.druid.data.input.InputEntityReader;
|
||||
import org.apache.druid.data.input.InputRowSchema;
|
||||
import org.apache.druid.java.util.common.parsers.JSONPathSpec;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.File;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
@ -66,9 +68,9 @@ public class JsonInputFormat extends NestedInputFormat
|
|||
}
|
||||
|
||||
@Override
|
||||
public InputEntityReader createReader(InputRowSchema inputRowSchema)
|
||||
public InputEntityReader createReader(InputRowSchema inputRowSchema, InputEntity source, File temporaryDirectory)
|
||||
{
|
||||
return new JsonReader(inputRowSchema, getFlattenSpec(), objectMapper);
|
||||
return new JsonReader(inputRowSchema, source, temporaryDirectory, getFlattenSpec(), objectMapper);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.druid.data.input.impl;
|
|||
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.apache.druid.data.input.InputEntity;
|
||||
import org.apache.druid.data.input.InputRow;
|
||||
import org.apache.druid.data.input.InputRowSchema;
|
||||
import org.apache.druid.data.input.TextReader;
|
||||
|
@ -30,6 +31,7 @@ import org.apache.druid.java.util.common.parsers.ObjectFlattener;
|
|||
import org.apache.druid.java.util.common.parsers.ObjectFlatteners;
|
||||
import org.apache.druid.java.util.common.parsers.ParseException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
@ -40,9 +42,15 @@ public class JsonReader extends TextReader
|
|||
private final ObjectFlattener<JsonNode> flattener;
|
||||
private final ObjectMapper mapper;
|
||||
|
||||
JsonReader(InputRowSchema inputRowSchema, JSONPathSpec flattenSpec, ObjectMapper mapper)
|
||||
JsonReader(
|
||||
InputRowSchema inputRowSchema,
|
||||
InputEntity source,
|
||||
File temporaryDirectory,
|
||||
JSONPathSpec flattenSpec,
|
||||
ObjectMapper mapper
|
||||
)
|
||||
{
|
||||
super(inputRowSchema);
|
||||
super(inputRowSchema, source, temporaryDirectory);
|
||||
this.flattener = ObjectFlatteners.create(flattenSpec, new JSONFlattenerMaker());
|
||||
this.mapper = mapper;
|
||||
}
|
||||
|
|
|
@ -120,9 +120,9 @@ public class CsvReaderTest
|
|||
)
|
||||
);
|
||||
final CsvInputFormat format = new CsvInputFormat(ImmutableList.of(), "|", true, 0);
|
||||
final InputEntityReader reader = format.createReader(INPUT_ROW_SCHEMA);
|
||||
final InputEntityReader reader = format.createReader(INPUT_ROW_SCHEMA, source, null);
|
||||
int numResults = 0;
|
||||
try (CloseableIterator<InputRow> iterator = reader.read(source, null)) {
|
||||
try (CloseableIterator<InputRow> iterator = reader.read()) {
|
||||
while (iterator.hasNext()) {
|
||||
final InputRow row = iterator.next();
|
||||
Assert.assertEquals(
|
||||
|
@ -216,10 +216,12 @@ public class CsvReaderTest
|
|||
new TimestampSpec("Timestamp", "auto", null),
|
||||
new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("Timestamp"))),
|
||||
Collections.emptyList()
|
||||
)
|
||||
),
|
||||
source,
|
||||
null
|
||||
);
|
||||
|
||||
try (CloseableIterator<InputRow> iterator = reader.read(source, null)) {
|
||||
try (CloseableIterator<InputRow> iterator = reader.read()) {
|
||||
final Iterator<InputRow> expectedRowIterator = expectedResults.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
Assert.assertTrue(expectedRowIterator.hasNext());
|
||||
|
@ -237,8 +239,8 @@ public class CsvReaderTest
|
|||
)
|
||||
);
|
||||
final CsvInputFormat format = new CsvInputFormat(ImmutableList.of("ts", "name", "Comment"), null, false, 0);
|
||||
final InputEntityReader reader = format.createReader(INPUT_ROW_SCHEMA);
|
||||
try (CloseableIterator<InputRow> iterator = reader.read(source, null)) {
|
||||
final InputEntityReader reader = format.createReader(INPUT_ROW_SCHEMA, source, null);
|
||||
try (CloseableIterator<InputRow> iterator = reader.read()) {
|
||||
Assert.assertTrue(iterator.hasNext());
|
||||
final InputRow row = iterator.next();
|
||||
Assert.assertEquals(DateTimes.of("2019-01-01T00:00:10Z"), row.getTimestamp());
|
||||
|
@ -267,9 +269,9 @@ public class CsvReaderTest
|
|||
|
||||
private void assertResult(ByteEntity source, CsvInputFormat format) throws IOException
|
||||
{
|
||||
final InputEntityReader reader = format.createReader(INPUT_ROW_SCHEMA);
|
||||
final InputEntityReader reader = format.createReader(INPUT_ROW_SCHEMA, source, null);
|
||||
int numResults = 0;
|
||||
try (CloseableIterator<InputRow> iterator = reader.read(source, null)) {
|
||||
try (CloseableIterator<InputRow> iterator = reader.read()) {
|
||||
while (iterator.hasNext()) {
|
||||
final InputRow row = iterator.next();
|
||||
Assert.assertEquals(
|
||||
|
|
|
@ -65,10 +65,12 @@ public class JsonReaderTest
|
|||
new TimestampSpec("timestamp", "iso", null),
|
||||
new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))),
|
||||
Collections.emptyList()
|
||||
)
|
||||
),
|
||||
source,
|
||||
null
|
||||
);
|
||||
final int numExpectedIterations = 1;
|
||||
try (CloseableIterator<InputRow> iterator = reader.read(source, null)) {
|
||||
try (CloseableIterator<InputRow> iterator = reader.read()) {
|
||||
int numActualIterations = 0;
|
||||
while (iterator.hasNext()) {
|
||||
final InputRow row = iterator.next();
|
||||
|
@ -112,11 +114,13 @@ public class JsonReaderTest
|
|||
new TimestampSpec("timestamp", "iso", null),
|
||||
new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("foo"))),
|
||||
Collections.emptyList()
|
||||
)
|
||||
),
|
||||
source,
|
||||
null
|
||||
);
|
||||
|
||||
final int numExpectedIterations = 1;
|
||||
try (CloseableIterator<InputRow> iterator = reader.read(source, null)) {
|
||||
try (CloseableIterator<InputRow> iterator = reader.read()) {
|
||||
int numActualIterations = 0;
|
||||
while (iterator.hasNext()) {
|
||||
final InputRow row = iterator.next();
|
||||
|
|
|
@ -19,10 +19,13 @@
|
|||
|
||||
package org.apache.druid.data.input.impl;
|
||||
|
||||
import org.apache.druid.data.input.InputEntity;
|
||||
import org.apache.druid.data.input.InputEntityReader;
|
||||
import org.apache.druid.data.input.InputFormat;
|
||||
import org.apache.druid.data.input.InputRowSchema;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
public class NoopInputFormat implements InputFormat
|
||||
{
|
||||
@Override
|
||||
|
@ -32,7 +35,7 @@ public class NoopInputFormat implements InputFormat
|
|||
}
|
||||
|
||||
@Override
|
||||
public InputEntityReader createReader(InputRowSchema inputRowSchema)
|
||||
public InputEntityReader createReader(InputRowSchema inputRowSchema, InputEntity source, File temporaryDirectory)
|
||||
{
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue