This change fixes problem when using space or tab as a separator in CSV processor - we check if current character is separator before we check if it is whitespace. This also improves tests to always check all combinations of separators and quotes. Closes #67013
This commit is contained in:
parent
6175e18e92
commit
fdd2a9c235
|
@ -127,8 +127,6 @@ final class CsvParser {
|
|||
char c = currentChar();
|
||||
if (c == LF || c == CR || c == quote) {
|
||||
throw new IllegalArgumentException("Illegal character inside unquoted field at " + currentIndex);
|
||||
} else if (trim && isWhitespace(c)) {
|
||||
spaceCount++;
|
||||
} else if (c == separator) {
|
||||
state = State.START;
|
||||
if (setField(currentIndex - spaceCount)) {
|
||||
|
@ -136,6 +134,8 @@ final class CsvParser {
|
|||
}
|
||||
startIndex = currentIndex + 1;
|
||||
return false;
|
||||
} else if (trim && isWhitespace(c)) {
|
||||
spaceCount++;
|
||||
} else {
|
||||
spaceCount = 0;
|
||||
}
|
||||
|
@ -163,20 +163,20 @@ final class CsvParser {
|
|||
boolean shouldSetField = true;
|
||||
for (; currentIndex < length; currentIndex++) {
|
||||
c = currentChar();
|
||||
if (isWhitespace(c)) {
|
||||
if (shouldSetField) {
|
||||
if (setField(currentIndex - 1)) {
|
||||
return true;
|
||||
}
|
||||
shouldSetField = false;
|
||||
}
|
||||
} else if (c == separator) {
|
||||
if (c == separator) {
|
||||
if (shouldSetField && setField(currentIndex - 1)) {
|
||||
return true;
|
||||
}
|
||||
startIndex = currentIndex + 1;
|
||||
state = State.START;
|
||||
return false;
|
||||
} else if (isWhitespace(c)) {
|
||||
if (shouldSetField) {
|
||||
if (setField(currentIndex - 1)) {
|
||||
return true;
|
||||
}
|
||||
shouldSetField = false;
|
||||
}
|
||||
} else {
|
||||
throw new IllegalArgumentException("character '" + c + "' after quoted field at " + currentIndex);
|
||||
}
|
||||
|
|
|
@ -24,33 +24,36 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
|||
import org.elasticsearch.ingest.IngestDocument;
|
||||
import org.elasticsearch.ingest.RandomDocumentPicks;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.junit.Before;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class CsvProcessorTests extends ESTestCase {
|
||||
|
||||
private static final Character[] SEPARATORS = new Character[]{',', ';', '|', '.'};
|
||||
private static final Character[] SEPARATORS = new Character[]{',', ';', '|', '.', '\t'};
|
||||
private static final String[] QUOTES = new String[]{"'", "\"", ""};
|
||||
private final String quote;
|
||||
private char separator;
|
||||
private final char separator;
|
||||
|
||||
|
||||
public CsvProcessorTests(@Name("quote") String quote) {
|
||||
public CsvProcessorTests(@Name("quote") String quote, @Name("separator") char separator) {
|
||||
this.quote = quote;
|
||||
this.separator = separator;
|
||||
}
|
||||
|
||||
@ParametersFactory
|
||||
public static Iterable<Object[]> parameters() {
|
||||
return Arrays.asList(new Object[]{"'"}, new Object[]{"\""}, new Object[]{""});
|
||||
LinkedList<Object[]> list = new LinkedList<>();
|
||||
for (Character separator : SEPARATORS) {
|
||||
for (String quote : QUOTES) {
|
||||
list.add(new Object[]{quote, separator});
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
separator = randomFrom(SEPARATORS);
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public void testExactNumberOfFields() {
|
||||
|
|
Loading…
Reference in New Issue