Fix whitespace as a separator in CSV processor (#67045) (#67050)

This change fixes problem when using space or tab as a separator in CSV processor - we check if current character is separator before we check if it is whitespace.

This also improves tests to always check all combinations of separators and quotes.

Closes #67013
This commit is contained in:
Przemko Robakowski 2021-01-05 23:05:10 +01:00 committed by GitHub
parent 6175e18e92
commit fdd2a9c235
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 23 additions and 20 deletions

View File

@ -127,8 +127,6 @@ final class CsvParser {
char c = currentChar(); char c = currentChar();
if (c == LF || c == CR || c == quote) { if (c == LF || c == CR || c == quote) {
throw new IllegalArgumentException("Illegal character inside unquoted field at " + currentIndex); throw new IllegalArgumentException("Illegal character inside unquoted field at " + currentIndex);
} else if (trim && isWhitespace(c)) {
spaceCount++;
} else if (c == separator) { } else if (c == separator) {
state = State.START; state = State.START;
if (setField(currentIndex - spaceCount)) { if (setField(currentIndex - spaceCount)) {
@ -136,6 +134,8 @@ final class CsvParser {
} }
startIndex = currentIndex + 1; startIndex = currentIndex + 1;
return false; return false;
} else if (trim && isWhitespace(c)) {
spaceCount++;
} else { } else {
spaceCount = 0; spaceCount = 0;
} }
@ -163,20 +163,20 @@ final class CsvParser {
boolean shouldSetField = true; boolean shouldSetField = true;
for (; currentIndex < length; currentIndex++) { for (; currentIndex < length; currentIndex++) {
c = currentChar(); c = currentChar();
if (isWhitespace(c)) { if (c == separator) {
if (shouldSetField) {
if (setField(currentIndex - 1)) {
return true;
}
shouldSetField = false;
}
} else if (c == separator) {
if (shouldSetField && setField(currentIndex - 1)) { if (shouldSetField && setField(currentIndex - 1)) {
return true; return true;
} }
startIndex = currentIndex + 1; startIndex = currentIndex + 1;
state = State.START; state = State.START;
return false; return false;
} else if (isWhitespace(c)) {
if (shouldSetField) {
if (setField(currentIndex - 1)) {
return true;
}
shouldSetField = false;
}
} else { } else {
throw new IllegalArgumentException("character '" + c + "' after quoted field at " + currentIndex); throw new IllegalArgumentException("character '" + c + "' after quoted field at " + currentIndex);
} }

View File

@ -24,33 +24,36 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.IngestDocument;
import org.elasticsearch.ingest.RandomDocumentPicks; import org.elasticsearch.ingest.RandomDocumentPicks;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import org.junit.Before;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
public class CsvProcessorTests extends ESTestCase { public class CsvProcessorTests extends ESTestCase {
private static final Character[] SEPARATORS = new Character[]{',', ';', '|', '.'}; private static final Character[] SEPARATORS = new Character[]{',', ';', '|', '.', '\t'};
private static final String[] QUOTES = new String[]{"'", "\"", ""};
private final String quote; private final String quote;
private char separator; private final char separator;
public CsvProcessorTests(@Name("quote") String quote) { public CsvProcessorTests(@Name("quote") String quote, @Name("separator") char separator) {
this.quote = quote; this.quote = quote;
this.separator = separator;
} }
@ParametersFactory @ParametersFactory
public static Iterable<Object[]> parameters() { public static Iterable<Object[]> parameters() {
return Arrays.asList(new Object[]{"'"}, new Object[]{"\""}, new Object[]{""}); LinkedList<Object[]> list = new LinkedList<>();
for (Character separator : SEPARATORS) {
for (String quote : QUOTES) {
list.add(new Object[]{quote, separator});
} }
}
@Before return list;
public void setup() {
separator = randomFrom(SEPARATORS);
} }
public void testExactNumberOfFields() { public void testExactNumberOfFields() {