Fix whitespace as a separator in CSV processor (#67045) (#67050)

This change fixes problem when using space or tab as a separator in CSV processor - we check if current character is separator before we check if it is whitespace. This also improves tests to always check all combinations of separators and quotes. Closes #67013
2021-01-05 23:05:10 +01:00 · 2021-01-05 23:05:10 +01:00 · fdd2a9c235
parent 6175e18e92
commit fdd2a9c235
2 changed files with 23 additions and 20 deletions
--- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvParser.java
+++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvParser.java
@ -127,8 +127,6 @@ final class CsvParser {
            char c = currentChar();
            if (c == LF || c == CR || c == quote) {
                throw new IllegalArgumentException("Illegal character inside unquoted field at " + currentIndex);
-            } else if (trim && isWhitespace(c)) {
-                spaceCount++;
            } else if (c == separator) {
                state = State.START;
                if (setField(currentIndex - spaceCount)) {
@ -136,6 +134,8 @@ final class CsvParser {
                }
                startIndex = currentIndex + 1;
                return false;
+            } else if (trim && isWhitespace(c)) {
+                spaceCount++;
            } else {
                spaceCount = 0;
            }
@ -163,20 +163,20 @@ final class CsvParser {
        boolean shouldSetField = true;
        for (; currentIndex < length; currentIndex++) {
            c = currentChar();
-            if (isWhitespace(c)) {
-                if (shouldSetField) {
-                    if (setField(currentIndex - 1)) {
-                        return true;
-                    }
-                    shouldSetField = false;
-                }
-            } else if (c == separator) {
+            if (c == separator) {
                if (shouldSetField && setField(currentIndex - 1)) {
                    return true;
                }
                startIndex = currentIndex + 1;
                state = State.START;
                return false;
+            } else if (isWhitespace(c)) {
+                if (shouldSetField) {
+                    if (setField(currentIndex - 1)) {
+                        return true;
+                    }
+                    shouldSetField = false;
+                }
            } else {
                throw new IllegalArgumentException("character '" + c + "' after quoted field at " + currentIndex);
            }
--- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java
+++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java
@ -24,33 +24,36 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
 import org.elasticsearch.ingest.IngestDocument;
 import org.elasticsearch.ingest.RandomDocumentPicks;
 import org.elasticsearch.test.ESTestCase;
-import org.junit.Before;

 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
+import java.util.LinkedList;
 import java.util.Map;
 import java.util.stream.Collectors;

 public class CsvProcessorTests extends ESTestCase {

-    private static final Character[] SEPARATORS = new Character[]{',', ';', '|', '.'};
+    private static final Character[] SEPARATORS = new Character[]{',', ';', '|', '.', '\t'};
+    private static final String[] QUOTES = new String[]{"'", "\"", ""};
    private final String quote;
-    private char separator;
+    private final char separator;


-    public CsvProcessorTests(@Name("quote") String quote) {
+    public CsvProcessorTests(@Name("quote") String quote, @Name("separator") char separator) {
        this.quote = quote;
+        this.separator = separator;
    }

    @ParametersFactory
    public static Iterable<Object[]> parameters() {
-        return Arrays.asList(new Object[]{"'"}, new Object[]{"\""}, new Object[]{""});
+        LinkedList<Object[]> list = new LinkedList<>();
+        for (Character separator : SEPARATORS) {
+            for (String quote : QUOTES) {
+                list.add(new Object[]{quote, separator});
            }
-
-    @Before
-    public void setup() {
-        separator = randomFrom(SEPARATORS);
+        }
+        return list;
    }

    public void testExactNumberOfFields() {