mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-28 10:58:30 +00:00
[ML] Improve CSV header row detection in find_file_structure (#45099)
When doing a fieldwise Levenshtein distance comparison between CSV rows, this change ignores all fields that have long values, not just the longest field. This approach works better for CSV formats that have multiple freeform text fields rather than just a single "message" field. Fixes #45047
This commit is contained in:
parent
9450505d5b
commit
f617585dbd
x-pack/plugin/ml/src
main/java/org/elasticsearch/xpack/ml/filestructurefinder
test/java/org/elasticsearch/xpack/ml/filestructurefinder
@ -17,6 +17,7 @@ import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.Collections;
|
||||
import java.util.DoubleSummaryStatistics;
|
||||
import java.util.HashSet;
|
||||
@ -27,12 +28,12 @@ import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.SortedMap;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public class DelimitedFileStructureFinder implements FileStructureFinder {
|
||||
|
||||
private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
|
||||
private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
|
||||
private static final int LONG_FIELD_THRESHOLD = 100;
|
||||
|
||||
private final List<String> sampleMessages;
|
||||
private final FileStructure structure;
|
||||
@ -322,10 +323,15 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
||||
explanation.add("First row is not unusual based on length test: [" + firstRowLength + "] and [" +
|
||||
toNiceString(otherRowStats) + "]");
|
||||
|
||||
// Check edit distances
|
||||
// Check edit distances between short fields
|
||||
|
||||
BitSet shortFieldMask = makeShortFieldMask(rows, LONG_FIELD_THRESHOLD);
|
||||
|
||||
// The reason that only short fields are included is that sometimes
|
||||
// there are "message" fields that are much longer than the other
|
||||
// fields, vary enormously between rows, and skew the comparison.
|
||||
DoubleSummaryStatistics firstRowStats = otherRows.stream().limit(MAX_LEVENSHTEIN_COMPARISONS)
|
||||
.mapToDouble(otherRow -> (double) levenshteinFieldwiseCompareRows(firstRow, otherRow))
|
||||
.mapToDouble(otherRow -> (double) levenshteinFieldwiseCompareRows(firstRow, otherRow, shortFieldMask))
|
||||
.collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine);
|
||||
|
||||
otherRowStats = new DoubleSummaryStatistics();
|
||||
@ -336,7 +342,7 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
||||
for (int i = 0; numComparisons < MAX_LEVENSHTEIN_COMPARISONS && i < otherRowStrs.size(); ++i) {
|
||||
for (int j = i + 1 + random.nextInt(innerIncrement); numComparisons < MAX_LEVENSHTEIN_COMPARISONS && j < otherRowStrs.size();
|
||||
j += innerIncrement) {
|
||||
otherRowStats.accept((double) levenshteinFieldwiseCompareRows(otherRows.get(i), otherRows.get(j)));
|
||||
otherRowStats.accept((double) levenshteinFieldwiseCompareRows(otherRows.get(i), otherRows.get(j), shortFieldMask));
|
||||
++numComparisons;
|
||||
}
|
||||
}
|
||||
@ -358,30 +364,58 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
|
||||
stats.getMax());
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a mask whose bits are set when the corresponding field in every supplied
|
||||
* row is short, and unset if the corresponding field in any supplied row is long.
|
||||
*/
|
||||
static BitSet makeShortFieldMask(List<List<String>> rows, int longFieldThreshold) {
|
||||
|
||||
assert rows.isEmpty() == false;
|
||||
|
||||
BitSet shortFieldMask = new BitSet();
|
||||
|
||||
int maxLength = rows.stream().map(List::size).max(Integer::compareTo).get();
|
||||
for (int index = 0; index < maxLength; ++index) {
|
||||
final int i = index;
|
||||
shortFieldMask.set(i,
|
||||
rows.stream().allMatch(row -> i >= row.size() || row.get(i) == null || row.get(i).length() < longFieldThreshold));
|
||||
}
|
||||
|
||||
return shortFieldMask;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sum of the Levenshtein distances between corresponding elements
|
||||
* in the two supplied lists _excluding_ the biggest difference.
|
||||
* The reason the biggest difference is excluded is that sometimes
|
||||
* there's a "message" field that is much longer than any of the other
|
||||
* fields, varies enormously between rows, and skews the comparison.
|
||||
* in the two supplied lists.
|
||||
*/
|
||||
static int levenshteinFieldwiseCompareRows(List<String> firstRow, List<String> secondRow) {
|
||||
|
||||
int largestSize = Math.max(firstRow.size(), secondRow.size());
|
||||
if (largestSize <= 1) {
|
||||
if (largestSize < 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int[] distances = new int[largestSize];
|
||||
BitSet allFields = new BitSet();
|
||||
allFields.set(0, largestSize);
|
||||
|
||||
for (int index = 0; index < largestSize; ++index) {
|
||||
distances[index] = levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "",
|
||||
return levenshteinFieldwiseCompareRows(firstRow, secondRow, allFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sum of the Levenshtein distances between corresponding elements
|
||||
* in the two supplied lists where the corresponding bit in the
|
||||
* supplied bit mask is set.
|
||||
*/
|
||||
static int levenshteinFieldwiseCompareRows(List<String> firstRow, List<String> secondRow, BitSet fieldMask) {
|
||||
|
||||
int result = 0;
|
||||
|
||||
for (int index = fieldMask.nextSetBit(0); index >= 0; index = fieldMask.nextSetBit(index + 1)) {
|
||||
result += levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "",
|
||||
(index < secondRow.size()) ? secondRow.get(index) : "");
|
||||
}
|
||||
|
||||
Arrays.sort(distances);
|
||||
|
||||
return IntStream.of(distances).limit(distances.length - 1).sum();
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -10,12 +10,16 @@ import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
|
||||
import org.supercsv.prefs.CsvPreference;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinFieldwiseCompareRows;
|
||||
import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinDistance;
|
||||
import static org.hamcrest.Matchers.arrayContaining;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
||||
|
||||
@ -449,15 +453,51 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
||||
assertEquals(0, levenshteinDistance("", ""));
|
||||
}
|
||||
|
||||
public void testMakeShortFieldMask() {
|
||||
|
||||
List<List<String>> rows = new ArrayList<>();
|
||||
rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(20), randomAlphaOfLength(5)));
|
||||
rows.add(Arrays.asList(randomAlphaOfLength(50), randomAlphaOfLength(5), randomAlphaOfLength(5)));
|
||||
rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(5), randomAlphaOfLength(5)));
|
||||
rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(5), randomAlphaOfLength(80)));
|
||||
|
||||
BitSet shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 110);
|
||||
assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet("111")));
|
||||
shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 80);
|
||||
assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet("11 ")));
|
||||
shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 50);
|
||||
assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet(" 1 ")));
|
||||
shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 20);
|
||||
assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet(" ")));
|
||||
}
|
||||
|
||||
public void testLevenshteinCompareRows() {
|
||||
|
||||
assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog")));
|
||||
assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat")));
|
||||
assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat")));
|
||||
assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat")));
|
||||
assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat")));
|
||||
assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse")));
|
||||
assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog")));
|
||||
assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat")));
|
||||
assertEquals(6, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat")));
|
||||
assertEquals(8, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat")));
|
||||
assertEquals(10, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat")));
|
||||
assertEquals(9, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse")));
|
||||
assertEquals(12, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog")));
|
||||
}
|
||||
|
||||
public void testLevenshteinCompareRowsWithMask() {
|
||||
|
||||
assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog"),
|
||||
TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" ", "1 ", " 1", "11"))));
|
||||
assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"),
|
||||
TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" ", "1 "))));
|
||||
assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"),
|
||||
TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" 1", "1 "))));
|
||||
assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"),
|
||||
TimestampFormatFinder.stringToNumberPosBitSet(" 1")));
|
||||
assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"),
|
||||
TimestampFormatFinder.stringToNumberPosBitSet(" 11")));
|
||||
assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"),
|
||||
TimestampFormatFinder.stringToNumberPosBitSet(" 11")));
|
||||
assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"),
|
||||
TimestampFormatFinder.stringToNumberPosBitSet(" 11")));
|
||||
}
|
||||
|
||||
public void testLineHasUnescapedQuote() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user