Aggregations bug: Significant_text fails on arrays of text. (#25030)
* Aggregations bug: Significant_text fails on arrays of text. The set of previously-seen tokens in a doc was allocated per-JSON-field string value rather than once per JSON document meaning the number of docs containing a term could be over-counted leading to exceptions from the checks in significance heuristics. Added unit test for this scenario Closes #25029
This commit is contained in:
parent
7ab3d5d04a
commit
518cda6637
|
@ -113,45 +113,40 @@ public class SignificantTextAggregator extends BucketsAggregator {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void processTokenStream(int doc, long bucket, TokenStream ts, String fieldText) throws IOException{
|
private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText)
|
||||||
|
throws IOException{
|
||||||
if (dupSequenceSpotter != null) {
|
if (dupSequenceSpotter != null) {
|
||||||
ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter);
|
ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter);
|
||||||
}
|
}
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
try {
|
try {
|
||||||
//Assume tokens will average 5 bytes in length to size number of tokens
|
while (ts.incrementToken()) {
|
||||||
BytesRefHash inDocTerms = new BytesRefHash(1+(fieldText.length()/5), context.bigArrays());
|
if (dupSequenceSpotter != null) {
|
||||||
|
long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes();
|
||||||
try{
|
long growth = newTrieSize - lastTrieSize;
|
||||||
while (ts.incrementToken()) {
|
// Only update the circuitbreaker after
|
||||||
if (dupSequenceSpotter != null) {
|
if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
|
||||||
long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes();
|
addRequestCircuitBreakerBytes(growth);
|
||||||
long growth = newTrieSize - lastTrieSize;
|
lastTrieSize = newTrieSize;
|
||||||
// Only update the circuitbreaker after
|
|
||||||
if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
|
|
||||||
addRequestCircuitBreakerBytes(growth);
|
|
||||||
lastTrieSize = newTrieSize;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
previous.clear();
|
}
|
||||||
previous.copyChars(termAtt);
|
previous.clear();
|
||||||
BytesRef bytes = previous.get();
|
previous.copyChars(termAtt);
|
||||||
if (inDocTerms.add(bytes) >= 0) {
|
BytesRef bytes = previous.get();
|
||||||
if (includeExclude == null || includeExclude.accept(bytes)) {
|
if (inDocTerms.add(bytes) >= 0) {
|
||||||
long bucketOrdinal = bucketOrds.add(bytes);
|
if (includeExclude == null || includeExclude.accept(bytes)) {
|
||||||
if (bucketOrdinal < 0) { // already seen
|
long bucketOrdinal = bucketOrds.add(bytes);
|
||||||
bucketOrdinal = -1 - bucketOrdinal;
|
if (bucketOrdinal < 0) { // already seen
|
||||||
collectExistingBucket(sub, doc, bucketOrdinal);
|
bucketOrdinal = -1 - bucketOrdinal;
|
||||||
} else {
|
collectExistingBucket(sub, doc, bucketOrdinal);
|
||||||
collectBucket(sub, doc, bucketOrdinal);
|
} else {
|
||||||
}
|
collectBucket(sub, doc, bucketOrdinal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally{
|
|
||||||
Releasables.close(inDocTerms);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} finally{
|
} finally{
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
@ -166,23 +161,28 @@ public class SignificantTextAggregator extends BucketsAggregator {
|
||||||
|
|
||||||
SourceLookup sourceLookup = context.lookup().source();
|
SourceLookup sourceLookup = context.lookup().source();
|
||||||
sourceLookup.setSegmentAndDocument(ctx, doc);
|
sourceLookup.setSegmentAndDocument(ctx, doc);
|
||||||
|
BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays());
|
||||||
|
|
||||||
for (String sourceField : sourceFieldNames) {
|
try {
|
||||||
List<Object> textsToHighlight = sourceLookup.extractRawValues(sourceField);
|
for (String sourceField : sourceFieldNames) {
|
||||||
textsToHighlight = textsToHighlight.stream().map(obj -> {
|
List<Object> textsToHighlight = sourceLookup.extractRawValues(sourceField);
|
||||||
if (obj instanceof BytesRef) {
|
textsToHighlight = textsToHighlight.stream().map(obj -> {
|
||||||
return fieldType.valueForDisplay(obj).toString();
|
if (obj instanceof BytesRef) {
|
||||||
} else {
|
return fieldType.valueForDisplay(obj).toString();
|
||||||
return obj;
|
} else {
|
||||||
}
|
return obj;
|
||||||
}).collect(Collectors.toList());
|
}
|
||||||
|
}).collect(Collectors.toList());
|
||||||
Analyzer analyzer = fieldType.indexAnalyzer();
|
|
||||||
for (Object fieldValue : textsToHighlight) {
|
Analyzer analyzer = fieldType.indexAnalyzer();
|
||||||
String fieldText = fieldValue.toString();
|
for (Object fieldValue : textsToHighlight) {
|
||||||
TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText);
|
String fieldText = fieldValue.toString();
|
||||||
processTokenStream(doc, bucket, ts, fieldText);
|
TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText);
|
||||||
}
|
processTokenStream(doc, bucket, ts, inDocTerms, fieldText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally{
|
||||||
|
Releasables.close(inDocTerms);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -123,4 +123,36 @@ public class SignificantTextAggregatorTests extends AggregatorTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test documents with arrays of text
|
||||||
|
*/
|
||||||
|
public void testSignificanceOnTextArrays() throws IOException {
|
||||||
|
TextFieldType textFieldType = new TextFieldType();
|
||||||
|
textFieldType.setName("text");
|
||||||
|
textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer()));
|
||||||
|
|
||||||
|
IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
|
||||||
|
try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) {
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("text", "foo", textFieldType));
|
||||||
|
String json ="{ \"text\" : [\"foo\",\"foo\"], \"title\" : [\"foo\", \"foo\"]}";
|
||||||
|
doc.add(new StoredField("_source", new BytesRef(json)));
|
||||||
|
w.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
SignificantTextAggregationBuilder sigAgg = new SignificantTextAggregationBuilder("sig_text", "text");
|
||||||
|
sigAgg.sourceFieldNames(Arrays.asList(new String [] {"title", "text"}));
|
||||||
|
try (IndexReader reader = DirectoryReader.open(w)) {
|
||||||
|
assertEquals("test expects a single segment", 1, reader.leaves().size());
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
searchAndReduce(searcher, new TermQuery(new Term("text", "foo")), sigAgg, textFieldType);
|
||||||
|
// No significant results to be found in this test - only checking we don't end up
|
||||||
|
// with the internal exception discovered in issue https://github.com/elastic/elasticsearch/issues/25029
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue