Aggregations bug: Significant_text fails on arrays of text. (#25030)

* Aggregations bug: Significant_text fails on arrays of text.
The set of previously-seen tokens in a doc was allocated per-JSON-field string value rather than once per JSON document meaning the number of docs containing a term could be over-counted leading to exceptions from the checks in significance heuristics. Added unit test for this scenario

Closes #25029
This commit is contained in:
markharwood 2017-06-12 14:02:54 +01:00 committed by GitHub
parent 7ab3d5d04a
commit 518cda6637
2 changed files with 76 additions and 44 deletions

View File

@ -113,45 +113,40 @@ public class SignificantTextAggregator extends BucketsAggregator {
} }
} }
private void processTokenStream(int doc, long bucket, TokenStream ts, String fieldText) throws IOException{ private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText)
throws IOException{
if (dupSequenceSpotter != null) { if (dupSequenceSpotter != null) {
ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter); ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter);
} }
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset(); ts.reset();
try { try {
//Assume tokens will average 5 bytes in length to size number of tokens while (ts.incrementToken()) {
BytesRefHash inDocTerms = new BytesRefHash(1+(fieldText.length()/5), context.bigArrays()); if (dupSequenceSpotter != null) {
long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes();
try{ long growth = newTrieSize - lastTrieSize;
while (ts.incrementToken()) { // Only update the circuitbreaker after
if (dupSequenceSpotter != null) { if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes(); addRequestCircuitBreakerBytes(growth);
long growth = newTrieSize - lastTrieSize; lastTrieSize = newTrieSize;
// Only update the circuitbreaker after
if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
addRequestCircuitBreakerBytes(growth);
lastTrieSize = newTrieSize;
}
} }
previous.clear(); }
previous.copyChars(termAtt); previous.clear();
BytesRef bytes = previous.get(); previous.copyChars(termAtt);
if (inDocTerms.add(bytes) >= 0) { BytesRef bytes = previous.get();
if (includeExclude == null || includeExclude.accept(bytes)) { if (inDocTerms.add(bytes) >= 0) {
long bucketOrdinal = bucketOrds.add(bytes); if (includeExclude == null || includeExclude.accept(bytes)) {
if (bucketOrdinal < 0) { // already seen long bucketOrdinal = bucketOrds.add(bytes);
bucketOrdinal = -1 - bucketOrdinal; if (bucketOrdinal < 0) { // already seen
collectExistingBucket(sub, doc, bucketOrdinal); bucketOrdinal = -1 - bucketOrdinal;
} else { collectExistingBucket(sub, doc, bucketOrdinal);
collectBucket(sub, doc, bucketOrdinal); } else {
} collectBucket(sub, doc, bucketOrdinal);
} }
} }
} }
} finally{
Releasables.close(inDocTerms);
} }
} finally{ } finally{
ts.close(); ts.close();
} }
@ -166,23 +161,28 @@ public class SignificantTextAggregator extends BucketsAggregator {
SourceLookup sourceLookup = context.lookup().source(); SourceLookup sourceLookup = context.lookup().source();
sourceLookup.setSegmentAndDocument(ctx, doc); sourceLookup.setSegmentAndDocument(ctx, doc);
BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays());
for (String sourceField : sourceFieldNames) { try {
List<Object> textsToHighlight = sourceLookup.extractRawValues(sourceField); for (String sourceField : sourceFieldNames) {
textsToHighlight = textsToHighlight.stream().map(obj -> { List<Object> textsToHighlight = sourceLookup.extractRawValues(sourceField);
if (obj instanceof BytesRef) { textsToHighlight = textsToHighlight.stream().map(obj -> {
return fieldType.valueForDisplay(obj).toString(); if (obj instanceof BytesRef) {
} else { return fieldType.valueForDisplay(obj).toString();
return obj; } else {
return obj;
}
}).collect(Collectors.toList());
Analyzer analyzer = fieldType.indexAnalyzer();
for (Object fieldValue : textsToHighlight) {
String fieldText = fieldValue.toString();
TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText);
processTokenStream(doc, bucket, ts, inDocTerms, fieldText);
} }
}).collect(Collectors.toList());
Analyzer analyzer = fieldType.indexAnalyzer();
for (Object fieldValue : textsToHighlight) {
String fieldText = fieldValue.toString();
TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText);
processTokenStream(doc, bucket, ts, fieldText);
} }
} finally{
Releasables.close(inDocTerms);
} }
} }
}; };

View File

@ -123,4 +123,36 @@ public class SignificantTextAggregatorTests extends AggregatorTestCase {
} }
} }
} }
/**
* Test documents with arrays of text
*/
public void testSignificanceOnTextArrays() throws IOException {
TextFieldType textFieldType = new TextFieldType();
textFieldType.setName("text");
textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer()));
IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) {
for (int i = 0; i < 10; i++) {
Document doc = new Document();
doc.add(new Field("text", "foo", textFieldType));
String json ="{ \"text\" : [\"foo\",\"foo\"], \"title\" : [\"foo\", \"foo\"]}";
doc.add(new StoredField("_source", new BytesRef(json)));
w.addDocument(doc);
}
SignificantTextAggregationBuilder sigAgg = new SignificantTextAggregationBuilder("sig_text", "text");
sigAgg.sourceFieldNames(Arrays.asList(new String [] {"title", "text"}));
try (IndexReader reader = DirectoryReader.open(w)) {
assertEquals("test expects a single segment", 1, reader.leaves().size());
IndexSearcher searcher = new IndexSearcher(reader);
searchAndReduce(searcher, new TermQuery(new Term("text", "foo")), sigAgg, textFieldType);
// No significant results to be found in this test - only checking we don't end up
// with the internal exception discovered in issue https://github.com/elastic/elasticsearch/issues/25029
}
}
}
} }