From 518cda6637a4480fde5d7dde044dab5b2a0e2936 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 12 Jun 2017 14:02:54 +0100 Subject: [PATCH] Aggregations bug: Significant_text fails on arrays of text. (#25030) * Aggregations bug: Significant_text fails on arrays of text. The set of previously-seen tokens in a doc was allocated per-JSON-field string value rather than once per JSON document meaning the number of docs containing a term could be over-counted leading to exceptions from the checks in significance heuristics. Added unit test for this scenario Closes #25029 --- .../SignificantTextAggregator.java | 88 +++++++++---------- .../SignificantTextAggregatorTests.java | 32 +++++++ 2 files changed, 76 insertions(+), 44 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java index c7539a4ca02..8b58678b676 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java @@ -113,45 +113,40 @@ public class SignificantTextAggregator extends BucketsAggregator { } } - private void processTokenStream(int doc, long bucket, TokenStream ts, String fieldText) throws IOException{ + private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText) + throws IOException{ if (dupSequenceSpotter != null) { ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter); } CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); try { - //Assume tokens will average 5 bytes in length to size number of tokens - BytesRefHash inDocTerms = new BytesRefHash(1+(fieldText.length()/5), context.bigArrays()); - - try{ - while (ts.incrementToken()) { - if (dupSequenceSpotter != null) { - long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes(); - long growth = newTrieSize - lastTrieSize; - // Only update the circuitbreaker after - if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) { - addRequestCircuitBreakerBytes(growth); - lastTrieSize = newTrieSize; - } + while (ts.incrementToken()) { + if (dupSequenceSpotter != null) { + long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes(); + long growth = newTrieSize - lastTrieSize; + // Only update the circuitbreaker after + if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) { + addRequestCircuitBreakerBytes(growth); + lastTrieSize = newTrieSize; } - previous.clear(); - previous.copyChars(termAtt); - BytesRef bytes = previous.get(); - if (inDocTerms.add(bytes) >= 0) { - if (includeExclude == null || includeExclude.accept(bytes)) { - long bucketOrdinal = bucketOrds.add(bytes); - if (bucketOrdinal < 0) { // already seen - bucketOrdinal = -1 - bucketOrdinal; - collectExistingBucket(sub, doc, bucketOrdinal); - } else { - collectBucket(sub, doc, bucketOrdinal); - } + } + previous.clear(); + previous.copyChars(termAtt); + BytesRef bytes = previous.get(); + if (inDocTerms.add(bytes) >= 0) { + if (includeExclude == null || includeExclude.accept(bytes)) { + long bucketOrdinal = bucketOrds.add(bytes); + if (bucketOrdinal < 0) { // already seen + bucketOrdinal = -1 - bucketOrdinal; + collectExistingBucket(sub, doc, bucketOrdinal); + } else { + collectBucket(sub, doc, bucketOrdinal); } } } - } finally{ - Releasables.close(inDocTerms); } + } finally{ ts.close(); } @@ -166,23 +161,28 @@ public class SignificantTextAggregator extends BucketsAggregator { SourceLookup sourceLookup = context.lookup().source(); sourceLookup.setSegmentAndDocument(ctx, doc); + BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays()); - for (String sourceField : sourceFieldNames) { - List textsToHighlight = sourceLookup.extractRawValues(sourceField); - textsToHighlight = textsToHighlight.stream().map(obj -> { - if (obj instanceof BytesRef) { - return fieldType.valueForDisplay(obj).toString(); - } else { - return obj; - } - }).collect(Collectors.toList()); - - Analyzer analyzer = fieldType.indexAnalyzer(); - for (Object fieldValue : textsToHighlight) { - String fieldText = fieldValue.toString(); - TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText); - processTokenStream(doc, bucket, ts, fieldText); - } + try { + for (String sourceField : sourceFieldNames) { + List textsToHighlight = sourceLookup.extractRawValues(sourceField); + textsToHighlight = textsToHighlight.stream().map(obj -> { + if (obj instanceof BytesRef) { + return fieldType.valueForDisplay(obj).toString(); + } else { + return obj; + } + }).collect(Collectors.toList()); + + Analyzer analyzer = fieldType.indexAnalyzer(); + for (Object fieldValue : textsToHighlight) { + String fieldText = fieldValue.toString(); + TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText); + processTokenStream(doc, bucket, ts, inDocTerms, fieldText); + } + } + } finally{ + Releasables.close(inDocTerms); } } }; diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java index c8d8b6d5979..8376d8c57a1 100644 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java @@ -123,4 +123,36 @@ public class SignificantTextAggregatorTests extends AggregatorTestCase { } } } + + /** + * Test documents with arrays of text + */ + public void testSignificanceOnTextArrays() throws IOException { + TextFieldType textFieldType = new TextFieldType(); + textFieldType.setName("text"); + textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer())); + + IndexWriterConfig indexWriterConfig = newIndexWriterConfig(); + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + for (int i = 0; i < 10; i++) { + Document doc = new Document(); + doc.add(new Field("text", "foo", textFieldType)); + String json ="{ \"text\" : [\"foo\",\"foo\"], \"title\" : [\"foo\", \"foo\"]}"; + doc.add(new StoredField("_source", new BytesRef(json))); + w.addDocument(doc); + } + + SignificantTextAggregationBuilder sigAgg = new SignificantTextAggregationBuilder("sig_text", "text"); + sigAgg.sourceFieldNames(Arrays.asList(new String [] {"title", "text"})); + try (IndexReader reader = DirectoryReader.open(w)) { + assertEquals("test expects a single segment", 1, reader.leaves().size()); + IndexSearcher searcher = new IndexSearcher(reader); + searchAndReduce(searcher, new TermQuery(new Term("text", "foo")), sigAgg, textFieldType); + // No significant results to be found in this test - only checking we don't end up + // with the internal exception discovered in issue https://github.com/elastic/elasticsearch/issues/25029 + } + } + } + + }