Aggregations bug: Significant_text fails on arrays of text. (#25030)

* Aggregations bug: Significant_text fails on arrays of text. The set of previously-seen tokens in a doc was allocated per-JSON-field string value rather than once per JSON document meaning the number of docs containing a term could be over-counted leading to exceptions from the checks in significance heuristics. Added unit test for this scenario Closes #25029
2017-06-12 14:02:54 +01:00 · 2017-06-12 14:02:54 +01:00 · 518cda6637
parent 7ab3d5d04a
commit 518cda6637
2 changed files with 76 additions and 44 deletions
--- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java
+++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java
@ -113,16 +113,13 @@ public class SignificantTextAggregator extends BucketsAggregator {
                }
            }
-            private void processTokenStream(int doc, long bucket, TokenStream ts, String fieldText) throws IOException{
+            private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText) 
                    throws IOException{
                if (dupSequenceSpotter != null) {
                    ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter);
                }
                CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
                ts.reset();
                try {
                    //Assume tokens will average 5 bytes in length to size number of tokens
                    BytesRefHash inDocTerms = new BytesRefHash(1+(fieldText.length()/5), context.bigArrays());
                try {
                    while (ts.incrementToken()) {
                        if (dupSequenceSpotter != null) {
@ -149,9 +146,7 @@ public class SignificantTextAggregator extends BucketsAggregator {
                            }
                        }
                    }
-                    } finally{
+
                        Releasables.close(inDocTerms);
                    }
                } finally{
                    ts.close();
                }
@ -166,7 +161,9 @@ public class SignificantTextAggregator extends BucketsAggregator {
                SourceLookup sourceLookup = context.lookup().source();
                sourceLookup.setSegmentAndDocument(ctx, doc);
                BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays());
                try {                
                    for (String sourceField : sourceFieldNames) {
                        List<Object> textsToHighlight = sourceLookup.extractRawValues(sourceField);    
                        textsToHighlight = textsToHighlight.stream().map(obj -> {
@ -181,9 +178,12 @@ public class SignificantTextAggregator extends BucketsAggregator {
                        for (Object fieldValue : textsToHighlight) {
                            String fieldText = fieldValue.toString();
                            TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText);
-                        processTokenStream(doc, bucket, ts, fieldText);                     
+                            processTokenStream(doc, bucket, ts, inDocTerms, fieldText);                     
                        }                    
                    }
                } finally{
                    Releasables.close(inDocTerms);
                }
            }
        };
    }
--- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java
+++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java
@ -123,4 +123,36 @@ public class SignificantTextAggregatorTests extends AggregatorTestCase {
            }
        }
    }
    /**
     * Test documents with arrays of text
     */
    public void testSignificanceOnTextArrays() throws IOException {
        TextFieldType textFieldType = new TextFieldType();
        textFieldType.setName("text");
        textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer()));
        IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
        try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) {
            for (int i = 0; i < 10; i++) {
                Document doc = new Document();
                doc.add(new Field("text", "foo", textFieldType));
                String json ="{ \"text\" : [\"foo\",\"foo\"], \"title\" : [\"foo\", \"foo\"]}";
                doc.add(new StoredField("_source", new BytesRef(json)));
                w.addDocument(doc);
            }
            SignificantTextAggregationBuilder sigAgg = new SignificantTextAggregationBuilder("sig_text", "text");
            sigAgg.sourceFieldNames(Arrays.asList(new String [] {"title", "text"}));
            try (IndexReader reader = DirectoryReader.open(w)) {
                assertEquals("test expects a single segment", 1, reader.leaves().size());
                IndexSearcher searcher = new IndexSearcher(reader);                                
                searchAndReduce(searcher, new TermQuery(new Term("text", "foo")), sigAgg, textFieldType);
                // No significant results to be found in this test - only checking we don't end up
                // with the internal exception discovered in issue https://github.com/elastic/elasticsearch/issues/25029
            }
        }
    }
 }