Aggregations bug: Significant_text fails on arrays of text. (#25030)

* Aggregations bug: Significant_text fails on arrays of text. The set of previously-seen tokens in a doc was allocated per-JSON-field string value rather than once per JSON document meaning the number of docs containing a term could be over-counted leading to exceptions from the checks in significance heuristics. Added unit test for this scenario Closes #25029
2017-06-12 14:02:54 +01:00 · 2017-06-12 14:02:54 +01:00 · 518cda6637
parent 7ab3d5d04a
commit 518cda6637
2 changed files with 76 additions and 44 deletions
--- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java
+++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java
@ -113,17 +113,14 @@ public class SignificantTextAggregator extends BucketsAggregator {
                }
            }
            
-            private void processTokenStream(int doc, long bucket, TokenStream ts, String fieldText) throws IOException{
+            private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText) 
+                    throws IOException{
                if (dupSequenceSpotter != null) {
                    ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter);
                }
                CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
                ts.reset();
                try {
-                    //Assume tokens will average 5 bytes in length to size number of tokens
-                    BytesRefHash inDocTerms = new BytesRefHash(1+(fieldText.length()/5), context.bigArrays());
-                    
-                    try{
                    while (ts.incrementToken()) {
                        if (dupSequenceSpotter != null) {
                            long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes();
@ -149,9 +146,7 @@ public class SignificantTextAggregator extends BucketsAggregator {
                            }
                        }
                    }
-                    } finally{
-                        Releasables.close(inDocTerms);
-                    }
+
                } finally{
                    ts.close();
                }
@ -166,7 +161,9 @@ public class SignificantTextAggregator extends BucketsAggregator {

                SourceLookup sourceLookup = context.lookup().source();
                sourceLookup.setSegmentAndDocument(ctx, doc);
+                BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays());
                
+                try {                
                    for (String sourceField : sourceFieldNames) {
                        List<Object> textsToHighlight = sourceLookup.extractRawValues(sourceField);    
                        textsToHighlight = textsToHighlight.stream().map(obj -> {
@ -181,9 +178,12 @@ public class SignificantTextAggregator extends BucketsAggregator {
                        for (Object fieldValue : textsToHighlight) {
                            String fieldText = fieldValue.toString();
                            TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText);
-                        processTokenStream(doc, bucket, ts, fieldText);                     
+                            processTokenStream(doc, bucket, ts, inDocTerms, fieldText);                     
                        }                    
                    }
+                } finally{
+                    Releasables.close(inDocTerms);
+                }
            }
        };
    }
--- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java
+++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java
@ -123,4 +123,36 @@ public class SignificantTextAggregatorTests extends AggregatorTestCase {
            }
        }
    }
+    
+    /**
+     * Test documents with arrays of text
+     */
+    public void testSignificanceOnTextArrays() throws IOException {
+        TextFieldType textFieldType = new TextFieldType();
+        textFieldType.setName("text");
+        textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer()));
+
+        IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
+        try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) {
+            for (int i = 0; i < 10; i++) {
+                Document doc = new Document();
+                doc.add(new Field("text", "foo", textFieldType));
+                String json ="{ \"text\" : [\"foo\",\"foo\"], \"title\" : [\"foo\", \"foo\"]}";
+                doc.add(new StoredField("_source", new BytesRef(json)));
+                w.addDocument(doc);
+            }
+
+            SignificantTextAggregationBuilder sigAgg = new SignificantTextAggregationBuilder("sig_text", "text");
+            sigAgg.sourceFieldNames(Arrays.asList(new String [] {"title", "text"}));
+            try (IndexReader reader = DirectoryReader.open(w)) {
+                assertEquals("test expects a single segment", 1, reader.leaves().size());
+                IndexSearcher searcher = new IndexSearcher(reader);                                
+                searchAndReduce(searcher, new TermQuery(new Term("text", "foo")), sigAgg, textFieldType);
+                // No significant results to be found in this test - only checking we don't end up
+                // with the internal exception discovered in issue https://github.com/elastic/elasticsearch/issues/25029
+            }
+        }
+    }
+    
+    
 }