Limit the number of extracted token instance per query token.

FVH deploys some recursive logic to extract terms from documents that need to highlighted. For documents that have terms with super large term frequency like a document that repeats a terms very very often this can produce some very large stacks when extracting the terms. Taken to an extreme this causes stack overflow errors when this grow beyond a term frequency >= 6000. The ultimate solution is a iterative implementation of the extract logic but until then we should protect users from these massive term extractions which might be not very useful in the first place. Closes #3486
2025-03-09 14:34:43 +00:00 · 2013-08-12 16:06:40 +02:00 · 2013-08-12 16:06:40 +02:00 · 8a876ea80e
commit 8a876ea80e
parent ab6163898f
2 changed files with 41 additions and 1 deletions
--- a/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldTermStack.java
+++ b/src/main/java/org/apache/lucene/search/vectorhighlight/XFieldTermStack.java
@ -114,7 +114,10 @@ public class XFieldTermStack {
      // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
      final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( new Term(fieldName, text) ) + 1 ) ) + 1.0 );

-      final int freq = dpEnum.freq();
+      // ES EDIT: added a safety check to limit this to 512 terms everything above might be meaningless anyways
+      // This limit protectes the FVH from running into StackOverflowErrors if super large TF docs are highlighted. 
+      final int freq = Math.min(512, dpEnum.freq()); 
+      
      
      for(int i = 0;i < freq;i++) {
        int pos = dpEnum.nextPosition();
--- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
+++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
@ -62,6 +62,43 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
    protected int numberOfNodes() {
        return 4; // why 4?
    }
+    
+    @Test
+    // see #3486
+    public void testHighTermFrequencyDoc() throws ElasticSearchException, IOException {
+        wipeIndex("test");
+        client().admin().indices().prepareCreate("test")
+        .addMapping("test", jsonBuilder()
+                .startObject()
+                    .startObject("test")
+                        .startObject("properties")
+                            .startObject("name")
+                                .field("type", "string")
+                                .field("term_vector", "with_positions_offsets")
+                                .field("store", randomBoolean() ? "yes" : "no")
+                            .endObject()
+                        .endObject()
+                    .endObject()
+                .endObject())
+        .setSettings(ImmutableSettings.settingsBuilder()
+                .put("index.number_of_shards", between(1, 5)))
+        .execute().actionGet();
+        ensureYellow();
+        StringBuilder builder = new StringBuilder();
+        for (int i = 0; i < 6000; i++) {
+            builder.append("abc").append(" ");
+        }
+        client().prepareIndex("test", "test", "1")
+            .setSource(XContentFactory.jsonBuilder()
+                    .startObject()
+                        .field("name", builder.toString())
+                    .endObject())
+            .execute().actionGet();
+        refresh();
+        SearchResponse search = client().prepareSearch().setQuery(constantScoreQuery(matchQuery("name", "abc"))).addHighlightedField("name").execute().actionGet();
+        assertHighlight(search, 0, "name", 0, startsWith("<em>abc</em> <em>abc</em> <em>abc</em> <em>abc</em>"));
+    }
+

    @Test
    public void testNgramHighlightingWithBrokenPositions() throws ElasticSearchException, IOException {