Aggs fix - background count for docs should include deleted docs otherwise a term’s docFreq (which includes deleted docs) can exceed the number of docs reported in the index and cause an exception.

The randomisation that deletes documents is also removed from tests as this doc-accounting change would mean the specific scores being expected in tests would now be subject to random variability and so fail. Closes #7951
2014-10-02 13:49:28 +01:00 · 2014-10-02 13:49:28 +01:00 · f878f40ae5
parent f0052a58d6
commit f878f40ae5
3 changed files with 50 additions and 5 deletions
--- a/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java
+++ b/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java
@ -71,7 +71,9 @@ public class FilterableTermsEnum extends TermsEnum {
        }
        this.docsEnumFlag = docsEnumFlag;
        if (filter == null) {
-            numDocs = reader.numDocs();
+            // Important - need to use the doc count that includes deleted docs
+            // or we have this issue: https://github.com/elasticsearch/elasticsearch/issues/7951
+            numDocs = reader.maxDoc();
        }
        ApplyAcceptedDocsFilter acceptedDocsFilter = filter == null ? null : new ApplyAcceptedDocsFilter(filter);
        List<AtomicReaderContext> leaves = reader.leaves();
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreTests.java
@ -262,6 +262,49 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
        assertThat(responseBuilder.string(), equalTo(result));

    }
+    
+    @Test
+    public void testDeletesIssue7951() throws Exception {
+        String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
+        String mappings = "{\"doc\": {\"properties\":{\"text\": {\"type\":\"string\",\"index\":\"not_analyzed\"}}}}";
+        assertAcked(prepareCreate(INDEX_NAME).setSettings(settings).addMapping("doc", mappings));
+        String[] cat1v1 = {"constant", "one"};
+        String[] cat1v2 = {"constant", "uno"};
+        String[] cat2v1 = {"constant", "two"};
+        String[] cat2v2 = {"constant", "duo"};
+        List<IndexRequestBuilder> indexRequestBuilderList = new ArrayList<>();
+        indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "1")
+                .setSource(TEXT_FIELD, cat1v1, CLASS_FIELD, "1"));
+        indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "2")
+                .setSource(TEXT_FIELD, cat1v2, CLASS_FIELD, "1"));
+        indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "3")
+                .setSource(TEXT_FIELD, cat2v1, CLASS_FIELD, "2"));
+        indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "4")
+                .setSource(TEXT_FIELD, cat2v2, CLASS_FIELD, "2"));
+        indexRandom(true, false, indexRequestBuilderList);
+        
+        // Now create some holes in the index with selective deletes caused by updates.
+        // This is the scenario that caused this issue https://github.com/elasticsearch/elasticsearch/issues/7951
+        // Scoring algorithms throw exceptions if term docFreqs exceed the reported size of the index 
+        // from which they are taken so need to make sure this doesn't happen.
+        String[] text = cat1v1;
+        indexRequestBuilderList.clear();
+        for (int i = 0; i < 50; i++) {
+            text = text == cat1v2 ? cat1v1 : cat1v2;
+            indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "1").setSource(TEXT_FIELD, text, CLASS_FIELD, "1"));
+        }
+        indexRandom(true, false, indexRequestBuilderList);
+        
+        SearchResponse response1 = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
+                .addAggregation(new TermsBuilder("class")
+                        .field(CLASS_FIELD)
+                        .subAggregation(
+                                new SignificantTermsBuilder("sig_terms")
+                                        .field(TEXT_FIELD)
+                                        .minDocCount(1)))
+                .execute()
+                .actionGet();
+    }    

    @Test
    public void testBackgroundVsSeparateSet() throws Exception {
@ -347,7 +390,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
                .setSource(TEXT_FIELD, gb, CLASS_FIELD, "0"));
        indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "7")
                .setSource(TEXT_FIELD, "0", CLASS_FIELD, "0"));
-        indexRandom(true, indexRequestBuilderList);
+        indexRandom(true, false, indexRequestBuilderList);
    }

    @Test
@ -413,6 +456,6 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
            indexRequestBuilders.add(client().prepareIndex("test", "doc", "" + i)
                    .setSource("class", parts[0], "text", parts[1]));
        }
-        indexRandom(true, indexRequestBuilders);
+        indexRandom(true, false, indexRequestBuilders);
    }
 }
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountTests.java
@ -69,7 +69,7 @@ public class TermsShardMinDocCountTests extends ElasticsearchIntegrationTest {
        addTermsDocs("5", 3, 1, indexBuilders);//low score but high doc freq
        addTermsDocs("6", 3, 1, indexBuilders);
        addTermsDocs("7", 0, 3, indexBuilders);// make sure the terms all get score > 0 except for this one
-        indexRandom(true, indexBuilders);
+        indexRandom(true, false, indexBuilders);

        // first, check that indeed when not setting the shardMinDocCount parameter 0 terms are returned
        SearchResponse response = client().prepareSearch(index)
@ -126,7 +126,7 @@ public class TermsShardMinDocCountTests extends ElasticsearchIntegrationTest {
        addTermsDocs("4", 1, indexBuilders);
        addTermsDocs("5", 3, indexBuilders);//low score but high doc freq
        addTermsDocs("6", 3, indexBuilders);
-        indexRandom(true, indexBuilders);
+        indexRandom(true, false, indexBuilders);

        // first, check that indeed when not setting the shardMinDocCount parameter 0 terms are returned
        SearchResponse response = client().prepareSearch(index)