Aggs fix - background count for docs should include deleted docs otherwise a term’s docFreq (which includes deleted docs) can exceed the number of docs reported in the index and cause an exception.

The randomisation that deletes documents is also removed from tests as this doc-accounting change would mean the specific scores being expected in tests would now be subject to random variability and so fail.

Closes #7951
This commit is contained in:
markharwood 2014-10-02 13:49:28 +01:00
parent f0052a58d6
commit f878f40ae5
3 changed files with 50 additions and 5 deletions

View File

@ -71,7 +71,9 @@ public class FilterableTermsEnum extends TermsEnum {
} }
this.docsEnumFlag = docsEnumFlag; this.docsEnumFlag = docsEnumFlag;
if (filter == null) { if (filter == null) {
numDocs = reader.numDocs(); // Important - need to use the doc count that includes deleted docs
// or we have this issue: https://github.com/elasticsearch/elasticsearch/issues/7951
numDocs = reader.maxDoc();
} }
ApplyAcceptedDocsFilter acceptedDocsFilter = filter == null ? null : new ApplyAcceptedDocsFilter(filter); ApplyAcceptedDocsFilter acceptedDocsFilter = filter == null ? null : new ApplyAcceptedDocsFilter(filter);
List<AtomicReaderContext> leaves = reader.leaves(); List<AtomicReaderContext> leaves = reader.leaves();

View File

@ -262,6 +262,49 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
assertThat(responseBuilder.string(), equalTo(result)); assertThat(responseBuilder.string(), equalTo(result));
} }
@Test
public void testDeletesIssue7951() throws Exception {
String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
String mappings = "{\"doc\": {\"properties\":{\"text\": {\"type\":\"string\",\"index\":\"not_analyzed\"}}}}";
assertAcked(prepareCreate(INDEX_NAME).setSettings(settings).addMapping("doc", mappings));
String[] cat1v1 = {"constant", "one"};
String[] cat1v2 = {"constant", "uno"};
String[] cat2v1 = {"constant", "two"};
String[] cat2v2 = {"constant", "duo"};
List<IndexRequestBuilder> indexRequestBuilderList = new ArrayList<>();
indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "1")
.setSource(TEXT_FIELD, cat1v1, CLASS_FIELD, "1"));
indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "2")
.setSource(TEXT_FIELD, cat1v2, CLASS_FIELD, "1"));
indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "3")
.setSource(TEXT_FIELD, cat2v1, CLASS_FIELD, "2"));
indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "4")
.setSource(TEXT_FIELD, cat2v2, CLASS_FIELD, "2"));
indexRandom(true, false, indexRequestBuilderList);
// Now create some holes in the index with selective deletes caused by updates.
// This is the scenario that caused this issue https://github.com/elasticsearch/elasticsearch/issues/7951
// Scoring algorithms throw exceptions if term docFreqs exceed the reported size of the index
// from which they are taken so need to make sure this doesn't happen.
String[] text = cat1v1;
indexRequestBuilderList.clear();
for (int i = 0; i < 50; i++) {
text = text == cat1v2 ? cat1v1 : cat1v2;
indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "1").setSource(TEXT_FIELD, text, CLASS_FIELD, "1"));
}
indexRandom(true, false, indexRequestBuilderList);
SearchResponse response1 = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
.addAggregation(new TermsBuilder("class")
.field(CLASS_FIELD)
.subAggregation(
new SignificantTermsBuilder("sig_terms")
.field(TEXT_FIELD)
.minDocCount(1)))
.execute()
.actionGet();
}
@Test @Test
public void testBackgroundVsSeparateSet() throws Exception { public void testBackgroundVsSeparateSet() throws Exception {
@ -347,7 +390,7 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
.setSource(TEXT_FIELD, gb, CLASS_FIELD, "0")); .setSource(TEXT_FIELD, gb, CLASS_FIELD, "0"));
indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "7") indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "7")
.setSource(TEXT_FIELD, "0", CLASS_FIELD, "0")); .setSource(TEXT_FIELD, "0", CLASS_FIELD, "0"));
indexRandom(true, indexRequestBuilderList); indexRandom(true, false, indexRequestBuilderList);
} }
@Test @Test
@ -413,6 +456,6 @@ public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegra
indexRequestBuilders.add(client().prepareIndex("test", "doc", "" + i) indexRequestBuilders.add(client().prepareIndex("test", "doc", "" + i)
.setSource("class", parts[0], "text", parts[1])); .setSource("class", parts[0], "text", parts[1]));
} }
indexRandom(true, indexRequestBuilders); indexRandom(true, false, indexRequestBuilders);
} }
} }

View File

@ -69,7 +69,7 @@ public class TermsShardMinDocCountTests extends ElasticsearchIntegrationTest {
addTermsDocs("5", 3, 1, indexBuilders);//low score but high doc freq addTermsDocs("5", 3, 1, indexBuilders);//low score but high doc freq
addTermsDocs("6", 3, 1, indexBuilders); addTermsDocs("6", 3, 1, indexBuilders);
addTermsDocs("7", 0, 3, indexBuilders);// make sure the terms all get score > 0 except for this one addTermsDocs("7", 0, 3, indexBuilders);// make sure the terms all get score > 0 except for this one
indexRandom(true, indexBuilders); indexRandom(true, false, indexBuilders);
// first, check that indeed when not setting the shardMinDocCount parameter 0 terms are returned // first, check that indeed when not setting the shardMinDocCount parameter 0 terms are returned
SearchResponse response = client().prepareSearch(index) SearchResponse response = client().prepareSearch(index)
@ -126,7 +126,7 @@ public class TermsShardMinDocCountTests extends ElasticsearchIntegrationTest {
addTermsDocs("4", 1, indexBuilders); addTermsDocs("4", 1, indexBuilders);
addTermsDocs("5", 3, indexBuilders);//low score but high doc freq addTermsDocs("5", 3, indexBuilders);//low score but high doc freq
addTermsDocs("6", 3, indexBuilders); addTermsDocs("6", 3, indexBuilders);
indexRandom(true, indexBuilders); indexRandom(true, false, indexBuilders);
// first, check that indeed when not setting the shardMinDocCount parameter 0 terms are returned // first, check that indeed when not setting the shardMinDocCount parameter 0 terms are returned
SearchResponse response = client().prepareSearch(index) SearchResponse response = client().prepareSearch(index)