Fix testIndexhasDuplicateData tests (#49786)
testIndexHasDuplicateData tests were failing ocassionally, due to approximate calculation of BKDReader.estimatePointCount, where if the node is Leaf, the number of points in it was (maxPointsInLeafNode + 1) / 2. As DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024, for small indexes used in tests, the estimation could be really off. This rewrites tests, to make the max points in leaf node to be a small value to control the tests. Closes #49703
This commit is contained in:
parent
3b2b564c91
commit
2c77c0df65
|
@ -575,7 +575,7 @@ public class QueryPhase implements SearchPhase {
|
|||
* Returns true if more than 50% of data in the index have the same value
|
||||
* The evaluation is approximation based on finding the median value and estimating its count
|
||||
*/
|
||||
static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
|
||||
private static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
|
||||
long docsNoDupl = 0; // number of docs in segments with NO duplicate data that would benefit optimization
|
||||
long docsDupl = 0; // number of docs in segments with duplicate data that would NOT benefit optimization
|
||||
for (LeafReaderContext lrc : reader.leaves()) {
|
||||
|
@ -586,24 +586,8 @@ public class QueryPhase implements SearchPhase {
|
|||
continue;
|
||||
}
|
||||
assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
|
||||
|
||||
int duplDocCount = docCount/2; // expected doc count of duplicate data
|
||||
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
|
||||
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
|
||||
boolean hasDuplicateData = true;
|
||||
while ((minValue < maxValue) && hasDuplicateData) {
|
||||
long midValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
|
||||
long countLeft = estimatePointCount(pointValues, minValue, midValue);
|
||||
long countRight = estimatePointCount(pointValues, midValue + 1, maxValue);
|
||||
if ((countLeft >= countRight) && (countLeft > duplDocCount) ) {
|
||||
maxValue = midValue;
|
||||
} else if ((countRight > countLeft) && (countRight > duplDocCount)) {
|
||||
minValue = midValue + 1;
|
||||
} else {
|
||||
hasDuplicateData = false;
|
||||
}
|
||||
}
|
||||
if (hasDuplicateData) {
|
||||
if (pointsHaveDuplicateData(pointValues, duplDocCount)) {
|
||||
docsDupl += docCount;
|
||||
} else {
|
||||
docsNoDupl += docCount;
|
||||
|
@ -612,6 +596,25 @@ public class QueryPhase implements SearchPhase {
|
|||
return (docsDupl > docsNoDupl);
|
||||
}
|
||||
|
||||
static boolean pointsHaveDuplicateData(PointValues pointValues, int duplDocCount) throws IOException {
|
||||
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
|
||||
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
|
||||
boolean hasDuplicateData = true;
|
||||
while ((minValue < maxValue) && hasDuplicateData) {
|
||||
long midValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
|
||||
long countLeft = estimatePointCount(pointValues, minValue, midValue);
|
||||
long countRight = estimatePointCount(pointValues, midValue + 1, maxValue);
|
||||
if ((countLeft >= countRight) && (countLeft > duplDocCount) ) {
|
||||
maxValue = midValue;
|
||||
} else if ((countRight > countLeft) && (countRight > duplDocCount)) {
|
||||
minValue = midValue + 1;
|
||||
} else {
|
||||
hasDuplicateData = false;
|
||||
}
|
||||
}
|
||||
return hasDuplicateData;
|
||||
}
|
||||
|
||||
|
||||
private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
|
||||
final byte[] minValueAsBytes = new byte[Long.BYTES];
|
||||
|
|
|
@ -69,8 +69,13 @@ import org.apache.lucene.search.join.ScoreMode;
|
|||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.bkd.BKDReader;
|
||||
import org.apache.lucene.util.bkd.BKDWriter;
|
||||
import org.elasticsearch.action.search.SearchShardTask;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.mapper.DateFieldMapper;
|
||||
|
@ -95,7 +100,7 @@ import java.util.ArrayList;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
|
||||
import static org.elasticsearch.search.query.QueryPhase.pointsHaveDuplicateData;
|
||||
import static org.elasticsearch.search.query.TopDocsCollectorContext.hasInfMaxScore;
|
||||
import static org.hamcrest.Matchers.anyOf;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
@ -710,31 +715,56 @@ public class QueryPhaseTests extends IndexShardTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/49703")
|
||||
public void testIndexHasDuplicateData() throws IOException {
|
||||
int docsCount = 7000;
|
||||
int duplIndex = docsCount * 7 / 10;
|
||||
int duplIndex2 = docsCount * 3 / 10;
|
||||
int docsCount = 5000;
|
||||
int maxPointsInLeafNode = 40;
|
||||
float duplicateRatio = 0.7f;
|
||||
long duplicateValue = randomLongBetween(-10000000L, 10000000L);
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
|
||||
for (int docId = 0; docId < docsCount; docId++) {
|
||||
Document doc = new Document();
|
||||
long rndValue = randomLongBetween(-10000000L, 10000000L);
|
||||
long value = (docId < duplIndex) ? duplicateValue : rndValue;
|
||||
long value2 = (docId < duplIndex2) ? duplicateValue : rndValue;
|
||||
doc.add(new LongPoint("duplicateField", value));
|
||||
doc.add(new LongPoint("notDuplicateField", value2));
|
||||
writer.addDocument(doc);
|
||||
|
||||
try (Directory dir = newDirectory()) {
|
||||
BKDWriter w = new BKDWriter(docsCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, docsCount);
|
||||
byte[] longBytes = new byte[8];
|
||||
for (int docId = 0; docId < docsCount; docId++) {
|
||||
long value = randomFloat() < duplicateRatio ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
|
||||
LongPoint.encodeDimension(value, longBytes, 0);
|
||||
w.add(longBytes, docId);
|
||||
}
|
||||
long indexFP;
|
||||
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
|
||||
indexFP = w.finish(out);
|
||||
}
|
||||
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
|
||||
in.seek(indexFP);
|
||||
BKDReader r = new BKDReader(in);
|
||||
assertTrue(pointsHaveDuplicateData(r, r.getDocCount()/2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testIndexHasNoDuplicateData() throws IOException {
|
||||
int docsCount = 5000;
|
||||
int maxPointsInLeafNode = 40;
|
||||
float duplicateRatio = 0.3f;
|
||||
long duplicateValue = randomLongBetween(-10000000L, 10000000L);
|
||||
|
||||
try (Directory dir = newDirectory()) {
|
||||
BKDWriter w = new BKDWriter(docsCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, docsCount);
|
||||
byte[] longBytes = new byte[8];
|
||||
for (int docId = 0; docId < docsCount; docId++) {
|
||||
long value = randomFloat() < duplicateRatio ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
|
||||
LongPoint.encodeDimension(value, longBytes, 0);
|
||||
w.add(longBytes, docId);
|
||||
}
|
||||
long indexFP;
|
||||
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
|
||||
indexFP = w.finish(out);
|
||||
}
|
||||
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
|
||||
in.seek(indexFP);
|
||||
BKDReader r = new BKDReader(in);
|
||||
assertFalse(pointsHaveDuplicateData(r, r.getDocCount()/2));
|
||||
}
|
||||
}
|
||||
writer.close();
|
||||
final IndexReader reader = DirectoryReader.open(dir);
|
||||
boolean hasDuplicateData = indexFieldHasDuplicateData(reader, "duplicateField");
|
||||
boolean hasDuplicateData2 = indexFieldHasDuplicateData(reader, "notDuplicateField");
|
||||
reader.close();
|
||||
dir.close();
|
||||
assertTrue(hasDuplicateData);
|
||||
assertFalse(hasDuplicateData2);
|
||||
}
|
||||
|
||||
public void testMaxScoreQueryVisitor() {
|
||||
|
|
Loading…
Reference in New Issue