diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java index c85499411be..a0d534406c8 100644 --- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java +++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java @@ -575,7 +575,7 @@ public class QueryPhase implements SearchPhase { * Returns true if more than 50% of data in the index have the same value * The evaluation is approximation based on finding the median value and estimating its count */ - static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException { + private static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException { long docsNoDupl = 0; // number of docs in segments with NO duplicate data that would benefit optimization long docsDupl = 0; // number of docs in segments with duplicate data that would NOT benefit optimization for (LeafReaderContext lrc : reader.leaves()) { @@ -586,24 +586,8 @@ public class QueryPhase implements SearchPhase { continue; } assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values - int duplDocCount = docCount/2; // expected doc count of duplicate data - long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0); - long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0); - boolean hasDuplicateData = true; - while ((minValue < maxValue) && hasDuplicateData) { - long midValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2 - long countLeft = estimatePointCount(pointValues, minValue, midValue); - long countRight = estimatePointCount(pointValues, midValue + 1, maxValue); - if ((countLeft >= countRight) && (countLeft > duplDocCount) ) { - maxValue = midValue; - } else if ((countRight > countLeft) && (countRight > duplDocCount)) { - minValue = midValue + 1; - } else { - hasDuplicateData = false; - } - } - if (hasDuplicateData) { + if (pointsHaveDuplicateData(pointValues, duplDocCount)) { docsDupl += docCount; } else { docsNoDupl += docCount; @@ -612,6 +596,25 @@ public class QueryPhase implements SearchPhase { return (docsDupl > docsNoDupl); } + static boolean pointsHaveDuplicateData(PointValues pointValues, int duplDocCount) throws IOException { + long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0); + long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0); + boolean hasDuplicateData = true; + while ((minValue < maxValue) && hasDuplicateData) { + long midValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2 + long countLeft = estimatePointCount(pointValues, minValue, midValue); + long countRight = estimatePointCount(pointValues, midValue + 1, maxValue); + if ((countLeft >= countRight) && (countLeft > duplDocCount) ) { + maxValue = midValue; + } else if ((countRight > countLeft) && (countRight > duplDocCount)) { + minValue = midValue + 1; + } else { + hasDuplicateData = false; + } + } + return hasDuplicateData; + } + private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) { final byte[] minValueAsBytes = new byte[Long.BYTES]; diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java index 95330ea8fd0..0b9ee869736 100644 --- a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java +++ b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java @@ -69,8 +69,13 @@ import org.apache.lucene.search.join.ScoreMode; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.bkd.BKDReader; +import org.apache.lucene.util.bkd.BKDWriter; import org.elasticsearch.action.search.SearchShardTask; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.mapper.DateFieldMapper; @@ -95,7 +100,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; -import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData; +import static org.elasticsearch.search.query.QueryPhase.pointsHaveDuplicateData; import static org.elasticsearch.search.query.TopDocsCollectorContext.hasInfMaxScore; import static org.hamcrest.Matchers.anyOf; import static org.hamcrest.Matchers.equalTo; @@ -710,31 +715,56 @@ public class QueryPhaseTests extends IndexShardTestCase { dir.close(); } - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/49703") public void testIndexHasDuplicateData() throws IOException { - int docsCount = 7000; - int duplIndex = docsCount * 7 / 10; - int duplIndex2 = docsCount * 3 / 10; + int docsCount = 5000; + int maxPointsInLeafNode = 40; + float duplicateRatio = 0.7f; long duplicateValue = randomLongBetween(-10000000L, 10000000L); - Directory dir = newDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null)); - for (int docId = 0; docId < docsCount; docId++) { - Document doc = new Document(); - long rndValue = randomLongBetween(-10000000L, 10000000L); - long value = (docId < duplIndex) ? duplicateValue : rndValue; - long value2 = (docId < duplIndex2) ? duplicateValue : rndValue; - doc.add(new LongPoint("duplicateField", value)); - doc.add(new LongPoint("notDuplicateField", value2)); - writer.addDocument(doc); + + try (Directory dir = newDirectory()) { + BKDWriter w = new BKDWriter(docsCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, docsCount); + byte[] longBytes = new byte[8]; + for (int docId = 0; docId < docsCount; docId++) { + long value = randomFloat() < duplicateRatio ? duplicateValue : randomLongBetween(-10000000L, 10000000L); + LongPoint.encodeDimension(value, longBytes, 0); + w.add(longBytes, docId); + } + long indexFP; + try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { + indexFP = w.finish(out); + } + try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { + in.seek(indexFP); + BKDReader r = new BKDReader(in); + assertTrue(pointsHaveDuplicateData(r, r.getDocCount()/2)); + } + } + } + + public void testIndexHasNoDuplicateData() throws IOException { + int docsCount = 5000; + int maxPointsInLeafNode = 40; + float duplicateRatio = 0.3f; + long duplicateValue = randomLongBetween(-10000000L, 10000000L); + + try (Directory dir = newDirectory()) { + BKDWriter w = new BKDWriter(docsCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, docsCount); + byte[] longBytes = new byte[8]; + for (int docId = 0; docId < docsCount; docId++) { + long value = randomFloat() < duplicateRatio ? duplicateValue : randomLongBetween(-10000000L, 10000000L); + LongPoint.encodeDimension(value, longBytes, 0); + w.add(longBytes, docId); + } + long indexFP; + try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { + indexFP = w.finish(out); + } + try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { + in.seek(indexFP); + BKDReader r = new BKDReader(in); + assertFalse(pointsHaveDuplicateData(r, r.getDocCount()/2)); + } } - writer.close(); - final IndexReader reader = DirectoryReader.open(dir); - boolean hasDuplicateData = indexFieldHasDuplicateData(reader, "duplicateField"); - boolean hasDuplicateData2 = indexFieldHasDuplicateData(reader, "notDuplicateField"); - reader.close(); - dir.close(); - assertTrue(hasDuplicateData); - assertFalse(hasDuplicateData2); } public void testMaxScoreQueryVisitor() {