From 8c9b9546ccf63ba8433a096ed0712fc33b5d529d Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 1 Apr 2021 07:04:04 +0200 Subject: [PATCH] LUCENE-9705: Create Lucene90PointsFormat (#52) --- .../lucene70/package-info.java | 4 +- .../lucene86/Lucene86Codec.java | 4 +- .../lucene86/Lucene86PointsFormat.java | 6 +- .../lucene86/Lucene86PointsReader.java | 4 +- .../lucene87/Lucene87Codec.java | 5 +- .../lucene60/TestLucene60PointsFormat.java | 81 ----- .../lucene86/Lucene86PointsWriter.java | 2 +- .../lucene86/Lucene86RWCodec.java | 6 + .../lucene86/Lucene86RWPointsFormat.java} | 21 +- .../lucene86/TestLucene86PointsFormat.java | 86 +---- .../lucene87/Lucene87RWCodec.java | 7 + .../lucene/codecs/lucene90/Lucene90Codec.java | 3 +- .../codecs/lucene90/Lucene90PointsFormat.java | 76 ++++ .../codecs/lucene90/Lucene90PointsReader.java | 158 ++++++++ .../codecs/lucene90/Lucene90PointsWriter.java | 298 +++++++++++++++ .../lucene/codecs/lucene90/package-info.java | 4 +- .../lucene90/TestLucene90PointsFormat.java | 344 ++++++++++++++++++ .../lucene/search/TestPointQueries.java | 8 +- .../search/LatLonPointPrototypeQueries.java | 4 +- .../lucene/spatial3d/TestGeo3DPoint.java | 8 +- .../lucene/geo/BaseGeoPointTestCase.java | 8 +- .../lucene/geo/BaseXYPointTestCase.java | 8 +- .../index/BasePointsFormatTestCase.java | 81 +++++ .../org/apache/lucene/index/RandomCodec.java | 8 +- 24 files changed, 1025 insertions(+), 209 deletions(-) rename lucene/{core/src/java/org/apache/lucene/codecs => backward-codecs/src/java/org/apache/lucene/backward_codecs}/lucene86/Lucene86PointsFormat.java (93%) rename lucene/{core/src/java/org/apache/lucene/codecs => backward-codecs/src/java/org/apache/lucene/backward_codecs}/lucene86/Lucene86PointsReader.java (97%) rename lucene/{core/src/java/org/apache/lucene/codecs => backward-codecs/src/test/org/apache/lucene/backward_codecs}/lucene86/Lucene86PointsWriter.java (99%) rename lucene/{core/src/java/org/apache/lucene/codecs/lucene86/package-info.java => backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWPointsFormat.java} (60%) rename lucene/{core/src/test/org/apache/lucene/codecs => backward-codecs/src/test/org/apache/lucene/backward_codecs}/lucene86/TestLucene86PointsFormat.java (82%) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90PointsFormat.java diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/package-info.java index 7a2e8f049aa..43b9a0554cf 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/package-info.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/package-info.java @@ -16,7 +16,7 @@ */ /** - * Components from the Lucene 7.0 index format. See {@link org.apache.lucene.codecs.lucene86} for an - * overview of the current index format. + * Components from the Lucene 7.0 index format. See {@link + * org.apache.lucene.backward_codecs.lucene86} for an overview of the current index format. */ package org.apache.lucene.backward_codecs.lucene70; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86Codec.java index 4299c2bc3b1..4834e09f043 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86Codec.java @@ -38,7 +38,6 @@ import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.VectorFormat; -import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; @@ -49,7 +48,6 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; *

If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. * * @lucene.experimental - * @see org.apache.lucene.codecs.lucene86 package documentation for file format details. */ public class Lucene86Codec extends Codec { private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat(); @@ -130,7 +128,7 @@ public class Lucene86Codec extends Codec { } @Override - public final PointsFormat pointsFormat() { + public PointsFormat pointsFormat() { return pointsFormat; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsFormat.java similarity index 93% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsFormat.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsFormat.java index 0714840a16e..b8c4d8cd118 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsFormat.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene86; +package org.apache.lucene.backward_codecs.lucene86; import java.io.IOException; import org.apache.lucene.codecs.PointsFormat; @@ -43,7 +43,7 @@ import org.apache.lucene.index.SegmentWriteState; * * @lucene.experimental */ -public final class Lucene86PointsFormat extends PointsFormat { +public class Lucene86PointsFormat extends PointsFormat { static final String DATA_CODEC_NAME = "Lucene86PointsFormatData"; static final String INDEX_CODEC_NAME = "Lucene86PointsFormatIndex"; @@ -66,7 +66,7 @@ public final class Lucene86PointsFormat extends PointsFormat { @Override public PointsWriter fieldsWriter(SegmentWriteState state) throws IOException { - return new Lucene86PointsWriter(state); + throw new UnsupportedOperationException("Old codecs may only be used for reading"); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsReader.java similarity index 97% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsReader.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsReader.java index 3b21a403e45..81d8603604c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsReader.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene86; +package org.apache.lucene.backward_codecs.lucene86; import java.io.IOException; import java.util.HashMap; @@ -31,7 +31,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.bkd.BKDReader; -/** Reads point values previously written with {@link Lucene86PointsWriter} */ +/** Reads point values previously written with Lucene86PointsWriter */ public class Lucene86PointsReader extends PointsReader { final IndexInput indexIn, dataIn; final SegmentReadState readState; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java index f8f477edf27..b6e3a8fddf1 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java @@ -25,6 +25,7 @@ import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat; import org.apache.lucene.backward_codecs.lucene80.Lucene80DocValuesFormat; import org.apache.lucene.backward_codecs.lucene80.Lucene80NormsFormat; import org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat; +import org.apache.lucene.backward_codecs.lucene86.Lucene86PointsFormat; import org.apache.lucene.backward_codecs.lucene86.Lucene86SegmentInfoFormat; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompoundFormat; @@ -39,7 +40,6 @@ import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.VectorFormat; -import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; @@ -49,7 +49,6 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; * *

If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. * - * @see org.apache.lucene.codecs.lucene86 package documentation for file format details. * @lucene.experimental */ public class Lucene87Codec extends Codec { @@ -153,7 +152,7 @@ public class Lucene87Codec extends Codec { } @Override - public final PointsFormat pointsFormat() { + public PointsFormat pointsFormat() { return pointsFormat; } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/TestLucene60PointsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/TestLucene60PointsFormat.java index f3692af449e..0d1d5faa693 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/TestLucene60PointsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/TestLucene60PointsFormat.java @@ -294,85 +294,4 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase { r.close(); dir.close(); } - - public void testDocCountEdgeCases() { - PointValues values = getPointValues(Long.MAX_VALUE, 1, Long.MAX_VALUE); - long docs = values.estimateDocCount(null); - assertEquals(1, docs); - values = getPointValues(Long.MAX_VALUE, 1, 1); - docs = values.estimateDocCount(null); - assertEquals(1, docs); - values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE); - docs = values.estimateDocCount(null); - assertEquals(Integer.MAX_VALUE, docs); - values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE / 2); - docs = values.estimateDocCount(null); - assertEquals(Integer.MAX_VALUE, docs); - values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, 1); - docs = values.estimateDocCount(null); - assertEquals(1, docs); - } - - public void testRandomDocCount() { - for (int i = 0; i < 100; i++) { - long size = TestUtil.nextLong(random(), 1, Long.MAX_VALUE); - int maxDoc = (size > Integer.MAX_VALUE) ? Integer.MAX_VALUE : Math.toIntExact(size); - int docCount = TestUtil.nextInt(random(), 1, maxDoc); - long estimatedPointCount = TestUtil.nextLong(random(), 0, size); - PointValues values = getPointValues(size, docCount, estimatedPointCount); - long docs = values.estimateDocCount(null); - assertTrue(docs <= estimatedPointCount); - assertTrue(docs <= maxDoc); - assertTrue(docs >= estimatedPointCount / (size / docCount)); - } - } - - private PointValues getPointValues(long size, int docCount, long estimatedPointCount) { - return new PointValues() { - @Override - public void intersect(IntersectVisitor visitor) { - throw new UnsupportedOperationException(); - } - - @Override - public long estimatePointCount(IntersectVisitor visitor) { - return estimatedPointCount; - } - - @Override - public byte[] getMinPackedValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public byte[] getMaxPackedValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int getNumDimensions() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int getNumIndexDimensions() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int getBytesPerDimension() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long size() { - return size; - } - - @Override - public int getDocCount() { - return docCount; - } - }; - } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsWriter.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsWriter.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsWriter.java index e671d7a2b46..f98bbcb9961 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsWriter.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene86; +package org.apache.lucene.backward_codecs.lucene86; import java.io.IOException; import java.util.ArrayList; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWCodec.java index 986684dffbc..c917d2c8e75 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWCodec.java @@ -24,6 +24,7 @@ import org.apache.lucene.backward_codecs.lucene80.Lucene80RWNormsFormat; import org.apache.lucene.backward_codecs.lucene84.Lucene84RWPostingsFormat; import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; @@ -82,4 +83,9 @@ public class Lucene86RWCodec extends Lucene86Codec { public TermVectorsFormat termVectorsFormat() { return new Lucene50RWTermVectorsFormat(); } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene86RWPointsFormat(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWPointsFormat.java similarity index 60% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWPointsFormat.java index 11a6b5c2a84..9fd239d090d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWPointsFormat.java @@ -14,9 +14,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +package org.apache.lucene.backward_codecs.lucene86; -/** - * Components from the Lucene 8.6 index format. See {@link org.apache.lucene.codecs.lucene90} for an - * overview of the current index format. - */ -package org.apache.lucene.codecs.lucene86; +import java.io.IOException; +import org.apache.lucene.codecs.PointsWriter; +import org.apache.lucene.index.SegmentWriteState; + +/** Writable version of Lucene86PointsFormat for testing */ +public final class Lucene86RWPointsFormat extends Lucene86PointsFormat { + + /** Sole constructor */ + public Lucene86RWPointsFormat() {} + + @Override + public PointsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new Lucene86PointsWriter(state); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene86/TestLucene86PointsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/TestLucene86PointsFormat.java similarity index 82% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene86/TestLucene86PointsFormat.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/TestLucene86PointsFormat.java index c03401ec14b..54c5235d690 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene86/TestLucene86PointsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/TestLucene86PointsFormat.java @@ -14,10 +14,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene86; +package org.apache.lucene.backward_codecs.lucene86; import java.io.IOException; import java.util.Arrays; +import org.apache.lucene.backward_codecs.lucene87.Lucene87RWCodec; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PointsFormat; @@ -48,7 +49,7 @@ public class TestLucene86PointsFormat extends BasePointsFormatTestCase { public TestLucene86PointsFormat() { // standard issue - Codec defaultCodec = TestUtil.getDefaultCodec(); + Codec defaultCodec = new Lucene87RWCodec(); if (random().nextBoolean()) { // randomize parameters maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500); @@ -341,85 +342,4 @@ public class TestLucene86PointsFormat extends BasePointsFormatTestCase { r.close(); dir.close(); } - - public void testDocCountEdgeCases() { - PointValues values = getPointValues(Long.MAX_VALUE, 1, Long.MAX_VALUE); - long docs = values.estimateDocCount(null); - assertEquals(1, docs); - values = getPointValues(Long.MAX_VALUE, 1, 1); - docs = values.estimateDocCount(null); - assertEquals(1, docs); - values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE); - docs = values.estimateDocCount(null); - assertEquals(Integer.MAX_VALUE, docs); - values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE / 2); - docs = values.estimateDocCount(null); - assertEquals(Integer.MAX_VALUE, docs); - values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, 1); - docs = values.estimateDocCount(null); - assertEquals(1, docs); - } - - public void testRandomDocCount() { - for (int i = 0; i < 100; i++) { - long size = TestUtil.nextLong(random(), 1, Long.MAX_VALUE); - int maxDoc = (size > Integer.MAX_VALUE) ? Integer.MAX_VALUE : Math.toIntExact(size); - int docCount = TestUtil.nextInt(random(), 1, maxDoc); - long estimatedPointCount = TestUtil.nextLong(random(), 0, size); - PointValues values = getPointValues(size, docCount, estimatedPointCount); - long docs = values.estimateDocCount(null); - assertTrue(docs <= estimatedPointCount); - assertTrue(docs <= maxDoc); - assertTrue(docs >= estimatedPointCount / (size / docCount)); - } - } - - private PointValues getPointValues(long size, int docCount, long estimatedPointCount) { - return new PointValues() { - @Override - public void intersect(IntersectVisitor visitor) { - throw new UnsupportedOperationException(); - } - - @Override - public long estimatePointCount(IntersectVisitor visitor) { - return estimatedPointCount; - } - - @Override - public byte[] getMinPackedValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public byte[] getMaxPackedValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int getNumDimensions() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int getNumIndexDimensions() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int getBytesPerDimension() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long size() { - return size; - } - - @Override - public int getDocCount() { - return docCount; - } - }; - } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java index 748a5a0ee1a..4bd93061744 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java @@ -20,9 +20,11 @@ import org.apache.lucene.backward_codecs.lucene50.Lucene50RWCompoundFormat; import org.apache.lucene.backward_codecs.lucene50.Lucene50RWTermVectorsFormat; import org.apache.lucene.backward_codecs.lucene80.Lucene80RWNormsFormat; import org.apache.lucene.backward_codecs.lucene84.Lucene84RWPostingsFormat; +import org.apache.lucene.backward_codecs.lucene86.Lucene86RWPointsFormat; import org.apache.lucene.backward_codecs.lucene86.Lucene86RWSegmentInfoFormat; import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; @@ -80,4 +82,9 @@ public class Lucene87RWCodec extends Lucene87Codec { public StoredFieldsFormat storedFieldsFormat() { return new Lucene87RWStoredFieldsFormat(mode.storedMode); } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene86RWPointsFormat(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java index 9569103cbbf..25a83fd0be9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java @@ -30,7 +30,6 @@ import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.VectorFormat; -import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; @@ -144,7 +143,7 @@ public class Lucene90Codec extends Codec { @Override public final PointsFormat pointsFormat() { - return new Lucene86PointsFormat(); + return new Lucene90PointsFormat(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsFormat.java new file mode 100644 index 00000000000..64e61991eb1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsFormat.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PointsReader; +import org.apache.lucene.codecs.PointsWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +/** + * Lucene 9.0 point format, which encodes dimensional values in a block KD-tree structure for fast + * 1D range and N dimensional shape intersection filtering. See this paper for + * details. + * + *

Data is stored across three files + * + *

+ * + * See this + * wiki for detailed data structures of the three files. + * + * @lucene.experimental + */ +public final class Lucene90PointsFormat extends PointsFormat { + + static final String DATA_CODEC_NAME = "Lucene90PointsFormatData"; + static final String INDEX_CODEC_NAME = "Lucene90PointsFormatIndex"; + static final String META_CODEC_NAME = "Lucene90PointsFormatMeta"; + + /** Filename extension for the leaf blocks */ + public static final String DATA_EXTENSION = "kdd"; + + /** Filename extension for the index per field */ + public static final String INDEX_EXTENSION = "kdi"; + + /** Filename extension for the meta per field */ + public static final String META_EXTENSION = "kdm"; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + /** Sole constructor */ + public Lucene90PointsFormat() {} + + @Override + public PointsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new Lucene90PointsWriter(state); + } + + @Override + public PointsReader fieldsReader(SegmentReadState state) throws IOException { + return new Lucene90PointsReader(state); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java new file mode 100644 index 00000000000..822f77a48e7 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.PointsReader; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.PointValues; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.bkd.BKDReader; + +/** Reads point values previously written with {@link Lucene90PointsWriter} */ +public class Lucene90PointsReader extends PointsReader { + final IndexInput indexIn, dataIn; + final SegmentReadState readState; + final Map readers = new HashMap<>(); + + /** Sole constructor */ + public Lucene90PointsReader(SegmentReadState readState) throws IOException { + this.readState = readState; + + String metaFileName = + IndexFileNames.segmentFileName( + readState.segmentInfo.name, + readState.segmentSuffix, + Lucene90PointsFormat.META_EXTENSION); + String indexFileName = + IndexFileNames.segmentFileName( + readState.segmentInfo.name, + readState.segmentSuffix, + Lucene90PointsFormat.INDEX_EXTENSION); + String dataFileName = + IndexFileNames.segmentFileName( + readState.segmentInfo.name, + readState.segmentSuffix, + Lucene90PointsFormat.DATA_EXTENSION); + + boolean success = false; + try { + indexIn = readState.directory.openInput(indexFileName, readState.context); + CodecUtil.checkIndexHeader( + indexIn, + Lucene90PointsFormat.INDEX_CODEC_NAME, + Lucene90PointsFormat.VERSION_START, + Lucene90PointsFormat.VERSION_CURRENT, + readState.segmentInfo.getId(), + readState.segmentSuffix); + + dataIn = readState.directory.openInput(dataFileName, readState.context); + CodecUtil.checkIndexHeader( + dataIn, + Lucene90PointsFormat.DATA_CODEC_NAME, + Lucene90PointsFormat.VERSION_START, + Lucene90PointsFormat.VERSION_CURRENT, + readState.segmentInfo.getId(), + readState.segmentSuffix); + + long indexLength = -1, dataLength = -1; + try (ChecksumIndexInput metaIn = + readState.directory.openChecksumInput(metaFileName, readState.context)) { + Throwable priorE = null; + try { + CodecUtil.checkIndexHeader( + metaIn, + Lucene90PointsFormat.META_CODEC_NAME, + Lucene90PointsFormat.VERSION_START, + Lucene90PointsFormat.VERSION_CURRENT, + readState.segmentInfo.getId(), + readState.segmentSuffix); + + while (true) { + int fieldNumber = metaIn.readInt(); + if (fieldNumber == -1) { + break; + } else if (fieldNumber < 0) { + throw new CorruptIndexException("Illegal field number: " + fieldNumber, metaIn); + } + BKDReader reader = new BKDReader(metaIn, indexIn, dataIn); + readers.put(fieldNumber, reader); + } + indexLength = metaIn.readLong(); + dataLength = metaIn.readLong(); + } catch (Throwable t) { + priorE = t; + } finally { + CodecUtil.checkFooter(metaIn, priorE); + } + } + // At this point, checksums of the meta file have been validated so we + // know that indexLength and dataLength are very likely correct. + CodecUtil.retrieveChecksum(indexIn, indexLength); + CodecUtil.retrieveChecksum(dataIn, dataLength); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + /** + * Returns the underlying {@link BKDReader}. + * + * @lucene.internal + */ + @Override + public PointValues getValues(String fieldName) { + FieldInfo fieldInfo = readState.fieldInfos.fieldInfo(fieldName); + if (fieldInfo == null) { + throw new IllegalArgumentException("field=\"" + fieldName + "\" is unrecognized"); + } + if (fieldInfo.getPointDimensionCount() == 0) { + throw new IllegalArgumentException("field=\"" + fieldName + "\" did not index point values"); + } + + return readers.get(fieldInfo.number); + } + + @Override + public long ramBytesUsed() { + return 0L; + } + + @Override + public void checkIntegrity() throws IOException { + CodecUtil.checksumEntireFile(indexIn); + CodecUtil.checksumEntireFile(dataIn); + } + + @Override + public void close() throws IOException { + IOUtils.close(indexIn, dataIn); + // Free up heap: + readers.clear(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java new file mode 100644 index 00000000000..3dc7dfac296 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.MutablePointValues; +import org.apache.lucene.codecs.PointsReader; +import org.apache.lucene.codecs.PointsWriter; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.PointValues; +import org.apache.lucene.index.PointValues.IntersectVisitor; +import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.bkd.BKDConfig; +import org.apache.lucene.util.bkd.BKDReader; +import org.apache.lucene.util.bkd.BKDWriter; + +/** Writes dimensional values */ +public class Lucene90PointsWriter extends PointsWriter { + + /** Outputs used to write the BKD tree data files. */ + protected final IndexOutput metaOut, indexOut, dataOut; + + final SegmentWriteState writeState; + final int maxPointsInLeafNode; + final double maxMBSortInHeap; + private boolean finished; + + /** Full constructor */ + public Lucene90PointsWriter( + SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap) + throws IOException { + assert writeState.fieldInfos.hasPointValues(); + this.writeState = writeState; + this.maxPointsInLeafNode = maxPointsInLeafNode; + this.maxMBSortInHeap = maxMBSortInHeap; + String dataFileName = + IndexFileNames.segmentFileName( + writeState.segmentInfo.name, + writeState.segmentSuffix, + Lucene90PointsFormat.DATA_EXTENSION); + dataOut = writeState.directory.createOutput(dataFileName, writeState.context); + boolean success = false; + try { + CodecUtil.writeIndexHeader( + dataOut, + Lucene90PointsFormat.DATA_CODEC_NAME, + Lucene90PointsFormat.VERSION_CURRENT, + writeState.segmentInfo.getId(), + writeState.segmentSuffix); + + String metaFileName = + IndexFileNames.segmentFileName( + writeState.segmentInfo.name, + writeState.segmentSuffix, + Lucene90PointsFormat.META_EXTENSION); + metaOut = writeState.directory.createOutput(metaFileName, writeState.context); + CodecUtil.writeIndexHeader( + metaOut, + Lucene90PointsFormat.META_CODEC_NAME, + Lucene90PointsFormat.VERSION_CURRENT, + writeState.segmentInfo.getId(), + writeState.segmentSuffix); + + String indexFileName = + IndexFileNames.segmentFileName( + writeState.segmentInfo.name, + writeState.segmentSuffix, + Lucene90PointsFormat.INDEX_EXTENSION); + indexOut = writeState.directory.createOutput(indexFileName, writeState.context); + CodecUtil.writeIndexHeader( + indexOut, + Lucene90PointsFormat.INDEX_CODEC_NAME, + Lucene90PointsFormat.VERSION_CURRENT, + writeState.segmentInfo.getId(), + writeState.segmentSuffix); + + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + /** + * Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} + * (16.0) + */ + public Lucene90PointsWriter(SegmentWriteState writeState) throws IOException { + this( + writeState, + BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE, + BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); + } + + @Override + public void writeField(FieldInfo fieldInfo, PointsReader reader) throws IOException { + + PointValues values = reader.getValues(fieldInfo.name); + + BKDConfig config = + new BKDConfig( + fieldInfo.getPointDimensionCount(), + fieldInfo.getPointIndexDimensionCount(), + fieldInfo.getPointNumBytes(), + maxPointsInLeafNode); + + try (BKDWriter writer = + new BKDWriter( + writeState.segmentInfo.maxDoc(), + writeState.directory, + writeState.segmentInfo.name, + config, + maxMBSortInHeap, + values.size())) { + + if (values instanceof MutablePointValues) { + Runnable finalizer = + writer.writeField( + metaOut, indexOut, dataOut, fieldInfo.name, (MutablePointValues) values); + if (finalizer != null) { + metaOut.writeInt(fieldInfo.number); + finalizer.run(); + } + return; + } + + values.intersect( + new IntersectVisitor() { + @Override + public void visit(int docID) { + throw new IllegalStateException(); + } + + @Override + public void visit(int docID, byte[] packedValue) throws IOException { + writer.add(packedValue, docID); + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_CROSSES_QUERY; + } + }); + + // We could have 0 points on merge since all docs with dimensional fields may be deleted: + Runnable finalizer = writer.finish(metaOut, indexOut, dataOut); + if (finalizer != null) { + metaOut.writeInt(fieldInfo.number); + finalizer.run(); + } + } + } + + @Override + public void merge(MergeState mergeState) throws IOException { + /** + * If indexSort is activated and some of the leaves are not sorted the next test will catch that + * and the non-optimized merge will run. If the readers are all sorted then it's safe to perform + * a bulk merge of the points. + */ + for (PointsReader reader : mergeState.pointsReaders) { + if (reader instanceof Lucene90PointsReader == false) { + // We can only bulk merge when all to-be-merged segments use our format: + super.merge(mergeState); + return; + } + } + for (PointsReader reader : mergeState.pointsReaders) { + if (reader != null) { + reader.checkIntegrity(); + } + } + + for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) { + if (fieldInfo.getPointDimensionCount() != 0) { + if (fieldInfo.getPointDimensionCount() == 1) { + + // Worst case total maximum size (if none of the points are deleted): + long totMaxSize = 0; + for (int i = 0; i < mergeState.pointsReaders.length; i++) { + PointsReader reader = mergeState.pointsReaders[i]; + if (reader != null) { + FieldInfos readerFieldInfos = mergeState.fieldInfos[i]; + FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) { + PointValues values = reader.getValues(fieldInfo.name); + if (values != null) { + totMaxSize += values.size(); + } + } + } + } + + BKDConfig config = + new BKDConfig( + fieldInfo.getPointDimensionCount(), + fieldInfo.getPointIndexDimensionCount(), + fieldInfo.getPointNumBytes(), + maxPointsInLeafNode); + + // System.out.println("MERGE: field=" + fieldInfo.name); + // Optimize the 1D case to use BKDWriter.merge, which does a single merge sort of the + // already sorted incoming segments, instead of trying to sort all points again as if + // we were simply reindexing them: + try (BKDWriter writer = + new BKDWriter( + writeState.segmentInfo.maxDoc(), + writeState.directory, + writeState.segmentInfo.name, + config, + maxMBSortInHeap, + totMaxSize)) { + List bkdReaders = new ArrayList<>(); + List docMaps = new ArrayList<>(); + for (int i = 0; i < mergeState.pointsReaders.length; i++) { + PointsReader reader = mergeState.pointsReaders[i]; + + if (reader != null) { + + // we confirmed this up above + assert reader instanceof Lucene90PointsReader; + Lucene90PointsReader reader90 = (Lucene90PointsReader) reader; + + // NOTE: we cannot just use the merged fieldInfo.number (instead of resolving to + // this + // reader's FieldInfo as we do below) because field numbers can easily be different + // when addIndexes(Directory...) copies over segments from another index: + + FieldInfos readerFieldInfos = mergeState.fieldInfos[i]; + FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) { + BKDReader bkdReader = reader90.readers.get(readerFieldInfo.number); + if (bkdReader != null) { + bkdReaders.add(bkdReader); + docMaps.add(mergeState.docMaps[i]); + } + } + } + } + + Runnable finalizer = writer.merge(metaOut, indexOut, dataOut, docMaps, bkdReaders); + if (finalizer != null) { + metaOut.writeInt(fieldInfo.number); + finalizer.run(); + } + } + } else { + mergeOneField(mergeState, fieldInfo); + } + } + } + + finish(); + } + + @Override + public void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; + metaOut.writeInt(-1); + CodecUtil.writeFooter(indexOut); + CodecUtil.writeFooter(dataOut); + metaOut.writeLong(indexOut.getFilePointer()); + metaOut.writeLong(dataOut.getFilePointer()); + CodecUtil.writeFooter(metaOut); + } + + @Override + public void close() throws IOException { + IOUtils.close(metaOut, indexOut, dataOut); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java index 449c7023af4..b72c8603797 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java @@ -176,7 +176,7 @@ * factors. *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An * optional file indicating which documents are live. - *
  • {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}. Optional pair + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair * of files, recording dimensionally indexed fields, to enable fast numeric range filtering * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape * intersection (2D, 3D). @@ -305,7 +305,7 @@ * Info about what documents are live * * - * {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values} + * {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values} * .dii, .dim * Holds indexed points * diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90PointsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90PointsFormat.java new file mode 100644 index 00000000000..fca1b83e95b --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90PointsFormat.java @@ -0,0 +1,344 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PointsReader; +import org.apache.lucene.codecs.PointsWriter; +import org.apache.lucene.document.BinaryPoint; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.BasePointsFormatTestCase; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.MockRandomMergePolicy; +import org.apache.lucene.index.PointValues; +import org.apache.lucene.index.PointValues.IntersectVisitor; +import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.bkd.BKDConfig; + +public class TestLucene90PointsFormat extends BasePointsFormatTestCase { + + private final Codec codec; + private final int maxPointsInLeafNode; + + public TestLucene90PointsFormat() { + // standard issue + Codec defaultCodec = TestUtil.getDefaultCodec(); + if (random().nextBoolean()) { + // randomize parameters + maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500); + double maxMBSortInHeap = 3.0 + (3 * random().nextDouble()); + if (VERBOSE) { + System.out.println( + "TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + + maxPointsInLeafNode + + " and maxMBSortInHeap=" + + maxMBSortInHeap); + } + + // sneaky impersonation! + codec = + new FilterCodec(defaultCodec.getName(), defaultCodec) { + @Override + public PointsFormat pointsFormat() { + return new PointsFormat() { + @Override + public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { + return new Lucene90PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); + } + + @Override + public PointsReader fieldsReader(SegmentReadState readState) throws IOException { + return new Lucene90PointsReader(readState); + } + }; + } + }; + } else { + // standard issue + codec = defaultCodec; + maxPointsInLeafNode = BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE; + } + } + + @Override + protected Codec getCodec() { + return codec; + } + + @Override + public void testMergeStability() throws Exception { + assumeFalse( + "TODO: mess with the parameters and test gets angry!", codec instanceof FilterCodec); + super.testMergeStability(); + } + + public void testEstimatePointCount() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(); + // Avoid mockRandomMP since it may cause non-optimal merges that make the + // number of points per leaf hard to predict + while (iwc.getMergePolicy() instanceof MockRandomMergePolicy) { + iwc.setMergePolicy(newMergePolicy()); + } + IndexWriter w = new IndexWriter(dir, iwc); + byte[] pointValue = new byte[3]; + byte[] uniquePointValue = new byte[3]; + random().nextBytes(uniquePointValue); + final int numDocs = + TEST_NIGHTLY ? atLeast(10000) : atLeast(500); // at night, make sure we have several leaves + final boolean multiValues = random().nextBoolean(); + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + if (i == numDocs / 2) { + doc.add(new BinaryPoint("f", uniquePointValue)); + } else { + final int numValues = (multiValues) ? TestUtil.nextInt(random(), 2, 100) : 1; + for (int j = 0; j < numValues; j++) { + do { + random().nextBytes(pointValue); + } while (Arrays.equals(pointValue, uniquePointValue)); + doc.add(new BinaryPoint("f", pointValue)); + } + } + w.addDocument(doc); + } + w.forceMerge(1); + final IndexReader r = DirectoryReader.open(w); + w.close(); + final LeafReader lr = getOnlyLeafReader(r); + PointValues points = lr.getPointValues("f"); + + // If all points match, then the point count is numLeaves * maxPointsInLeafNode + final int numLeaves = (int) Math.ceil((double) points.size() / maxPointsInLeafNode); + + IntersectVisitor allPointsVisitor = + new IntersectVisitor() { + @Override + public void visit(int docID, byte[] packedValue) throws IOException {} + + @Override + public void visit(int docID) throws IOException {} + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_INSIDE_QUERY; + } + }; + + assertEquals(numLeaves * maxPointsInLeafNode, points.estimatePointCount(allPointsVisitor)); + assertEquals(numDocs, points.estimateDocCount(allPointsVisitor)); + + IntersectVisitor noPointsVisitor = + new IntersectVisitor() { + @Override + public void visit(int docID, byte[] packedValue) throws IOException {} + + @Override + public void visit(int docID) throws IOException {} + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_OUTSIDE_QUERY; + } + }; + + // Return 0 if no points match + assertEquals(0, points.estimatePointCount(noPointsVisitor)); + assertEquals(0, points.estimateDocCount(noPointsVisitor)); + + IntersectVisitor onePointMatchVisitor = + new IntersectVisitor() { + @Override + public void visit(int docID, byte[] packedValue) throws IOException {} + + @Override + public void visit(int docID) throws IOException {} + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + if (Arrays.compareUnsigned(uniquePointValue, 0, 3, maxPackedValue, 0, 3) > 0 + || Arrays.compareUnsigned(uniquePointValue, 0, 3, minPackedValue, 0, 3) < 0) { + return Relation.CELL_OUTSIDE_QUERY; + } + return Relation.CELL_CROSSES_QUERY; + } + }; + + // If only one point matches, then the point count is (maxPointsInLeafNode + 1) / 2 + // in general, or maybe 2x that if the point is a split value + final long pointCount = points.estimatePointCount(onePointMatchVisitor); + assertTrue( + "" + pointCount, + pointCount == (maxPointsInLeafNode + 1) / 2 + || // common case + pointCount == 2 * ((maxPointsInLeafNode + 1) / 2)); // if the point is a split value + + final long docCount = points.estimateDocCount(onePointMatchVisitor); + + if (multiValues) { + assertEquals( + docCount, + (long) + (docCount + * (1d + - Math.pow( + (numDocs - pointCount) / points.size(), points.size() / docCount)))); + } else { + assertEquals(Math.min(pointCount, numDocs), docCount); + } + r.close(); + dir.close(); + } + + // The tree is always balanced in the N dims case, and leaves are + // not all full so things are a bit different + public void testEstimatePointCount2Dims() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + byte[][] pointValue = new byte[2][]; + pointValue[0] = new byte[3]; + pointValue[1] = new byte[3]; + byte[][] uniquePointValue = new byte[2][]; + uniquePointValue[0] = new byte[3]; + uniquePointValue[1] = new byte[3]; + random().nextBytes(uniquePointValue[0]); + random().nextBytes(uniquePointValue[1]); + final int numDocs = + TEST_NIGHTLY + ? atLeast(10000) + : atLeast(1000); // in nightly, make sure we have several leaves + final boolean multiValues = random().nextBoolean(); + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + if (i == numDocs / 2) { + doc.add(new BinaryPoint("f", uniquePointValue)); + } else { + final int numValues = (multiValues) ? TestUtil.nextInt(random(), 2, 100) : 1; + for (int j = 0; j < numValues; j++) { + do { + random().nextBytes(pointValue[0]); + random().nextBytes(pointValue[1]); + } while (Arrays.equals(pointValue[0], uniquePointValue[0]) + || Arrays.equals(pointValue[1], uniquePointValue[1])); + doc.add(new BinaryPoint("f", pointValue)); + } + } + w.addDocument(doc); + } + w.forceMerge(1); + final IndexReader r = DirectoryReader.open(w); + w.close(); + final LeafReader lr = getOnlyLeafReader(r); + PointValues points = lr.getPointValues("f"); + + IntersectVisitor allPointsVisitor = + new IntersectVisitor() { + @Override + public void visit(int docID, byte[] packedValue) throws IOException {} + + @Override + public void visit(int docID) throws IOException {} + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_INSIDE_QUERY; + } + }; + + // If all points match, then the point count is numLeaves * maxPointsInLeafNode + final int numLeaves = (int) Math.ceil((double) points.size() / maxPointsInLeafNode); + + assertEquals(numLeaves * maxPointsInLeafNode, points.estimatePointCount(allPointsVisitor)); + assertEquals(numDocs, points.estimateDocCount(allPointsVisitor)); + + IntersectVisitor noPointsVisitor = + new IntersectVisitor() { + @Override + public void visit(int docID, byte[] packedValue) throws IOException {} + + @Override + public void visit(int docID) throws IOException {} + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_OUTSIDE_QUERY; + } + }; + + // Return 0 if no points match + assertEquals(0, points.estimatePointCount(noPointsVisitor)); + assertEquals(0, points.estimateDocCount(noPointsVisitor)); + + IntersectVisitor onePointMatchVisitor = + new IntersectVisitor() { + @Override + public void visit(int docID, byte[] packedValue) throws IOException {} + + @Override + public void visit(int docID) throws IOException {} + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + for (int dim = 0; dim < 2; ++dim) { + if (Arrays.compareUnsigned( + uniquePointValue[dim], 0, 3, maxPackedValue, dim * 3, dim * 3 + 3) + > 0 + || Arrays.compareUnsigned( + uniquePointValue[dim], 0, 3, minPackedValue, dim * 3, dim * 3 + 3) + < 0) { + return Relation.CELL_OUTSIDE_QUERY; + } + } + return Relation.CELL_CROSSES_QUERY; + } + }; + + final long pointCount = points.estimatePointCount(onePointMatchVisitor); + // The number of matches needs to be multiple of count per leaf + final long countPerLeaf = (maxPointsInLeafNode + 1) / 2; + assertTrue("" + pointCount, pointCount % countPerLeaf == 0); + // in extreme cases, a point can be be shared by 4 leaves + assertTrue("" + pointCount, pointCount / countPerLeaf <= 4 && pointCount / countPerLeaf >= 1); + + final long docCount = points.estimateDocCount(onePointMatchVisitor); + if (multiValues) { + assertEquals( + docCount, + (long) + (docCount + * (1d + - Math.pow( + (numDocs - pointCount) / points.size(), points.size() / docCount)))); + } else { + assertEquals(Math.min(pointCount, numDocs), docCount); + } + r.close(); + dir.close(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java index 894635c7485..54134948ab0 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java @@ -36,8 +36,8 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.PointsWriter; -import org.apache.lucene.codecs.lucene86.Lucene86PointsReader; -import org.apache.lucene.codecs.lucene86.Lucene86PointsWriter; +import org.apache.lucene.codecs.lucene90.Lucene90PointsReader; +import org.apache.lucene.codecs.lucene90.Lucene90PointsWriter; import org.apache.lucene.document.BinaryPoint; import org.apache.lucene.document.Document; import org.apache.lucene.document.DoublePoint; @@ -1280,12 +1280,12 @@ public class TestPointQueries extends LuceneTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene86PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); + return new Lucene90PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); } @Override public PointsReader fieldsReader(SegmentReadState readState) throws IOException { - return new Lucene86PointsReader(readState); + return new Lucene90PointsReader(readState); } }; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LatLonPointPrototypeQueries.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LatLonPointPrototypeQueries.java index 1a28d47dbec..e7f1eb6351d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LatLonPointPrototypeQueries.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LatLonPointPrototypeQueries.java @@ -19,7 +19,7 @@ package org.apache.lucene.sandbox.search; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; import org.apache.lucene.document.LatLonDocValuesField; import org.apache.lucene.document.LatLonPoint; import org.apache.lucene.geo.GeoUtils; @@ -54,7 +54,7 @@ public class LatLonPointPrototypeQueries { * *

    This is functionally equivalent to running {@link MatchAllDocsQuery} with a {@link * LatLonDocValuesField#newDistanceSort}, but is far more efficient since it takes advantage of - * properties the indexed BKD tree. Currently this only works with {@link Lucene86PointsFormat} + * properties the indexed BKD tree. Currently this only works with {@link Lucene90PointsFormat} * (used by the default codec). Multi-valued fields are currently not de-duplicated, so if a * document had multiple instances of the specified field that make it into the top n, that * document will appear more than once. diff --git a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java index 6589e667b2c..6dd83e8b9b4 100644 --- a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java +++ b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java @@ -32,8 +32,8 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.PointsWriter; -import org.apache.lucene.codecs.lucene86.Lucene86PointsReader; -import org.apache.lucene.codecs.lucene86.Lucene86PointsWriter; +import org.apache.lucene.codecs.lucene90.Lucene90PointsReader; +import org.apache.lucene.codecs.lucene90.Lucene90PointsWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericDocValuesField; @@ -108,12 +108,12 @@ public class TestGeo3DPoint extends LuceneTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene86PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); + return new Lucene90PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap); } @Override public PointsReader fieldsReader(SegmentReadState readState) throws IOException { - return new Lucene86PointsReader(readState); + return new Lucene90PointsReader(readState); } }; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java index bd96ce875d2..5b18e0c74d9 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java @@ -31,8 +31,8 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.PointsWriter; -import org.apache.lucene.codecs.lucene86.Lucene86PointsReader; -import org.apache.lucene.codecs.lucene86.Lucene86PointsWriter; +import org.apache.lucene.codecs.lucene90.Lucene90PointsReader; +import org.apache.lucene.codecs.lucene90.Lucene90PointsWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericDocValuesField; @@ -1467,13 +1467,13 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene86PointsWriter( + return new Lucene90PointsWriter( writeState, pointsInLeaf, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); } @Override public PointsReader fieldsReader(SegmentReadState readState) throws IOException { - return new Lucene86PointsReader(readState); + return new Lucene90PointsReader(readState); } }; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseXYPointTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseXYPointTestCase.java index c587281b115..bcdecf794f1 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseXYPointTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseXYPointTestCase.java @@ -31,8 +31,8 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.PointsWriter; -import org.apache.lucene.codecs.lucene86.Lucene86PointsReader; -import org.apache.lucene.codecs.lucene86.Lucene86PointsWriter; +import org.apache.lucene.codecs.lucene90.Lucene90PointsReader; +import org.apache.lucene.codecs.lucene90.Lucene90PointsWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericDocValuesField; @@ -1312,13 +1312,13 @@ public abstract class BaseXYPointTestCase extends LuceneTestCase { return new PointsFormat() { @Override public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException { - return new Lucene86PointsWriter( + return new Lucene90PointsWriter( writeState, pointsInLeaf, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP); } @Override public PointsReader fieldsReader(SegmentReadState readState) throws IOException { - return new Lucene86PointsReader(readState); + return new Lucene90PointsReader(readState); } }; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java index 1248cf6963e..d489e4b6f3e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java @@ -1186,4 +1186,85 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa w.forceMerge(1); IOUtils.close(w, dir); } + + public void testDocCountEdgeCases() { + PointValues values = getPointValues(Long.MAX_VALUE, 1, Long.MAX_VALUE); + long docs = values.estimateDocCount(null); + assertEquals(1, docs); + values = getPointValues(Long.MAX_VALUE, 1, 1); + docs = values.estimateDocCount(null); + assertEquals(1, docs); + values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE); + docs = values.estimateDocCount(null); + assertEquals(Integer.MAX_VALUE, docs); + values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE / 2); + docs = values.estimateDocCount(null); + assertEquals(Integer.MAX_VALUE, docs); + values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, 1); + docs = values.estimateDocCount(null); + assertEquals(1, docs); + } + + public void testRandomDocCount() { + for (int i = 0; i < 100; i++) { + long size = TestUtil.nextLong(random(), 1, Long.MAX_VALUE); + int maxDoc = (size > Integer.MAX_VALUE) ? Integer.MAX_VALUE : Math.toIntExact(size); + int docCount = TestUtil.nextInt(random(), 1, maxDoc); + long estimatedPointCount = TestUtil.nextLong(random(), 0, size); + PointValues values = getPointValues(size, docCount, estimatedPointCount); + long docs = values.estimateDocCount(null); + assertTrue(docs <= estimatedPointCount); + assertTrue(docs <= maxDoc); + assertTrue(docs >= estimatedPointCount / (size / docCount)); + } + } + + private PointValues getPointValues(long size, int docCount, long estimatedPointCount) { + return new PointValues() { + @Override + public void intersect(IntersectVisitor visitor) { + throw new UnsupportedOperationException(); + } + + @Override + public long estimatePointCount(IntersectVisitor visitor) { + return estimatedPointCount; + } + + @Override + public byte[] getMinPackedValue() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public byte[] getMaxPackedValue() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int getNumDimensions() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int getNumIndexDimensions() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int getBytesPerDimension() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long size() { + return size; + } + + @Override + public int getDocCount() { + return docCount; + } + }; + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index 5fe55caf085..cdd85da6b36 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -39,9 +39,9 @@ import org.apache.lucene.codecs.blockterms.LuceneVarGapDocFreqInterval; import org.apache.lucene.codecs.blockterms.LuceneVarGapFixedInterval; import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat; import org.apache.lucene.codecs.bloom.TestBloomFilteredLucenePostings; -import org.apache.lucene.codecs.lucene86.Lucene86PointsReader; -import org.apache.lucene.codecs.lucene86.Lucene86PointsWriter; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsReader; +import org.apache.lucene.codecs.lucene90.Lucene90PointsWriter; import org.apache.lucene.codecs.memory.DirectPostingsFormat; import org.apache.lucene.codecs.memory.FSTPostingsFormat; import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat; @@ -102,7 +102,7 @@ public class RandomCodec extends AssertingCodec { // Randomize how BKDWriter chooses its splits: - return new Lucene86PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) { + return new Lucene90PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) { @Override public void writeField(FieldInfo fieldInfo, PointsReader reader) throws IOException { @@ -157,7 +157,7 @@ public class RandomCodec extends AssertingCodec { @Override public PointsReader fieldsReader(SegmentReadState readState) throws IOException { - return new Lucene86PointsReader(readState); + return new Lucene90PointsReader(readState); } }); }