From 9128bdbaf547429667740cdc95370c7c606f83fc Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 5 Oct 2016 14:07:15 +0200 Subject: [PATCH] LUCENE-7475: Make norms sparse. --- lucene/CHANGES.txt | 3 + .../codecs/lucene53/Lucene53NormsFormat.java | 10 +- .../lucene53/Lucene53NormsProducer.java | 0 .../lucene/codecs/lucene53/package-info.java | 2 +- .../lucene54/Lucene54DocValuesConsumer.java | 0 .../lucene54/Lucene54DocValuesFormat.java | 0 .../lucene54/Lucene54DocValuesProducer.java | 0 .../lucene/codecs/lucene54/package-info.java | 0 .../lucene/codecs/lucene60/Lucene60Codec.java | 2 +- .../lucene/codecs/lucene62/Lucene62Codec.java | 4 +- .../lucene/codecs/lucene62/package.html | 25 ++ .../services/org.apache.lucene.codecs.Codec | 1 + .../org.apache.lucene.codecs.DocValuesFormat | 1 + .../lucene53/Lucene53NormsConsumer.java | 2 +- .../lucene53/Lucene53RWNormsFormat.java | 31 ++ .../lucene53/TestLucene53NormsFormat.java | 14 +- .../lucene54/TestLucene54DocValuesFormat.java | 0 .../codecs/lucene62/Lucene62RWCodec.java | 32 ++ .../simpletext/SimpleTextDocValuesReader.java | 2 +- .../simpletext/SimpleTextNormsFormat.java | 5 +- .../codecs/LegacyDocValuesIterables.java | 7 +- .../lucene/codecs/lucene50/package-info.java | 2 +- .../lucene/codecs/lucene70/Lucene70Codec.java | 3 +- .../lucene70/Lucene70NormsConsumer.java | 155 +++++++++ .../codecs/lucene70/Lucene70NormsFormat.java | 97 ++++++ .../lucene70/Lucene70NormsProducer.java | 271 +++++++++++++++ .../lucene/codecs/lucene70/SparseDISI.java | 115 +++++++ .../lucene/codecs/lucene70/package-info.java | 6 +- .../lucene/index/DefaultIndexingChain.java | 13 +- .../apache/lucene/index/NormValuesWriter.java | 59 ++-- .../services/org.apache.lucene.codecs.Codec | 1 - .../org.apache.lucene.codecs.DocValuesFormat | 1 - .../lucene70/TestLucene70NormsFormat.java | 34 ++ .../codecs/lucene70/TestSparseDISI.java | 94 ++++++ .../org/apache/lucene/index/TestNorms.java | 21 ++ .../lucene/index/BaseNormsFormatTestCase.java | 313 +++++++++++++++--- 36 files changed, 1219 insertions(+), 107 deletions(-) rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java (90%) rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsProducer.java (100%) rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene53/package-info.java (93%) rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java (100%) rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java (100%) rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java (100%) rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene54/package-info.java (100%) rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java (99%) create mode 100644 lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/package.html rename lucene/{core/src/java => backward-codecs/src/test}/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java (99%) create mode 100644 lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53RWNormsFormat.java rename lucene/{core => backward-codecs}/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java (85%) rename lucene/{core => backward-codecs}/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java (100%) create mode 100644 lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene62/Lucene62RWCodec.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4437792b8c1..0a65d204306 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -22,6 +22,9 @@ API Changes to iterators, enabling future codec compression improvements. (Mike McCandless) +* LUCENE-7475: Norms now support sparsity, allowing to pay for what is + actually used. (Adrien Grand) + Bug Fixes Improvements diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java similarity index 90% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java index 15cdeccca2b..1f7928f2b29 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java @@ -74,7 +74,7 @@ public class Lucene53NormsFormat extends NormsFormat { @Override public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException { - return new Lucene53NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); + throw new UnsupportedOperationException("This format can only be used for reading"); } @Override @@ -82,10 +82,10 @@ public class Lucene53NormsFormat extends NormsFormat { return new Lucene53NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); } - private static final String DATA_CODEC = "Lucene53NormsData"; - private static final String DATA_EXTENSION = "nvd"; - private static final String METADATA_CODEC = "Lucene53NormsMetadata"; - private static final String METADATA_EXTENSION = "nvm"; + static final String DATA_CODEC = "Lucene53NormsData"; + static final String DATA_EXTENSION = "nvd"; + static final String METADATA_CODEC = "Lucene53NormsMetadata"; + static final String METADATA_EXTENSION = "nvm"; static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsProducer.java similarity index 100% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsProducer.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsProducer.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/package-info.java similarity index 93% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene53/package-info.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/package-info.java index 6a035323cd3..93fefb8448a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/package-info.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/package-info.java @@ -17,7 +17,7 @@ /** * Components from the Lucene 5.3 index format - * See {@link org.apache.lucene.codecs.lucene54} for an overview + * See {@link org.apache.lucene.codecs.lucene53} for an overview * of the index format. */ package org.apache.lucene.codecs.lucene53; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java similarity index 100% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java similarity index 100% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java similarity index 100% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/package-info.java similarity index 100% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene54/package-info.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/package-info.java diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java index 32c17527deb..ed74aa8dfd4 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java @@ -171,7 +171,7 @@ public class Lucene60Codec extends Codec { private final NormsFormat normsFormat = new Lucene53NormsFormat(); @Override - public final NormsFormat normsFormat() { + public NormsFormat normsFormat() { return normsFormat; } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java index 50710752694..58b07ebe73e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java @@ -32,9 +32,9 @@ import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat; import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat; -import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode; import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat; import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat; +import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode; import org.apache.lucene.codecs.lucene53.Lucene53NormsFormat; import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat; import org.apache.lucene.codecs.lucene60.Lucene60PointsFormat; @@ -170,7 +170,7 @@ public class Lucene62Codec extends Codec { private final NormsFormat normsFormat = new Lucene53NormsFormat(); @Override - public final NormsFormat normsFormat() { + public NormsFormat normsFormat() { return normsFormat; } } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/package.html b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/package.html new file mode 100644 index 00000000000..74e66d93bc6 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/package.html @@ -0,0 +1,25 @@ + + + + + + + +Lucene 6.2 file format. + + diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 875aba527e2..6954d7a641c 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -14,3 +14,4 @@ # limitations under the License. org.apache.lucene.codecs.lucene60.Lucene60Codec +org.apache.lucene.codecs.lucene62.Lucene62Codec diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat index 4a812de77e5..26984efd409 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -13,3 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. +org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java index 833500c1930..ddb968c8b3e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java @@ -58,7 +58,7 @@ class Lucene53NormsConsumer extends NormsConsumer { @Override public void addNormsField(FieldInfo field, NormsProducer normsProducer) throws IOException { - addNormsField(field, LegacyDocValuesIterables.normsIterable(field, normsProducer, maxDoc)); + addNormsField(field, LegacyDocValuesIterables.normsIterable(field, normsProducer, maxDoc, true)); } private void addNormsField(FieldInfo field, Iterable values) throws IOException { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53RWNormsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53RWNormsFormat.java new file mode 100644 index 00000000000..86a2b6a509b --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53RWNormsFormat.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene53; + +import java.io.IOException; + +import org.apache.lucene.codecs.NormsConsumer; +import org.apache.lucene.index.SegmentWriteState; + +public class Lucene53RWNormsFormat extends Lucene53NormsFormat { + + @Override + public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException { + return new Lucene53NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java similarity index 85% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java index c87c51ff6c0..80a8eee6269 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java @@ -16,19 +16,23 @@ */ package org.apache.lucene.codecs.lucene53; - import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene70.Lucene70Codec; +import org.apache.lucene.codecs.lucene62.Lucene62RWCodec; import org.apache.lucene.index.BaseNormsFormatTestCase; /** * Tests Lucene53NormsFormat */ public class TestLucene53NormsFormat extends BaseNormsFormatTestCase { - private final Codec codec = new Lucene70Codec(); - + private final Codec codec = new Lucene62RWCodec(); + @Override protected Codec getCodec() { return codec; } -} + + @Override + protected boolean codecSupportsSparsity() { + return false; + } +} \ No newline at end of file diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java similarity index 100% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene62/Lucene62RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene62/Lucene62RWCodec.java new file mode 100644 index 00000000000..fcb414def03 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene62/Lucene62RWCodec.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene62; + +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.lucene53.Lucene53RWNormsFormat; +import org.apache.lucene.codecs.lucene62.Lucene62Codec; + +public class Lucene62RWCodec extends Lucene62Codec { + + private final NormsFormat normsFormat = new Lucene53RWNormsFormat(); + + @Override + public NormsFormat normsFormat() { + return normsFormat; + } + +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java index c0b88cc3de0..b01924a85dc 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java @@ -187,7 +187,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer { }; } - private Bits getNumericDocsWithField(FieldInfo fieldInfo) throws IOException { + public Bits getNumericDocsWithField(FieldInfo fieldInfo) throws IOException { final OneField field = fields.get(fieldInfo.name); final IndexInput in = data.clone(); final BytesRefBuilder scratch = new BytesRefBuilder(); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextNormsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextNormsFormat.java index faa50b765c7..26b00ec6238 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextNormsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextNormsFormat.java @@ -30,7 +30,6 @@ import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.Bits; /** * plain-text norms format. @@ -70,7 +69,7 @@ public class SimpleTextNormsFormat extends NormsFormat { @Override public NumericDocValues getNorms(FieldInfo field) throws IOException { - return new LegacyNumericDocValuesWrapper(new Bits.MatchAllBits(impl.maxDoc), impl.getNumericNonIterator(field)); + return new LegacyNumericDocValuesWrapper(impl.getNumericDocsWithField(field), impl.getNumericNonIterator(field)); } @Override @@ -117,7 +116,7 @@ public class SimpleTextNormsFormat extends NormsFormat { @Override public void addNormsField(FieldInfo field, NormsProducer normsProducer) throws IOException { - impl.addNumericField(field, LegacyDocValuesIterables.normsIterable(field, normsProducer, impl.numDocs)); + impl.addNumericField(field, LegacyDocValuesIterables.normsIterable(field, normsProducer, impl.numDocs, false)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/LegacyDocValuesIterables.java b/lucene/core/src/java/org/apache/lucene/codecs/LegacyDocValuesIterables.java index 63f93dbef84..74c2d801ec3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/LegacyDocValuesIterables.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/LegacyDocValuesIterables.java @@ -372,7 +372,8 @@ public class LegacyDocValuesIterables { * * @deprecated Consume {@link NumericDocValues} instead. */ @Deprecated - public static Iterable normsIterable(final FieldInfo field, final NormsProducer normsProducer, final int maxDoc) { + public static Iterable normsIterable(final FieldInfo field, + final NormsProducer normsProducer, final int maxDoc, boolean missingAsZero) { return new Iterable() { @@ -411,9 +412,11 @@ public class LegacyDocValuesIterables { } catch (IOException ioe) { throw new RuntimeException(ioe); } - } else { + } else if (missingAsZero) { // Unlike NumericDocValues, norms should return for missing values: result = 0; + } else { + result = null; } return result; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java index f76ac06392e..9170c69bb3d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java @@ -17,7 +17,7 @@ /** * Components from the Lucene 5.0 index format - * See {@link org.apache.lucene.codecs.lucene53} for an overview + * See {@link org.apache.lucene.codecs.lucene50} for an overview * of the index format. */ package org.apache.lucene.codecs.lucene50; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70Codec.java index 8d86649e30c..7f9aed0ed96 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70Codec.java @@ -35,7 +35,6 @@ import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat; import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat; import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode; import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat; -import org.apache.lucene.codecs.lucene53.Lucene53NormsFormat; import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat; import org.apache.lucene.codecs.lucene60.Lucene60PointsFormat; import org.apache.lucene.codecs.lucene62.Lucene62SegmentInfoFormat; @@ -168,7 +167,7 @@ public class Lucene70Codec extends Codec { private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50"); private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene70"); - private final NormsFormat normsFormat = new Lucene53NormsFormat(); + private final NormsFormat normsFormat = new Lucene70NormsFormat(); @Override public final NormsFormat normsFormat() { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java new file mode 100644 index 00000000000..00cd5ecde33 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene70; + +import static org.apache.lucene.codecs.lucene70.Lucene70NormsFormat.VERSION_CURRENT; + +import java.io.IOException; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.NormsConsumer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.IOUtils; + +/** + * Writer for {@link Lucene70NormsFormat} + */ +final class Lucene70NormsConsumer extends NormsConsumer { + IndexOutput data, meta; + final int maxDoc; + + Lucene70NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + boolean success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.createOutput(dataName, state.context); + CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + meta = state.directory.createOutput(metaName, state.context); + CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + maxDoc = state.segmentInfo.maxDoc(); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void close() throws IOException { + boolean success = false; + try { + if (meta != null) { + meta.writeInt(-1); // write EOF marker + CodecUtil.writeFooter(meta); // write checksum + } + if (data != null) { + CodecUtil.writeFooter(data); // write checksum + } + success = true; + } finally { + if (success) { + IOUtils.close(data, meta); + } else { + IOUtils.closeWhileHandlingException(data, meta); + } + meta = data = null; + } + } + + @Override + public void addNormsField(FieldInfo field, NormsProducer normsProducer) throws IOException { + NumericDocValues values = normsProducer.getNorms(field); + int numDocsWithValue = 0; + long min = Long.MAX_VALUE; + long max = Long.MIN_VALUE; + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + numDocsWithValue++; + long v = values.longValue(); + min = Math.min(min, v); + max = Math.max(max, v); + } + assert numDocsWithValue <= maxDoc; + + meta.writeInt(field.number); + + if (numDocsWithValue == 0) { + meta.writeLong(-2); + } else if (numDocsWithValue == maxDoc) { + meta.writeLong(-1); + } else { + meta.writeLong(data.getFilePointer()); + values = normsProducer.getNorms(field); + SparseDISI.writeBitSet(values, maxDoc, data); + } + + meta.writeInt(numDocsWithValue); + int numBytesPerValue = numBytesPerValue(min, max); + + meta.writeByte((byte) numBytesPerValue); + if (numBytesPerValue == 0) { + meta.writeLong(min); + } else { + meta.writeLong(data.getFilePointer()); + values = normsProducer.getNorms(field); + writeValues(values, numBytesPerValue, data); + } + } + + private int numBytesPerValue(long min, long max) { + if (min >= max) { + return 0; + } else if (min >= Byte.MIN_VALUE && max <= Byte.MAX_VALUE) { + return 1; + } else if (min >= Short.MIN_VALUE && max <= Short.MAX_VALUE) { + return 2; + } else if (min >= Integer.MIN_VALUE && max <= Integer.MAX_VALUE) { + return 4; + } else { + return 8; + } + } + + private void writeValues(NumericDocValues values, int numBytesPerValue, IndexOutput out) throws IOException, AssertionError { + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + long value = values.longValue(); + switch (numBytesPerValue) { + case 1: + out.writeByte((byte) value); + break; + case 2: + out.writeShort((short) value); + break; + case 4: + out.writeInt((int) value); + break; + case 8: + out.writeLong(value); + break; + default: + throw new AssertionError(); + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java new file mode 100644 index 00000000000..7e70b246967 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene70; + +import java.io.IOException; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.NormsConsumer; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.DataOutput; + +/** + * Lucene 7.0 Score normalization format. + *

+ * Encodes normalization values by encoding each value with the minimum + * number of bytes needed to represent the range (which can be zero). + *

+ * Files: + *

    + *
  1. .nvd: Norms data
  2. + *
  3. .nvm: Norms metadata
  4. + *
+ *
    + *
  1. + *

    The Norms metadata or .nvm file.

    + *

    For each norms field, this stores metadata, such as the offset into the + * Norms data (.nvd)

    + *

    Norms metadata (.dvm) --> Header,<Entry>NumFields,Footer

    + *
      + *
    • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
    • + *
    • Entry --> FieldNumber, DocsWithFieldAddress, NumDocsWithField, BytesPerNorm, NormsAddress
    • + *
    • FieldNumber --> {@link DataOutput#writeInt Int32}
    • + *
    • DocsWithFieldAddress --> {@link DataOutput#writeLong Int64}
    • + *
    • NumDocsWithField --> {@link DataOutput#writeInt Int32}
    • + *
    • BytesPerNorm --> {@link DataOutput#writeByte byte}
    • + *
    • NormsAddress --> {@link DataOutput#writeLong Int64}
    • + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter}
    • + *
    + *

    FieldNumber of -1 indicates the end of metadata.

    + *

    NormsAddress is the pointer to the start of the data in the norms data (.nvd), or the singleton value + * when BytesPerValue = 0. If BytesPerValue is different from 0 then there are NumDocsWithField values + * to read at that offset.

    + *

    DocsWithFieldAddress is the pointer to the start of the bit set containing documents that have a norm + * in the norms data (.nvd), or -2 if no documents have a norm value, or -1 if all documents have a norm + * value.

    + *
  2. + *

    The Norms data or .nvd file.

    + *

    For each Norms field, this stores the actual per-document data (the heavy-lifting)

    + *

    Norms data (.nvd) --> Header,< Data >NumFields,Footer

    + *
      + *
    • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
    • + *
    • DocsWithFieldData --> Bit set of MaxDoc bits
    • + *
    • NormsData --> {@link DataOutput#writeByte(byte) byte}NumDocsWithField * BytesPerValue
    • + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter}
    • + *
    + *
+ * @lucene.experimental + */ +public class Lucene70NormsFormat extends NormsFormat { + + /** Sole Constructor */ + public Lucene70NormsFormat() {} + + @Override + public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException { + return new Lucene70NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); + } + + @Override + public NormsProducer normsProducer(SegmentReadState state) throws IOException { + return new Lucene70NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); + } + + private static final String DATA_CODEC = "Lucene70NormsData"; + private static final String DATA_EXTENSION = "nvd"; + private static final String METADATA_CODEC = "Lucene70NormsMetadata"; + private static final String METADATA_EXTENSION = "nvm"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java new file mode 100644 index 00000000000..ee96c1583b5 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene70; + +import static org.apache.lucene.codecs.lucene70.Lucene70NormsFormat.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene70.Lucene70NormsFormat.VERSION_START; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.IOUtils; + +/** + * Reader for {@link Lucene70NormsFormat} + */ +final class Lucene70NormsProducer extends NormsProducer { + // metadata maps (just file pointers and minimal stuff) + private final Map norms = new HashMap<>(); + private final IndexInput data; + private final int maxDoc; + + Lucene70NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + maxDoc = state.segmentInfo.maxDoc(); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + int version = -1; + + // read in the entries from the metadata file. + try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) { + Throwable priorE = null; + try { + version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + readFields(in, state.fieldInfos); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(in, priorE); + } + } + + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.openInput(dataName, state.context); + boolean success = false; + try { + final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + if (version != version2) { + throw new CorruptIndexException("Format versions mismatch: meta=" + version + ",data=" + version2, data); + } + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(data); + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this.data); + } + } + } + + static class NormsEntry { + byte bytesPerNorm; + long docsWithFieldOffset; + int numDocsWithField; + long normsOffset; + } + + static abstract class LongValues { + abstract long get(int index) throws IOException; + } + + private void readFields(IndexInput meta, FieldInfos infos) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { + FieldInfo info = infos.fieldInfo(fieldNumber); + if (info == null) { + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); + } else if (!info.hasNorms()) { + throw new CorruptIndexException("Invalid field: " + info.name, meta); + } + NormsEntry entry = new NormsEntry(); + entry.docsWithFieldOffset = meta.readLong(); + entry.numDocsWithField = meta.readInt(); + entry.bytesPerNorm = meta.readByte(); + switch (entry.bytesPerNorm) { + case 0: case 1: case 2: case 4: case 8: + break; + default: + throw new CorruptIndexException("Invalid bytesPerValue: " + entry.bytesPerNorm + ", field: " + info.name, meta); + } + entry.normsOffset = meta.readLong(); + norms.put(info.number, entry); + } + } + + @Override + public NumericDocValues getNorms(FieldInfo field) throws IOException { + final NormsEntry entry = norms.get(field.number); + if (entry.docsWithFieldOffset == -2) { + // empty + return DocValues.emptyNumeric(); + } else if (entry.docsWithFieldOffset == -1) { + // dense + final LongValues normValues = getNormValues(entry); + return new NumericDocValues() { + + int doc = -1; + + @Override + public long longValue() throws IOException { + return normValues.get(doc); + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return doc = NO_MORE_DOCS; + } + return doc = target; + } + + @Override + public long cost() { + return maxDoc; + } + + }; + } else { + // sparse + final LongValues normValues = getNormValues(entry); + final SparseDISI disi; + synchronized (data) { + disi = new SparseDISI(maxDoc, data, entry.docsWithFieldOffset, entry.numDocsWithField); + } + return new NumericDocValues() { + + @Override + public int advance(int target) throws IOException { + return disi.advance(target); + } + + @Override + public int nextDoc() throws IOException { + return disi.nextDoc(); + } + + @Override + public int docID() { + return disi.docID(); + } + + @Override + public long cost() { + return entry.numDocsWithField; + } + + @Override + public long longValue() throws IOException { + return normValues.get(disi.index()); + } + }; + } + } + + private LongValues getNormValues(NormsEntry entry) throws IOException { + if (entry.bytesPerNorm == 0) { + return new LongValues() { + @Override + long get(int index) { + return entry.normsOffset; + } + }; + } else { + RandomAccessInput slice; + synchronized (data) { + slice = data.randomAccessSlice(entry.normsOffset, entry.numDocsWithField * (long) entry.bytesPerNorm); + } + switch (entry.bytesPerNorm) { + case 1: + return new LongValues() { + @Override + long get(int index) throws IOException { + return slice.readByte(index); + } + }; + case 2: + return new LongValues() { + @Override + long get(int index) throws IOException { + return slice.readShort(((long) index) << 1); + } + }; + case 4: + return new LongValues() { + @Override + long get(int index) throws IOException { + return slice.readInt(((long) index) << 2); + } + }; + case 8: + return new LongValues() { + @Override + long get(int index) throws IOException { + return slice.readLong(((long) index) << 3); + } + }; + default: + // should not happen, we already validate bytesPerNorm in readFields + throw new AssertionError(); + } + } + } + + @Override + public void close() throws IOException { + data.close(); + } + + @Override + public long ramBytesUsed() { + return 64L * norms.size(); // good enough + } + + @Override + public void checkIntegrity() throws IOException { + CodecUtil.checksumEntireFile(data); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(fields=" + norms.size() + ")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java new file mode 100644 index 00000000000..af71b9ed99c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene70; + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; + +final class SparseDISI extends DocIdSetIterator { + + static void writeBitSet(DocIdSetIterator it, int maxDoc, IndexOutput out) throws IOException { + int currentIndex = 0; + long currentBits = 0; + for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { + final int index = doc >>> 6; + if (index > currentIndex) { + out.writeLong(currentBits); + for (int i = currentIndex + 1; i < index; ++i) { + out.writeLong(0L); + } + currentIndex = index; + currentBits = 0L; + } + currentBits |= 1L << doc; + } + + out.writeLong(currentBits); + final int maxIndex = (maxDoc - 1) >>> 6; + for (int i = currentIndex + 1; i <= maxIndex; ++i) { + out.writeLong(0L); + } + } + + final int maxDoc; + final int numWords; + final long cost; + final RandomAccessInput slice; + int doc = -1; + int wordIndex = -1; + long word; + int index = -1; + + SparseDISI(int maxDoc, IndexInput in, long offset, long cost) throws IOException { + this.maxDoc = maxDoc; + this.numWords = (int) ((maxDoc + 63L) >>> 6); + this.slice = in.randomAccessSlice(offset, numWords * 8L); + this.cost = cost; + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return doc = NO_MORE_DOCS; + } + + final int targetWordIndex = target >>> 6; + for (int i = wordIndex + 1; i <= targetWordIndex; ++i) { + word = slice.readLong(i << 3); + index += Long.bitCount(word); + } + wordIndex = targetWordIndex; + + long leftBits = word >>> target; + if (leftBits != 0L) { + return doc = target + Long.numberOfTrailingZeros(leftBits); + } + + while (++wordIndex < numWords) { + word = slice.readLong(wordIndex << 3); + if (word != 0) { + index += Long.bitCount(word); + return doc = (wordIndex << 6) + Long.numberOfTrailingZeros(word); + } + } + + return doc = NO_MORE_DOCS; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int docID() { + return doc; + } + + @Override + public long cost() { + return cost; + } + + public int index() { + return index - Long.bitCount(word >>> doc) + 1; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java index 77492ad9659..9b432f7c4f4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java @@ -163,7 +163,7 @@ * all documents omit position data. * *
  • - * {@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Normalization factors}. + * {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Normalization factors}. * For each field in each document, a value is stored * that is multiplied into the score for hits on that field. *
  • @@ -278,12 +278,12 @@ * Stores additional per-position metadata information such as character offsets and user payloads * * - * {@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Norms} + * {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms} * .nvd, .nvm * Encodes length and boost factors for docs and fields * * - * {@link org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat Per-Document Values} + * {@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-Document Values} * .dvd, .dvm * Encodes additional scoring factors or other per-document information. * diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java index e941911ef2b..e2ece543506 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java @@ -665,8 +665,17 @@ final class DefaultIndexingChain extends DocConsumer { } public void finish() throws IOException { - if (fieldInfo.omitsNorms() == false && invertState.length != 0) { - norms.addValue(docState.docID, similarity.computeNorm(invertState)); + if (fieldInfo.omitsNorms() == false) { + long normValue; + if (invertState.length == 0) { + // the field exists in this document, but it did not have + // any indexed tokens, so we assign a default value of zero + // to the norm + normValue = 0; + } else { + normValue = similarity.computeNorm(invertState); + } + norms.addValue(docState.docID, normValue); } termsHashPerField.finish(); diff --git a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java index c444661708b..46b8c1ceb15 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java @@ -21,7 +21,10 @@ import java.io.IOException; import org.apache.lucene.codecs.NormsConsumer; import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.Counter; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -29,29 +32,34 @@ import org.apache.lucene.util.packed.PackedLongValues; * segment flushes. */ class NormValuesWriter { - private final static long MISSING = 0L; - + private FixedBitSet docsWithField; private PackedLongValues.Builder pending; private final Counter iwBytesUsed; private long bytesUsed; private final FieldInfo fieldInfo; + private int lastDocID = -1; public NormValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { + docsWithField = new FixedBitSet(64); pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT); - bytesUsed = pending.ramBytesUsed(); + bytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed(); this.fieldInfo = fieldInfo; this.iwBytesUsed = iwBytesUsed; iwBytesUsed.addAndGet(bytesUsed); } public void addValue(int docID, long value) { - // Fill in any holes: - for (int i = (int)pending.size(); i < docID; ++i) { - pending.add(MISSING); + if (docID <= lastDocID) { + throw new IllegalArgumentException("Norm for \"" + fieldInfo.name + "\" appears more than once in this document (only one value is allowed per field)"); } pending.add(value); + docsWithField = FixedBitSet.ensureCapacity(docsWithField, docID); + docsWithField.set(docID); + updateBytesUsed(); + + lastDocID = docID; } private void updateBytesUsed() { @@ -65,7 +73,6 @@ class NormValuesWriter { public void flush(SegmentWriteState state, NormsConsumer normsConsumer) throws IOException { - final int maxDoc = state.segmentInfo.maxDoc(); final PackedLongValues values = pending.build(); normsConsumer.addNormsField(fieldInfo, @@ -75,7 +82,7 @@ class NormValuesWriter { if (fieldInfo != NormValuesWriter.this.fieldInfo) { throw new IllegalArgumentException("wrong fieldInfo"); } - return new BufferedNorms(maxDoc, values); + return new BufferedNorms(values, docsWithField); } @Override @@ -98,36 +105,28 @@ class NormValuesWriter { // iterates over the values we have in ram private static class BufferedNorms extends NumericDocValues { final PackedLongValues.Iterator iter; - final int size; - final int maxDoc; - private int docID = -1; + final DocIdSetIterator docsWithField; private long value; - - BufferedNorms(int maxDoc, PackedLongValues values) { - this.maxDoc = maxDoc; + + BufferedNorms(PackedLongValues values, FixedBitSet docsWithFields) { this.iter = values.iterator(); - this.size = (int) values.size(); - } - - @Override - public int docID() { - return docID; + this.docsWithField = new BitSetIterator(docsWithFields, values.size()); } @Override - public int nextDoc() { - docID++; - if (docID == maxDoc) { - docID = NO_MORE_DOCS; - } - if (docID < size) { + public int docID() { + return docsWithField.docID(); + } + + @Override + public int nextDoc() throws IOException { + int docID = docsWithField.nextDoc(); + if (docID != NO_MORE_DOCS) { value = iter.next(); - } else { - value = MISSING; } return docID; } - + @Override public int advance(int target) { throw new UnsupportedOperationException(); @@ -135,7 +134,7 @@ class NormValuesWriter { @Override public long cost() { - return maxDoc; + return docsWithField.cost(); } @Override diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 85aa3a7cd37..773c168280b 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,5 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene62.Lucene62Codec org.apache.lucene.codecs.lucene70.Lucene70Codec diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat index 1161f039864..20463c5dc45 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -13,5 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java new file mode 100644 index 00000000000..cc07ceec5a5 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene70; + + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene70.Lucene70Codec; +import org.apache.lucene.index.BaseNormsFormatTestCase; + +/** + * Tests Lucene70NormsFormat + */ +public class TestLucene70NormsFormat extends BaseNormsFormatTestCase { + private final Codec codec = new Lucene70Codec(); + + @Override + protected Codec getCodec() { + return codec; + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java new file mode 100644 index 00000000000..1911bd023e2 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene70; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +public class TestSparseDISI extends LuceneTestCase { + + public void testRandom() throws IOException { + try (Directory dir = newDirectory()) { + for (int i = 0; i < 1000; ++i) { + doTestRandom(dir); + } + } + } + + private void doTestRandom(Directory dir) throws IOException { + List docs = new ArrayList<>(); + final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 10)); + final int numDocs = TestUtil.nextInt(random(), 1, 1000); + for (int doc = -1, i = 0; i < numDocs; ++i) { + doc += TestUtil.nextInt(random(), 1, maxStep); + docs.add(doc); + } + final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100); + + FixedBitSet set = new FixedBitSet(maxDoc); + for (int doc : docs) { + set.set(doc); + } + + try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) { + SparseDISI.writeBitSet(new BitSetIterator(set, docs.size()), maxDoc, out); + } + + try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) { + SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size()); + BitSetIterator disi2 = new BitSetIterator(set, docs.size()); + int i = 0; + for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) { + assertEquals(doc, disi.nextDoc()); + assertEquals(i++, disi.index()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc()); + } + + for (int step : new int[] {1, 20, maxStep, maxStep * 10}) { + try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) { + SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size()); + BitSetIterator disi2 = new BitSetIterator(set, docs.size()); + while (true) { + int target = disi2.docID() + step; + int doc = disi2.advance(target); + assertEquals(doc, disi.advance(target)); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + int index = Collections.binarySearch(docs, doc); + assertEquals(index, disi.index()); + } + } + } + + dir.deleteFile("foo"); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java index a51f1d9f57c..52038bc0874 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java @@ -23,6 +23,7 @@ import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.TextField; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.TermStatistics; @@ -186,4 +187,24 @@ public class TestNorms extends LuceneTestCase { throw new UnsupportedOperationException(); } } + + public void testEmptyValueVsNoValue() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig cfg = newIndexWriterConfig().setMergePolicy(newLogMergePolicy()); + IndexWriter w = new IndexWriter(dir, cfg); + Document doc = new Document(); + w.addDocument(doc); + doc.add(newTextField("foo", "", Store.NO)); + w.addDocument(doc); + w.forceMerge(1); + IndexReader reader = DirectoryReader.open(w); + w.close(); + LeafReader leafReader = getOnlyLeafReader(reader); + NumericDocValues normValues = leafReader.getNormValues("foo"); + assertNotNull(normValues); + assertEquals(1, normValues.nextDoc()); // doc 0 does not have norms + assertEquals(0, normValues.longValue()); + reader.close(); + dir.close(); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java index dc0636ddfd5..9ca13ddd78b 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java @@ -30,9 +30,11 @@ import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.TestUtil; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; @@ -46,12 +48,17 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; * if there is some bug in a given NormsFormat that this * test fails to catch then this test needs to be improved! */ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCase { - + + /** Whether the codec supports sparse values. */ + protected boolean codecSupportsSparsity() { + return true; + } + public void testByteRange() throws Exception { int iterations = atLeast(1); final Random r = random(); for (int i = 0; i < iterations; i++) { - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); @@ -59,12 +66,26 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas }); } } - + + public void testSparseByteRange() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; i++) { + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + return TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + } + }); + } + } + public void testShortRange() throws Exception { int iterations = atLeast(1); final Random r = random(); for (int i = 0; i < iterations; i++) { - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return TestUtil.nextLong(r, Short.MIN_VALUE, Short.MAX_VALUE); @@ -72,12 +93,26 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas }); } } - + + public void testSparseShortRange() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; i++) { + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + return TestUtil.nextLong(r, Short.MIN_VALUE, Short.MAX_VALUE); + } + }); + } + } + public void testLongRange() throws Exception { int iterations = atLeast(1); final Random r = random(); for (int i = 0; i < iterations; i++) { - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return TestUtil.nextLong(r, Long.MIN_VALUE, Long.MAX_VALUE); @@ -85,12 +120,26 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas }); } } - + + public void testSparseLongRange() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; i++) { + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + return TestUtil.nextLong(r, Long.MIN_VALUE, Long.MAX_VALUE); + } + }); + } + } + public void testFullLongRange() throws Exception { int iterations = atLeast(1); final Random r = random(); for (int i = 0; i < iterations; i++) { - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { int thingToDo = r.nextInt(3); @@ -103,12 +152,31 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas }); } } - + + public void testSparseFullLongRange() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; i++) { + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + int thingToDo = r.nextInt(3); + switch (thingToDo) { + case 0: return Long.MIN_VALUE; + case 1: return Long.MAX_VALUE; + default: return TestUtil.nextLong(r, Long.MIN_VALUE, Long.MAX_VALUE); + } + } + }); + } + } + public void testFewValues() throws Exception { int iterations = atLeast(1); final Random r = random(); for (int i = 0; i < iterations; i++) { - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return r.nextBoolean() ? 20 : 3; @@ -116,12 +184,26 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas }); } } - + + public void testFewSparseValues() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; i++) { + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + return r.nextBoolean() ? 20 : 3; + } + }); + } + } + public void testFewLargeValues() throws Exception { int iterations = atLeast(1); final Random r = random(); for (int i = 0; i < iterations; i++) { - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return r.nextBoolean() ? 1000000L : -5000; @@ -129,11 +211,25 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas }); } } - + + public void testFewSparseLargeValues() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; i++) { + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + return r.nextBoolean() ? 1000000L : -5000; + } + }); + } + } + public void testAllZeros() throws Exception { int iterations = atLeast(1); for (int i = 0; i < iterations; i++) { - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return 0; @@ -141,12 +237,25 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas }); } } - - public void testSparse() throws Exception { + + public void testSparseAllZeros() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + int iterations = atLeast(1); + for (int i = 0; i < iterations; i++) { + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + return 0; + } + }); + } + } + + public void testMostZeros() throws Exception { int iterations = atLeast(1); final Random r = random(); for (int i = 0; i < iterations; i++) { - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return r.nextInt(100) == 0 ? TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE) : 0; @@ -160,7 +269,7 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas final Random r = random(); for (int i = 0; i < iterations; i++) { final long commonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return r.nextInt(100) == 0 ? TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE) : commonValue; @@ -168,14 +277,29 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas }); } } - + + public void testSparseOutliers() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; i++) { + final long commonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + return r.nextInt(100) == 0 ? TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE) : commonValue; + } + }); + } + } + public void testOutliers2() throws Exception { int iterations = atLeast(1); final Random r = random(); for (int i = 0; i < iterations; i++) { final long commonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); final long uncommonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return r.nextInt(100) == 0 ? uncommonValue : commonValue; @@ -183,7 +307,23 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas }); } } - + + public void testSparseOutliers2() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; i++) { + final long commonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + final long uncommonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + return r.nextInt(100) == 0 ? uncommonValue : commonValue; + } + }); + } + } + public void testNCommon() throws Exception { final Random r = random(); final int N = TestUtil.nextInt(r, 2, 15); @@ -196,14 +336,35 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas for (int j = 0; j < numOtherValues; ++j) { otherValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); } - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return r.nextInt(100) == 0 ? otherValues[r.nextInt(numOtherValues - 1)] : commonValues[r.nextInt(N - 1)]; } }); } - + + public void testSparseNCommon() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + final Random r = random(); + final int N = TestUtil.nextInt(r, 2, 15); + final long[] commonValues = new long[N]; + for (int j = 0; j < N; ++j) { + commonValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + } + final int numOtherValues = TestUtil.nextInt(r, 2, 256 - N); + final long[] otherValues = new long[numOtherValues]; + for (int j = 0; j < numOtherValues; ++j) { + otherValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + } + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + return r.nextInt(100) == 0 ? otherValues[r.nextInt(numOtherValues - 1)] : commonValues[r.nextInt(N - 1)]; + } + }); + } + /** * a more thorough n-common that tests all low bpv */ @@ -224,7 +385,7 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas for (int j = 0; j < numOtherValues; ++j) { otherValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); } - doTestNormsVersusDocValues(new LongProducer() { + doTestNormsVersusDocValues(1, new LongProducer() { @Override long next() { return r.nextInt(100) == 0 ? otherValues[r.nextInt(numOtherValues - 1)] : commonValues[r.nextInt(N - 1)]; @@ -233,17 +394,62 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas } } } - - private void doTestNormsVersusDocValues(LongProducer longs) throws Exception { + + /** + * a more thorough n-common that tests all low bpv and sparse docs + */ + @Nightly + public void testSparseNCommonBig() throws Exception { + assumeTrue("Requires sparse norms support", codecSupportsSparsity()); + final int iterations = atLeast(1); + final Random r = random(); + for (int i = 0; i < iterations; ++i) { + // 16 is 4 bpv, the max before we jump to 8bpv + for (int n = 2; n < 16; ++n) { + final int N = n; + final long[] commonValues = new long[N]; + for (int j = 0; j < N; ++j) { + commonValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + } + final int numOtherValues = TestUtil.nextInt(r, 2, 256 - N); + final long[] otherValues = new long[numOtherValues]; + for (int j = 0; j < numOtherValues; ++j) { + otherValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE); + } + doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() { + @Override + long next() { + return r.nextInt(100) == 0 ? otherValues[r.nextInt(numOtherValues - 1)] : commonValues[r.nextInt(N - 1)]; + } + }); + } + } + } + + private void doTestNormsVersusDocValues(double density, LongProducer longs) throws Exception { int numDocs = atLeast(500); - long norms[] = new long[numDocs]; - for (int i = 0; i < numDocs; i++) { + final FixedBitSet docsWithField = new FixedBitSet(numDocs); + final int numDocsWithField = Math.max(1, (int) (density * numDocs)); + if (numDocsWithField == numDocs) { + docsWithField.set(0, numDocs); + } else { + int i = 0; + while (i < numDocsWithField) { + int doc = random().nextInt(numDocs); + if (docsWithField.get(doc) == false) { + docsWithField.set(doc); + ++i; + } + } + } + long norms[] = new long[numDocsWithField]; + for (int i = 0; i < numDocsWithField; i++) { norms[i] = longs.next(); } Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); + IndexWriterConfig conf = newIndexWriterConfig(analyzer);conf.setMergePolicy(NoMergePolicy.INSTANCE); conf.setSimilarity(new CannedNormSimilarity(norms)); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); Document doc = new Document(); @@ -254,12 +460,18 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas doc.add(indexedField); doc.add(dvField); - for (int i = 0; i < numDocs; i++) { + for (int i = 0, j = 0; i < numDocs; i++) { idField.setStringValue(Integer.toString(i)); - long value = norms[i]; - dvField.setLongValue(value); - indexedField.setStringValue(Long.toString(value)); - writer.addDocument(doc); + if (docsWithField.get(i) == false) { + Document doc2 = new Document(); + doc2.add(idField); + writer.addDocument(doc2); + } else { + long value = norms[j++]; + dvField.setLongValue(value); + indexedField.setStringValue(Long.toString(value)); + writer.addDocument(doc); + } if (random().nextInt(31) == 0) { writer.commit(); } @@ -280,12 +492,14 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas LeafReader r = context.reader(); NumericDocValues expected = r.getNumericDocValues("dv"); NumericDocValues actual = r.getNormValues("indexed"); - for (int i = 0; i < r.maxDoc(); i++) { - assertEquals(i, expected.nextDoc()); - assertEquals(i, actual.nextDoc()); - assertEquals("doc " + i, expected.longValue(), actual.longValue()); + assertEquals(expected == null, actual == null); + if (expected != null) { + for (int d = expected.nextDoc(); d != DocIdSetIterator.NO_MORE_DOCS; d = expected.nextDoc()) { + assertEquals(d, actual.nextDoc()); + assertEquals("doc " + d, expected.longValue(), actual.longValue()); + } + assertEquals(NO_MORE_DOCS, actual.nextDoc()); } - assertEquals(NO_MORE_DOCS, expected.nextDoc()); } ir.close(); @@ -297,10 +511,13 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas LeafReader r = context.reader(); NumericDocValues expected = r.getNumericDocValues("dv"); NumericDocValues actual = r.getNormValues("indexed"); - for (int i = 0; i < r.maxDoc(); i++) { - assertEquals(i, expected.nextDoc()); - assertEquals(i, actual.nextDoc()); - assertEquals("doc " + i, expected.longValue(), actual.longValue()); + assertEquals(expected == null, actual == null); + if (expected != null) { + for (int d = expected.nextDoc(); d != DocIdSetIterator.NO_MORE_DOCS; d = expected.nextDoc()) { + assertEquals(d, actual.nextDoc()); + assertEquals("doc " + d, expected.longValue(), actual.longValue()); + } + assertEquals(NO_MORE_DOCS, actual.nextDoc()); } } @@ -403,9 +620,13 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas // be undead: NumericDocValues norms = MultiDocValues.getNormValues(r, "content"); assertNotNull(norms); - for(int i=0;i