diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4437792b8c1..0a65d204306 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -22,6 +22,9 @@ API Changes
to iterators, enabling future codec compression improvements. (Mike
McCandless)
+* LUCENE-7475: Norms now support sparsity, allowing to pay for what is
+ actually used. (Adrien Grand)
+
Bug Fixes
Improvements
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java
similarity index 90%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java
index 15cdeccca2b..1f7928f2b29 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsFormat.java
@@ -74,7 +74,7 @@ public class Lucene53NormsFormat extends NormsFormat {
@Override
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
- return new Lucene53NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+ throw new UnsupportedOperationException("This format can only be used for reading");
}
@Override
@@ -82,10 +82,10 @@ public class Lucene53NormsFormat extends NormsFormat {
return new Lucene53NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
}
- private static final String DATA_CODEC = "Lucene53NormsData";
- private static final String DATA_EXTENSION = "nvd";
- private static final String METADATA_CODEC = "Lucene53NormsMetadata";
- private static final String METADATA_EXTENSION = "nvm";
+ static final String DATA_CODEC = "Lucene53NormsData";
+ static final String DATA_EXTENSION = "nvd";
+ static final String METADATA_CODEC = "Lucene53NormsMetadata";
+ static final String METADATA_EXTENSION = "nvm";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsProducer.java
similarity index 100%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsProducer.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsProducer.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/package-info.java
similarity index 93%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene53/package-info.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/package-info.java
index 6a035323cd3..93fefb8448a 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/package-info.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene53/package-info.java
@@ -17,7 +17,7 @@
/**
* Components from the Lucene 5.3 index format
- * See {@link org.apache.lucene.codecs.lucene54} for an overview
+ * See {@link org.apache.lucene.codecs.lucene53} for an overview
* of the index format.
*/
package org.apache.lucene.codecs.lucene53;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java
similarity index 100%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java
similarity index 100%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesFormat.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java
similarity index 100%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/package-info.java
similarity index 100%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene54/package-info.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene54/package-info.java
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java
index 32c17527deb..ed74aa8dfd4 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java
@@ -171,7 +171,7 @@ public class Lucene60Codec extends Codec {
private final NormsFormat normsFormat = new Lucene53NormsFormat();
@Override
- public final NormsFormat normsFormat() {
+ public NormsFormat normsFormat() {
return normsFormat;
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java
similarity index 99%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java
index 50710752694..58b07ebe73e 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java
@@ -32,9 +32,9 @@ import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
-import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.lucene53.Lucene53NormsFormat;
import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.codecs.lucene60.Lucene60PointsFormat;
@@ -170,7 +170,7 @@ public class Lucene62Codec extends Codec {
private final NormsFormat normsFormat = new Lucene53NormsFormat();
@Override
- public final NormsFormat normsFormat() {
+ public NormsFormat normsFormat() {
return normsFormat;
}
}
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/package.html b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/package.html
new file mode 100644
index 00000000000..74e66d93bc6
--- /dev/null
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene62/package.html
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+Lucene 6.2 file format.
+
+
diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index 875aba527e2..6954d7a641c 100644
--- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -14,3 +14,4 @@
# limitations under the License.
org.apache.lucene.codecs.lucene60.Lucene60Codec
+org.apache.lucene.codecs.lucene62.Lucene62Codec
diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
index 4a812de77e5..26984efd409 100644
--- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
+++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
@@ -13,3 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java
similarity index 99%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java
rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java
index 833500c1930..ddb968c8b3e 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53NormsConsumer.java
@@ -58,7 +58,7 @@ class Lucene53NormsConsumer extends NormsConsumer {
@Override
public void addNormsField(FieldInfo field, NormsProducer normsProducer) throws IOException {
- addNormsField(field, LegacyDocValuesIterables.normsIterable(field, normsProducer, maxDoc));
+ addNormsField(field, LegacyDocValuesIterables.normsIterable(field, normsProducer, maxDoc, true));
}
private void addNormsField(FieldInfo field, Iterable values) throws IOException {
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53RWNormsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53RWNormsFormat.java
new file mode 100644
index 00000000000..86a2b6a509b
--- /dev/null
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/Lucene53RWNormsFormat.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene53;
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.NormsConsumer;
+import org.apache.lucene.index.SegmentWriteState;
+
+public class Lucene53RWNormsFormat extends Lucene53NormsFormat {
+
+ @Override
+ public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
+ return new Lucene53NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+ }
+
+}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java
similarity index 85%
rename from lucene/core/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java
rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java
index c87c51ff6c0..80a8eee6269 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java
@@ -16,19 +16,23 @@
*/
package org.apache.lucene.codecs.lucene53;
-
import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.codecs.lucene70.Lucene70Codec;
+import org.apache.lucene.codecs.lucene62.Lucene62RWCodec;
import org.apache.lucene.index.BaseNormsFormatTestCase;
/**
* Tests Lucene53NormsFormat
*/
public class TestLucene53NormsFormat extends BaseNormsFormatTestCase {
- private final Codec codec = new Lucene70Codec();
-
+ private final Codec codec = new Lucene62RWCodec();
+
@Override
protected Codec getCodec() {
return codec;
}
-}
+
+ @Override
+ protected boolean codecSupportsSparsity() {
+ return false;
+ }
+}
\ No newline at end of file
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java
similarity index 100%
rename from lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java
rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene62/Lucene62RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene62/Lucene62RWCodec.java
new file mode 100644
index 00000000000..fcb414def03
--- /dev/null
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene62/Lucene62RWCodec.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene62;
+
+import org.apache.lucene.codecs.NormsFormat;
+import org.apache.lucene.codecs.lucene53.Lucene53RWNormsFormat;
+import org.apache.lucene.codecs.lucene62.Lucene62Codec;
+
+public class Lucene62RWCodec extends Lucene62Codec {
+
+ private final NormsFormat normsFormat = new Lucene53RWNormsFormat();
+
+ @Override
+ public NormsFormat normsFormat() {
+ return normsFormat;
+ }
+
+}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
index c0b88cc3de0..b01924a85dc 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
@@ -187,7 +187,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
};
}
- private Bits getNumericDocsWithField(FieldInfo fieldInfo) throws IOException {
+ public Bits getNumericDocsWithField(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextNormsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextNormsFormat.java
index faa50b765c7..26b00ec6238 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextNormsFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextNormsFormat.java
@@ -30,7 +30,6 @@ import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.Accountable;
-import org.apache.lucene.util.Bits;
/**
* plain-text norms format.
@@ -70,7 +69,7 @@ public class SimpleTextNormsFormat extends NormsFormat {
@Override
public NumericDocValues getNorms(FieldInfo field) throws IOException {
- return new LegacyNumericDocValuesWrapper(new Bits.MatchAllBits(impl.maxDoc), impl.getNumericNonIterator(field));
+ return new LegacyNumericDocValuesWrapper(impl.getNumericDocsWithField(field), impl.getNumericNonIterator(field));
}
@Override
@@ -117,7 +116,7 @@ public class SimpleTextNormsFormat extends NormsFormat {
@Override
public void addNormsField(FieldInfo field, NormsProducer normsProducer) throws IOException {
- impl.addNumericField(field, LegacyDocValuesIterables.normsIterable(field, normsProducer, impl.numDocs));
+ impl.addNumericField(field, LegacyDocValuesIterables.normsIterable(field, normsProducer, impl.numDocs, false));
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/LegacyDocValuesIterables.java b/lucene/core/src/java/org/apache/lucene/codecs/LegacyDocValuesIterables.java
index 63f93dbef84..74c2d801ec3 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/LegacyDocValuesIterables.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/LegacyDocValuesIterables.java
@@ -372,7 +372,8 @@ public class LegacyDocValuesIterables {
*
* @deprecated Consume {@link NumericDocValues} instead. */
@Deprecated
- public static Iterable normsIterable(final FieldInfo field, final NormsProducer normsProducer, final int maxDoc) {
+ public static Iterable normsIterable(final FieldInfo field,
+ final NormsProducer normsProducer, final int maxDoc, boolean missingAsZero) {
return new Iterable() {
@@ -411,9 +412,11 @@ public class LegacyDocValuesIterables {
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
- } else {
+ } else if (missingAsZero) {
// Unlike NumericDocValues, norms should return for missing values:
result = 0;
+ } else {
+ result = null;
}
return result;
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
index f76ac06392e..9170c69bb3d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
@@ -17,7 +17,7 @@
/**
* Components from the Lucene 5.0 index format
- * See {@link org.apache.lucene.codecs.lucene53} for an overview
+ * See {@link org.apache.lucene.codecs.lucene50} for an overview
* of the index format.
*/
package org.apache.lucene.codecs.lucene50;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70Codec.java
index 8d86649e30c..7f9aed0ed96 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70Codec.java
@@ -35,7 +35,6 @@ import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
-import org.apache.lucene.codecs.lucene53.Lucene53NormsFormat;
import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.codecs.lucene60.Lucene60PointsFormat;
import org.apache.lucene.codecs.lucene62.Lucene62SegmentInfoFormat;
@@ -168,7 +167,7 @@ public class Lucene70Codec extends Codec {
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene70");
- private final NormsFormat normsFormat = new Lucene53NormsFormat();
+ private final NormsFormat normsFormat = new Lucene70NormsFormat();
@Override
public final NormsFormat normsFormat() {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
new file mode 100644
index 00000000000..00cd5ecde33
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene70;
+
+import static org.apache.lucene.codecs.lucene70.Lucene70NormsFormat.VERSION_CURRENT;
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.NormsConsumer;
+import org.apache.lucene.codecs.NormsProducer;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Writer for {@link Lucene70NormsFormat}
+ */
+final class Lucene70NormsConsumer extends NormsConsumer {
+ IndexOutput data, meta;
+ final int maxDoc;
+
+ Lucene70NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+ boolean success = false;
+ try {
+ String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
+ data = state.directory.createOutput(dataName, state.context);
+ CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+ String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
+ meta = state.directory.createOutput(metaName, state.context);
+ CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+ maxDoc = state.segmentInfo.maxDoc();
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ boolean success = false;
+ try {
+ if (meta != null) {
+ meta.writeInt(-1); // write EOF marker
+ CodecUtil.writeFooter(meta); // write checksum
+ }
+ if (data != null) {
+ CodecUtil.writeFooter(data); // write checksum
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(data, meta);
+ } else {
+ IOUtils.closeWhileHandlingException(data, meta);
+ }
+ meta = data = null;
+ }
+ }
+
+ @Override
+ public void addNormsField(FieldInfo field, NormsProducer normsProducer) throws IOException {
+ NumericDocValues values = normsProducer.getNorms(field);
+ int numDocsWithValue = 0;
+ long min = Long.MAX_VALUE;
+ long max = Long.MIN_VALUE;
+ for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
+ numDocsWithValue++;
+ long v = values.longValue();
+ min = Math.min(min, v);
+ max = Math.max(max, v);
+ }
+ assert numDocsWithValue <= maxDoc;
+
+ meta.writeInt(field.number);
+
+ if (numDocsWithValue == 0) {
+ meta.writeLong(-2);
+ } else if (numDocsWithValue == maxDoc) {
+ meta.writeLong(-1);
+ } else {
+ meta.writeLong(data.getFilePointer());
+ values = normsProducer.getNorms(field);
+ SparseDISI.writeBitSet(values, maxDoc, data);
+ }
+
+ meta.writeInt(numDocsWithValue);
+ int numBytesPerValue = numBytesPerValue(min, max);
+
+ meta.writeByte((byte) numBytesPerValue);
+ if (numBytesPerValue == 0) {
+ meta.writeLong(min);
+ } else {
+ meta.writeLong(data.getFilePointer());
+ values = normsProducer.getNorms(field);
+ writeValues(values, numBytesPerValue, data);
+ }
+ }
+
+ private int numBytesPerValue(long min, long max) {
+ if (min >= max) {
+ return 0;
+ } else if (min >= Byte.MIN_VALUE && max <= Byte.MAX_VALUE) {
+ return 1;
+ } else if (min >= Short.MIN_VALUE && max <= Short.MAX_VALUE) {
+ return 2;
+ } else if (min >= Integer.MIN_VALUE && max <= Integer.MAX_VALUE) {
+ return 4;
+ } else {
+ return 8;
+ }
+ }
+
+ private void writeValues(NumericDocValues values, int numBytesPerValue, IndexOutput out) throws IOException, AssertionError {
+ for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
+ long value = values.longValue();
+ switch (numBytesPerValue) {
+ case 1:
+ out.writeByte((byte) value);
+ break;
+ case 2:
+ out.writeShort((short) value);
+ break;
+ case 4:
+ out.writeInt((int) value);
+ break;
+ case 8:
+ out.writeLong(value);
+ break;
+ default:
+ throw new AssertionError();
+ }
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
new file mode 100644
index 00000000000..7e70b246967
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene70;
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.NormsConsumer;
+import org.apache.lucene.codecs.NormsFormat;
+import org.apache.lucene.codecs.NormsProducer;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.DataOutput;
+
+/**
+ * Lucene 7.0 Score normalization format.
+ *
+ * Encodes normalization values by encoding each value with the minimum
+ * number of bytes needed to represent the range (which can be zero).
+ *
+ * Files:
+ *
+ * - .nvd: Norms data
+ * - .nvm: Norms metadata
+ *
+ *
+ * -
+ *
The Norms metadata or .nvm file.
+ * For each norms field, this stores metadata, such as the offset into the
+ * Norms data (.nvd)
+ * Norms metadata (.dvm) --> Header,<Entry>NumFields,Footer
+ *
+ * - Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
+ * - Entry --> FieldNumber, DocsWithFieldAddress, NumDocsWithField, BytesPerNorm, NormsAddress
+ * - FieldNumber --> {@link DataOutput#writeInt Int32}
+ * - DocsWithFieldAddress --> {@link DataOutput#writeLong Int64}
+ * - NumDocsWithField --> {@link DataOutput#writeInt Int32}
+ * - BytesPerNorm --> {@link DataOutput#writeByte byte}
+ * - NormsAddress --> {@link DataOutput#writeLong Int64}
+ * - Footer --> {@link CodecUtil#writeFooter CodecFooter}
+ *
+ * FieldNumber of -1 indicates the end of metadata.
+ * NormsAddress is the pointer to the start of the data in the norms data (.nvd), or the singleton value
+ * when BytesPerValue = 0. If BytesPerValue is different from 0 then there are NumDocsWithField values
+ * to read at that offset.
+ * DocsWithFieldAddress is the pointer to the start of the bit set containing documents that have a norm
+ * in the norms data (.nvd), or -2 if no documents have a norm value, or -1 if all documents have a norm
+ * value.
+ * -
+ *
The Norms data or .nvd file.
+ * For each Norms field, this stores the actual per-document data (the heavy-lifting)
+ * Norms data (.nvd) --> Header,< Data >NumFields,Footer
+ *
+ * - Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
+ * - DocsWithFieldData --> Bit set of MaxDoc bits
+ * - NormsData --> {@link DataOutput#writeByte(byte) byte}NumDocsWithField * BytesPerValue
+ * - Footer --> {@link CodecUtil#writeFooter CodecFooter}
+ *
+ *
+ * @lucene.experimental
+ */
+public class Lucene70NormsFormat extends NormsFormat {
+
+ /** Sole Constructor */
+ public Lucene70NormsFormat() {}
+
+ @Override
+ public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
+ return new Lucene70NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+ }
+
+ @Override
+ public NormsProducer normsProducer(SegmentReadState state) throws IOException {
+ return new Lucene70NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+ }
+
+ private static final String DATA_CODEC = "Lucene70NormsData";
+ private static final String DATA_EXTENSION = "nvd";
+ private static final String METADATA_CODEC = "Lucene70NormsMetadata";
+ private static final String METADATA_EXTENSION = "nvm";
+ static final int VERSION_START = 0;
+ static final int VERSION_CURRENT = VERSION_START;
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
new file mode 100644
index 00000000000..ee96c1583b5
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene70;
+
+import static org.apache.lucene.codecs.lucene70.Lucene70NormsFormat.VERSION_CURRENT;
+import static org.apache.lucene.codecs.lucene70.Lucene70NormsFormat.VERSION_START;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.NormsProducer;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Reader for {@link Lucene70NormsFormat}
+ */
+final class Lucene70NormsProducer extends NormsProducer {
+ // metadata maps (just file pointers and minimal stuff)
+ private final Map norms = new HashMap<>();
+ private final IndexInput data;
+ private final int maxDoc;
+
+ Lucene70NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+ maxDoc = state.segmentInfo.maxDoc();
+ String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
+ int version = -1;
+
+ // read in the entries from the metadata file.
+ try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) {
+ Throwable priorE = null;
+ try {
+ version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+ readFields(in, state.fieldInfos);
+ } catch (Throwable exception) {
+ priorE = exception;
+ } finally {
+ CodecUtil.checkFooter(in, priorE);
+ }
+ }
+
+ String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
+ data = state.directory.openInput(dataName, state.context);
+ boolean success = false;
+ try {
+ final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+ if (version != version2) {
+ throw new CorruptIndexException("Format versions mismatch: meta=" + version + ",data=" + version2, data);
+ }
+
+ // NOTE: data file is too costly to verify checksum against all the bytes on open,
+ // but for now we at least verify proper structure of the checksum footer: which looks
+ // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+ // such as file truncation.
+ CodecUtil.retrieveChecksum(data);
+
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(this.data);
+ }
+ }
+ }
+
+ static class NormsEntry {
+ byte bytesPerNorm;
+ long docsWithFieldOffset;
+ int numDocsWithField;
+ long normsOffset;
+ }
+
+ static abstract class LongValues {
+ abstract long get(int index) throws IOException;
+ }
+
+ private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
+ for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) {
+ FieldInfo info = infos.fieldInfo(fieldNumber);
+ if (info == null) {
+ throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
+ } else if (!info.hasNorms()) {
+ throw new CorruptIndexException("Invalid field: " + info.name, meta);
+ }
+ NormsEntry entry = new NormsEntry();
+ entry.docsWithFieldOffset = meta.readLong();
+ entry.numDocsWithField = meta.readInt();
+ entry.bytesPerNorm = meta.readByte();
+ switch (entry.bytesPerNorm) {
+ case 0: case 1: case 2: case 4: case 8:
+ break;
+ default:
+ throw new CorruptIndexException("Invalid bytesPerValue: " + entry.bytesPerNorm + ", field: " + info.name, meta);
+ }
+ entry.normsOffset = meta.readLong();
+ norms.put(info.number, entry);
+ }
+ }
+
+ @Override
+ public NumericDocValues getNorms(FieldInfo field) throws IOException {
+ final NormsEntry entry = norms.get(field.number);
+ if (entry.docsWithFieldOffset == -2) {
+ // empty
+ return DocValues.emptyNumeric();
+ } else if (entry.docsWithFieldOffset == -1) {
+ // dense
+ final LongValues normValues = getNormValues(entry);
+ return new NumericDocValues() {
+
+ int doc = -1;
+
+ @Override
+ public long longValue() throws IOException {
+ return normValues.get(doc);
+ }
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if (target >= maxDoc) {
+ return doc = NO_MORE_DOCS;
+ }
+ return doc = target;
+ }
+
+ @Override
+ public long cost() {
+ return maxDoc;
+ }
+
+ };
+ } else {
+ // sparse
+ final LongValues normValues = getNormValues(entry);
+ final SparseDISI disi;
+ synchronized (data) {
+ disi = new SparseDISI(maxDoc, data, entry.docsWithFieldOffset, entry.numDocsWithField);
+ }
+ return new NumericDocValues() {
+
+ @Override
+ public int advance(int target) throws IOException {
+ return disi.advance(target);
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return disi.nextDoc();
+ }
+
+ @Override
+ public int docID() {
+ return disi.docID();
+ }
+
+ @Override
+ public long cost() {
+ return entry.numDocsWithField;
+ }
+
+ @Override
+ public long longValue() throws IOException {
+ return normValues.get(disi.index());
+ }
+ };
+ }
+ }
+
+ private LongValues getNormValues(NormsEntry entry) throws IOException {
+ if (entry.bytesPerNorm == 0) {
+ return new LongValues() {
+ @Override
+ long get(int index) {
+ return entry.normsOffset;
+ }
+ };
+ } else {
+ RandomAccessInput slice;
+ synchronized (data) {
+ slice = data.randomAccessSlice(entry.normsOffset, entry.numDocsWithField * (long) entry.bytesPerNorm);
+ }
+ switch (entry.bytesPerNorm) {
+ case 1:
+ return new LongValues() {
+ @Override
+ long get(int index) throws IOException {
+ return slice.readByte(index);
+ }
+ };
+ case 2:
+ return new LongValues() {
+ @Override
+ long get(int index) throws IOException {
+ return slice.readShort(((long) index) << 1);
+ }
+ };
+ case 4:
+ return new LongValues() {
+ @Override
+ long get(int index) throws IOException {
+ return slice.readInt(((long) index) << 2);
+ }
+ };
+ case 8:
+ return new LongValues() {
+ @Override
+ long get(int index) throws IOException {
+ return slice.readLong(((long) index) << 3);
+ }
+ };
+ default:
+ // should not happen, we already validate bytesPerNorm in readFields
+ throw new AssertionError();
+ }
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ data.close();
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return 64L * norms.size(); // good enough
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ CodecUtil.checksumEntireFile(data);
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + "(fields=" + norms.size() + ")";
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java
new file mode 100644
index 00000000000..af71b9ed99c
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene70;
+
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+
+final class SparseDISI extends DocIdSetIterator {
+
+ static void writeBitSet(DocIdSetIterator it, int maxDoc, IndexOutput out) throws IOException {
+ int currentIndex = 0;
+ long currentBits = 0;
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ final int index = doc >>> 6;
+ if (index > currentIndex) {
+ out.writeLong(currentBits);
+ for (int i = currentIndex + 1; i < index; ++i) {
+ out.writeLong(0L);
+ }
+ currentIndex = index;
+ currentBits = 0L;
+ }
+ currentBits |= 1L << doc;
+ }
+
+ out.writeLong(currentBits);
+ final int maxIndex = (maxDoc - 1) >>> 6;
+ for (int i = currentIndex + 1; i <= maxIndex; ++i) {
+ out.writeLong(0L);
+ }
+ }
+
+ final int maxDoc;
+ final int numWords;
+ final long cost;
+ final RandomAccessInput slice;
+ int doc = -1;
+ int wordIndex = -1;
+ long word;
+ int index = -1;
+
+ SparseDISI(int maxDoc, IndexInput in, long offset, long cost) throws IOException {
+ this.maxDoc = maxDoc;
+ this.numWords = (int) ((maxDoc + 63L) >>> 6);
+ this.slice = in.randomAccessSlice(offset, numWords * 8L);
+ this.cost = cost;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if (target >= maxDoc) {
+ return doc = NO_MORE_DOCS;
+ }
+
+ final int targetWordIndex = target >>> 6;
+ for (int i = wordIndex + 1; i <= targetWordIndex; ++i) {
+ word = slice.readLong(i << 3);
+ index += Long.bitCount(word);
+ }
+ wordIndex = targetWordIndex;
+
+ long leftBits = word >>> target;
+ if (leftBits != 0L) {
+ return doc = target + Long.numberOfTrailingZeros(leftBits);
+ }
+
+ while (++wordIndex < numWords) {
+ word = slice.readLong(wordIndex << 3);
+ if (word != 0) {
+ index += Long.bitCount(word);
+ return doc = (wordIndex << 6) + Long.numberOfTrailingZeros(word);
+ }
+ }
+
+ return doc = NO_MORE_DOCS;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public long cost() {
+ return cost;
+ }
+
+ public int index() {
+ return index - Long.bitCount(word >>> doc) + 1;
+ }
+
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
index 77492ad9659..9b432f7c4f4 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
@@ -163,7 +163,7 @@
* all documents omit position data.
*
*
- * {@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Normalization factors}.
+ * {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Normalization factors}.
* For each field in each document, a value is stored
* that is multiplied into the score for hits on that field.
*
@@ -278,12 +278,12 @@
* Stores additional per-position metadata information such as character offsets and user payloads |
*
*
- * {@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Norms} |
+ * {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms} |
* .nvd, .nvm |
* Encodes length and boost factors for docs and fields |
*
*
- * {@link org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat Per-Document Values} |
+ * {@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-Document Values} |
* .dvd, .dvm |
* Encodes additional scoring factors or other per-document information. |
*
diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
index e941911ef2b..e2ece543506 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@@ -665,8 +665,17 @@ final class DefaultIndexingChain extends DocConsumer {
}
public void finish() throws IOException {
- if (fieldInfo.omitsNorms() == false && invertState.length != 0) {
- norms.addValue(docState.docID, similarity.computeNorm(invertState));
+ if (fieldInfo.omitsNorms() == false) {
+ long normValue;
+ if (invertState.length == 0) {
+ // the field exists in this document, but it did not have
+ // any indexed tokens, so we assign a default value of zero
+ // to the norm
+ normValue = 0;
+ } else {
+ normValue = similarity.computeNorm(invertState);
+ }
+ norms.addValue(docState.docID, normValue);
}
termsHashPerField.finish();
diff --git a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
index c444661708b..46b8c1ceb15 100644
--- a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
@@ -21,7 +21,10 @@ import java.io.IOException;
import org.apache.lucene.codecs.NormsConsumer;
import org.apache.lucene.codecs.NormsProducer;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
@@ -29,29 +32,34 @@ import org.apache.lucene.util.packed.PackedLongValues;
* segment flushes. */
class NormValuesWriter {
- private final static long MISSING = 0L;
-
+ private FixedBitSet docsWithField;
private PackedLongValues.Builder pending;
private final Counter iwBytesUsed;
private long bytesUsed;
private final FieldInfo fieldInfo;
+ private int lastDocID = -1;
public NormValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+ docsWithField = new FixedBitSet(64);
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
- bytesUsed = pending.ramBytesUsed();
+ bytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed();
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
iwBytesUsed.addAndGet(bytesUsed);
}
public void addValue(int docID, long value) {
- // Fill in any holes:
- for (int i = (int)pending.size(); i < docID; ++i) {
- pending.add(MISSING);
+ if (docID <= lastDocID) {
+ throw new IllegalArgumentException("Norm for \"" + fieldInfo.name + "\" appears more than once in this document (only one value is allowed per field)");
}
pending.add(value);
+ docsWithField = FixedBitSet.ensureCapacity(docsWithField, docID);
+ docsWithField.set(docID);
+
updateBytesUsed();
+
+ lastDocID = docID;
}
private void updateBytesUsed() {
@@ -65,7 +73,6 @@ class NormValuesWriter {
public void flush(SegmentWriteState state, NormsConsumer normsConsumer) throws IOException {
- final int maxDoc = state.segmentInfo.maxDoc();
final PackedLongValues values = pending.build();
normsConsumer.addNormsField(fieldInfo,
@@ -75,7 +82,7 @@ class NormValuesWriter {
if (fieldInfo != NormValuesWriter.this.fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
- return new BufferedNorms(maxDoc, values);
+ return new BufferedNorms(values, docsWithField);
}
@Override
@@ -98,36 +105,28 @@ class NormValuesWriter {
// iterates over the values we have in ram
private static class BufferedNorms extends NumericDocValues {
final PackedLongValues.Iterator iter;
- final int size;
- final int maxDoc;
- private int docID = -1;
+ final DocIdSetIterator docsWithField;
private long value;
-
- BufferedNorms(int maxDoc, PackedLongValues values) {
- this.maxDoc = maxDoc;
+
+ BufferedNorms(PackedLongValues values, FixedBitSet docsWithFields) {
this.iter = values.iterator();
- this.size = (int) values.size();
- }
-
- @Override
- public int docID() {
- return docID;
+ this.docsWithField = new BitSetIterator(docsWithFields, values.size());
}
@Override
- public int nextDoc() {
- docID++;
- if (docID == maxDoc) {
- docID = NO_MORE_DOCS;
- }
- if (docID < size) {
+ public int docID() {
+ return docsWithField.docID();
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ int docID = docsWithField.nextDoc();
+ if (docID != NO_MORE_DOCS) {
value = iter.next();
- } else {
- value = MISSING;
}
return docID;
}
-
+
@Override
public int advance(int target) {
throw new UnsupportedOperationException();
@@ -135,7 +134,7 @@ class NormValuesWriter {
@Override
public long cost() {
- return maxDoc;
+ return docsWithField.cost();
}
@Override
diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index 85aa3a7cd37..773c168280b 100644
--- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -13,5 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.lucene.codecs.lucene62.Lucene62Codec
org.apache.lucene.codecs.lucene70.Lucene70Codec
diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
index 1161f039864..20463c5dc45 100644
--- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
+++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
@@ -13,5 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat
org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java
new file mode 100644
index 00000000000..cc07ceec5a5
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene70;
+
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.lucene70.Lucene70Codec;
+import org.apache.lucene.index.BaseNormsFormatTestCase;
+
+/**
+ * Tests Lucene70NormsFormat
+ */
+public class TestLucene70NormsFormat extends BaseNormsFormatTestCase {
+ private final Codec codec = new Lucene70Codec();
+
+ @Override
+ protected Codec getCodec() {
+ return codec;
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java
new file mode 100644
index 00000000000..1911bd023e2
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene70;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestSparseDISI extends LuceneTestCase {
+
+ public void testRandom() throws IOException {
+ try (Directory dir = newDirectory()) {
+ for (int i = 0; i < 1000; ++i) {
+ doTestRandom(dir);
+ }
+ }
+ }
+
+ private void doTestRandom(Directory dir) throws IOException {
+ List docs = new ArrayList<>();
+ final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 10));
+ final int numDocs = TestUtil.nextInt(random(), 1, 1000);
+ for (int doc = -1, i = 0; i < numDocs; ++i) {
+ doc += TestUtil.nextInt(random(), 1, maxStep);
+ docs.add(doc);
+ }
+ final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
+
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ for (int doc : docs) {
+ set.set(doc);
+ }
+
+ try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
+ SparseDISI.writeBitSet(new BitSetIterator(set, docs.size()), maxDoc, out);
+ }
+
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size());
+ BitSetIterator disi2 = new BitSetIterator(set, docs.size());
+ int i = 0;
+ for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
+ assertEquals(doc, disi.nextDoc());
+ assertEquals(i++, disi.index());
+ }
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
+ }
+
+ for (int step : new int[] {1, 20, maxStep, maxStep * 10}) {
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size());
+ BitSetIterator disi2 = new BitSetIterator(set, docs.size());
+ while (true) {
+ int target = disi2.docID() + step;
+ int doc = disi2.advance(target);
+ assertEquals(doc, disi.advance(target));
+ if (doc == DocIdSetIterator.NO_MORE_DOCS) {
+ break;
+ }
+ int index = Collections.binarySearch(docs, doc);
+ assertEquals(index, disi.index());
+ }
+ }
+ }
+
+ dir.deleteFile("foo");
+ }
+
+}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
index a51f1d9f57c..52038bc0874 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
@@ -23,6 +23,7 @@ import java.util.Random;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
@@ -186,4 +187,24 @@ public class TestNorms extends LuceneTestCase {
throw new UnsupportedOperationException();
}
}
+
+ public void testEmptyValueVsNoValue() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriterConfig cfg = newIndexWriterConfig().setMergePolicy(newLogMergePolicy());
+ IndexWriter w = new IndexWriter(dir, cfg);
+ Document doc = new Document();
+ w.addDocument(doc);
+ doc.add(newTextField("foo", "", Store.NO));
+ w.addDocument(doc);
+ w.forceMerge(1);
+ IndexReader reader = DirectoryReader.open(w);
+ w.close();
+ LeafReader leafReader = getOnlyLeafReader(reader);
+ NumericDocValues normValues = leafReader.getNormValues("foo");
+ assertNotNull(normValues);
+ assertEquals(1, normValues.nextDoc()); // doc 0 does not have norms
+ assertEquals(0, normValues.longValue());
+ reader.close();
+ dir.close();
+ }
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java
index dc0636ddfd5..9ca13ddd78b 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java
@@ -30,9 +30,11 @@ import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.TestUtil;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
@@ -46,12 +48,17 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
* if there is some bug in a given NormsFormat that this
* test fails to catch then this test needs to be improved! */
public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCase {
-
+
+ /** Whether the codec supports sparse values. */
+ protected boolean codecSupportsSparsity() {
+ return true;
+ }
+
public void testByteRange() throws Exception {
int iterations = atLeast(1);
final Random r = random();
for (int i = 0; i < iterations; i++) {
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
@@ -59,12 +66,26 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
});
}
}
-
+
+ public void testSparseByteRange() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ int iterations = atLeast(1);
+ final Random r = random();
+ for (int i = 0; i < iterations; i++) {
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ return TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+ }
+ });
+ }
+ }
+
public void testShortRange() throws Exception {
int iterations = atLeast(1);
final Random r = random();
for (int i = 0; i < iterations; i++) {
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return TestUtil.nextLong(r, Short.MIN_VALUE, Short.MAX_VALUE);
@@ -72,12 +93,26 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
});
}
}
-
+
+ public void testSparseShortRange() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ int iterations = atLeast(1);
+ final Random r = random();
+ for (int i = 0; i < iterations; i++) {
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ return TestUtil.nextLong(r, Short.MIN_VALUE, Short.MAX_VALUE);
+ }
+ });
+ }
+ }
+
public void testLongRange() throws Exception {
int iterations = atLeast(1);
final Random r = random();
for (int i = 0; i < iterations; i++) {
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return TestUtil.nextLong(r, Long.MIN_VALUE, Long.MAX_VALUE);
@@ -85,12 +120,26 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
});
}
}
-
+
+ public void testSparseLongRange() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ int iterations = atLeast(1);
+ final Random r = random();
+ for (int i = 0; i < iterations; i++) {
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ return TestUtil.nextLong(r, Long.MIN_VALUE, Long.MAX_VALUE);
+ }
+ });
+ }
+ }
+
public void testFullLongRange() throws Exception {
int iterations = atLeast(1);
final Random r = random();
for (int i = 0; i < iterations; i++) {
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
int thingToDo = r.nextInt(3);
@@ -103,12 +152,31 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
});
}
}
-
+
+ public void testSparseFullLongRange() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ int iterations = atLeast(1);
+ final Random r = random();
+ for (int i = 0; i < iterations; i++) {
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ int thingToDo = r.nextInt(3);
+ switch (thingToDo) {
+ case 0: return Long.MIN_VALUE;
+ case 1: return Long.MAX_VALUE;
+ default: return TestUtil.nextLong(r, Long.MIN_VALUE, Long.MAX_VALUE);
+ }
+ }
+ });
+ }
+ }
+
public void testFewValues() throws Exception {
int iterations = atLeast(1);
final Random r = random();
for (int i = 0; i < iterations; i++) {
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return r.nextBoolean() ? 20 : 3;
@@ -116,12 +184,26 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
});
}
}
-
+
+ public void testFewSparseValues() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ int iterations = atLeast(1);
+ final Random r = random();
+ for (int i = 0; i < iterations; i++) {
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ return r.nextBoolean() ? 20 : 3;
+ }
+ });
+ }
+ }
+
public void testFewLargeValues() throws Exception {
int iterations = atLeast(1);
final Random r = random();
for (int i = 0; i < iterations; i++) {
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return r.nextBoolean() ? 1000000L : -5000;
@@ -129,11 +211,25 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
});
}
}
-
+
+ public void testFewSparseLargeValues() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ int iterations = atLeast(1);
+ final Random r = random();
+ for (int i = 0; i < iterations; i++) {
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ return r.nextBoolean() ? 1000000L : -5000;
+ }
+ });
+ }
+ }
+
public void testAllZeros() throws Exception {
int iterations = atLeast(1);
for (int i = 0; i < iterations; i++) {
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return 0;
@@ -141,12 +237,25 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
});
}
}
-
- public void testSparse() throws Exception {
+
+ public void testSparseAllZeros() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ int iterations = atLeast(1);
+ for (int i = 0; i < iterations; i++) {
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ return 0;
+ }
+ });
+ }
+ }
+
+ public void testMostZeros() throws Exception {
int iterations = atLeast(1);
final Random r = random();
for (int i = 0; i < iterations; i++) {
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return r.nextInt(100) == 0 ? TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE) : 0;
@@ -160,7 +269,7 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
final Random r = random();
for (int i = 0; i < iterations; i++) {
final long commonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return r.nextInt(100) == 0 ? TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE) : commonValue;
@@ -168,14 +277,29 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
});
}
}
-
+
+ public void testSparseOutliers() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ int iterations = atLeast(1);
+ final Random r = random();
+ for (int i = 0; i < iterations; i++) {
+ final long commonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ return r.nextInt(100) == 0 ? TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE) : commonValue;
+ }
+ });
+ }
+ }
+
public void testOutliers2() throws Exception {
int iterations = atLeast(1);
final Random r = random();
for (int i = 0; i < iterations; i++) {
final long commonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
final long uncommonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return r.nextInt(100) == 0 ? uncommonValue : commonValue;
@@ -183,7 +307,23 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
});
}
}
-
+
+ public void testSparseOutliers2() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ int iterations = atLeast(1);
+ final Random r = random();
+ for (int i = 0; i < iterations; i++) {
+ final long commonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+ final long uncommonValue = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ return r.nextInt(100) == 0 ? uncommonValue : commonValue;
+ }
+ });
+ }
+ }
+
public void testNCommon() throws Exception {
final Random r = random();
final int N = TestUtil.nextInt(r, 2, 15);
@@ -196,14 +336,35 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
for (int j = 0; j < numOtherValues; ++j) {
otherValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
}
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return r.nextInt(100) == 0 ? otherValues[r.nextInt(numOtherValues - 1)] : commonValues[r.nextInt(N - 1)];
}
});
}
-
+
+ public void testSparseNCommon() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ final Random r = random();
+ final int N = TestUtil.nextInt(r, 2, 15);
+ final long[] commonValues = new long[N];
+ for (int j = 0; j < N; ++j) {
+ commonValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+ }
+ final int numOtherValues = TestUtil.nextInt(r, 2, 256 - N);
+ final long[] otherValues = new long[numOtherValues];
+ for (int j = 0; j < numOtherValues; ++j) {
+ otherValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+ }
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ return r.nextInt(100) == 0 ? otherValues[r.nextInt(numOtherValues - 1)] : commonValues[r.nextInt(N - 1)];
+ }
+ });
+ }
+
/**
* a more thorough n-common that tests all low bpv
*/
@@ -224,7 +385,7 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
for (int j = 0; j < numOtherValues; ++j) {
otherValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
}
- doTestNormsVersusDocValues(new LongProducer() {
+ doTestNormsVersusDocValues(1, new LongProducer() {
@Override
long next() {
return r.nextInt(100) == 0 ? otherValues[r.nextInt(numOtherValues - 1)] : commonValues[r.nextInt(N - 1)];
@@ -233,17 +394,62 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
}
}
}
-
- private void doTestNormsVersusDocValues(LongProducer longs) throws Exception {
+
+ /**
+ * a more thorough n-common that tests all low bpv and sparse docs
+ */
+ @Nightly
+ public void testSparseNCommonBig() throws Exception {
+ assumeTrue("Requires sparse norms support", codecSupportsSparsity());
+ final int iterations = atLeast(1);
+ final Random r = random();
+ for (int i = 0; i < iterations; ++i) {
+ // 16 is 4 bpv, the max before we jump to 8bpv
+ for (int n = 2; n < 16; ++n) {
+ final int N = n;
+ final long[] commonValues = new long[N];
+ for (int j = 0; j < N; ++j) {
+ commonValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+ }
+ final int numOtherValues = TestUtil.nextInt(r, 2, 256 - N);
+ final long[] otherValues = new long[numOtherValues];
+ for (int j = 0; j < numOtherValues; ++j) {
+ otherValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+ }
+ doTestNormsVersusDocValues(random().nextDouble(), new LongProducer() {
+ @Override
+ long next() {
+ return r.nextInt(100) == 0 ? otherValues[r.nextInt(numOtherValues - 1)] : commonValues[r.nextInt(N - 1)];
+ }
+ });
+ }
+ }
+ }
+
+ private void doTestNormsVersusDocValues(double density, LongProducer longs) throws Exception {
int numDocs = atLeast(500);
- long norms[] = new long[numDocs];
- for (int i = 0; i < numDocs; i++) {
+ final FixedBitSet docsWithField = new FixedBitSet(numDocs);
+ final int numDocsWithField = Math.max(1, (int) (density * numDocs));
+ if (numDocsWithField == numDocs) {
+ docsWithField.set(0, numDocs);
+ } else {
+ int i = 0;
+ while (i < numDocsWithField) {
+ int doc = random().nextInt(numDocs);
+ if (docsWithField.get(doc) == false) {
+ docsWithField.set(doc);
+ ++i;
+ }
+ }
+ }
+ long norms[] = new long[numDocsWithField];
+ for (int i = 0; i < numDocsWithField; i++) {
norms[i] = longs.next();
}
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
- IndexWriterConfig conf = newIndexWriterConfig(analyzer);
+ IndexWriterConfig conf = newIndexWriterConfig(analyzer);conf.setMergePolicy(NoMergePolicy.INSTANCE);
conf.setSimilarity(new CannedNormSimilarity(norms));
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
Document doc = new Document();
@@ -254,12 +460,18 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
doc.add(indexedField);
doc.add(dvField);
- for (int i = 0; i < numDocs; i++) {
+ for (int i = 0, j = 0; i < numDocs; i++) {
idField.setStringValue(Integer.toString(i));
- long value = norms[i];
- dvField.setLongValue(value);
- indexedField.setStringValue(Long.toString(value));
- writer.addDocument(doc);
+ if (docsWithField.get(i) == false) {
+ Document doc2 = new Document();
+ doc2.add(idField);
+ writer.addDocument(doc2);
+ } else {
+ long value = norms[j++];
+ dvField.setLongValue(value);
+ indexedField.setStringValue(Long.toString(value));
+ writer.addDocument(doc);
+ }
if (random().nextInt(31) == 0) {
writer.commit();
}
@@ -280,12 +492,14 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
LeafReader r = context.reader();
NumericDocValues expected = r.getNumericDocValues("dv");
NumericDocValues actual = r.getNormValues("indexed");
- for (int i = 0; i < r.maxDoc(); i++) {
- assertEquals(i, expected.nextDoc());
- assertEquals(i, actual.nextDoc());
- assertEquals("doc " + i, expected.longValue(), actual.longValue());
+ assertEquals(expected == null, actual == null);
+ if (expected != null) {
+ for (int d = expected.nextDoc(); d != DocIdSetIterator.NO_MORE_DOCS; d = expected.nextDoc()) {
+ assertEquals(d, actual.nextDoc());
+ assertEquals("doc " + d, expected.longValue(), actual.longValue());
+ }
+ assertEquals(NO_MORE_DOCS, actual.nextDoc());
}
- assertEquals(NO_MORE_DOCS, expected.nextDoc());
}
ir.close();
@@ -297,10 +511,13 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
LeafReader r = context.reader();
NumericDocValues expected = r.getNumericDocValues("dv");
NumericDocValues actual = r.getNormValues("indexed");
- for (int i = 0; i < r.maxDoc(); i++) {
- assertEquals(i, expected.nextDoc());
- assertEquals(i, actual.nextDoc());
- assertEquals("doc " + i, expected.longValue(), actual.longValue());
+ assertEquals(expected == null, actual == null);
+ if (expected != null) {
+ for (int d = expected.nextDoc(); d != DocIdSetIterator.NO_MORE_DOCS; d = expected.nextDoc()) {
+ assertEquals(d, actual.nextDoc());
+ assertEquals("doc " + d, expected.longValue(), actual.longValue());
+ }
+ assertEquals(NO_MORE_DOCS, actual.nextDoc());
}
}
@@ -403,9 +620,13 @@ public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCas
// be undead:
NumericDocValues norms = MultiDocValues.getNormValues(r, "content");
assertNotNull(norms);
- for(int i=0;i