Merge branch 'main' into optimize_prefix_query

This commit is contained in:
zhouhui 2024-11-18 11:20:15 +08:00
commit cdf2d5a2aa
196 changed files with 8789 additions and 2925 deletions

View File

@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
description "Regenerate gen_ForUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
def genDir = file("src/java/org/apache/lucene/codecs/lucene101")
def genScript = file("${genDir}/gen_ForUtil.py")
def genOutput = file("${genDir}/ForUtil.java")
@ -48,7 +48,7 @@ configure(project(":lucene:core")) {
description "Regenerate gen_ForDeltaUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
def genDir = file("src/java/org/apache/lucene/codecs/lucene101")
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
def genOutput = file("${genDir}/ForDeltaUtil.java")
@ -68,6 +68,7 @@ configure(project(":lucene:core")) {
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
}
configure(project(":lucene:backward-codecs")) {
@ -146,5 +147,55 @@ configure(project(":lucene:backward-codecs")) {
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
task generateForUtil912Internal() {
description "Regenerate gen_ForUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene912")
def genScript = file("${genDir}/gen_ForUtil.py")
def genOutput = file("${genDir}/ForUtil.java")
inputs.file genScript
outputs.file genOutput
doLast {
quietExec {
workingDir genDir
executable project.externalTool("python3")
args = [ '-B', genScript ]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil912Internal, [
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
task generateForDeltaUtil912Internal() {
description "Regenerate gen_ForDeltaUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene912")
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
def genOutput = file("${genDir}/ForDeltaUtil.java")
inputs.file genScript
outputs.file genOutput
doLast {
quietExec {
workingDir genDir
executable project.externalTool("python3")
args = [ '-B', genScript ]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtil912Internal, [
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
}

View File

@ -128,8 +128,14 @@ allprojects {
jvmArgs '--add-modules', 'jdk.management'
// Enable the vector incubator module on supported Java versions:
if (rootProject.vectorIncubatorJavaVersions.contains(rootProject.runtimeJavaVersion)) {
def prop = propertyOrDefault("org.apache.lucene.vectorization.upperJavaFeatureVersion", "1") as String
def v = JavaVersion.toVersion(Integer.parseInt(prop)).majorVersion
if (rootProject.vectorIncubatorJavaVersions.contains(rootProject.runtimeJavaVersion) ||
rootProject.runtimeJavaVersion.majorVersion <= v) {
jvmArgs '--add-modules', 'jdk.incubator.vector'
if (rootProject.runtimeJavaVersion.majorVersion <= v) {
systemProperty 'org.apache.lucene.vectorization.upperJavaFeatureVersion', v
}
}
jvmArgs '--enable-native-access=' + (project.path in [

View File

@ -7,7 +7,7 @@ http://s.apache.org/luceneversions
API Changes
---------------------
(No changes)
* GITHUB#11023: Removing deprecated parameters from CheckIndex. (Jakub Slowinski)
New Features
---------------------
@ -36,6 +36,10 @@ API Changes
* GITHUB#13859: Allow open-ended ranges in Intervals range queries. (Mayya Sharipova)
* GITHUB#13950: Make BooleanQuery#getClauses public and add #add(Collection<BooleanClause>) to BQ builder. (Shubham Chaudhary)
* GITHUB#13957: Removed LeafSimScorer class, to save its overhead. Scorers now
compute scores directly from a SimScorer, postings and norms. (Adrien Grand)
New Features
---------------------
@ -43,7 +47,11 @@ New Features
Improvements
---------------------
(No changes)
* GITHUB#13986: Allow easier configuration of the Panama Vectorization provider with
newer Java versions. Set the `org.apache.lucene.vectorization.upperJavaFeatureVersion`
system property to increase the set of Java versions that Panama Vectorization will
provide optimized implementations for. (Chris Hegarty)
Optimizations
---------------------
@ -53,12 +61,48 @@ Optimizations
* GITHUB#13800: MaxScoreBulkScorer now recomputes scorer partitions when the
minimum competitive allows for a more favorable partitioning. (Adrien Grand)
* GITHUB#13930: Use growNoCopy when copying bytes in BytesRefBuilder. (Ignacio Vera)
* GITHUB#13931: Refactored `BooleanScorer` to evaluate matches of sub clauses
using the `Scorer` abstraction rather than the `BulkScorer` abstraction. This
speeds up exhaustive evaluation of disjunctions of term queries.
(Adrien Grand)
* GITHUB#13941: Optimized computation of top-hits on disjunctive queries with
many clauses. (Adrien Grand)
* GITHUB#13954: Disabled exchanging scores across slices for exhaustive
top-hits evaluation. (Adrien Grand)
* GITHUB#13899: Check ahead if we can get the count. (Lu Xugang)
* GITHUB#13943: Removed shared `HitsThresholdChecker`, which reduces overhead
but may delay a bit when dynamic pruning kicks in. (Adrien Grand)
* GITHUB#13961: Replace Map<String,Object> with IntObjectHashMap for DV producer. (Pan Guixin)
* GITHUB#13963: Speed up nextDoc() implementations in Lucene912PostingsReader.
(Adrien Grand)
* GITHUB#13958: Speed up advancing within a block. (Adrien Grand)
* GITHUB#13763: Replace Map<String,Object> with IntObjectHashMap for KnnVectorsReader (Pan Guixin)
* GITHUB#13968: Switch postings from storing doc IDs in a long[] to an int[].
Lucene 8.4 had moved to a long[] to help speed up block decoding by using
longs that would pack two integers. We are now moving back to integers to be
able to take advantage of 2x more lanes with the vector API. (Adrien Grand)
Bug Fixes
---------------------
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
when they were not sorted by startOffset. (Seunghan Jung)
* GITHUB#13884: Remove broken .toArray from Long/CharObjectHashMap entirely. (Pan Guixin)
* GITHUB#12686: Added support for highlighting IndexOrDocValuesQuery. (Prudhvi Godithi)
* GITHUB#13927: Fix StoredFieldsConsumer finish. (linfn)
* GITHUB#13944: Ensure deterministic order of clauses for `DisjunctionMaxQuery#toString`. (Laurent Jakubina)
* GITHUB#13841: Improve Tessellatorlogic when two holes share the same vertex with the polygon which was failing
in valid polygons. (Ignacio Vera)
Build
---------------------
@ -67,7 +111,7 @@ Build
Other
---------------------
(No changes)
* GITHUB#13982: Remove duplicate test code. (Lu Xugang)
======================== Lucene 10.0.1 =======================
@ -295,6 +339,8 @@ Bug Fixes
* GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those
of DoubleValues#doubleValue(). (Uwe Schindler)
* GITHUB#13498: Avoid performance regression by constructing lazily the PointTree in NumericComparator, (Ignacio Vera)
Changes in Runtime Behavior
---------------------

View File

@ -19,6 +19,13 @@
## Migration from Lucene 9.x to Lucene 10.0
### DataInput#readVLong() may now read negative vlongs
LUCENE-10376 started allowing `DataInput#readVLong()` to read negative vlongs.
In particular, this feature is used by the `DataInput#readZLong()` method. A
practical implication is that `DataInput#readVLong()` may now read up to 10
bytes, while it would never read more than 9 bytes in Lucene 9.x.
### Changes to DataInput.readGroupVInt and readGroupVInts methods
As part of GITHUB#13820, GITHUB#13825, GITHUB#13830, this issue corrects DataInput.readGroupVInts

View File

@ -0,0 +1,4 @@
{
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForDeltaUtil.java": "b81961f0b277b1458ca259e0d23ccc4eeeb47fe7",
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForDeltaUtil.py": "3191d7591309b7876c5c709fb9375af5b87c2ef8"
}

View File

@ -0,0 +1,4 @@
{
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForUtil.java": "e6db3c665dfebca8b93eb6b4651d2eb3af637b02",
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForUtil.py": "993ecc9cf7ea821963384070669695257b16e040"
}

View File

@ -37,6 +37,7 @@ module org.apache.lucene.backward_codecs {
exports org.apache.lucene.backward_codecs.lucene95;
exports org.apache.lucene.backward_codecs.lucene99;
exports org.apache.lucene.backward_codecs.lucene912;
exports org.apache.lucene.backward_codecs.lucene100;
exports org.apache.lucene.backward_codecs.packed;
exports org.apache.lucene.backward_codecs.store;
@ -46,7 +47,8 @@ module org.apache.lucene.backward_codecs {
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat,
org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
@ -64,5 +66,6 @@ module org.apache.lucene.backward_codecs {
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec,
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec;
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec,
org.apache.lucene.backward_codecs.lucene100.Lucene100Codec;
}

View File

@ -14,9 +14,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene100;
package org.apache.lucene.backward_codecs.lucene100;
import java.util.Objects;
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
@ -50,7 +50,7 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
*
* <p>If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
*
* @see org.apache.lucene.codecs.lucene100 package documentation for file format details.
* @see org.apache.lucene.backward_codecs.lucene100 package documentation for file format details.
* @lucene.experimental
*/
public class Lucene100Codec extends Codec {

View File

@ -15,5 +15,5 @@
* limitations under the License.
*/
/** Lucene 9.12 file format. */
package org.apache.lucene.codecs.lucene912;
/** Lucene 10.0 file format. */
package org.apache.lucene.backward_codecs.lucene100;

View File

@ -17,8 +17,6 @@
package org.apache.lucene.backward_codecs.lucene80;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.backward_codecs.packed.LegacyDirectMonotonicReader;
import org.apache.lucene.backward_codecs.packed.LegacyDirectReader;
import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil;
@ -41,6 +39,7 @@ import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
@ -53,11 +52,11 @@ import org.apache.lucene.util.compress.LZ4;
/** reader for {@link Lucene80DocValuesFormat} */
final class Lucene80DocValuesProducer extends DocValuesProducer {
private final Map<String, NumericEntry> numerics = new HashMap<>();
private final Map<String, BinaryEntry> binaries = new HashMap<>();
private final Map<String, SortedEntry> sorted = new HashMap<>();
private final Map<String, SortedSetEntry> sortedSets = new HashMap<>();
private final Map<String, SortedNumericEntry> sortedNumerics = new HashMap<>();
private final IntObjectHashMap<NumericEntry> numerics = new IntObjectHashMap<>();
private final IntObjectHashMap<BinaryEntry> binaries = new IntObjectHashMap<>();
private final IntObjectHashMap<SortedEntry> sorted = new IntObjectHashMap<>();
private final IntObjectHashMap<SortedSetEntry> sortedSets = new IntObjectHashMap<>();
private final IntObjectHashMap<SortedNumericEntry> sortedNumerics = new IntObjectHashMap<>();
private final IndexInput data;
private final int maxDoc;
private int version = -1;
@ -139,7 +138,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
}
byte type = meta.readByte();
if (type == Lucene80DocValuesFormat.NUMERIC) {
numerics.put(info.name, readNumeric(meta));
numerics.put(info.number, readNumeric(meta));
} else if (type == Lucene80DocValuesFormat.BINARY) {
final boolean compressed;
if (version >= Lucene80DocValuesFormat.VERSION_CONFIGURABLE_COMPRESSION) {
@ -158,13 +157,13 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
} else {
compressed = version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED;
}
binaries.put(info.name, readBinary(meta, compressed));
binaries.put(info.number, readBinary(meta, compressed));
} else if (type == Lucene80DocValuesFormat.SORTED) {
sorted.put(info.name, readSorted(meta));
sorted.put(info.number, readSorted(meta));
} else if (type == Lucene80DocValuesFormat.SORTED_SET) {
sortedSets.put(info.name, readSortedSet(meta));
sortedSets.put(info.number, readSortedSet(meta));
} else if (type == Lucene80DocValuesFormat.SORTED_NUMERIC) {
sortedNumerics.put(info.name, readSortedNumeric(meta));
sortedNumerics.put(info.number, readSortedNumeric(meta));
} else {
throw new CorruptIndexException("invalid type: " + type, meta);
}
@ -426,7 +425,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
NumericEntry entry = numerics.get(field.name);
NumericEntry entry = numerics.get(field.number);
return getNumeric(entry);
}
@ -915,7 +914,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
BinaryEntry entry = binaries.get(field.name);
BinaryEntry entry = binaries.get(field.number);
if (entry.compressed) {
return getCompressedBinary(entry);
} else {
@ -973,7 +972,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
SortedEntry entry = sorted.get(field.name);
SortedEntry entry = sorted.get(field.number);
return getSorted(entry);
}
@ -1407,7 +1406,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedNumericEntry entry = sortedNumerics.get(field.name);
SortedNumericEntry entry = sortedNumerics.get(field.number);
if (entry.numValues == entry.numDocsWithField) {
return DocValues.singleton(getNumeric(entry));
}
@ -1543,7 +1542,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetEntry entry = sortedSets.get(field.name);
SortedSetEntry entry = sortedSets.get(field.number);
if (entry.singleValueEntry != null) {
return DocValues.singleton(getSorted(entry.singleValueEntry));
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.backward_codecs.lucene90;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.SplittableRandom;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.KnnVectorsReader;
@ -33,6 +31,7 @@ import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.search.VectorScorer;
import org.apache.lucene.store.ChecksumIndexInput;
@ -50,14 +49,16 @@ import org.apache.lucene.util.hnsw.NeighborQueue;
*/
public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
private final Map<String, FieldEntry> fields = new HashMap<>();
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
private final IndexInput vectorData;
private final IndexInput vectorIndex;
private final long checksumSeed;
private final FieldInfos fieldInfos;
Lucene90HnswVectorsReader(SegmentReadState state) throws IOException {
int versionMeta = readMetadata(state);
long[] checksumRef = new long[1];
this.fieldInfos = state.fieldInfos;
boolean success = false;
try {
vectorData =
@ -158,7 +159,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
FieldEntry fieldEntry = readField(meta, info);
validateFieldEntry(info, fieldEntry);
fields.put(info.name, fieldEntry);
fields.put(info.number, fieldEntry);
}
}
@ -218,13 +219,18 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
CodecUtil.checksumEntireFile(vectorIndex);
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
private FieldEntry getFieldEntry(String field) {
final FieldInfo info = fieldInfos.fieldInfo(field);
final FieldEntry fieldEntry;
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
return getOffHeapVectorValues(fieldEntry);
return fieldEntry;
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
return getOffHeapVectorValues(getFieldEntry(field));
}
@Override
@ -235,8 +241,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
@Override
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
FieldEntry fieldEntry = fields.get(field);
final FieldEntry fieldEntry = getFieldEntry(field);
if (fieldEntry.size() == 0) {
return;
}

View File

@ -21,8 +21,6 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.function.IntUnaryOperator;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.KnnVectorsReader;
@ -35,6 +33,7 @@ import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.search.VectorScorer;
@ -55,13 +54,15 @@ import org.apache.lucene.util.hnsw.RandomVectorScorer;
*/
public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
private final Map<String, FieldEntry> fields = new HashMap<>();
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
private final IndexInput vectorData;
private final IndexInput vectorIndex;
private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
private final FieldInfos fieldInfos;
Lucene91HnswVectorsReader(SegmentReadState state) throws IOException {
int versionMeta = readMetadata(state);
this.fieldInfos = state.fieldInfos;
boolean success = false;
try {
vectorData =
@ -154,7 +155,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
}
FieldEntry fieldEntry = readField(meta, info);
validateFieldEntry(info, fieldEntry);
fields.put(info.name, fieldEntry);
fields.put(info.number, fieldEntry);
}
}
@ -214,13 +215,18 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
CodecUtil.checksumEntireFile(vectorIndex);
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
private FieldEntry getFieldEntry(String field) {
final FieldInfo info = fieldInfos.fieldInfo(field);
final FieldEntry fieldEntry;
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
return getOffHeapVectorValues(fieldEntry);
return fieldEntry;
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
return getOffHeapVectorValues(getFieldEntry(field));
}
@Override
@ -231,8 +237,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
@Override
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
FieldEntry fieldEntry = fields.get(field);
final FieldEntry fieldEntry = getFieldEntry(field);
if (fieldEntry.size() == 0) {
return;
}

View File

@ -16,13 +16,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import static org.apache.lucene.codecs.lucene912.ForUtil.*;
import static org.apache.lucene.backward_codecs.lucene912.ForUtil.*;
import java.io.IOException;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.packed.PackedInts;
/**
@ -30,7 +30,7 @@ import org.apache.lucene.util.packed.PackedInts;
* SIMD-like speedups. If bitsPerValue &lt;= 4 then we pack 8 ints per long else if bitsPerValue
* &lt;= 11 we pack 4 ints per long else we pack 2 ints per long
*/
public final class ForDeltaUtil {
final class ForDeltaUtil {
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
@ -272,125 +272,124 @@ public final class ForDeltaUtil {
}
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */
void decodeAndPrefixSum(PostingDecodingUtil pdu, long base, long[] longs) throws IOException {
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
void decodeAndPrefixSum(IndexInput in, long base, long[] longs) throws IOException {
final int bitsPerValue = Byte.toUnsignedInt(in.readByte());
if (bitsPerValue == 0) {
prefixSumOfOnes(longs, base);
} else {
decodeAndPrefixSum(bitsPerValue, pdu, base, longs);
decodeAndPrefixSum(bitsPerValue, in, base, longs);
}
}
/** Delta-decode 128 integers into {@code longs}. */
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, long base, long[] longs)
void decodeAndPrefixSum(int bitsPerValue, IndexInput in, long base, long[] longs)
throws IOException {
switch (bitsPerValue) {
case 1:
decode1(pdu, tmp, longs);
decode1(in, longs);
prefixSum8(longs, base);
break;
case 2:
decode2(pdu, tmp, longs);
decode2(in, longs);
prefixSum8(longs, base);
break;
case 3:
decode3(pdu, tmp, longs);
decode3(in, tmp, longs);
prefixSum8(longs, base);
break;
case 4:
decode4(pdu, tmp, longs);
decode4(in, longs);
prefixSum8(longs, base);
break;
case 5:
decode5To16(pdu, tmp, longs);
decode5To16(in, tmp, longs);
prefixSum16(longs, base);
break;
case 6:
decode6To16(pdu, tmp, longs);
decode6To16(in, tmp, longs);
prefixSum16(longs, base);
break;
case 7:
decode7To16(pdu, tmp, longs);
decode7To16(in, tmp, longs);
prefixSum16(longs, base);
break;
case 8:
decode8To16(pdu, tmp, longs);
decode8To16(in, longs);
prefixSum16(longs, base);
break;
case 9:
decode9(pdu, tmp, longs);
decode9(in, tmp, longs);
prefixSum16(longs, base);
break;
case 10:
decode10(pdu, tmp, longs);
decode10(in, tmp, longs);
prefixSum16(longs, base);
break;
case 11:
decode11(pdu, tmp, longs);
decode11(in, tmp, longs);
prefixSum16(longs, base);
break;
case 12:
decode12To32(pdu, tmp, longs);
decode12To32(in, tmp, longs);
prefixSum32(longs, base);
break;
case 13:
decode13To32(pdu, tmp, longs);
decode13To32(in, tmp, longs);
prefixSum32(longs, base);
break;
case 14:
decode14To32(pdu, tmp, longs);
decode14To32(in, tmp, longs);
prefixSum32(longs, base);
break;
case 15:
decode15To32(pdu, tmp, longs);
decode15To32(in, tmp, longs);
prefixSum32(longs, base);
break;
case 16:
decode16To32(pdu, tmp, longs);
decode16To32(in, longs);
prefixSum32(longs, base);
break;
case 17:
decode17(pdu, tmp, longs);
decode17(in, tmp, longs);
prefixSum32(longs, base);
break;
case 18:
decode18(pdu, tmp, longs);
decode18(in, tmp, longs);
prefixSum32(longs, base);
break;
case 19:
decode19(pdu, tmp, longs);
decode19(in, tmp, longs);
prefixSum32(longs, base);
break;
case 20:
decode20(pdu, tmp, longs);
decode20(in, tmp, longs);
prefixSum32(longs, base);
break;
case 21:
decode21(pdu, tmp, longs);
decode21(in, tmp, longs);
prefixSum32(longs, base);
break;
case 22:
decode22(pdu, tmp, longs);
decode22(in, tmp, longs);
prefixSum32(longs, base);
break;
case 23:
decode23(pdu, tmp, longs);
decode23(in, tmp, longs);
prefixSum32(longs, base);
break;
case 24:
decode24(pdu, tmp, longs);
decode24(in, tmp, longs);
prefixSum32(longs, base);
break;
default:
decodeSlow(bitsPerValue, pdu, tmp, longs);
decodeSlow(bitsPerValue, in, tmp, longs);
prefixSum32(longs, base);
break;
}
}
private static void decode5To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
throws IOException {
pdu.splitLongs(10, longs, 11, 5, MASK16_5, tmp, 0, MASK16_1);
private static void decode5To16(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 10, longs, 11, 5, MASK16_5, tmp, 0, MASK16_1);
for (int iter = 0, tmpIdx = 0, longsIdx = 30; iter < 2; ++iter, tmpIdx += 5, longsIdx += 1) {
long l0 = tmp[tmpIdx + 0] << 4;
l0 |= tmp[tmpIdx + 1] << 3;
@ -401,9 +400,8 @@ public final class ForDeltaUtil {
}
}
private static void decode6To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
throws IOException {
pdu.splitLongs(12, longs, 10, 6, MASK16_6, tmp, 0, MASK16_4);
private static void decode6To16(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 12, longs, 10, 6, MASK16_6, tmp, 0, MASK16_4);
for (int iter = 0, tmpIdx = 0, longsIdx = 24; iter < 4; ++iter, tmpIdx += 3, longsIdx += 2) {
long l0 = tmp[tmpIdx + 0] << 2;
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_2;
@ -414,9 +412,8 @@ public final class ForDeltaUtil {
}
}
private static void decode7To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
throws IOException {
pdu.splitLongs(14, longs, 9, 7, MASK16_7, tmp, 0, MASK16_2);
private static void decode7To16(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 14, longs, 9, 7, MASK16_7, tmp, 0, MASK16_2);
for (int iter = 0, tmpIdx = 0, longsIdx = 28; iter < 2; ++iter, tmpIdx += 7, longsIdx += 2) {
long l0 = tmp[tmpIdx + 0] << 5;
l0 |= tmp[tmpIdx + 1] << 3;
@ -431,14 +428,12 @@ public final class ForDeltaUtil {
}
}
private static void decode8To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
throws IOException {
pdu.splitLongs(16, longs, 8, 8, MASK16_8, longs, 16, MASK16_8);
private static void decode8To16(IndexInput in, long[] longs) throws IOException {
splitLongs(in, 16, longs, 8, 8, MASK16_8, longs, 16, MASK16_8);
}
private static void decode12To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
throws IOException {
pdu.splitLongs(24, longs, 20, 12, MASK32_12, tmp, 0, MASK32_8);
private static void decode12To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 24, longs, 20, 12, MASK32_12, tmp, 0, MASK32_8);
for (int iter = 0, tmpIdx = 0, longsIdx = 48; iter < 8; ++iter, tmpIdx += 3, longsIdx += 2) {
long l0 = tmp[tmpIdx + 0] << 4;
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_4;
@ -449,9 +444,8 @@ public final class ForDeltaUtil {
}
}
private static void decode13To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
throws IOException {
pdu.splitLongs(26, longs, 19, 13, MASK32_13, tmp, 0, MASK32_6);
private static void decode13To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 26, longs, 19, 13, MASK32_13, tmp, 0, MASK32_6);
for (int iter = 0, tmpIdx = 0, longsIdx = 52; iter < 2; ++iter, tmpIdx += 13, longsIdx += 6) {
long l0 = tmp[tmpIdx + 0] << 7;
l0 |= tmp[tmpIdx + 1] << 1;
@ -480,9 +474,8 @@ public final class ForDeltaUtil {
}
}
private static void decode14To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
throws IOException {
pdu.splitLongs(28, longs, 18, 14, MASK32_14, tmp, 0, MASK32_4);
private static void decode14To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 28, longs, 18, 14, MASK32_14, tmp, 0, MASK32_4);
for (int iter = 0, tmpIdx = 0, longsIdx = 56; iter < 4; ++iter, tmpIdx += 7, longsIdx += 2) {
long l0 = tmp[tmpIdx + 0] << 10;
l0 |= tmp[tmpIdx + 1] << 6;
@ -497,9 +490,8 @@ public final class ForDeltaUtil {
}
}
private static void decode15To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
throws IOException {
pdu.splitLongs(30, longs, 17, 15, MASK32_15, tmp, 0, MASK32_2);
private static void decode15To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 30, longs, 17, 15, MASK32_15, tmp, 0, MASK32_2);
for (int iter = 0, tmpIdx = 0, longsIdx = 60; iter < 2; ++iter, tmpIdx += 15, longsIdx += 2) {
long l0 = tmp[tmpIdx + 0] << 13;
l0 |= tmp[tmpIdx + 1] << 11;
@ -522,8 +514,7 @@ public final class ForDeltaUtil {
}
}
private static void decode16To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
throws IOException {
pdu.splitLongs(32, longs, 16, 16, MASK32_16, longs, 32, MASK32_16);
private static void decode16To32(IndexInput in, long[] longs) throws IOException {
splitLongs(in, 32, longs, 16, 16, MASK32_16, longs, 32, MASK32_16);
}
}

View File

@ -16,18 +16,18 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
/**
* Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in a long to get
* SIMD-like speedups. If bitsPerValue &lt;= 8 then we pack 8 ints per long else if bitsPerValue
* &lt;= 16 we pack 4 ints per long else we pack 2 ints per long
*/
public final class ForUtil {
final class ForUtil {
public static final int BLOCK_SIZE = 128;
static final int BLOCK_SIZE_LOG2 = 7;
@ -196,11 +196,11 @@ public final class ForUtil {
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
}
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, long[] tmp, long[] longs)
static void decodeSlow(int bitsPerValue, IndexInput in, long[] tmp, long[] longs)
throws IOException {
final int numLongs = bitsPerValue << 1;
final long mask = MASKS32[bitsPerValue];
pdu.splitLongs(numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
splitLongs(in, numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
final int remainingBitsPerLong = 32 - bitsPerValue;
final long mask32RemainingBitsPerLong = MASKS32[remainingBitsPerLong];
int tmpIdx = 0;
@ -222,6 +222,28 @@ public final class ForUtil {
}
}
static void splitLongs(
IndexInput in,
int count,
long[] b,
int bShift,
int dec,
long bMask,
long[] c,
int cIndex,
long cMask)
throws IOException {
// takes advantage of the C2 compiler's loop unrolling and auto-vectorization.
in.readLongs(c, cIndex, count);
int maxIter = (bShift - 1) / dec;
for (int i = 0; i < count; ++i) {
for (int j = 0; j <= maxIter; ++j) {
b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask;
}
c[cIndex + i] &= cMask;
}
}
static final long[] MASKS8 = new long[8];
static final long[] MASKS16 = new long[16];
static final long[] MASKS32 = new long[32];
@ -288,121 +310,121 @@ public final class ForUtil {
static final long MASK32_24 = MASKS32[24];
/** Decode 128 integers into {@code longs}. */
void decode(int bitsPerValue, PostingDecodingUtil pdu, long[] longs) throws IOException {
void decode(int bitsPerValue, IndexInput in, long[] longs) throws IOException {
switch (bitsPerValue) {
case 1:
decode1(pdu, tmp, longs);
decode1(in, longs);
expand8(longs);
break;
case 2:
decode2(pdu, tmp, longs);
decode2(in, longs);
expand8(longs);
break;
case 3:
decode3(pdu, tmp, longs);
decode3(in, tmp, longs);
expand8(longs);
break;
case 4:
decode4(pdu, tmp, longs);
decode4(in, longs);
expand8(longs);
break;
case 5:
decode5(pdu, tmp, longs);
decode5(in, tmp, longs);
expand8(longs);
break;
case 6:
decode6(pdu, tmp, longs);
decode6(in, tmp, longs);
expand8(longs);
break;
case 7:
decode7(pdu, tmp, longs);
decode7(in, tmp, longs);
expand8(longs);
break;
case 8:
decode8(pdu, tmp, longs);
decode8(in, longs);
expand8(longs);
break;
case 9:
decode9(pdu, tmp, longs);
decode9(in, tmp, longs);
expand16(longs);
break;
case 10:
decode10(pdu, tmp, longs);
decode10(in, tmp, longs);
expand16(longs);
break;
case 11:
decode11(pdu, tmp, longs);
decode11(in, tmp, longs);
expand16(longs);
break;
case 12:
decode12(pdu, tmp, longs);
decode12(in, tmp, longs);
expand16(longs);
break;
case 13:
decode13(pdu, tmp, longs);
decode13(in, tmp, longs);
expand16(longs);
break;
case 14:
decode14(pdu, tmp, longs);
decode14(in, tmp, longs);
expand16(longs);
break;
case 15:
decode15(pdu, tmp, longs);
decode15(in, tmp, longs);
expand16(longs);
break;
case 16:
decode16(pdu, tmp, longs);
decode16(in, longs);
expand16(longs);
break;
case 17:
decode17(pdu, tmp, longs);
decode17(in, tmp, longs);
expand32(longs);
break;
case 18:
decode18(pdu, tmp, longs);
decode18(in, tmp, longs);
expand32(longs);
break;
case 19:
decode19(pdu, tmp, longs);
decode19(in, tmp, longs);
expand32(longs);
break;
case 20:
decode20(pdu, tmp, longs);
decode20(in, tmp, longs);
expand32(longs);
break;
case 21:
decode21(pdu, tmp, longs);
decode21(in, tmp, longs);
expand32(longs);
break;
case 22:
decode22(pdu, tmp, longs);
decode22(in, tmp, longs);
expand32(longs);
break;
case 23:
decode23(pdu, tmp, longs);
decode23(in, tmp, longs);
expand32(longs);
break;
case 24:
decode24(pdu, tmp, longs);
decode24(in, tmp, longs);
expand32(longs);
break;
default:
decodeSlow(bitsPerValue, pdu, tmp, longs);
decodeSlow(bitsPerValue, in, tmp, longs);
expand32(longs);
break;
}
}
static void decode1(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(2, longs, 7, 1, MASK8_1, longs, 14, MASK8_1);
static void decode1(IndexInput in, long[] longs) throws IOException {
splitLongs(in, 2, longs, 7, 1, MASK8_1, longs, 14, MASK8_1);
}
static void decode2(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(4, longs, 6, 2, MASK8_2, longs, 12, MASK8_2);
static void decode2(IndexInput in, long[] longs) throws IOException {
splitLongs(in, 4, longs, 6, 2, MASK8_2, longs, 12, MASK8_2);
}
static void decode3(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(6, longs, 5, 3, MASK8_3, tmp, 0, MASK8_2);
static void decode3(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 6, longs, 5, 3, MASK8_3, tmp, 0, MASK8_2);
for (int iter = 0, tmpIdx = 0, longsIdx = 12; iter < 2; ++iter, tmpIdx += 3, longsIdx += 2) {
long l0 = tmp[tmpIdx + 0] << 1;
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_1;
@ -413,12 +435,12 @@ public final class ForUtil {
}
}
static void decode4(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(8, longs, 4, 4, MASK8_4, longs, 8, MASK8_4);
static void decode4(IndexInput in, long[] longs) throws IOException {
splitLongs(in, 8, longs, 4, 4, MASK8_4, longs, 8, MASK8_4);
}
static void decode5(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(10, longs, 3, 5, MASK8_5, tmp, 0, MASK8_3);
static void decode5(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 10, longs, 3, 5, MASK8_5, tmp, 0, MASK8_3);
for (int iter = 0, tmpIdx = 0, longsIdx = 10; iter < 2; ++iter, tmpIdx += 5, longsIdx += 3) {
long l0 = tmp[tmpIdx + 0] << 2;
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_2;
@ -433,8 +455,8 @@ public final class ForUtil {
}
}
static void decode6(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(12, longs, 2, 6, MASK8_6, tmp, 0, MASK8_2);
static void decode6(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 12, longs, 2, 6, MASK8_6, tmp, 0, MASK8_2);
for (int iter = 0, tmpIdx = 0, longsIdx = 12; iter < 4; ++iter, tmpIdx += 3, longsIdx += 1) {
long l0 = tmp[tmpIdx + 0] << 4;
l0 |= tmp[tmpIdx + 1] << 2;
@ -443,8 +465,8 @@ public final class ForUtil {
}
}
static void decode7(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(14, longs, 1, 7, MASK8_7, tmp, 0, MASK8_1);
static void decode7(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 14, longs, 1, 7, MASK8_7, tmp, 0, MASK8_1);
for (int iter = 0, tmpIdx = 0, longsIdx = 14; iter < 2; ++iter, tmpIdx += 7, longsIdx += 1) {
long l0 = tmp[tmpIdx + 0] << 6;
l0 |= tmp[tmpIdx + 1] << 5;
@ -457,12 +479,12 @@ public final class ForUtil {
}
}
static void decode8(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.in.readLongs(longs, 0, 16);
static void decode8(IndexInput in, long[] longs) throws IOException {
in.readLongs(longs, 0, 16);
}
static void decode9(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(18, longs, 7, 9, MASK16_9, tmp, 0, MASK16_7);
static void decode9(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 18, longs, 7, 9, MASK16_9, tmp, 0, MASK16_7);
for (int iter = 0, tmpIdx = 0, longsIdx = 18; iter < 2; ++iter, tmpIdx += 9, longsIdx += 7) {
long l0 = tmp[tmpIdx + 0] << 2;
l0 |= (tmp[tmpIdx + 1] >>> 5) & MASK16_2;
@ -489,8 +511,8 @@ public final class ForUtil {
}
}
static void decode10(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(20, longs, 6, 10, MASK16_10, tmp, 0, MASK16_6);
static void decode10(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 20, longs, 6, 10, MASK16_10, tmp, 0, MASK16_6);
for (int iter = 0, tmpIdx = 0, longsIdx = 20; iter < 4; ++iter, tmpIdx += 5, longsIdx += 3) {
long l0 = tmp[tmpIdx + 0] << 4;
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_4;
@ -505,8 +527,8 @@ public final class ForUtil {
}
}
static void decode11(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(22, longs, 5, 11, MASK16_11, tmp, 0, MASK16_5);
static void decode11(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 22, longs, 5, 11, MASK16_11, tmp, 0, MASK16_5);
for (int iter = 0, tmpIdx = 0, longsIdx = 22; iter < 2; ++iter, tmpIdx += 11, longsIdx += 5) {
long l0 = tmp[tmpIdx + 0] << 6;
l0 |= tmp[tmpIdx + 1] << 1;
@ -531,8 +553,8 @@ public final class ForUtil {
}
}
static void decode12(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(24, longs, 4, 12, MASK16_12, tmp, 0, MASK16_4);
static void decode12(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 24, longs, 4, 12, MASK16_12, tmp, 0, MASK16_4);
for (int iter = 0, tmpIdx = 0, longsIdx = 24; iter < 8; ++iter, tmpIdx += 3, longsIdx += 1) {
long l0 = tmp[tmpIdx + 0] << 8;
l0 |= tmp[tmpIdx + 1] << 4;
@ -541,8 +563,8 @@ public final class ForUtil {
}
}
static void decode13(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(26, longs, 3, 13, MASK16_13, tmp, 0, MASK16_3);
static void decode13(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 26, longs, 3, 13, MASK16_13, tmp, 0, MASK16_3);
for (int iter = 0, tmpIdx = 0, longsIdx = 26; iter < 2; ++iter, tmpIdx += 13, longsIdx += 3) {
long l0 = tmp[tmpIdx + 0] << 10;
l0 |= tmp[tmpIdx + 1] << 7;
@ -565,8 +587,8 @@ public final class ForUtil {
}
}
static void decode14(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(28, longs, 2, 14, MASK16_14, tmp, 0, MASK16_2);
static void decode14(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 28, longs, 2, 14, MASK16_14, tmp, 0, MASK16_2);
for (int iter = 0, tmpIdx = 0, longsIdx = 28; iter < 4; ++iter, tmpIdx += 7, longsIdx += 1) {
long l0 = tmp[tmpIdx + 0] << 12;
l0 |= tmp[tmpIdx + 1] << 10;
@ -579,8 +601,8 @@ public final class ForUtil {
}
}
static void decode15(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(30, longs, 1, 15, MASK16_15, tmp, 0, MASK16_1);
static void decode15(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 30, longs, 1, 15, MASK16_15, tmp, 0, MASK16_1);
for (int iter = 0, tmpIdx = 0, longsIdx = 30; iter < 2; ++iter, tmpIdx += 15, longsIdx += 1) {
long l0 = tmp[tmpIdx + 0] << 14;
l0 |= tmp[tmpIdx + 1] << 13;
@ -601,12 +623,12 @@ public final class ForUtil {
}
}
static void decode16(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.in.readLongs(longs, 0, 32);
static void decode16(IndexInput in, long[] longs) throws IOException {
in.readLongs(longs, 0, 32);
}
static void decode17(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(34, longs, 15, 17, MASK32_17, tmp, 0, MASK32_15);
static void decode17(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 34, longs, 15, 17, MASK32_17, tmp, 0, MASK32_15);
for (int iter = 0, tmpIdx = 0, longsIdx = 34; iter < 2; ++iter, tmpIdx += 17, longsIdx += 15) {
long l0 = tmp[tmpIdx + 0] << 2;
l0 |= (tmp[tmpIdx + 1] >>> 13) & MASK32_2;
@ -657,8 +679,8 @@ public final class ForUtil {
}
}
static void decode18(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(36, longs, 14, 18, MASK32_18, tmp, 0, MASK32_14);
static void decode18(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 36, longs, 14, 18, MASK32_18, tmp, 0, MASK32_14);
for (int iter = 0, tmpIdx = 0, longsIdx = 36; iter < 4; ++iter, tmpIdx += 9, longsIdx += 7) {
long l0 = tmp[tmpIdx + 0] << 4;
l0 |= (tmp[tmpIdx + 1] >>> 10) & MASK32_4;
@ -685,8 +707,8 @@ public final class ForUtil {
}
}
static void decode19(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(38, longs, 13, 19, MASK32_19, tmp, 0, MASK32_13);
static void decode19(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 38, longs, 13, 19, MASK32_19, tmp, 0, MASK32_13);
for (int iter = 0, tmpIdx = 0, longsIdx = 38; iter < 2; ++iter, tmpIdx += 19, longsIdx += 13) {
long l0 = tmp[tmpIdx + 0] << 6;
l0 |= (tmp[tmpIdx + 1] >>> 7) & MASK32_6;
@ -735,8 +757,8 @@ public final class ForUtil {
}
}
static void decode20(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(40, longs, 12, 20, MASK32_20, tmp, 0, MASK32_12);
static void decode20(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 40, longs, 12, 20, MASK32_20, tmp, 0, MASK32_12);
for (int iter = 0, tmpIdx = 0, longsIdx = 40; iter < 8; ++iter, tmpIdx += 5, longsIdx += 3) {
long l0 = tmp[tmpIdx + 0] << 8;
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_8;
@ -751,8 +773,8 @@ public final class ForUtil {
}
}
static void decode21(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(42, longs, 11, 21, MASK32_21, tmp, 0, MASK32_11);
static void decode21(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 42, longs, 11, 21, MASK32_21, tmp, 0, MASK32_11);
for (int iter = 0, tmpIdx = 0, longsIdx = 42; iter < 2; ++iter, tmpIdx += 21, longsIdx += 11) {
long l0 = tmp[tmpIdx + 0] << 10;
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK32_10;
@ -799,8 +821,8 @@ public final class ForUtil {
}
}
static void decode22(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(44, longs, 10, 22, MASK32_22, tmp, 0, MASK32_10);
static void decode22(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 44, longs, 10, 22, MASK32_22, tmp, 0, MASK32_10);
for (int iter = 0, tmpIdx = 0, longsIdx = 44; iter < 4; ++iter, tmpIdx += 11, longsIdx += 5) {
long l0 = tmp[tmpIdx + 0] << 12;
l0 |= tmp[tmpIdx + 1] << 2;
@ -825,8 +847,8 @@ public final class ForUtil {
}
}
static void decode23(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(46, longs, 9, 23, MASK32_23, tmp, 0, MASK32_9);
static void decode23(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 46, longs, 9, 23, MASK32_23, tmp, 0, MASK32_9);
for (int iter = 0, tmpIdx = 0, longsIdx = 46; iter < 2; ++iter, tmpIdx += 23, longsIdx += 9) {
long l0 = tmp[tmpIdx + 0] << 14;
l0 |= tmp[tmpIdx + 1] << 5;
@ -871,8 +893,8 @@ public final class ForUtil {
}
}
static void decode24(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
pdu.splitLongs(48, longs, 8, 24, MASK32_24, tmp, 0, MASK32_8);
static void decode24(IndexInput in, long[] tmp, long[] longs) throws IOException {
splitLongs(in, 48, longs, 8, 24, MASK32_24, tmp, 0, MASK32_8);
for (int iter = 0, tmpIdx = 0, longsIdx = 48; iter < 16; ++iter, tmpIdx += 3, longsIdx += 1) {
long l0 = tmp[tmpIdx + 0] << 16;
l0 |= tmp[tmpIdx + 1] << 8;

View File

@ -37,7 +37,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
@ -23,7 +23,6 @@ import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.index.IndexOptions;
@ -318,7 +317,7 @@ import org.apache.lucene.util.packed.PackedInts;
*
* @lucene.experimental
*/
public final class Lucene912PostingsFormat extends PostingsFormat {
public class Lucene912PostingsFormat extends PostingsFormat {
/** Filename extension for some small metadata about how postings are encoded. */
public static final String META_EXTENSION = "psm";
@ -341,7 +340,7 @@ public final class Lucene912PostingsFormat extends PostingsFormat {
/** Size of blocks. */
public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE;
public static final int BLOCK_MASK = BLOCK_SIZE - 1;
static final int BLOCK_MASK = BLOCK_SIZE - 1;
/** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */
public static final int LEVEL1_FACTOR = 32;
@ -349,7 +348,7 @@ public final class Lucene912PostingsFormat extends PostingsFormat {
/** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */
public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE;
public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1;
static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1;
static final String TERMS_CODEC = "Lucene90PostingsWriterTerms";
static final String META_CODEC = "Lucene912PostingsWriterMeta";
@ -360,45 +359,15 @@ public final class Lucene912PostingsFormat extends PostingsFormat {
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
private final int minTermBlockSize;
private final int maxTermBlockSize;
/** Creates {@code Lucene912PostingsFormat} with default settings. */
public Lucene912PostingsFormat() {
this(
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/**
* Creates {@code Lucene912PostingsFormat} with custom values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
*
* @see
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene912PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("Lucene912");
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret =
new Lucene90BlockTreeTermsWriter(
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
throw new UnsupportedOperationException(
"This postings format may not be used for writing, use the current postings format");
}
@Override

View File

@ -14,17 +14,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import static org.apache.lucene.codecs.lucene912.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.LEVEL1_NUM_DOCS;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_START;
import static org.apache.lucene.backward_codecs.lucene912.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.LEVEL1_NUM_DOCS;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_START;
import java.io.IOException;
import java.util.AbstractList;
@ -32,10 +32,10 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.RandomAccess;
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
@ -45,8 +45,6 @@ import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.internal.vectorization.VectorizationProvider;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
@ -64,7 +62,6 @@ import org.apache.lucene.util.IOUtils;
*/
public final class Lucene912PostingsReader extends PostingsReaderBase {
static final VectorizationProvider VECTORIZATION_PROVIDER = VectorizationProvider.getInstance();
// Dummy impacts, composed of the maximum possible term frequency and the lowest possible
// (unsigned) norm value. This is typically used on tail blocks, which don't actually record
// impacts as the storage overhead would not be worth any query evaluation speedup, since there's
@ -215,15 +212,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
}
}
static int findFirstGreater(long[] buffer, int target, int from) {
for (int i = from; i < BLOCK_SIZE; ++i) {
if (buffer[i] >= target) {
return i;
}
}
return BLOCK_SIZE;
}
@Override
public BlockTermState newTermState() {
return new IntBlockTermState();
@ -357,10 +345,10 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
protected int docCountUpto; // number of docs in or before the current block
protected long prevDocID; // last doc ID of the previous block
protected int docBufferSize;
protected int docBufferUpto;
protected IndexInput docIn;
protected PostingDecodingUtil docInUtil;
protected AbstractPostingsEnum(FieldInfo fieldInfo) {
indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
@ -381,7 +369,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
if (docIn == null) {
// lazy init
docIn = Lucene912PostingsReader.this.docIn.clone();
docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn);
}
prefetchPostings(docIn, termState);
}
@ -402,6 +389,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
level1DocEndFP = termState.docStartFP;
}
level1DocCountUpto = 0;
docBufferSize = BLOCK_SIZE;
docBufferUpto = BLOCK_SIZE;
return this;
}
@ -427,7 +415,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException {
resetIndexInput(termState);
if (pforUtil == null && docFreq >= BLOCK_SIZE) {
pforUtil = new PForUtil(new ForUtil());
pforUtil = new PForUtil();
forDeltaUtil = new ForDeltaUtil();
}
totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq;
@ -446,7 +434,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
public int freq() throws IOException {
if (freqFP != -1) {
docIn.seek(freqFP);
pforUtil.decode(docInUtil, freqBuffer);
pforUtil.decode(docIn, freqBuffer);
freqFP = -1;
}
@ -476,7 +464,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
private void refillFullBlock() throws IOException {
assert docFreq - docCountUpto >= BLOCK_SIZE;
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
if (indexHasFreq) {
if (needsFreq) {
@ -487,7 +475,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
docCountUpto += BLOCK_SIZE;
prevDocID = docBuffer[BLOCK_SIZE - 1];
docBufferUpto = 0;
assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS;
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
}
private void refillRemainder() throws IOException {
@ -508,6 +496,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
docCountUpto += left;
}
docBufferUpto = 0;
docBufferSize = left;
freqFP = -1;
}
@ -580,7 +569,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
@Override
public int nextDoc() throws IOException {
if (doc == level0LastDocID) { // advance skip data on level 0
if (docBufferUpto == BLOCK_SIZE) { // advance skip data on level 0
moveToNextLevel0Block();
}
@ -604,7 +593,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
}
}
int next = findFirstGreater(docBuffer, target, docBufferUpto);
int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
this.doc = (int) docBuffer[next];
docBufferUpto = next + 1;
return doc;
@ -636,9 +625,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
private int posBufferUpto;
final IndexInput posIn;
final PostingDecodingUtil posInUtil;
final IndexInput payIn;
final PostingDecodingUtil payInUtil;
final BytesRef payload;
final boolean indexHasOffsets;
@ -681,13 +668,10 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads;
this.posIn = Lucene912PostingsReader.this.posIn.clone();
posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn);
if (indexHasOffsetsOrPayloads) {
this.payIn = Lucene912PostingsReader.this.payIn.clone();
payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn);
} else {
this.payIn = null;
payInUtil = null;
}
if (indexHasOffsets) {
offsetStartDeltaBuffer = new long[BLOCK_SIZE];
@ -727,7 +711,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
}
totalTermFreq = termState.totalTermFreq;
if (pforUtil == null && totalTermFreq >= BLOCK_SIZE) {
pforUtil = new PForUtil(new ForUtil());
pforUtil = new PForUtil();
}
// Where this term's postings start in the .pos file:
final long posTermStartFP = termState.posStartFP;
@ -774,24 +758,26 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
assert left >= 0;
if (left >= BLOCK_SIZE) {
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
pforUtil.decode(docInUtil, freqBuffer);
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
pforUtil.decode(docIn, freqBuffer);
docCountUpto += BLOCK_SIZE;
} else if (docFreq == 1) {
docBuffer[0] = singletonDocID;
freqBuffer[0] = totalTermFreq;
docBuffer[1] = NO_MORE_DOCS;
docCountUpto++;
docBufferSize = 1;
} else {
// Read vInts:
PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
prefixSum(docBuffer, left, prevDocID);
docBuffer[left] = NO_MORE_DOCS;
docCountUpto += left;
docBufferSize = left;
}
prevDocID = docBuffer[BLOCK_SIZE - 1];
docBufferUpto = 0;
assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS;
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
}
private void skipLevel1To(int target) throws IOException {
@ -875,7 +861,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
@Override
public int nextDoc() throws IOException {
if (doc == level0LastDocID) { // advance level 0 skip data
if (docBufferUpto == BLOCK_SIZE) { // advance level 0 skip data
moveToNextLevel0Block();
}
@ -951,7 +937,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
refillDocs();
}
int next = findFirstGreater(docBuffer, target, docBufferUpto);
int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1);
this.freq = (int) freqBuffer[next];
this.docBufferUpto = next + 1;
@ -1045,11 +1031,11 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
}
payloadByteUpto = 0;
} else {
pforUtil.decode(posInUtil, posDeltaBuffer);
pforUtil.decode(posIn, posDeltaBuffer);
if (indexHasPayloads) {
if (needsPayloads) {
pforUtil.decode(payInUtil, payloadLengthBuffer);
pforUtil.decode(payIn, payloadLengthBuffer);
int numBytes = payIn.readVInt();
if (numBytes > payloadBytes.length) {
@ -1068,8 +1054,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
if (indexHasOffsets) {
if (needsOffsets) {
pforUtil.decode(payInUtil, offsetStartDeltaBuffer);
pforUtil.decode(payInUtil, offsetLengthBuffer);
pforUtil.decode(payIn, offsetStartDeltaBuffer);
pforUtil.decode(payIn, offsetLengthBuffer);
} else {
// this works, because when writing a vint block we always force the first length to be
// written
@ -1142,7 +1128,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
private abstract class BlockImpactsEnum extends ImpactsEnum {
protected final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
protected final PForUtil pforUtil = new PForUtil(new ForUtil());
protected final PForUtil pforUtil = new PForUtil();
protected final long[] docBuffer = new long[BLOCK_SIZE + 1];
protected final long[] freqBuffer = new long[BLOCK_SIZE];
@ -1150,11 +1136,11 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
protected final int docFreq; // number of docs in this posting list
protected final IndexInput docIn;
protected final PostingDecodingUtil docInUtil;
protected int docCountUpto; // number of docs in or before the current block
protected int doc = -1; // doc we last read
protected long prevDocID = -1; // last doc ID of the previous block
protected int docBufferSize = BLOCK_SIZE;
protected int docBufferUpto = BLOCK_SIZE;
// true if we shallow-advanced to a new block that we have not decoded yet
@ -1175,7 +1161,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
private BlockImpactsEnum(IntBlockTermState termState) throws IOException {
this.docFreq = termState.docFreq;
this.docIn = Lucene912PostingsReader.this.docIn.clone();
this.docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn);
prefetchPostings(docIn, termState);
level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0);
level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1);
@ -1279,7 +1264,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
public int freq() throws IOException {
if (freqFP != -1) {
docIn.seek(freqFP);
pforUtil.decode(docInUtil, freqBuffer);
pforUtil.decode(docIn, freqBuffer);
freqFP = -1;
}
return (int) freqBuffer[docBufferUpto - 1];
@ -1295,7 +1280,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
assert left >= 0;
if (left >= BLOCK_SIZE) {
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
freqFP = docIn.getFilePointer();
PForUtil.skip(docIn);
docCountUpto += BLOCK_SIZE;
@ -1306,10 +1291,11 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
docBuffer[left] = NO_MORE_DOCS;
freqFP = -1;
docCountUpto += left;
docBufferSize = left;
}
prevDocID = docBuffer[BLOCK_SIZE - 1];
docBufferUpto = 0;
assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS;
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
}
private void skipLevel1To(int target) throws IOException {
@ -1417,11 +1403,13 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
@Override
public int nextDoc() throws IOException {
if (doc == level0LastDocID) {
moveToNextLevel0Block();
} else if (needsRefilling) {
refillDocs();
needsRefilling = false;
if (docBufferUpto == BLOCK_SIZE) {
if (needsRefilling) {
refillDocs();
needsRefilling = false;
} else {
moveToNextLevel0Block();
}
}
return this.doc = (int) docBuffer[docBufferUpto++];
@ -1435,7 +1423,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
needsRefilling = false;
}
int next = findFirstGreater(docBuffer, target, docBufferUpto);
int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
this.doc = (int) docBuffer[next];
docBufferUpto = next + 1;
return doc;
@ -1447,7 +1435,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
private int posBufferUpto;
final IndexInput posIn;
final PostingDecodingUtil posInUtil;
final boolean indexHasFreq;
final boolean indexHasOffsets;
@ -1488,7 +1475,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads;
this.posIn = Lucene912PostingsReader.this.posIn.clone();
posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn);
// Where this term's postings start in the .pos file:
final long posTermStartFP = termState.posStartFP;
@ -1519,8 +1505,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
assert left >= 0;
if (left >= BLOCK_SIZE) {
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
pforUtil.decode(docInUtil, freqBuffer);
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
pforUtil.decode(docIn, freqBuffer);
docCountUpto += BLOCK_SIZE;
} else if (docFreq == 1) {
docBuffer[0] = singletonDocID;
@ -1533,10 +1519,11 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
prefixSum(docBuffer, left, prevDocID);
docBuffer[left] = NO_MORE_DOCS;
docCountUpto += left;
docBufferSize = left;
}
prevDocID = docBuffer[BLOCK_SIZE - 1];
docBufferUpto = 0;
assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS;
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
}
private void skipLevel1To(int target) throws IOException {
@ -1644,8 +1631,9 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
@Override
public int nextDoc() throws IOException {
advanceShallow(doc + 1);
if (needsRefilling) {
if (docBufferUpto == BLOCK_SIZE) {
advanceShallow(doc + 1);
assert needsRefilling;
refillDocs();
needsRefilling = false;
}
@ -1666,7 +1654,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
needsRefilling = false;
}
int next = findFirstGreater(docBuffer, target, docBufferUpto);
int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1);
freq = (int) freqBuffer[next];
docBufferUpto = next + 1;
@ -1724,7 +1712,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
}
}
} else {
pforUtil.decode(posInUtil, posDeltaBuffer);
pforUtil.decode(posIn, posDeltaBuffer);
}
}
@ -1749,9 +1737,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
}
}
/**
* @see Lucene912PostingsWriter#writeVInt15(org.apache.lucene.store.DataOutput, int)
*/
static int readVInt15(DataInput in) throws IOException {
short s = in.readShort();
if (s >= 0) {
@ -1761,9 +1746,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
}
}
/**
* @see Lucene912PostingsWriter#writeVLong15(org.apache.lucene.store.DataOutput, long)
*/
static long readVLong15(DataInput in) throws IOException {
short s = in.readShort();
if (s >= 0) {
@ -1773,6 +1755,15 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
}
}
private static int findNextGEQ(long[] buffer, long target, int from, int to) {
for (int i = from; i < to; ++i) {
if (buffer[i] >= target) {
return i;
}
}
return to;
}
private static void prefetchPostings(IndexInput docIn, IntBlockTermState state)
throws IOException {
assert state.docFreq > 1; // Singletons are inlined in the terms dict, nothing to prefetch

View File

@ -14,13 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.LongHeap;
import org.apache.lucene.util.packed.PackedInts;
@ -38,11 +38,10 @@ final class PForUtil {
return true;
}
private final ForUtil forUtil;
private final ForUtil forUtil = new ForUtil();
PForUtil(ForUtil forUtil) {
static {
assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE;
this.forUtil = forUtil;
}
/** Encode 128 integers from {@code longs} into {@code out}. */
@ -105,18 +104,18 @@ final class PForUtil {
}
/** Decode 128 integers into {@code ints}. */
void decode(PostingDecodingUtil pdu, long[] longs) throws IOException {
final int token = Byte.toUnsignedInt(pdu.in.readByte());
void decode(IndexInput in, long[] longs) throws IOException {
final int token = Byte.toUnsignedInt(in.readByte());
final int bitsPerValue = token & 0x1f;
final int numExceptions = token >>> 5;
if (bitsPerValue == 0) {
Arrays.fill(longs, 0, ForUtil.BLOCK_SIZE, pdu.in.readVLong());
Arrays.fill(longs, 0, ForUtil.BLOCK_SIZE, in.readVLong());
} else {
forUtil.decode(bitsPerValue, pdu, longs);
forUtil.decode(bitsPerValue, in, longs);
}
final int numExceptions = token >>> 5;
for (int i = 0; i < numExceptions; ++i) {
longs[Byte.toUnsignedInt(pdu.in.readByte())] |=
Byte.toUnsignedLong(pdu.in.readByte()) << bitsPerValue;
longs[Byte.toUnsignedInt(in.readByte())] |=
Byte.toUnsignedLong(in.readByte()) << bitsPerValue;
}
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;

View File

@ -40,15 +40,14 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.codecs.lucene912.ForUtil.*;
import static org.apache.lucene.backward_codecs.lucene912.ForUtil.*;
/**
* Inspired from https://fulmicoton.com/posts/bitpacking/
@ -57,7 +56,7 @@ import static org.apache.lucene.codecs.lucene912.ForUtil.*;
* else if bitsPerValue &lt;= 11 we pack 4 ints per long
* else we pack 2 ints per long
*/
public final class ForDeltaUtil {
final class ForDeltaUtil {
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
@ -299,12 +298,12 @@ public final class ForDeltaUtil {
}
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */
void decodeAndPrefixSum(PostingDecodingUtil pdu, long base, long[] longs) throws IOException {
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
void decodeAndPrefixSum(IndexInput in, long base, long[] longs) throws IOException {
final int bitsPerValue = Byte.toUnsignedInt(in.readByte());
if (bitsPerValue == 0) {
prefixSumOfOnes(longs, base);
} else {
decodeAndPrefixSum(bitsPerValue, pdu, base, longs);
decodeAndPrefixSum(bitsPerValue, in, base, longs);
}
}
@ -361,18 +360,21 @@ def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values,
def writeDecode(bpv, f):
next_primitive = primitive_size_for_bpv(bpv)
f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {\n' %(bpv, next_primitive))
if next_primitive % bpv == 0:
f.write(' private static void decode%dTo%d(IndexInput in, long[] longs) throws IOException {\n' %(bpv, next_primitive))
else:
f.write(' private static void decode%dTo%d(IndexInput in, long[] tmp, long[] longs) throws IOException {\n' %(bpv, next_primitive))
if bpv == next_primitive:
f.write(' pdu.in.readLongs(longs, 0, %d);\n' %(bpv*2))
f.write(' in.readLongs(longs, 0, %d);\n' %(bpv*2))
else:
num_values_per_long = 64 / next_primitive
remaining_bits = next_primitive % bpv
num_iters = (next_primitive - 1) // bpv
o = 2 * bpv * num_iters
if remaining_bits == 0:
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
else:
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_long - o, f)
f.write(' }\n')
@ -383,20 +385,26 @@ if __name__ == '__main__':
/**
* Delta-decode 128 integers into {@code longs}.
*/
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, long base, long[] longs) throws IOException {
void decodeAndPrefixSum(int bitsPerValue, IndexInput in, long base, long[] longs) throws IOException {
switch (bitsPerValue) {
""")
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
primitive_size = primitive_size_for_bpv(bpv)
f.write(' case %d:\n' %bpv)
if next_primitive(bpv) == primitive_size:
f.write(' decode%d(pdu, tmp, longs);\n' %bpv)
if primitive_size % bpv == 0:
f.write(' decode%d(in, longs);\n' %bpv)
else:
f.write(' decode%d(in, tmp, longs);\n' %bpv)
else:
f.write(' decode%dTo%d(pdu, tmp, longs);\n' %(bpv, primitive_size))
if primitive_size % bpv == 0:
f.write(' decode%dTo%d(in, longs);\n' %(bpv, primitive_size))
else:
f.write(' decode%dTo%d(in, tmp, longs);\n' %(bpv, primitive_size))
f.write(' prefixSum%d(longs, base);\n' %primitive_size)
f.write(' break;\n')
f.write(' default:\n')
f.write(' decodeSlow(bitsPerValue, pdu, tmp, longs);\n')
f.write(' decodeSlow(bitsPerValue, in, tmp, longs);\n')
f.write(' prefixSum32(longs, base);\n')
f.write(' break;\n')
f.write(' }\n')

View File

@ -40,10 +40,9 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
@ -54,7 +53,7 @@ import org.apache.lucene.store.IndexInput;
* else if bitsPerValue &lt;= 16 we pack 4 ints per long
* else we pack 2 ints per long
*/
public final class ForUtil {
final class ForUtil {
public static final int BLOCK_SIZE = 128;
static final int BLOCK_SIZE_LOG2 = 7;
@ -222,11 +221,11 @@ public final class ForUtil {
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
}
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, long[] tmp, long[] longs)
static void decodeSlow(int bitsPerValue, IndexInput in, long[] tmp, long[] longs)
throws IOException {
final int numLongs = bitsPerValue << 1;
final long mask = MASKS32[bitsPerValue];
pdu.splitLongs(numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
splitLongs(in, numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
final int remainingBitsPerLong = 32 - bitsPerValue;
final long mask32RemainingBitsPerLong = MASKS32[remainingBitsPerLong];
int tmpIdx = 0;
@ -248,6 +247,20 @@ public final class ForUtil {
}
}
static void splitLongs(
IndexInput in, int count, long[] b, int bShift, int dec, long bMask, long[] c, int cIndex, long cMask)
throws IOException {
// takes advantage of the C2 compiler's loop unrolling and auto-vectorization.
in.readLongs(c, cIndex, count);
int maxIter = (bShift - 1) / dec;
for (int i = 0; i < count; ++i) {
for (int j = 0; j <= maxIter; ++j) {
b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask;
}
c[cIndex + i] &= cMask;
}
}
"""
def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values, f):
@ -287,18 +300,20 @@ def writeDecode(bpv, f):
next_primitive = 8
elif bpv <= 16:
next_primitive = 16
f.write(' static void decode%d(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {\n' %bpv)
if bpv == next_primitive:
f.write(' pdu.in.readLongs(longs, 0, %d);\n' %(bpv*2))
f.write(' static void decode%d(IndexInput in, long[] longs) throws IOException {\n' %bpv)
f.write(' in.readLongs(longs, 0, %d);\n' %(bpv*2))
else:
num_values_per_long = 64 / next_primitive
remaining_bits = next_primitive % bpv
num_iters = (next_primitive - 1) // bpv
o = 2 * bpv * num_iters
if remaining_bits == 0:
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
f.write(' static void decode%d(IndexInput in, long[] longs) throws IOException {\n' %bpv)
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
else:
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
f.write(' static void decode%d(IndexInput in, long[] tmp, long[] longs) throws IOException {\n' %bpv)
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_long - o, f)
f.write(' }\n')
@ -324,7 +339,7 @@ if __name__ == '__main__':
f.write("""
/** Decode 128 integers into {@code longs}. */
void decode(int bitsPerValue, PostingDecodingUtil pdu, long[] longs) throws IOException {
void decode(int bitsPerValue, IndexInput in, long[] longs) throws IOException {
switch (bitsPerValue) {
""")
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
@ -334,11 +349,14 @@ if __name__ == '__main__':
elif bpv <= 16:
next_primitive = 16
f.write(' case %d:\n' %bpv)
f.write(' decode%d(pdu, tmp, longs);\n' %bpv)
if next_primitive % bpv == 0:
f.write(' decode%d(in, longs);\n' %bpv)
else:
f.write(' decode%d(in, tmp, longs);\n' %bpv)
f.write(' expand%d(longs);\n' %next_primitive)
f.write(' break;\n')
f.write(' default:\n')
f.write(' decodeSlow(bitsPerValue, pdu, tmp, longs);\n')
f.write(' decodeSlow(bitsPerValue, in, tmp, longs);\n')
f.write(' expand32(longs);\n')
f.write(' break;\n')
f.write(' }\n')

View File

@ -15,419 +15,5 @@
* limitations under the License.
*/
/**
* Lucene 9.12 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
*
* <div>
*
* <ul>
* <li><a href="#Introduction">Introduction</a>
* <li><a href="#Definitions">Definitions</a>
* <ul>
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
* <li><a href="#Types_of_Fields">Types of Fields</a>
* <li><a href="#Segments">Segments</a>
* <li><a href="#Document_Numbers">Document Numbers</a>
* </ul>
* <li><a href="#Overview">Index Structure Overview</a>
* <li><a href="#File_Naming">File Naming</a>
* <li><a href="#file-names">Summary of File Extensions</a>
* <ul>
* <li><a href="#Lock_File">Lock File</a>
* <li><a href="#History">History</a>
* <li><a href="#Limitations">Limitations</a>
* </ul>
* </ul>
*
* </div> <a id="Introduction"></a>
*
* <h3>Introduction</h3>
*
* <div>
*
* <p>This document defines the index file formats used in this version of Lucene. If you are using
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
* with the version you are using.
*
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
* </div> <a id="Definitions"></a>
*
* <h3>Definitions</h3>
*
* <div>
*
* <p>The fundamental concepts in Lucene are index, document, field and term.
*
* <p>An index contains a sequence of documents.
*
* <ul>
* <li>A document is a sequence of fields.
* <li>A field is a named sequence of terms.
* <li>A term is a sequence of bytes.
* </ul>
*
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
* are represented as a pair: the string naming the field, and the bytes within the field. <a
* id="Inverted_Indexing"></a>
*
* <h4>Inverted Indexing</h4>
*
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
* search more efficient. Lucene's terms index falls into the family of indexes known as an
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
* This is the inverse of the natural relationship, in which documents list terms. <a
* id="Types_of_Fields"></a>
*
* <h4>Types of Fields</h4>
*
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
* may be both stored and indexed.
*
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
* useful for certain identifier fields to be indexed literally.
*
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
* Fields. <a id="Segments"></a>
*
* <h4>Segments</h4>
*
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
* fully independent index, which could be searched separately. Indexes evolve by:
*
* <ol>
* <li>Creating new segments for newly added documents.
* <li>Merging existing segments.
* </ol>
*
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
* composed of a set of segments. <a id="Document_Numbers"></a>
*
* <h4>Document Numbers</h4>
*
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
* document added to an index is numbered zero, and each subsequent document added gets a number one
* greater than the previous.
*
* <p>Note that a document's number may change, so caution should be taken when storing these
* numbers outside of Lucene. In particular, numbers may change in the following situations:
*
* <ul>
* <li>
* <p>The numbers stored in each segment are unique only within the segment, and must be
* converted before they can be used in a larger context. The standard technique is to
* allocate each segment a range of values, based on the range of numbers used in that
* segment. To convert a document number from a segment to an external value, the segment's
* <i>base</i> document number is added. To convert an external value back to a
* segment-specific value, the segment is identified by the range that the external value is
* in, and the segment's base value is subtracted. For example two five document segments
* might be combined, so that the first segment has a base value of zero, and the second of
* five. Document three from the second segment would have an external value of eight.
* <li>
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
* removed as the index evolves through merging. Deleted documents are dropped when segments
* are merged. A freshly-merged segment thus has no gaps in its numbering.
* </ul>
*
* </div> <a id="Overview"></a>
*
* <h3>Index Structure Overview</h3>
*
* <div>
*
* <p>Each segment index maintains the following:
*
* <ul>
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
* contains metadata about a segment, such as the number of documents, what files it uses, and
* information about how the segment is sorted
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
* contains metadata about the set of named fields used in the index.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
* This contains, for each document, a list of attribute-value pairs, where the attributes are
* field names. These are used to store auxiliary information about the document, such as its
* title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
* dictionary containing all of the terms used in all of the indexed fields of all of the
* documents. The dictionary also contains the number of documents which contain the term, and
* pointers to the term's frequency and proximity data.
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
* each term in the dictionary, the numbers of all the documents that contain that term, and
* the frequency of the term in that document, unless frequencies are omitted ({@link
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
* each term in the dictionary, the positions that the term occurs in each document. Note that
* this will not exist if all fields in all documents omit position data.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
* each field in each document, a value is stored that is multiplied into the score for hits
* on that field.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
* field in each document, the term vector (sometimes called document vector) may be stored. A
* term vector consists of term text and term frequency. To add Term Vectors to your index see
* the {@link org.apache.lucene.document.Field Field} constructors
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
* stored values, these are also keyed by document number, but are generally intended to be
* loaded into main memory for fast access. Whereas stored values are generally intended for
* summary results from searches, per-document values are useful for things like scoring
* factors.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
* optional file indicating which documents are live.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
* intersection (2D, 3D).
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
* vector format stores numeric vectors in a format optimized for random access and
* computation, supporting high-dimensional nearest-neighbor search.
* </ul>
*
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
*
* <h3>File Naming</h3>
*
* <div>
*
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
* correspond to the different file formats described below. When using the Compound File format
* (default for small segments) these files (except for the Segment info file, the Lock file, and
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
*
* <p>Typically, all segments in an index are stored in a single directory, although this is not
* required.
*
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
* never before used filename. This is achieved using a simple generations approach. For example,
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
*
* <h3>Summary of File Extensions</h3>
*
* <div>
*
* <p>The following table summarizes the names and extensions of the files in Lucene:
*
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
* <caption>lucene filenames by extension</caption>
* <tr>
* <th>Name</th>
* <th>Extension</th>
* <th>Brief Description</th>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
* <td>segments_N</td>
* <td>Stores information about a commit point</td>
* </tr>
* <tr>
* <td><a href="#Lock_File">Lock File</a></td>
* <td>write.lock</td>
* <td>The Write lock prevents multiple IndexWriters from writing to the same
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
* <td>.cfs, .cfe</td>
* <td>An optional "virtual" file consisting of all the other index files for
* systems that frequently run out of file handles.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
* <td>.fnm</td>
* <td>Stores information about the fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
* <td>.fdx</td>
* <td>Contains pointers to field data</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
* <td>.fdt</td>
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
* <td>.kdd, .kdi, .kdm</td>
* <td>Holds indexed points</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
* <td>.vec, .vem, .veq, vex</td>
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
* hnsw graph data.</td>
* </tr>
* </table>
*
* </div> <a id="Lock_File"></a>
*
* <h3>Lock File</h3>
*
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
* lock directory is different from the index directory then the write lock will be named
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
* directory. When this file is present, a writer is currently modifying the index (adding or
* removing documents). This lock file ensures that only one writer is modifying the index at a
* time. <a id="History"></a>
*
* <h3>History</h3>
*
* <p>Compatibility notes are provided in this document, describing how file formats have changed
* from prior versions:
*
* <ul>
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
* written in the new file format (meaning no specific "upgrade" process is needed). But note
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
* store (vectors &amp; stored fields) files. This allows for faster indexing in certain
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
* change in 2.1).
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
* details.
* <li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData may be passed to
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
* details. Also, diagnostics were added to each segment written recording details about why
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
* read, but on merge the new segment will write them, uncompressed). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
* <li>In version 3.1, segments records the code version that created them. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
* Additionally segments track explicitly whether or not they have term vectors. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
* they were stored in text format only.
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
* was introduced. Normalization factors need no longer be a single byte, they can be any
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
* the postings lists. Payloads can be stored in the term vectors.
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
* were changed to inline directly into the term dictionary. Stored fields are compressed by
* default.
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
* allow updating NumericDocValues fields.
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
* checksum of the file.
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
* suitable for faceting/sorting/analytics.
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
* for binary fields and ord indexes for multi-valued fields.
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
* sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
* an iterator API.
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
* if they may not produce high enough scores. Additionally doc values and norms has been
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
* elements to skip when advancing in the data.
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
* performant encoding that is vectorized.
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
* user-defined sorts to be used
* <li>In version 8.6, points fields split the index tree and leaf data into separate files, to
* allow for different access patterns to the different data structures
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
* smaller stored fields.
* <li>In version 9.0, vector-valued fields were added.
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
* IndexDISI. ordToDoc mappings was added to .vem.
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
* layer and not writing the node ids for the zeroth layer.
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
* format to utilize int8 quantized vectors for float32 vector search.
* <li>In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
* 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
* need skipping, especially conjunctions.
* </ul>
*
* <a id="Limitations"></a>
*
* <h3>Limitations</h3>
*
* <div>
*
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
* index file format and the current implementation. Eventually these should be replaced with either
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
* VInt} values which have no limit. </div>
*/
/** Lucene 9.12 file format. */
package org.apache.lucene.backward_codecs.lucene912;

View File

@ -21,8 +21,6 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
@ -34,6 +32,7 @@ import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
@ -53,13 +52,15 @@ import org.apache.lucene.util.packed.DirectMonotonicReader;
*/
public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
private final Map<String, FieldEntry> fields = new HashMap<>();
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
private final IndexInput vectorData;
private final IndexInput vectorIndex;
private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
private final FieldInfos fieldInfos;
Lucene92HnswVectorsReader(SegmentReadState state) throws IOException {
int versionMeta = readMetadata(state);
this.fieldInfos = state.fieldInfos;
boolean success = false;
try {
vectorData =
@ -152,7 +153,7 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
}
FieldEntry fieldEntry = readField(meta, info);
validateFieldEntry(info, fieldEntry);
fields.put(info.name, fieldEntry);
fields.put(info.number, fieldEntry);
}
}
@ -212,13 +213,18 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
CodecUtil.checksumEntireFile(vectorIndex);
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
private FieldEntry getFieldEntry(String field) {
final FieldInfo info = fieldInfos.fieldInfo(field);
final FieldEntry fieldEntry;
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
return OffHeapFloatVectorValues.load(fieldEntry, vectorData);
return fieldEntry;
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
return OffHeapFloatVectorValues.load(getFieldEntry(field), vectorData);
}
@Override
@ -229,8 +235,7 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
@Override
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
FieldEntry fieldEntry = fields.get(field);
final FieldEntry fieldEntry = getFieldEntry(field);
if (fieldEntry.size() == 0) {
return;
}

View File

@ -21,8 +21,6 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
@ -35,6 +33,7 @@ import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
@ -54,13 +53,15 @@ import org.apache.lucene.util.packed.DirectMonotonicReader;
*/
public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
private final Map<String, FieldEntry> fields = new HashMap<>();
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
private final IndexInput vectorData;
private final IndexInput vectorIndex;
private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
private final FieldInfos fieldInfos;
Lucene94HnswVectorsReader(SegmentReadState state) throws IOException {
int versionMeta = readMetadata(state);
this.fieldInfos = state.fieldInfos;
boolean success = false;
try {
vectorData =
@ -153,7 +154,7 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
}
FieldEntry fieldEntry = readField(meta, info);
validateFieldEntry(info, fieldEntry);
fields.put(info.name, fieldEntry);
fields.put(info.number, fieldEntry);
}
}
@ -230,48 +231,41 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
CodecUtil.checksumEntireFile(vectorIndex);
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) {
final FieldInfo info = fieldInfos.fieldInfo(field);
final FieldEntry fieldEntry;
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
if (fieldEntry.vectorEncoding != expectedEncoding) {
throw new IllegalArgumentException(
"field=\""
+ field
+ "\" is encoded as: "
+ fieldEntry.vectorEncoding
+ " expected: "
+ VectorEncoding.FLOAT32);
+ expectedEncoding);
}
return fieldEntry;
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
return OffHeapFloatVectorValues.load(fieldEntry, vectorData);
}
@Override
public ByteVectorValues getByteVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
throw new IllegalArgumentException(
"field=\""
+ field
+ "\" is encoded as: "
+ fieldEntry.vectorEncoding
+ " expected: "
+ VectorEncoding.BYTE);
}
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
return OffHeapByteVectorValues.load(fieldEntry, vectorData);
}
@Override
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry.size() == 0 || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
if (fieldEntry.size() == 0 || knnCollector.k() == 0) {
return;
}
@ -289,9 +283,8 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
@Override
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry.size() == 0 || fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
if (fieldEntry.size() == 0 || knnCollector.k() == 0) {
return;
}

View File

@ -21,8 +21,6 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
@ -39,6 +37,7 @@ import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
@ -61,7 +60,7 @@ import org.apache.lucene.util.packed.DirectMonotonicReader;
public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements HnswGraphProvider {
private final FieldInfos fieldInfos;
private final Map<String, FieldEntry> fields = new HashMap<>();
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
private final IndexInput vectorData;
private final IndexInput vectorIndex;
private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
@ -161,7 +160,7 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
}
FieldEntry fieldEntry = readField(meta, info);
validateFieldEntry(info, fieldEntry);
fields.put(info.name, fieldEntry);
fields.put(info.number, fieldEntry);
}
}
@ -238,21 +237,27 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
CodecUtil.checksumEntireFile(vectorIndex);
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) {
final FieldInfo info = fieldInfos.fieldInfo(field);
final FieldEntry fieldEntry;
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
if (fieldEntry.vectorEncoding != expectedEncoding) {
throw new IllegalArgumentException(
"field=\""
+ field
+ "\" is encoded as: "
+ fieldEntry.vectorEncoding
+ " expected: "
+ VectorEncoding.FLOAT32);
+ expectedEncoding);
}
return fieldEntry;
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
return OffHeapFloatVectorValues.load(
fieldEntry.similarityFunction,
defaultFlatVectorScorer,
@ -266,19 +271,7 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
@Override
public ByteVectorValues getByteVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
throw new IllegalArgumentException(
"field=\""
+ field
+ "\" is encoded as: "
+ fieldEntry.vectorEncoding
+ " expected: "
+ VectorEncoding.BYTE);
}
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
return OffHeapByteVectorValues.load(
fieldEntry.similarityFunction,
defaultFlatVectorScorer,
@ -293,11 +286,8 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
@Override
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry.size() == 0
|| knnCollector.k() == 0
|| fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
if (fieldEntry.size() == 0 || knnCollector.k() == 0) {
return;
}
@ -324,11 +314,8 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
@Override
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry.size() == 0
|| knnCollector.k() == 0
|| fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
if (fieldEntry.size() == 0 || knnCollector.k() == 0) {
return;
}
@ -355,12 +342,12 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
/** Get knn graph values; used for testing */
@Override
public HnswGraph getGraph(String field) throws IOException {
FieldInfo info = fieldInfos.fieldInfo(field);
if (info == null) {
throw new IllegalArgumentException("No such field '" + field + "'");
final FieldInfo info = fieldInfos.fieldInfo(field);
final FieldEntry entry;
if (info == null || (entry = fields.get(info.number)) == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
FieldEntry entry = fields.get(field);
if (entry != null && entry.vectorIndexLength > 0) {
if (entry.vectorIndexLength > 0) {
return getGraph(entry);
} else {
return HnswGraph.EMPTY;

View File

@ -17,6 +17,7 @@
package org.apache.lucene.backward_codecs.lucene99;
import java.util.Objects;
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;

View File

@ -24,3 +24,4 @@ org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec
org.apache.lucene.backward_codecs.lucene100.Lucene100Codec

View File

@ -17,3 +17,4 @@ org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat
org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat

View File

@ -14,25 +14,25 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.LEVEL1_MASK;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.LEVEL1_MASK;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Impact;
@ -142,9 +142,8 @@ public class Lucene912PostingsWriter extends PushPostingsWriterBase {
metaOut, META_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(
docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
final ForUtil forUtil = new ForUtil();
forDeltaUtil = new ForDeltaUtil();
pforUtil = new PForUtil(forUtil);
pforUtil = new PForUtil();
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new long[BLOCK_SIZE];
String posFileName =

View File

@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/** Read-write impersonation of {@link Lucene912PostingsFormat}. */
public final class Lucene912RWPostingsFormat extends Lucene912PostingsFormat {
private final int minTermBlockSize;
private final int maxTermBlockSize;
/** Creates {@code Lucene912PostingsFormat} with default settings. */
public Lucene912RWPostingsFormat() {
this(
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/**
* Creates {@code Lucene912PostingsFormat} with custom values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
*
* @see
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene912RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super();
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret =
new Lucene90BlockTreeTermsWriter(
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
}

View File

@ -14,12 +14,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -65,13 +64,11 @@ public class TestForDeltaUtil extends LuceneTestCase {
{
// decode
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
PostingDecodingUtil pdu =
Lucene912PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
for (int i = 0; i < iterations; ++i) {
long base = 0;
final long[] restored = new long[ForUtil.BLOCK_SIZE];
forDeltaUtil.decodeAndPrefixSum(pdu, base, restored);
forDeltaUtil.decodeAndPrefixSum(in, base, restored);
final long[] expected = new long[ForUtil.BLOCK_SIZE];
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
expected[j] = values[i * ForUtil.BLOCK_SIZE + j];

View File

@ -0,0 +1,94 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene912;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.packed.PackedInts;
public class TestForUtil extends LuceneTestCase {
public void testEncodeDecode() throws IOException {
final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
final int[] values = new int[iterations * ForUtil.BLOCK_SIZE];
for (int i = 0; i < iterations; ++i) {
final int bpv = TestUtil.nextInt(random(), 1, 31);
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
values[i * ForUtil.BLOCK_SIZE + j] =
RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
}
}
final Directory d = new ByteBuffersDirectory();
final long endPointer;
{
// encode
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
final ForUtil forUtil = new ForUtil();
for (int i = 0; i < iterations; ++i) {
long[] source = new long[ForUtil.BLOCK_SIZE];
long or = 0;
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
source[j] = values[i * ForUtil.BLOCK_SIZE + j];
or |= source[j];
}
final int bpv = PackedInts.bitsRequired(or);
out.writeByte((byte) bpv);
forUtil.encode(source, bpv, out);
}
endPointer = out.getFilePointer();
out.close();
}
{
// decode
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
ForUtil forUtil = new ForUtil();
for (int i = 0; i < iterations; ++i) {
final int bitsPerValue = in.readByte();
final long currentFilePointer = in.getFilePointer();
final long[] restored = new long[ForUtil.BLOCK_SIZE];
forUtil.decode(bitsPerValue, in, restored);
int[] ints = new int[ForUtil.BLOCK_SIZE];
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
ints[j] = Math.toIntExact(restored[j]);
}
assertArrayEquals(
Arrays.toString(ints),
ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE),
ints);
assertEquals(ForUtil.numBytes(bitsPerValue), in.getFilePointer() - currentFilePointer);
}
assertEquals(endPointer, in.getFilePointer());
in.close();
}
d.close();
}
}

View File

@ -14,17 +14,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsReader.MutableImpactList;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader.MutableImpactList;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
@ -45,7 +45,7 @@ public class TestLucene912PostingsFormat extends BasePostingsFormatTestCase {
@Override
protected Codec getCodec() {
return TestUtil.alwaysPostingsFormat(new Lucene912PostingsFormat());
return TestUtil.alwaysPostingsFormat(new Lucene912RWPostingsFormat());
}
public void testVInt15() throws IOException {

View File

@ -0,0 +1,104 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene912;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.packed.PackedInts;
public class TestPForUtil extends LuceneTestCase {
public void testEncodeDecode() throws IOException {
final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
final int[] values = createTestData(iterations, 31);
final Directory d = new ByteBuffersDirectory();
final long endPointer = encodeTestData(iterations, values, d);
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
final PForUtil pforUtil = new PForUtil();
for (int i = 0; i < iterations; ++i) {
if (random().nextInt(5) == 0) {
PForUtil.skip(in);
continue;
}
final long[] restored = new long[ForUtil.BLOCK_SIZE];
pforUtil.decode(in, restored);
int[] ints = new int[ForUtil.BLOCK_SIZE];
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
ints[j] = Math.toIntExact(restored[j]);
}
assertArrayEquals(
Arrays.toString(ints),
ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE),
ints);
}
assertEquals(endPointer, in.getFilePointer());
in.close();
d.close();
}
private int[] createTestData(int iterations, int maxBpv) {
final int[] values = new int[iterations * ForUtil.BLOCK_SIZE];
for (int i = 0; i < iterations; ++i) {
final int bpv = TestUtil.nextInt(random(), 0, maxBpv);
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
values[i * ForUtil.BLOCK_SIZE + j] =
RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
if (random().nextInt(100) == 0) {
final int exceptionBpv;
if (random().nextInt(10) == 0) {
exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 9, 16), maxBpv);
} else {
exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 1, 8), maxBpv);
}
values[i * ForUtil.BLOCK_SIZE + j] |= random().nextInt(1 << (exceptionBpv - bpv)) << bpv;
}
}
}
return values;
}
private long encodeTestData(int iterations, int[] values, Directory d) throws IOException {
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
final PForUtil pforUtil = new PForUtil();
for (int i = 0; i < iterations; ++i) {
long[] source = new long[ForUtil.BLOCK_SIZE];
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
source[j] = values[i * ForUtil.BLOCK_SIZE + j];
}
pforUtil.encode(source, out);
}
final long endPointer = out.getFilePointer();
out.close();
return endPointer;
}
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import java.io.IOException;
import org.apache.lucene.store.Directory;

View File

@ -18,17 +18,12 @@
package org.apache.lucene.backward_codecs.lucene99;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
import org.apache.lucene.tests.util.TestUtil;
public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {
@Override
protected Codec getCodec() {
return new Lucene99Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new Lucene99RWHnswScalarQuantizationVectorsFormat();
}
};
return TestUtil.alwaysKnnVectorsFormat(new Lucene99RWHnswScalarQuantizationVectorsFormat());
}
}

View File

@ -20,9 +20,7 @@ import static org.apache.lucene.backward_index.TestBasicBackwardsCompatibility.a
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
@ -69,14 +67,10 @@ public class TestInt7HnswBackwardsCompatibility extends BackwardsCompatibilityTe
}
protected Codec getCodec() {
return new Lucene99Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new Lucene99HnswScalarQuantizedVectorsFormat(
return TestUtil.alwaysKnnVectorsFormat(
new Lucene99HnswScalarQuantizedVectorsFormat(
Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN,
Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH);
}
};
Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH));
}
@Override

View File

@ -16,6 +16,9 @@
*/
/** Lucene JMH benchmarks. */
// jmh.core is not modularized and causes a warning. Suppressing it until it is modularized.
@SuppressWarnings("requires-automatic")
module org.apache.lucene.benchmark.jmh {
requires jmh.core;
requires jdk.unsupported;

View File

@ -0,0 +1,180 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.util.Arrays;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.VectorUtil;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.CompilerControl;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(
value = 3,
jvmArgsAppend = {
"-Xmx1g",
"-Xms1g",
"-XX:+AlwaysPreTouch",
"--add-modules",
"jdk.incubator.vector"
})
public class AdvanceBenchmark {
private final int[] values = new int[129];
private final int[] startIndexes = new int[1_000];
private final int[] targets = new int[startIndexes.length];
@Setup(Level.Trial)
public void setup() throws Exception {
for (int i = 0; i < 128; ++i) {
values[i] = i;
}
values[128] = DocIdSetIterator.NO_MORE_DOCS;
Random r = new Random(0);
for (int i = 0; i < startIndexes.length; ++i) {
startIndexes[i] = r.nextInt(64);
targets[i] = startIndexes[i] + 1 + r.nextInt(1 << r.nextInt(7));
}
}
@Benchmark
public void binarySearch() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch(int[] values, int target, int startIndex) {
// Standard binary search
int i = Arrays.binarySearch(values, startIndex, values.length, target);
if (i < 0) {
i = -1 - i;
}
return i;
}
@Benchmark
public void inlinedBranchlessBinarySearch() {
for (int i = 0; i < targets.length; ++i) {
inlinedBranchlessBinarySearch(values, targets[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int inlinedBranchlessBinarySearch(int[] values, int target) {
// This compiles to cmov instructions.
int start = 0;
if (values[63] < target) {
start += 64;
}
if (values[start + 31] < target) {
start += 32;
}
if (values[start + 15] < target) {
start += 16;
}
if (values[start + 7] < target) {
start += 8;
}
if (values[start + 3] < target) {
start += 4;
}
if (values[start + 1] < target) {
start += 2;
}
if (values[start] < target) {
start += 1;
}
return start;
}
@Benchmark
public void linearSearch() {
for (int i = 0; i < startIndexes.length; ++i) {
linearSearch(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int linearSearch(int[] values, long target, int startIndex) {
// Naive linear search.
for (int i = startIndex; i < values.length; ++i) {
if (values[i] >= target) {
return i;
}
}
return values.length;
}
@Benchmark
public void vectorUtilSearch() {
for (int i = 0; i < startIndexes.length; ++i) {
VectorUtil.findNextGEQ(values, targets[i], startIndexes[i], 128);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int vectorUtilSearch(int[] values, int target, int startIndex) {
return VectorUtil.findNextGEQ(values, target, startIndex, 128);
}
private static void assertEquals(int expected, int actual) {
if (expected != actual) {
throw new AssertionError("Expected: " + expected + ", got " + actual);
}
}
public static void main(String[] args) {
// For testing purposes
int[] values = new int[129];
for (int i = 0; i < 128; ++i) {
values[i] = i;
}
values[128] = DocIdSetIterator.NO_MORE_DOCS;
for (int start = 0; start < 128; ++start) {
for (int targetIndex = start; targetIndex < 128; ++targetIndex) {
int actualIndex = binarySearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = inlinedBranchlessBinarySearch(values, values[targetIndex]);
assertEquals(targetIndex, actualIndex);
actualIndex = linearSearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = vectorUtilSearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
}
}
}
}

View File

@ -21,9 +21,9 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.lucene912.ForDeltaUtil;
import org.apache.lucene.codecs.lucene912.ForUtil;
import org.apache.lucene.codecs.lucene912.PostingIndexInput;
import org.apache.lucene.codecs.lucene101.ForDeltaUtil;
import org.apache.lucene.codecs.lucene101.ForUtil;
import org.apache.lucene.codecs.lucene101.PostingIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
@ -61,7 +61,7 @@ public class PostingIndexInputBenchmark {
private PostingIndexInput postingIn;
private final ForUtil forUtil = new ForUtil();
private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
private final long[] values = new long[128];
private final int[] values = new int[ForUtil.BLOCK_SIZE];
@Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"})
public int bpv;

View File

@ -112,8 +112,7 @@ public abstract class ReadTask extends PerfTask {
// pulling the Weight ourselves:
int totalHitsThreshold = withTotalHits() ? Integer.MAX_VALUE : 1;
TopFieldCollectorManager collectorManager =
new TopFieldCollectorManager(
sort, numHits, null, totalHitsThreshold, searcher.getSlices().length > 1);
new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold);
hits = searcher.search(q, collectorManager);
} else {
hits = searcher.search(q, numHits);

View File

@ -22,14 +22,14 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene912PostingsWriter}. */
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene101PostingsWriter}. */
public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
private final int minTermBlockSize;
@ -67,7 +67,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
boolean success = false;
try {
@ -84,7 +84,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);

View File

@ -24,7 +24,7 @@ import java.util.TreeMap;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.TransitionAccessor;
// - or: longer dense skip lists than just next byte?
/**
* Wraps {@link Lucene912PostingsFormat} format for on-disk storage, but then at read time loads and
* Wraps {@link Lucene101PostingsFormat} format for on-disk storage, but then at read time loads and
* stores all terms and postings directly in RAM as byte[], int[].
*
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
@ -97,12 +97,12 @@ public final class DirectPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return PostingsFormat.forName("Lucene912").fieldsConsumer(state);
return PostingsFormat.forName("Lucene101").fieldsConsumer(state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
FieldsProducer postings = PostingsFormat.forName("Lucene912").fieldsProducer(state);
FieldsProducer postings = PostingsFormat.forName("Lucene101").fieldsProducer(state);
if (state.context.context() != IOContext.Context.MERGE) {
FieldsProducer loadedPostings;
try {

View File

@ -22,8 +22,8 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
@ -41,7 +41,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
boolean success = false;
try {
@ -57,7 +57,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new FSTTermsReader(state, postingsReader);

View File

@ -26,8 +26,6 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextKnnVectorsWriter.VEC
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.CorruptIndexException;
@ -36,6 +34,7 @@ import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.search.VectorScorer;
@ -63,7 +62,7 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader {
private final SegmentReadState readState;
private final IndexInput dataIn;
private final BytesRefBuilder scratch = new BytesRefBuilder();
private final Map<String, FieldEntry> fieldEntries = new HashMap<>();
private final IntObjectHashMap<FieldEntry> fieldEntries = new IntObjectHashMap<>();
SimpleTextKnnVectorsReader(SegmentReadState readState) throws IOException {
this.readState = readState;
@ -91,9 +90,9 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader {
for (int i = 0; i < size; i++) {
docIds[i] = readInt(in, EMPTY);
}
assert fieldEntries.containsKey(fieldName) == false;
assert fieldEntries.containsKey(fieldNumber) == false;
fieldEntries.put(
fieldName,
fieldNumber,
new FieldEntry(
dimension,
vectorDataOffset,
@ -126,7 +125,7 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader {
throw new IllegalStateException(
"KNN vectors readers should not be called on fields that don't enable KNN vectors");
}
FieldEntry fieldEntry = fieldEntries.get(field);
FieldEntry fieldEntry = fieldEntries.get(info.number);
if (fieldEntry == null) {
// mirror the handling in Lucene90VectorReader#getVectorValues
// needed to pass TestSimpleTextKnnVectorsFormat#testDeleteAllVectorDocs
@ -159,7 +158,7 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader {
throw new IllegalStateException(
"KNN vectors readers should not be called on fields that don't enable KNN vectors");
}
FieldEntry fieldEntry = fieldEntries.get(field);
FieldEntry fieldEntry = fieldEntries.get(info.number);
if (fieldEntry == null) {
// mirror the handling in Lucene90VectorReader#getVectorValues
// needed to pass TestSimpleTextKnnVectorsFormat#testDeleteAllVectorDocs

View File

@ -17,13 +17,13 @@
package org.apache.lucene.codecs.uniformsplit;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.TermState;
@ -34,7 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator;
/**
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
* pointer. It differs from {@link Lucene912PostingsWriter#encodeTerm} which encodes each file
* pointer. It differs from {@link Lucene101PostingsWriter#encodeTerm} which encodes each file
* pointer as a delta relative to the previous file pointer.
*
* <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP,
@ -95,7 +95,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
/**
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
*
* <p>Simpler variant of {@link Lucene912PostingsWriter#encodeTerm(DataOutput, FieldInfo,
* <p>Simpler variant of {@link Lucene101PostingsWriter#encodeTerm(DataOutput, FieldInfo,
* BlockTermState, boolean)}.
*/
public void writeTermState(
@ -145,7 +145,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
/**
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
*
* <p>Simpler variant of {@link Lucene912PostingsReader#decodeTerm(DataInput, FieldInfo,
* <p>Simpler variant of {@link Lucene101PostingsReader#decodeTerm(DataInput, FieldInfo,
* BlockTermState, boolean)}.
*
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.

View File

@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
@ -113,7 +113,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer termsWriter =
@ -130,7 +130,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
boolean success = false;
try {
FieldsProducer termsReader =

View File

@ -28,7 +28,7 @@
* org.apache.lucene.search.PhraseQuery})
* <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery}
* <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case
* prefer {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
* prefer {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat}
* </ul>
*/
package org.apache.lucene.codecs.uniformsplit;

View File

@ -22,7 +22,6 @@ import java.io.IOException;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnByteVectorField;
@ -38,16 +37,12 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopKnnCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase;
import org.apache.lucene.tests.util.TestUtil;
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
@Override
protected Codec getCodec() {
return new Lucene100Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new HnswBitVectorsFormat();
}
};
return TestUtil.alwaysKnnVectorsFormat(new HnswBitVectorsFormat());
}
@Override

View File

@ -17,7 +17,7 @@
package org.apache.lucene.codecs.lucene90.tests;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
/** Test utility class to create mock {@link IntBlockTermState}. */
public class MockTermStateFactory {

View File

@ -1,4 +1,4 @@
{
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "f561578ccb6a95364bb62c5ed86b38ff0b4a009d",
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "eea1a71be9da8a13fdd979354dc4a8c6edf21be1"
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "0ff7fb9159693055d9e4b9468b004166156f6550",
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "8c55b7aaced028388408c5eb968b1f1197e11142"
}

View File

@ -1,4 +1,4 @@
{
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "159e82388346fde147924d5e15ca65df4dd63b9a",
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "66dc8813160feae2a37d8b50474f5f9830b6cb22"
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java": "10ceb79f031232bc1e4564db7e3ebb16eedd2e0a",
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py": "d69e734bce30375952046a3776bbb7a5c1edbd51"
}

View File

@ -15,8 +15,6 @@
* limitations under the License.
*/
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
/** Lucene Core. */
@SuppressWarnings("module") // the test framework is compiled after the core...
module org.apache.lucene.core {
@ -33,8 +31,7 @@ module org.apache.lucene.core {
exports org.apache.lucene.codecs.lucene94;
exports org.apache.lucene.codecs.lucene95;
exports org.apache.lucene.codecs.lucene99;
exports org.apache.lucene.codecs.lucene912;
exports org.apache.lucene.codecs.lucene100;
exports org.apache.lucene.codecs.lucene101;
exports org.apache.lucene.codecs.perfield;
exports org.apache.lucene.codecs;
exports org.apache.lucene.document;
@ -73,7 +70,7 @@ module org.apache.lucene.core {
provides org.apache.lucene.analysis.TokenizerFactory with
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
provides org.apache.lucene.codecs.Codec with
Lucene100Codec;
org.apache.lucene.codecs.lucene101.Lucene101Codec;
provides org.apache.lucene.codecs.DocValuesFormat with
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with
@ -81,7 +78,7 @@ module org.apache.lucene.core {
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat;
provides org.apache.lucene.codecs.PostingsFormat with
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
provides org.apache.lucene.index.SortFieldProvider with
org.apache.lucene.search.SortField.Provider,
org.apache.lucene.search.SortedNumericSortField.Provider,

View File

@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
return LOADER;
}
static Codec defaultCodec = LOADER.lookup("Lucene100");
static Codec defaultCodec = LOADER.lookup("Lucene101");
}
private final String name;

View File

@ -0,0 +1,525 @@
// This file has been automatically generated, DO NOT EDIT
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene101;
import static org.apache.lucene.codecs.lucene101.ForUtil.*;
import java.io.IOException;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.packed.PackedInts;
/**
* Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in a Java int to
* get SIMD-like speedups. If bitsPerValue &lt;= 4 then we pack 4 ints per Java int else if
* bitsPerValue &lt;= 11 we pack 2 ints per Java int else we use scalar operations.
*/
public final class ForDeltaUtil {
private static final int HALF_BLOCK_SIZE = BLOCK_SIZE / 2;
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4;
// IDENTITY_PLUS_ONE[i] == i+1
private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE];
static {
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
IDENTITY_PLUS_ONE[i] = i + 1;
}
}
private static void prefixSumOfOnes(int[] arr, int base) {
System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE);
// This loop gets auto-vectorized
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
arr[i] += base;
}
}
private static void prefixSum8(int[] arr, int base) {
// When the number of bits per value is 4 or less, we can sum up all values in a block without
// risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4
// values at once.
innerPrefixSum8(arr);
expand8(arr);
final int l0 = base;
final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1];
final int l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1];
final int l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1];
for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) {
arr[i] += l0;
arr[ONE_BLOCK_SIZE_FOURTH + i] += l1;
arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2;
arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3;
}
}
private static void prefixSum16(int[] arr, int base) {
// When the number of bits per value is 11 or less, we can sum up all values in a block without
// risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2
// values at once.
innerPrefixSum16(arr);
expand16(arr);
final int l0 = base;
final int l1 = base + arr[HALF_BLOCK_SIZE - 1];
for (int i = 0; i < HALF_BLOCK_SIZE; ++i) {
arr[i] += l0;
arr[HALF_BLOCK_SIZE + i] += l1;
}
}
private static void prefixSum32(int[] arr, int base) {
arr[0] += base;
for (int i = 1; i < BLOCK_SIZE; ++i) {
arr[i] += arr[i - 1];
}
}
// For some reason unrolling seems to help
private static void innerPrefixSum8(int[] arr) {
arr[1] += arr[0];
arr[2] += arr[1];
arr[3] += arr[2];
arr[4] += arr[3];
arr[5] += arr[4];
arr[6] += arr[5];
arr[7] += arr[6];
arr[8] += arr[7];
arr[9] += arr[8];
arr[10] += arr[9];
arr[11] += arr[10];
arr[12] += arr[11];
arr[13] += arr[12];
arr[14] += arr[13];
arr[15] += arr[14];
arr[16] += arr[15];
arr[17] += arr[16];
arr[18] += arr[17];
arr[19] += arr[18];
arr[20] += arr[19];
arr[21] += arr[20];
arr[22] += arr[21];
arr[23] += arr[22];
arr[24] += arr[23];
arr[25] += arr[24];
arr[26] += arr[25];
arr[27] += arr[26];
arr[28] += arr[27];
arr[29] += arr[28];
arr[30] += arr[29];
arr[31] += arr[30];
}
// For some reason unrolling seems to help
private static void innerPrefixSum16(int[] arr) {
arr[1] += arr[0];
arr[2] += arr[1];
arr[3] += arr[2];
arr[4] += arr[3];
arr[5] += arr[4];
arr[6] += arr[5];
arr[7] += arr[6];
arr[8] += arr[7];
arr[9] += arr[8];
arr[10] += arr[9];
arr[11] += arr[10];
arr[12] += arr[11];
arr[13] += arr[12];
arr[14] += arr[13];
arr[15] += arr[14];
arr[16] += arr[15];
arr[17] += arr[16];
arr[18] += arr[17];
arr[19] += arr[18];
arr[20] += arr[19];
arr[21] += arr[20];
arr[22] += arr[21];
arr[23] += arr[22];
arr[24] += arr[23];
arr[25] += arr[24];
arr[26] += arr[25];
arr[27] += arr[26];
arr[28] += arr[27];
arr[29] += arr[28];
arr[30] += arr[29];
arr[31] += arr[30];
arr[32] += arr[31];
arr[33] += arr[32];
arr[34] += arr[33];
arr[35] += arr[34];
arr[36] += arr[35];
arr[37] += arr[36];
arr[38] += arr[37];
arr[39] += arr[38];
arr[40] += arr[39];
arr[41] += arr[40];
arr[42] += arr[41];
arr[43] += arr[42];
arr[44] += arr[43];
arr[45] += arr[44];
arr[46] += arr[45];
arr[47] += arr[46];
arr[48] += arr[47];
arr[49] += arr[48];
arr[50] += arr[49];
arr[51] += arr[50];
arr[52] += arr[51];
arr[53] += arr[52];
arr[54] += arr[53];
arr[55] += arr[54];
arr[56] += arr[55];
arr[57] += arr[56];
arr[58] += arr[57];
arr[59] += arr[58];
arr[60] += arr[59];
arr[61] += arr[60];
arr[62] += arr[61];
arr[63] += arr[62];
}
private final int[] tmp = new int[BLOCK_SIZE];
/**
* Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code
* ints} are expected to be deltas between consecutive values.
*/
void encodeDeltas(int[] ints, DataOutput out) throws IOException {
if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings
out.writeByte((byte) 0);
} else {
int or = 0;
for (int l : ints) {
or |= l;
}
assert or != 0;
final int bitsPerValue = PackedInts.bitsRequired(or);
out.writeByte((byte) bitsPerValue);
final int primitiveSize;
if (bitsPerValue <= 3) {
primitiveSize = 8;
collapse8(ints);
} else if (bitsPerValue <= 10) {
primitiveSize = 16;
collapse16(ints);
} else {
primitiveSize = 32;
}
encode(ints, bitsPerValue, primitiveSize, out, tmp);
}
}
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */
void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException {
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
if (bitsPerValue == 0) {
prefixSumOfOnes(ints, base);
} else {
decodeAndPrefixSum(bitsPerValue, pdu, base, ints);
}
}
/** Delta-decode 128 integers into {@code ints}. */
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int[] ints)
throws IOException {
switch (bitsPerValue) {
case 1:
decode1(pdu, ints);
prefixSum8(ints, base);
break;
case 2:
decode2(pdu, ints);
prefixSum8(ints, base);
break;
case 3:
decode3(pdu, tmp, ints);
prefixSum8(ints, base);
break;
case 4:
decode4To16(pdu, ints);
prefixSum16(ints, base);
break;
case 5:
decode5To16(pdu, tmp, ints);
prefixSum16(ints, base);
break;
case 6:
decode6To16(pdu, tmp, ints);
prefixSum16(ints, base);
break;
case 7:
decode7To16(pdu, tmp, ints);
prefixSum16(ints, base);
break;
case 8:
decode8To16(pdu, ints);
prefixSum16(ints, base);
break;
case 9:
decode9(pdu, tmp, ints);
prefixSum16(ints, base);
break;
case 10:
decode10(pdu, tmp, ints);
prefixSum16(ints, base);
break;
case 11:
decode11To32(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 12:
decode12To32(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 13:
decode13To32(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 14:
decode14To32(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 15:
decode15To32(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 16:
decode16To32(pdu, ints);
prefixSum32(ints, base);
break;
case 17:
decode17(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 18:
decode18(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 19:
decode19(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 20:
decode20(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 21:
decode21(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 22:
decode22(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 23:
decode23(pdu, tmp, ints);
prefixSum32(ints, base);
break;
case 24:
decode24(pdu, tmp, ints);
prefixSum32(ints, base);
break;
default:
decodeSlow(bitsPerValue, pdu, tmp, ints);
prefixSum32(ints, base);
break;
}
}
private static void decode4To16(PostingDecodingUtil pdu, int[] ints) throws IOException {
pdu.splitInts(16, ints, 12, 4, MASK16_4, ints, 48, MASK16_4);
}
private static void decode5To16(PostingDecodingUtil pdu, int[] tmp, int[] ints)
throws IOException {
pdu.splitInts(20, ints, 11, 5, MASK16_5, tmp, 0, MASK16_1);
for (int iter = 0, tmpIdx = 0, intsIdx = 60; iter < 4; ++iter, tmpIdx += 5, intsIdx += 1) {
int l0 = tmp[tmpIdx + 0] << 4;
l0 |= tmp[tmpIdx + 1] << 3;
l0 |= tmp[tmpIdx + 2] << 2;
l0 |= tmp[tmpIdx + 3] << 1;
l0 |= tmp[tmpIdx + 4] << 0;
ints[intsIdx + 0] = l0;
}
}
private static void decode6To16(PostingDecodingUtil pdu, int[] tmp, int[] ints)
throws IOException {
pdu.splitInts(24, ints, 10, 6, MASK16_6, tmp, 0, MASK16_4);
for (int iter = 0, tmpIdx = 0, intsIdx = 48; iter < 8; ++iter, tmpIdx += 3, intsIdx += 2) {
int l0 = tmp[tmpIdx + 0] << 2;
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_2;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK16_2) << 4;
l1 |= tmp[tmpIdx + 2] << 0;
ints[intsIdx + 1] = l1;
}
}
private static void decode7To16(PostingDecodingUtil pdu, int[] tmp, int[] ints)
throws IOException {
pdu.splitInts(28, ints, 9, 7, MASK16_7, tmp, 0, MASK16_2);
for (int iter = 0, tmpIdx = 0, intsIdx = 56; iter < 4; ++iter, tmpIdx += 7, intsIdx += 2) {
int l0 = tmp[tmpIdx + 0] << 5;
l0 |= tmp[tmpIdx + 1] << 3;
l0 |= tmp[tmpIdx + 2] << 1;
l0 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_1;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 3] & MASK16_1) << 6;
l1 |= tmp[tmpIdx + 4] << 4;
l1 |= tmp[tmpIdx + 5] << 2;
l1 |= tmp[tmpIdx + 6] << 0;
ints[intsIdx + 1] = l1;
}
}
private static void decode8To16(PostingDecodingUtil pdu, int[] ints) throws IOException {
pdu.splitInts(32, ints, 8, 8, MASK16_8, ints, 32, MASK16_8);
}
private static void decode11To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
throws IOException {
pdu.splitInts(44, ints, 21, 11, MASK32_11, tmp, 0, MASK32_10);
for (int iter = 0, tmpIdx = 0, intsIdx = 88; iter < 4; ++iter, tmpIdx += 11, intsIdx += 10) {
int l0 = tmp[tmpIdx + 0] << 1;
l0 |= (tmp[tmpIdx + 1] >>> 9) & MASK32_1;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK32_9) << 2;
l1 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 2] & MASK32_8) << 3;
l2 |= (tmp[tmpIdx + 3] >>> 7) & MASK32_3;
ints[intsIdx + 2] = l2;
int l3 = (tmp[tmpIdx + 3] & MASK32_7) << 4;
l3 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4;
ints[intsIdx + 3] = l3;
int l4 = (tmp[tmpIdx + 4] & MASK32_6) << 5;
l4 |= (tmp[tmpIdx + 5] >>> 5) & MASK32_5;
ints[intsIdx + 4] = l4;
int l5 = (tmp[tmpIdx + 5] & MASK32_5) << 6;
l5 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6;
ints[intsIdx + 5] = l5;
int l6 = (tmp[tmpIdx + 6] & MASK32_4) << 7;
l6 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_7;
ints[intsIdx + 6] = l6;
int l7 = (tmp[tmpIdx + 7] & MASK32_3) << 8;
l7 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8;
ints[intsIdx + 7] = l7;
int l8 = (tmp[tmpIdx + 8] & MASK32_2) << 9;
l8 |= (tmp[tmpIdx + 9] >>> 1) & MASK32_9;
ints[intsIdx + 8] = l8;
int l9 = (tmp[tmpIdx + 9] & MASK32_1) << 10;
l9 |= tmp[tmpIdx + 10] << 0;
ints[intsIdx + 9] = l9;
}
}
private static void decode12To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
throws IOException {
pdu.splitInts(48, ints, 20, 12, MASK32_12, tmp, 0, MASK32_8);
for (int iter = 0, tmpIdx = 0, intsIdx = 96; iter < 16; ++iter, tmpIdx += 3, intsIdx += 2) {
int l0 = tmp[tmpIdx + 0] << 4;
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_4;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK32_4) << 8;
l1 |= tmp[tmpIdx + 2] << 0;
ints[intsIdx + 1] = l1;
}
}
private static void decode13To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
throws IOException {
pdu.splitInts(52, ints, 19, 13, MASK32_13, tmp, 0, MASK32_6);
for (int iter = 0, tmpIdx = 0, intsIdx = 104; iter < 4; ++iter, tmpIdx += 13, intsIdx += 6) {
int l0 = tmp[tmpIdx + 0] << 7;
l0 |= tmp[tmpIdx + 1] << 1;
l0 |= (tmp[tmpIdx + 2] >>> 5) & MASK32_1;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 2] & MASK32_5) << 8;
l1 |= tmp[tmpIdx + 3] << 2;
l1 |= (tmp[tmpIdx + 4] >>> 4) & MASK32_2;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 4] & MASK32_4) << 9;
l2 |= tmp[tmpIdx + 5] << 3;
l2 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_3;
ints[intsIdx + 2] = l2;
int l3 = (tmp[tmpIdx + 6] & MASK32_3) << 10;
l3 |= tmp[tmpIdx + 7] << 4;
l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_4;
ints[intsIdx + 3] = l3;
int l4 = (tmp[tmpIdx + 8] & MASK32_2) << 11;
l4 |= tmp[tmpIdx + 9] << 5;
l4 |= (tmp[tmpIdx + 10] >>> 1) & MASK32_5;
ints[intsIdx + 4] = l4;
int l5 = (tmp[tmpIdx + 10] & MASK32_1) << 12;
l5 |= tmp[tmpIdx + 11] << 6;
l5 |= tmp[tmpIdx + 12] << 0;
ints[intsIdx + 5] = l5;
}
}
private static void decode14To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
throws IOException {
pdu.splitInts(56, ints, 18, 14, MASK32_14, tmp, 0, MASK32_4);
for (int iter = 0, tmpIdx = 0, intsIdx = 112; iter < 8; ++iter, tmpIdx += 7, intsIdx += 2) {
int l0 = tmp[tmpIdx + 0] << 10;
l0 |= tmp[tmpIdx + 1] << 6;
l0 |= tmp[tmpIdx + 2] << 2;
l0 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_2;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 3] & MASK32_2) << 12;
l1 |= tmp[tmpIdx + 4] << 8;
l1 |= tmp[tmpIdx + 5] << 4;
l1 |= tmp[tmpIdx + 6] << 0;
ints[intsIdx + 1] = l1;
}
}
private static void decode15To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
throws IOException {
pdu.splitInts(60, ints, 17, 15, MASK32_15, tmp, 0, MASK32_2);
for (int iter = 0, tmpIdx = 0, intsIdx = 120; iter < 4; ++iter, tmpIdx += 15, intsIdx += 2) {
int l0 = tmp[tmpIdx + 0] << 13;
l0 |= tmp[tmpIdx + 1] << 11;
l0 |= tmp[tmpIdx + 2] << 9;
l0 |= tmp[tmpIdx + 3] << 7;
l0 |= tmp[tmpIdx + 4] << 5;
l0 |= tmp[tmpIdx + 5] << 3;
l0 |= tmp[tmpIdx + 6] << 1;
l0 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_1;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 7] & MASK32_1) << 14;
l1 |= tmp[tmpIdx + 8] << 12;
l1 |= tmp[tmpIdx + 9] << 10;
l1 |= tmp[tmpIdx + 10] << 8;
l1 |= tmp[tmpIdx + 11] << 6;
l1 |= tmp[tmpIdx + 12] << 4;
l1 |= tmp[tmpIdx + 13] << 2;
l1 |= tmp[tmpIdx + 14] << 0;
ints[intsIdx + 1] = l1;
}
}
private static void decode16To32(PostingDecodingUtil pdu, int[] ints) throws IOException {
pdu.splitInts(64, ints, 16, 16, MASK32_16, ints, 64, MASK32_16);
}
}

View File

@ -0,0 +1,841 @@
// This file has been automatically generated, DO NOT EDIT
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene101;
import java.io.IOException;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.DataOutput;
/**
* Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in one to get
* SIMD-like speedups. If bitsPerValue &lt;= 8 then we pack 4 ints per Java int else if bitsPerValue
* &lt;= 16 we pack 2 ints per Java int else we do scalar operations.
*/
public final class ForUtil {
public static final int BLOCK_SIZE = 128;
static final int BLOCK_SIZE_LOG2 = 7;
static int expandMask16(int mask16) {
return mask16 | (mask16 << 16);
}
static int expandMask8(int mask8) {
return expandMask16(mask8 | (mask8 << 8));
}
static int mask32(int bitsPerValue) {
return (1 << bitsPerValue) - 1;
}
static int mask16(int bitsPerValue) {
return expandMask16((1 << bitsPerValue) - 1);
}
static int mask8(int bitsPerValue) {
return expandMask8((1 << bitsPerValue) - 1);
}
static void expand8(int[] arr) {
for (int i = 0; i < 32; ++i) {
int l = arr[i];
arr[i] = (l >>> 24) & 0xFF;
arr[32 + i] = (l >>> 16) & 0xFF;
arr[64 + i] = (l >>> 8) & 0xFF;
arr[96 + i] = l & 0xFF;
}
}
static void collapse8(int[] arr) {
for (int i = 0; i < 32; ++i) {
arr[i] = (arr[i] << 24) | (arr[32 + i] << 16) | (arr[64 + i] << 8) | arr[96 + i];
}
}
static void expand16(int[] arr) {
for (int i = 0; i < 64; ++i) {
int l = arr[i];
arr[i] = (l >>> 16) & 0xFFFF;
arr[64 + i] = l & 0xFFFF;
}
}
static void collapse16(int[] arr) {
for (int i = 0; i < 64; ++i) {
arr[i] = (arr[i] << 16) | arr[64 + i];
}
}
private final int[] tmp = new int[BLOCK_SIZE];
/** Encode 128 integers from {@code ints} into {@code out}. */
void encode(int[] ints, int bitsPerValue, DataOutput out) throws IOException {
final int nextPrimitive;
if (bitsPerValue <= 8) {
nextPrimitive = 8;
collapse8(ints);
} else if (bitsPerValue <= 16) {
nextPrimitive = 16;
collapse16(ints);
} else {
nextPrimitive = 32;
}
encode(ints, bitsPerValue, nextPrimitive, out, tmp);
}
static void encode(int[] ints, int bitsPerValue, int primitiveSize, DataOutput out, int[] tmp)
throws IOException {
final int numInts = BLOCK_SIZE * primitiveSize / Integer.SIZE;
final int numIntsPerShift = bitsPerValue * 4;
int idx = 0;
int shift = primitiveSize - bitsPerValue;
for (int i = 0; i < numIntsPerShift; ++i) {
tmp[i] = ints[idx++] << shift;
}
for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
for (int i = 0; i < numIntsPerShift; ++i) {
tmp[i] |= ints[idx++] << shift;
}
}
final int remainingBitsPerInt = shift + bitsPerValue;
final int maskRemainingBitsPerInt;
if (primitiveSize == 8) {
maskRemainingBitsPerInt = MASKS8[remainingBitsPerInt];
} else if (primitiveSize == 16) {
maskRemainingBitsPerInt = MASKS16[remainingBitsPerInt];
} else {
maskRemainingBitsPerInt = MASKS32[remainingBitsPerInt];
}
int tmpIdx = 0;
int remainingBitsPerValue = bitsPerValue;
while (idx < numInts) {
if (remainingBitsPerValue >= remainingBitsPerInt) {
remainingBitsPerValue -= remainingBitsPerInt;
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerInt;
if (remainingBitsPerValue == 0) {
idx++;
remainingBitsPerValue = bitsPerValue;
}
} else {
final int mask1, mask2;
if (primitiveSize == 8) {
mask1 = MASKS8[remainingBitsPerValue];
mask2 = MASKS8[remainingBitsPerInt - remainingBitsPerValue];
} else if (primitiveSize == 16) {
mask1 = MASKS16[remainingBitsPerValue];
mask2 = MASKS16[remainingBitsPerInt - remainingBitsPerValue];
} else {
mask1 = MASKS32[remainingBitsPerValue];
mask2 = MASKS32[remainingBitsPerInt - remainingBitsPerValue];
}
tmp[tmpIdx] |= (ints[idx++] & mask1) << (remainingBitsPerInt - remainingBitsPerValue);
remainingBitsPerValue = bitsPerValue - remainingBitsPerInt + remainingBitsPerValue;
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & mask2;
}
}
for (int i = 0; i < numIntsPerShift; ++i) {
out.writeInt(tmp[i]);
}
}
/** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */
static int numBytes(int bitsPerValue) {
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
}
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, int[] tmp, int[] ints)
throws IOException {
final int numInts = bitsPerValue << 2;
final int mask = MASKS32[bitsPerValue];
pdu.splitInts(numInts, ints, 32 - bitsPerValue, 32, mask, tmp, 0, -1);
final int remainingBitsPerInt = 32 - bitsPerValue;
final int mask32RemainingBitsPerInt = MASKS32[remainingBitsPerInt];
int tmpIdx = 0;
int remainingBits = remainingBitsPerInt;
for (int intsIdx = numInts; intsIdx < BLOCK_SIZE; ++intsIdx) {
int b = bitsPerValue - remainingBits;
int l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b;
while (b >= remainingBitsPerInt) {
b -= remainingBitsPerInt;
l |= (tmp[tmpIdx++] & mask32RemainingBitsPerInt) << b;
}
if (b > 0) {
l |= (tmp[tmpIdx] >>> (remainingBitsPerInt - b)) & MASKS32[b];
remainingBits = remainingBitsPerInt - b;
} else {
remainingBits = remainingBitsPerInt;
}
ints[intsIdx] = l;
}
}
static final int[] MASKS8 = new int[8];
static final int[] MASKS16 = new int[16];
static final int[] MASKS32 = new int[32];
static {
for (int i = 0; i < 8; ++i) {
MASKS8[i] = mask8(i);
}
for (int i = 0; i < 16; ++i) {
MASKS16[i] = mask16(i);
}
for (int i = 0; i < 32; ++i) {
MASKS32[i] = mask32(i);
}
}
// mark values in array as final ints to avoid the cost of reading array, arrays should only be
// used when the idx is a variable
static final int MASK8_1 = MASKS8[1];
static final int MASK8_2 = MASKS8[2];
static final int MASK8_3 = MASKS8[3];
static final int MASK8_4 = MASKS8[4];
static final int MASK8_5 = MASKS8[5];
static final int MASK8_6 = MASKS8[6];
static final int MASK8_7 = MASKS8[7];
static final int MASK16_1 = MASKS16[1];
static final int MASK16_2 = MASKS16[2];
static final int MASK16_3 = MASKS16[3];
static final int MASK16_4 = MASKS16[4];
static final int MASK16_5 = MASKS16[5];
static final int MASK16_6 = MASKS16[6];
static final int MASK16_7 = MASKS16[7];
static final int MASK16_8 = MASKS16[8];
static final int MASK16_9 = MASKS16[9];
static final int MASK16_10 = MASKS16[10];
static final int MASK16_11 = MASKS16[11];
static final int MASK16_12 = MASKS16[12];
static final int MASK16_13 = MASKS16[13];
static final int MASK16_14 = MASKS16[14];
static final int MASK16_15 = MASKS16[15];
static final int MASK32_1 = MASKS32[1];
static final int MASK32_2 = MASKS32[2];
static final int MASK32_3 = MASKS32[3];
static final int MASK32_4 = MASKS32[4];
static final int MASK32_5 = MASKS32[5];
static final int MASK32_6 = MASKS32[6];
static final int MASK32_7 = MASKS32[7];
static final int MASK32_8 = MASKS32[8];
static final int MASK32_9 = MASKS32[9];
static final int MASK32_10 = MASKS32[10];
static final int MASK32_11 = MASKS32[11];
static final int MASK32_12 = MASKS32[12];
static final int MASK32_13 = MASKS32[13];
static final int MASK32_14 = MASKS32[14];
static final int MASK32_15 = MASKS32[15];
static final int MASK32_16 = MASKS32[16];
static final int MASK32_17 = MASKS32[17];
static final int MASK32_18 = MASKS32[18];
static final int MASK32_19 = MASKS32[19];
static final int MASK32_20 = MASKS32[20];
static final int MASK32_21 = MASKS32[21];
static final int MASK32_22 = MASKS32[22];
static final int MASK32_23 = MASKS32[23];
static final int MASK32_24 = MASKS32[24];
/** Decode 128 integers into {@code ints}. */
void decode(int bitsPerValue, PostingDecodingUtil pdu, int[] ints) throws IOException {
switch (bitsPerValue) {
case 1:
decode1(pdu, ints);
expand8(ints);
break;
case 2:
decode2(pdu, ints);
expand8(ints);
break;
case 3:
decode3(pdu, tmp, ints);
expand8(ints);
break;
case 4:
decode4(pdu, ints);
expand8(ints);
break;
case 5:
decode5(pdu, tmp, ints);
expand8(ints);
break;
case 6:
decode6(pdu, tmp, ints);
expand8(ints);
break;
case 7:
decode7(pdu, tmp, ints);
expand8(ints);
break;
case 8:
decode8(pdu, ints);
expand8(ints);
break;
case 9:
decode9(pdu, tmp, ints);
expand16(ints);
break;
case 10:
decode10(pdu, tmp, ints);
expand16(ints);
break;
case 11:
decode11(pdu, tmp, ints);
expand16(ints);
break;
case 12:
decode12(pdu, tmp, ints);
expand16(ints);
break;
case 13:
decode13(pdu, tmp, ints);
expand16(ints);
break;
case 14:
decode14(pdu, tmp, ints);
expand16(ints);
break;
case 15:
decode15(pdu, tmp, ints);
expand16(ints);
break;
case 16:
decode16(pdu, ints);
expand16(ints);
break;
case 17:
decode17(pdu, tmp, ints);
break;
case 18:
decode18(pdu, tmp, ints);
break;
case 19:
decode19(pdu, tmp, ints);
break;
case 20:
decode20(pdu, tmp, ints);
break;
case 21:
decode21(pdu, tmp, ints);
break;
case 22:
decode22(pdu, tmp, ints);
break;
case 23:
decode23(pdu, tmp, ints);
break;
case 24:
decode24(pdu, tmp, ints);
break;
default:
decodeSlow(bitsPerValue, pdu, tmp, ints);
break;
}
}
static void decode1(PostingDecodingUtil pdu, int[] ints) throws IOException {
pdu.splitInts(4, ints, 7, 1, MASK8_1, ints, 28, MASK8_1);
}
static void decode2(PostingDecodingUtil pdu, int[] ints) throws IOException {
pdu.splitInts(8, ints, 6, 2, MASK8_2, ints, 24, MASK8_2);
}
static void decode3(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(12, ints, 5, 3, MASK8_3, tmp, 0, MASK8_2);
for (int iter = 0, tmpIdx = 0, intsIdx = 24; iter < 4; ++iter, tmpIdx += 3, intsIdx += 2) {
int l0 = tmp[tmpIdx + 0] << 1;
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_1;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK8_1) << 2;
l1 |= tmp[tmpIdx + 2] << 0;
ints[intsIdx + 1] = l1;
}
}
static void decode4(PostingDecodingUtil pdu, int[] ints) throws IOException {
pdu.splitInts(16, ints, 4, 4, MASK8_4, ints, 16, MASK8_4);
}
static void decode5(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(20, ints, 3, 5, MASK8_5, tmp, 0, MASK8_3);
for (int iter = 0, tmpIdx = 0, intsIdx = 20; iter < 4; ++iter, tmpIdx += 5, intsIdx += 3) {
int l0 = tmp[tmpIdx + 0] << 2;
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_2;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK8_1) << 4;
l1 |= tmp[tmpIdx + 2] << 1;
l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK8_1;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 3] & MASK8_2) << 3;
l2 |= tmp[tmpIdx + 4] << 0;
ints[intsIdx + 2] = l2;
}
}
static void decode6(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(24, ints, 2, 6, MASK8_6, tmp, 0, MASK8_2);
for (int iter = 0, tmpIdx = 0, intsIdx = 24; iter < 8; ++iter, tmpIdx += 3, intsIdx += 1) {
int l0 = tmp[tmpIdx + 0] << 4;
l0 |= tmp[tmpIdx + 1] << 2;
l0 |= tmp[tmpIdx + 2] << 0;
ints[intsIdx + 0] = l0;
}
}
static void decode7(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(28, ints, 1, 7, MASK8_7, tmp, 0, MASK8_1);
for (int iter = 0, tmpIdx = 0, intsIdx = 28; iter < 4; ++iter, tmpIdx += 7, intsIdx += 1) {
int l0 = tmp[tmpIdx + 0] << 6;
l0 |= tmp[tmpIdx + 1] << 5;
l0 |= tmp[tmpIdx + 2] << 4;
l0 |= tmp[tmpIdx + 3] << 3;
l0 |= tmp[tmpIdx + 4] << 2;
l0 |= tmp[tmpIdx + 5] << 1;
l0 |= tmp[tmpIdx + 6] << 0;
ints[intsIdx + 0] = l0;
}
}
static void decode8(PostingDecodingUtil pdu, int[] ints) throws IOException {
pdu.in.readInts(ints, 0, 32);
}
static void decode9(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(36, ints, 7, 9, MASK16_9, tmp, 0, MASK16_7);
for (int iter = 0, tmpIdx = 0, intsIdx = 36; iter < 4; ++iter, tmpIdx += 9, intsIdx += 7) {
int l0 = tmp[tmpIdx + 0] << 2;
l0 |= (tmp[tmpIdx + 1] >>> 5) & MASK16_2;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK16_5) << 4;
l1 |= (tmp[tmpIdx + 2] >>> 3) & MASK16_4;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 2] & MASK16_3) << 6;
l2 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_6;
ints[intsIdx + 2] = l2;
int l3 = (tmp[tmpIdx + 3] & MASK16_1) << 8;
l3 |= tmp[tmpIdx + 4] << 1;
l3 |= (tmp[tmpIdx + 5] >>> 6) & MASK16_1;
ints[intsIdx + 3] = l3;
int l4 = (tmp[tmpIdx + 5] & MASK16_6) << 3;
l4 |= (tmp[tmpIdx + 6] >>> 4) & MASK16_3;
ints[intsIdx + 4] = l4;
int l5 = (tmp[tmpIdx + 6] & MASK16_4) << 5;
l5 |= (tmp[tmpIdx + 7] >>> 2) & MASK16_5;
ints[intsIdx + 5] = l5;
int l6 = (tmp[tmpIdx + 7] & MASK16_2) << 7;
l6 |= tmp[tmpIdx + 8] << 0;
ints[intsIdx + 6] = l6;
}
}
static void decode10(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(40, ints, 6, 10, MASK16_10, tmp, 0, MASK16_6);
for (int iter = 0, tmpIdx = 0, intsIdx = 40; iter < 8; ++iter, tmpIdx += 5, intsIdx += 3) {
int l0 = tmp[tmpIdx + 0] << 4;
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_4;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK16_2) << 8;
l1 |= tmp[tmpIdx + 2] << 2;
l1 |= (tmp[tmpIdx + 3] >>> 4) & MASK16_2;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 3] & MASK16_4) << 6;
l2 |= tmp[tmpIdx + 4] << 0;
ints[intsIdx + 2] = l2;
}
}
static void decode11(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(44, ints, 5, 11, MASK16_11, tmp, 0, MASK16_5);
for (int iter = 0, tmpIdx = 0, intsIdx = 44; iter < 4; ++iter, tmpIdx += 11, intsIdx += 5) {
int l0 = tmp[tmpIdx + 0] << 6;
l0 |= tmp[tmpIdx + 1] << 1;
l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK16_1;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 2] & MASK16_4) << 7;
l1 |= tmp[tmpIdx + 3] << 2;
l1 |= (tmp[tmpIdx + 4] >>> 3) & MASK16_2;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 4] & MASK16_3) << 8;
l2 |= tmp[tmpIdx + 5] << 3;
l2 |= (tmp[tmpIdx + 6] >>> 2) & MASK16_3;
ints[intsIdx + 2] = l2;
int l3 = (tmp[tmpIdx + 6] & MASK16_2) << 9;
l3 |= tmp[tmpIdx + 7] << 4;
l3 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_4;
ints[intsIdx + 3] = l3;
int l4 = (tmp[tmpIdx + 8] & MASK16_1) << 10;
l4 |= tmp[tmpIdx + 9] << 5;
l4 |= tmp[tmpIdx + 10] << 0;
ints[intsIdx + 4] = l4;
}
}
static void decode12(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(48, ints, 4, 12, MASK16_12, tmp, 0, MASK16_4);
for (int iter = 0, tmpIdx = 0, intsIdx = 48; iter < 16; ++iter, tmpIdx += 3, intsIdx += 1) {
int l0 = tmp[tmpIdx + 0] << 8;
l0 |= tmp[tmpIdx + 1] << 4;
l0 |= tmp[tmpIdx + 2] << 0;
ints[intsIdx + 0] = l0;
}
}
static void decode13(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(52, ints, 3, 13, MASK16_13, tmp, 0, MASK16_3);
for (int iter = 0, tmpIdx = 0, intsIdx = 52; iter < 4; ++iter, tmpIdx += 13, intsIdx += 3) {
int l0 = tmp[tmpIdx + 0] << 10;
l0 |= tmp[tmpIdx + 1] << 7;
l0 |= tmp[tmpIdx + 2] << 4;
l0 |= tmp[tmpIdx + 3] << 1;
l0 |= (tmp[tmpIdx + 4] >>> 2) & MASK16_1;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 4] & MASK16_2) << 11;
l1 |= tmp[tmpIdx + 5] << 8;
l1 |= tmp[tmpIdx + 6] << 5;
l1 |= tmp[tmpIdx + 7] << 2;
l1 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_2;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 8] & MASK16_1) << 12;
l2 |= tmp[tmpIdx + 9] << 9;
l2 |= tmp[tmpIdx + 10] << 6;
l2 |= tmp[tmpIdx + 11] << 3;
l2 |= tmp[tmpIdx + 12] << 0;
ints[intsIdx + 2] = l2;
}
}
static void decode14(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(56, ints, 2, 14, MASK16_14, tmp, 0, MASK16_2);
for (int iter = 0, tmpIdx = 0, intsIdx = 56; iter < 8; ++iter, tmpIdx += 7, intsIdx += 1) {
int l0 = tmp[tmpIdx + 0] << 12;
l0 |= tmp[tmpIdx + 1] << 10;
l0 |= tmp[tmpIdx + 2] << 8;
l0 |= tmp[tmpIdx + 3] << 6;
l0 |= tmp[tmpIdx + 4] << 4;
l0 |= tmp[tmpIdx + 5] << 2;
l0 |= tmp[tmpIdx + 6] << 0;
ints[intsIdx + 0] = l0;
}
}
static void decode15(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(60, ints, 1, 15, MASK16_15, tmp, 0, MASK16_1);
for (int iter = 0, tmpIdx = 0, intsIdx = 60; iter < 4; ++iter, tmpIdx += 15, intsIdx += 1) {
int l0 = tmp[tmpIdx + 0] << 14;
l0 |= tmp[tmpIdx + 1] << 13;
l0 |= tmp[tmpIdx + 2] << 12;
l0 |= tmp[tmpIdx + 3] << 11;
l0 |= tmp[tmpIdx + 4] << 10;
l0 |= tmp[tmpIdx + 5] << 9;
l0 |= tmp[tmpIdx + 6] << 8;
l0 |= tmp[tmpIdx + 7] << 7;
l0 |= tmp[tmpIdx + 8] << 6;
l0 |= tmp[tmpIdx + 9] << 5;
l0 |= tmp[tmpIdx + 10] << 4;
l0 |= tmp[tmpIdx + 11] << 3;
l0 |= tmp[tmpIdx + 12] << 2;
l0 |= tmp[tmpIdx + 13] << 1;
l0 |= tmp[tmpIdx + 14] << 0;
ints[intsIdx + 0] = l0;
}
}
static void decode16(PostingDecodingUtil pdu, int[] ints) throws IOException {
pdu.in.readInts(ints, 0, 64);
}
static void decode17(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(68, ints, 15, 17, MASK32_17, tmp, 0, MASK32_15);
for (int iter = 0, tmpIdx = 0, intsIdx = 68; iter < 4; ++iter, tmpIdx += 17, intsIdx += 15) {
int l0 = tmp[tmpIdx + 0] << 2;
l0 |= (tmp[tmpIdx + 1] >>> 13) & MASK32_2;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK32_13) << 4;
l1 |= (tmp[tmpIdx + 2] >>> 11) & MASK32_4;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 2] & MASK32_11) << 6;
l2 |= (tmp[tmpIdx + 3] >>> 9) & MASK32_6;
ints[intsIdx + 2] = l2;
int l3 = (tmp[tmpIdx + 3] & MASK32_9) << 8;
l3 |= (tmp[tmpIdx + 4] >>> 7) & MASK32_8;
ints[intsIdx + 3] = l3;
int l4 = (tmp[tmpIdx + 4] & MASK32_7) << 10;
l4 |= (tmp[tmpIdx + 5] >>> 5) & MASK32_10;
ints[intsIdx + 4] = l4;
int l5 = (tmp[tmpIdx + 5] & MASK32_5) << 12;
l5 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_12;
ints[intsIdx + 5] = l5;
int l6 = (tmp[tmpIdx + 6] & MASK32_3) << 14;
l6 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_14;
ints[intsIdx + 6] = l6;
int l7 = (tmp[tmpIdx + 7] & MASK32_1) << 16;
l7 |= tmp[tmpIdx + 8] << 1;
l7 |= (tmp[tmpIdx + 9] >>> 14) & MASK32_1;
ints[intsIdx + 7] = l7;
int l8 = (tmp[tmpIdx + 9] & MASK32_14) << 3;
l8 |= (tmp[tmpIdx + 10] >>> 12) & MASK32_3;
ints[intsIdx + 8] = l8;
int l9 = (tmp[tmpIdx + 10] & MASK32_12) << 5;
l9 |= (tmp[tmpIdx + 11] >>> 10) & MASK32_5;
ints[intsIdx + 9] = l9;
int l10 = (tmp[tmpIdx + 11] & MASK32_10) << 7;
l10 |= (tmp[tmpIdx + 12] >>> 8) & MASK32_7;
ints[intsIdx + 10] = l10;
int l11 = (tmp[tmpIdx + 12] & MASK32_8) << 9;
l11 |= (tmp[tmpIdx + 13] >>> 6) & MASK32_9;
ints[intsIdx + 11] = l11;
int l12 = (tmp[tmpIdx + 13] & MASK32_6) << 11;
l12 |= (tmp[tmpIdx + 14] >>> 4) & MASK32_11;
ints[intsIdx + 12] = l12;
int l13 = (tmp[tmpIdx + 14] & MASK32_4) << 13;
l13 |= (tmp[tmpIdx + 15] >>> 2) & MASK32_13;
ints[intsIdx + 13] = l13;
int l14 = (tmp[tmpIdx + 15] & MASK32_2) << 15;
l14 |= tmp[tmpIdx + 16] << 0;
ints[intsIdx + 14] = l14;
}
}
static void decode18(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(72, ints, 14, 18, MASK32_18, tmp, 0, MASK32_14);
for (int iter = 0, tmpIdx = 0, intsIdx = 72; iter < 8; ++iter, tmpIdx += 9, intsIdx += 7) {
int l0 = tmp[tmpIdx + 0] << 4;
l0 |= (tmp[tmpIdx + 1] >>> 10) & MASK32_4;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK32_10) << 8;
l1 |= (tmp[tmpIdx + 2] >>> 6) & MASK32_8;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 2] & MASK32_6) << 12;
l2 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_12;
ints[intsIdx + 2] = l2;
int l3 = (tmp[tmpIdx + 3] & MASK32_2) << 16;
l3 |= tmp[tmpIdx + 4] << 2;
l3 |= (tmp[tmpIdx + 5] >>> 12) & MASK32_2;
ints[intsIdx + 3] = l3;
int l4 = (tmp[tmpIdx + 5] & MASK32_12) << 6;
l4 |= (tmp[tmpIdx + 6] >>> 8) & MASK32_6;
ints[intsIdx + 4] = l4;
int l5 = (tmp[tmpIdx + 6] & MASK32_8) << 10;
l5 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_10;
ints[intsIdx + 5] = l5;
int l6 = (tmp[tmpIdx + 7] & MASK32_4) << 14;
l6 |= tmp[tmpIdx + 8] << 0;
ints[intsIdx + 6] = l6;
}
}
static void decode19(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(76, ints, 13, 19, MASK32_19, tmp, 0, MASK32_13);
for (int iter = 0, tmpIdx = 0, intsIdx = 76; iter < 4; ++iter, tmpIdx += 19, intsIdx += 13) {
int l0 = tmp[tmpIdx + 0] << 6;
l0 |= (tmp[tmpIdx + 1] >>> 7) & MASK32_6;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK32_7) << 12;
l1 |= (tmp[tmpIdx + 2] >>> 1) & MASK32_12;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 2] & MASK32_1) << 18;
l2 |= tmp[tmpIdx + 3] << 5;
l2 |= (tmp[tmpIdx + 4] >>> 8) & MASK32_5;
ints[intsIdx + 2] = l2;
int l3 = (tmp[tmpIdx + 4] & MASK32_8) << 11;
l3 |= (tmp[tmpIdx + 5] >>> 2) & MASK32_11;
ints[intsIdx + 3] = l3;
int l4 = (tmp[tmpIdx + 5] & MASK32_2) << 17;
l4 |= tmp[tmpIdx + 6] << 4;
l4 |= (tmp[tmpIdx + 7] >>> 9) & MASK32_4;
ints[intsIdx + 4] = l4;
int l5 = (tmp[tmpIdx + 7] & MASK32_9) << 10;
l5 |= (tmp[tmpIdx + 8] >>> 3) & MASK32_10;
ints[intsIdx + 5] = l5;
int l6 = (tmp[tmpIdx + 8] & MASK32_3) << 16;
l6 |= tmp[tmpIdx + 9] << 3;
l6 |= (tmp[tmpIdx + 10] >>> 10) & MASK32_3;
ints[intsIdx + 6] = l6;
int l7 = (tmp[tmpIdx + 10] & MASK32_10) << 9;
l7 |= (tmp[tmpIdx + 11] >>> 4) & MASK32_9;
ints[intsIdx + 7] = l7;
int l8 = (tmp[tmpIdx + 11] & MASK32_4) << 15;
l8 |= tmp[tmpIdx + 12] << 2;
l8 |= (tmp[tmpIdx + 13] >>> 11) & MASK32_2;
ints[intsIdx + 8] = l8;
int l9 = (tmp[tmpIdx + 13] & MASK32_11) << 8;
l9 |= (tmp[tmpIdx + 14] >>> 5) & MASK32_8;
ints[intsIdx + 9] = l9;
int l10 = (tmp[tmpIdx + 14] & MASK32_5) << 14;
l10 |= tmp[tmpIdx + 15] << 1;
l10 |= (tmp[tmpIdx + 16] >>> 12) & MASK32_1;
ints[intsIdx + 10] = l10;
int l11 = (tmp[tmpIdx + 16] & MASK32_12) << 7;
l11 |= (tmp[tmpIdx + 17] >>> 6) & MASK32_7;
ints[intsIdx + 11] = l11;
int l12 = (tmp[tmpIdx + 17] & MASK32_6) << 13;
l12 |= tmp[tmpIdx + 18] << 0;
ints[intsIdx + 12] = l12;
}
}
static void decode20(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(80, ints, 12, 20, MASK32_20, tmp, 0, MASK32_12);
for (int iter = 0, tmpIdx = 0, intsIdx = 80; iter < 16; ++iter, tmpIdx += 5, intsIdx += 3) {
int l0 = tmp[tmpIdx + 0] << 8;
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_8;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK32_4) << 16;
l1 |= tmp[tmpIdx + 2] << 4;
l1 |= (tmp[tmpIdx + 3] >>> 8) & MASK32_4;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 3] & MASK32_8) << 12;
l2 |= tmp[tmpIdx + 4] << 0;
ints[intsIdx + 2] = l2;
}
}
static void decode21(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(84, ints, 11, 21, MASK32_21, tmp, 0, MASK32_11);
for (int iter = 0, tmpIdx = 0, intsIdx = 84; iter < 4; ++iter, tmpIdx += 21, intsIdx += 11) {
int l0 = tmp[tmpIdx + 0] << 10;
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK32_10;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 1] & MASK32_1) << 20;
l1 |= tmp[tmpIdx + 2] << 9;
l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_9;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 3] & MASK32_2) << 19;
l2 |= tmp[tmpIdx + 4] << 8;
l2 |= (tmp[tmpIdx + 5] >>> 3) & MASK32_8;
ints[intsIdx + 2] = l2;
int l3 = (tmp[tmpIdx + 5] & MASK32_3) << 18;
l3 |= tmp[tmpIdx + 6] << 7;
l3 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_7;
ints[intsIdx + 3] = l3;
int l4 = (tmp[tmpIdx + 7] & MASK32_4) << 17;
l4 |= tmp[tmpIdx + 8] << 6;
l4 |= (tmp[tmpIdx + 9] >>> 5) & MASK32_6;
ints[intsIdx + 4] = l4;
int l5 = (tmp[tmpIdx + 9] & MASK32_5) << 16;
l5 |= tmp[tmpIdx + 10] << 5;
l5 |= (tmp[tmpIdx + 11] >>> 6) & MASK32_5;
ints[intsIdx + 5] = l5;
int l6 = (tmp[tmpIdx + 11] & MASK32_6) << 15;
l6 |= tmp[tmpIdx + 12] << 4;
l6 |= (tmp[tmpIdx + 13] >>> 7) & MASK32_4;
ints[intsIdx + 6] = l6;
int l7 = (tmp[tmpIdx + 13] & MASK32_7) << 14;
l7 |= tmp[tmpIdx + 14] << 3;
l7 |= (tmp[tmpIdx + 15] >>> 8) & MASK32_3;
ints[intsIdx + 7] = l7;
int l8 = (tmp[tmpIdx + 15] & MASK32_8) << 13;
l8 |= tmp[tmpIdx + 16] << 2;
l8 |= (tmp[tmpIdx + 17] >>> 9) & MASK32_2;
ints[intsIdx + 8] = l8;
int l9 = (tmp[tmpIdx + 17] & MASK32_9) << 12;
l9 |= tmp[tmpIdx + 18] << 1;
l9 |= (tmp[tmpIdx + 19] >>> 10) & MASK32_1;
ints[intsIdx + 9] = l9;
int l10 = (tmp[tmpIdx + 19] & MASK32_10) << 11;
l10 |= tmp[tmpIdx + 20] << 0;
ints[intsIdx + 10] = l10;
}
}
static void decode22(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(88, ints, 10, 22, MASK32_22, tmp, 0, MASK32_10);
for (int iter = 0, tmpIdx = 0, intsIdx = 88; iter < 8; ++iter, tmpIdx += 11, intsIdx += 5) {
int l0 = tmp[tmpIdx + 0] << 12;
l0 |= tmp[tmpIdx + 1] << 2;
l0 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 2] & MASK32_8) << 14;
l1 |= tmp[tmpIdx + 3] << 4;
l1 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 4] & MASK32_6) << 16;
l2 |= tmp[tmpIdx + 5] << 6;
l2 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6;
ints[intsIdx + 2] = l2;
int l3 = (tmp[tmpIdx + 6] & MASK32_4) << 18;
l3 |= tmp[tmpIdx + 7] << 8;
l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8;
ints[intsIdx + 3] = l3;
int l4 = (tmp[tmpIdx + 8] & MASK32_2) << 20;
l4 |= tmp[tmpIdx + 9] << 10;
l4 |= tmp[tmpIdx + 10] << 0;
ints[intsIdx + 4] = l4;
}
}
static void decode23(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(92, ints, 9, 23, MASK32_23, tmp, 0, MASK32_9);
for (int iter = 0, tmpIdx = 0, intsIdx = 92; iter < 4; ++iter, tmpIdx += 23, intsIdx += 9) {
int l0 = tmp[tmpIdx + 0] << 14;
l0 |= tmp[tmpIdx + 1] << 5;
l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK32_5;
ints[intsIdx + 0] = l0;
int l1 = (tmp[tmpIdx + 2] & MASK32_4) << 19;
l1 |= tmp[tmpIdx + 3] << 10;
l1 |= tmp[tmpIdx + 4] << 1;
l1 |= (tmp[tmpIdx + 5] >>> 8) & MASK32_1;
ints[intsIdx + 1] = l1;
int l2 = (tmp[tmpIdx + 5] & MASK32_8) << 15;
l2 |= tmp[tmpIdx + 6] << 6;
l2 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_6;
ints[intsIdx + 2] = l2;
int l3 = (tmp[tmpIdx + 7] & MASK32_3) << 20;
l3 |= tmp[tmpIdx + 8] << 11;
l3 |= tmp[tmpIdx + 9] << 2;
l3 |= (tmp[tmpIdx + 10] >>> 7) & MASK32_2;
ints[intsIdx + 3] = l3;
int l4 = (tmp[tmpIdx + 10] & MASK32_7) << 16;
l4 |= tmp[tmpIdx + 11] << 7;
l4 |= (tmp[tmpIdx + 12] >>> 2) & MASK32_7;
ints[intsIdx + 4] = l4;
int l5 = (tmp[tmpIdx + 12] & MASK32_2) << 21;
l5 |= tmp[tmpIdx + 13] << 12;
l5 |= tmp[tmpIdx + 14] << 3;
l5 |= (tmp[tmpIdx + 15] >>> 6) & MASK32_3;
ints[intsIdx + 5] = l5;
int l6 = (tmp[tmpIdx + 15] & MASK32_6) << 17;
l6 |= tmp[tmpIdx + 16] << 8;
l6 |= (tmp[tmpIdx + 17] >>> 1) & MASK32_8;
ints[intsIdx + 6] = l6;
int l7 = (tmp[tmpIdx + 17] & MASK32_1) << 22;
l7 |= tmp[tmpIdx + 18] << 13;
l7 |= tmp[tmpIdx + 19] << 4;
l7 |= (tmp[tmpIdx + 20] >>> 5) & MASK32_4;
ints[intsIdx + 7] = l7;
int l8 = (tmp[tmpIdx + 20] & MASK32_5) << 18;
l8 |= tmp[tmpIdx + 21] << 9;
l8 |= tmp[tmpIdx + 22] << 0;
ints[intsIdx + 8] = l8;
}
}
static void decode24(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
pdu.splitInts(96, ints, 8, 24, MASK32_24, tmp, 0, MASK32_8);
for (int iter = 0, tmpIdx = 0, intsIdx = 96; iter < 32; ++iter, tmpIdx += 3, intsIdx += 1) {
int l0 = tmp[tmpIdx + 0] << 16;
l0 |= tmp[tmpIdx + 1] << 8;
l0 |= tmp[tmpIdx + 2] << 0;
ints[intsIdx + 0] = l0;
}
}
}

View File

@ -0,0 +1,217 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene101;
import java.util.Objects;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
/**
* Implements the Lucene 10.1 index format
*
* <p>If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
*
* @see org.apache.lucene.codecs.lucene101 package documentation for file format details.
* @lucene.experimental
*/
public class Lucene101Codec extends Codec {
/** Configuration option for the codec. */
public enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED),
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION);
private final Lucene90StoredFieldsFormat.Mode storedMode;
private Mode(Lucene90StoredFieldsFormat.Mode storedMode) {
this.storedMode = Objects.requireNonNull(storedMode);
}
}
private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat();
private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
private final CompoundFormat compoundFormat = new Lucene90CompoundFormat();
private final NormsFormat normsFormat = new Lucene90NormsFormat();
private final PostingsFormat defaultPostingsFormat;
private final PostingsFormat postingsFormat =
new PerFieldPostingsFormat() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return Lucene101Codec.this.getPostingsFormatForField(field);
}
};
private final DocValuesFormat defaultDVFormat;
private final DocValuesFormat docValuesFormat =
new PerFieldDocValuesFormat() {
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return Lucene101Codec.this.getDocValuesFormatForField(field);
}
};
private final KnnVectorsFormat defaultKnnVectorsFormat;
private final KnnVectorsFormat knnVectorsFormat =
new PerFieldKnnVectorsFormat() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return Lucene101Codec.this.getKnnVectorsFormatForField(field);
}
};
private final StoredFieldsFormat storedFieldsFormat;
/** Instantiates a new codec. */
public Lucene101Codec() {
this(Mode.BEST_SPEED);
}
/**
* Instantiates a new codec, specifying the stored fields compression mode to use.
*
* @param mode stored fields compression mode to use for newly flushed/merged segments.
*/
public Lucene101Codec(Mode mode) {
super("Lucene101");
this.storedFieldsFormat =
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
this.defaultPostingsFormat = new Lucene101PostingsFormat();
this.defaultDVFormat = new Lucene90DocValuesFormat();
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
}
@Override
public final StoredFieldsFormat storedFieldsFormat() {
return storedFieldsFormat;
}
@Override
public final TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}
@Override
public final PostingsFormat postingsFormat() {
return postingsFormat;
}
@Override
public final FieldInfosFormat fieldInfosFormat() {
return fieldInfosFormat;
}
@Override
public final SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}
@Override
public final LiveDocsFormat liveDocsFormat() {
return liveDocsFormat;
}
@Override
public final CompoundFormat compoundFormat() {
return compoundFormat;
}
@Override
public final PointsFormat pointsFormat() {
return new Lucene90PointsFormat();
}
@Override
public final KnnVectorsFormat knnVectorsFormat() {
return knnVectorsFormat;
}
/**
* Returns the postings format that should be used for writing new segments of <code>field</code>.
*
* <p>The default implementation always returns "Lucene101".
*
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
* future version of Lucene are only guaranteed to be able to read the default implementation,
*/
public PostingsFormat getPostingsFormatForField(String field) {
return defaultPostingsFormat;
}
/**
* Returns the docvalues format that should be used for writing new segments of <code>field</code>
* .
*
* <p>The default implementation always returns "Lucene90".
*
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
* future version of Lucene are only guaranteed to be able to read the default implementation.
*/
public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
}
/**
* Returns the vectors format that should be used for writing new segments of <code>field</code>
*
* <p>The default implementation always returns "Lucene99HnswVectorsFormat".
*
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
* future version of Lucene are only guaranteed to be able to read the default implementation.
*/
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return defaultKnnVectorsFormat;
}
@Override
public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}
@Override
public final NormsFormat normsFormat() {
return normsFormat;
}
}

View File

@ -0,0 +1,492 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene101;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
/**
* Lucene 10.1 postings format, which encodes postings in packed integer blocks for fast decode.
*
* <p>Basic idea:
*
* <ul>
* <li><b>Packed Blocks and VInt Blocks</b>:
* <p>In packed blocks, integers are encoded with the same bit width ({@link PackedInts packed
* format}): the block size (i.e. number of integers inside block) is fixed (currently 128).
* Additionally blocks that are all the same value are encoded in an optimized way.
* <p>In VInt blocks, integers are encoded as {@link DataOutput#writeVInt VInt}: the block
* size is variable.
* <li><b>Block structure</b>:
* <p>When the postings are long enough, Lucene101PostingsFormat will try to encode most
* integer data as a packed block.
* <p>Take a term with 259 documents as an example, the first 256 document ids are encoded as
* two packed blocks, while the remaining 3 are encoded as one VInt block.
* <p>Different kinds of data are always encoded separately into different packed blocks, but
* may possibly be interleaved into the same VInt block.
* <p>This strategy is applied to pairs: &lt;document number, frequency&gt;, &lt;position,
* payload length&gt;, &lt;position, offset start, offset length&gt;, and &lt;position,
* payload length, offsetstart, offset length&gt;.
* <li><b>Skipdata</b>:
* <p>Skipdata is interleaved with blocks on 2 levels. Level 0 skip data is interleaved
* between every packed block. Level 1 skip data is interleaved between every 32 packed
* blocks.
* <li><b>Positions, Payloads, and Offsets</b>:
* <p>A position is an integer indicating where the term occurs within one document. A payload
* is a blob of metadata associated with current position. An offset is a pair of integers
* indicating the tokenized start/end offsets for given term in current position: it is
* essentially a specialized payload.
* <p>When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets
* (assuming a null payload contributes one count). As mentioned in block structure, it is
* possible to encode these three either combined or separately.
* <p>In all cases, payloads and offsets are stored together. When encoded as a packed block,
* position data is separated out as .pos, while payloads and offsets are encoded in .pay
* (payload metadata will also be stored directly in .pay). When encoded as VInt blocks, all
* these three are stored interleaved into the .pos (so is payload metadata).
* <p>With this strategy, the majority of payload and offset data will be outside .pos file.
* So for queries that require only position data, running on a full index with payloads and
* offsets, this reduces disk pre-fetches.
* </ul>
*
* <p>Files and detailed format:
*
* <ul>
* <li><code>.tim</code>: <a href="#Termdictionary">Term Dictionary</a>
* <li><code>.tip</code>: <a href="#Termindex">Term Index</a>
* <li><code>.doc</code>: <a href="#Frequencies">Frequencies and Skip Data</a>
* <li><code>.pos</code>: <a href="#Positions">Positions</a>
* <li><code>.pay</code>: <a href="#Payloads">Payloads and Offsets</a>
* </ul>
*
* <a id="Termdictionary"></a>
*
* <dl>
* <dd><b>Term Dictionary</b>
* <p>The .tim file contains the list of terms in each field along with per-term statistics
* (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the
* .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on
* the format.
* <p>NOTE: The term dictionary can plug into different postings implementations: the postings
* writer/reader are actually responsible for encoding and decoding the PostingsHeader and
* TermMetadata sections described here:
* <ul>
* <li>PostingsHeader --&gt; Header, PackedBlockSize
* <li>TermMetadata --&gt; (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?,
* PayFPDelta?
* <li>Header, --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
* <li>PackedBlockSize, SingletonDocID --&gt; {@link DataOutput#writeVInt VInt}
* <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta --&gt; {@link
* DataOutput#writeVLong VLong}
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}
* </ul>
* <p>Notes:
* <ul>
* <li>Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version
* information for the postings.
* <li>PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width
* is determined by the largest integer. Smaller block size result in smaller variance
* among width of integers hence smaller indexes. Larger block size result in more
* efficient bulk i/o hence better acceleration. This value should always be a multiple
* of 64, currently fixed as 128 as a tradeoff. It is also the skip interval used to
* accelerate {@link org.apache.lucene.index.PostingsEnum#advance(int)}.
* <li>DocFPDelta determines the position of this term's TermFreqs within the .doc file. In
* particular, it is the difference of file offset between this term's data and previous
* term's data (or zero, for the first term in the block).On disk it is stored as the
* difference from previous value in sequence.
* <li>PosFPDelta determines the position of this term's TermPositions within the .pos file.
* While PayFPDelta determines the position of this term's &lt;TermPayloads,
* TermOffsets?&gt; within the .pay file. Similar to DocFPDelta, it is the difference
* between two file positions (or neglected, for fields that omit payloads and offsets).
* <li>PosVIntBlockFPDelta determines the position of this term's last TermPosition in last
* pos packed block within the .pos file. It is synonym for PayVIntBlockFPDelta or
* OffsetVIntBlockFPDelta. This is actually used to indicate whether it is necessary to
* load following payloads and offsets from .pos instead of .pay. Every time a new block
* of positions are to be loaded, the PostingsReader will use this value to check
* whether current block is packed format or VInt. When packed format, payloads and
* offsets are fetched from .pay, otherwise from .pos. (this value is neglected when
* total number of positions i.e. totalTermFreq is less or equal to PackedBlockSize).
* <li>SingletonDocID is an optimization when a term only appears in one document. In this
* case, instead of writing a file pointer to the .doc file (DocFPDelta), and then a
* VIntBlock at that location, the single document ID is written to the term dictionary.
* </ul>
* </dl>
*
* <a id="Termindex"></a>
*
* <dl>
* <dd><b>Term Index</b>
* <p>The .tip file contains an index into the term dictionary, so that it can be accessed
* randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format.
* </dl>
*
* <a id="Frequencies"></a>
*
* <dl>
* <dd><b>Frequencies and Skip Data</b>
* <p>The .doc file contains the lists of documents which contain each term, along with the
* frequency of the term in that document (except when frequencies are omitted: {@link
* IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data
* is saved once for the entire postings list.
* <ul>
* <li>docFile(.doc) --&gt; Header, &lt;TermFreqs&gt;<sup>TermCount</sup>, Footer
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
* <li>TermFreqs --&gt; &lt;PackedBlock32&gt; <sup>PackedDocBlockNum/32</sup>, VIntBlock?
* <li>PackedBlock32 --&gt; Level1SkipData, &lt;PackedBlock&gt; <sup>32</sup>
* <li>PackedBlock --&gt; Level0SkipData, PackedDocDeltaBlock, PackedFreqBlock?
* <li>VIntBlock --&gt;
* &lt;DocDelta[,Freq?]&gt;<sup>DocFreq-PackedBlockSize*PackedDocBlockNum</sup>
* <li>Level1SkipData --&gt; DocDelta, DocFPDelta, Skip1NumBytes?, ImpactLength?, Impacts?,
* PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto?
* <li>Level0SkipData --&gt; Skip0NumBytes, DocDelta, DocFPDelta, PackedBlockLength,
* ImpactLength?, Impacts?, PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto?
* <li>PackedFreqBlock --&gt; {@link PackedInts PackedInts}, uses patching
* <li>PackedDocDeltaBlock --&gt; {@link PackedInts PackedInts}, does not use patching
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}
* </ul>
* <p>Notes:
* <ul>
* <li>PackedDocDeltaBlock is theoretically generated from two steps:
* <ol>
* <li>Calculate the difference between each document number and previous one, and get
* a d-gaps list (for the first document, use absolute value);
* <li>For those d-gaps from first one to
* PackedDocBlockNum*PackedBlockSize<sup>th</sup>, separately encode as packed
* blocks.
* </ol>
* If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step.
* <li>VIntBlock stores remaining d-gaps (along with frequencies when possible) with a
* format that encodes DocDelta and Freq:
* <p>DocDelta: if frequencies are indexed, this determines both the document number and
* the frequency. In particular, DocDelta/2 is the difference between this document
* number and the previous document number (or zero when this is the first document in a
* TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the
* frequency is read as another VInt. If frequencies are omitted, DocDelta contains the
* gap (not multiplied by 2) between document numbers and no frequency information is
* stored.
* <p>For example, the TermFreqs for a term which occurs once in document seven and
* three times in document eleven, with frequencies indexed, would be the following
* sequence of VInts:
* <p>15, 8, 3
* <p>If frequencies were omitted ({@link IndexOptions#DOCS}) it would be this sequence
* of VInts instead:
* <p>7,4
* <li>PackedDocBlockNum is the number of packed blocks for current term's docids or
* frequencies. In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize)
* <li>On skip data, DocDelta is the delta between the last doc of the previous block - or
* -1 if there is no previous block - and the last doc of this block. This helps know by
* how much the doc ID should be incremented in case the block gets skipped.
* <li>Skip0Length is the length of skip data at level 0. Encoding it is useful when skip
* data is never needed to quickly skip over skip data, e.g. if only using nextDoc(). It
* is also used when only the first fields of skip data are needed, in order to skip
* over remaining fields without reading them.
* <li>ImpactLength and Impacts are only stored if frequencies are indexed.
* <li>Since positions and payloads are also block encoded, the skip should skip to related
* block first, then fetch the values according to in-block offset. PosFPSkip and
* PayFPSkip record the file offsets of related block in .pos and .pay, respectively.
* While PosBlockOffset indicates which value to fetch inside the related block
* (PayBlockOffset is unnecessary since it is always equal to PosBlockOffset). Same as
* DocFPSkip, the file offsets are relative to the start of current term's TermFreqs,
* and stored as a difference sequence.
* <li>PayByteUpto indicates the start offset of the current payload. It is equivalent to
* the sum of the payload lengths in the current block up to PosBlockOffset
* <li>ImpactLength is the total length of CompetitiveFreqDelta and CompetitiveNormDelta
* pairs. CompetitiveFreqDelta and CompetitiveNormDelta are used to safely skip score
* calculation for uncompetitive documents; See {@link
* org.apache.lucene.codecs.CompetitiveImpactAccumulator} for more details.
* </ul>
* </dl>
*
* <a id="Positions"></a>
*
* <dl>
* <dd><b>Positions</b>
* <p>The .pos file contains the lists of positions that each term occurs at within documents.
* It also sometimes stores part of payloads and offsets for speedup.
* <ul>
* <li>PosFile(.pos) --&gt; Header, &lt;TermPositions&gt; <sup>TermCount</sup>, Footer
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
* <li>TermPositions --&gt; &lt;PackedPosDeltaBlock&gt; <sup>PackedPosBlockNum</sup>,
* VIntBlock?
* <li>VIntBlock --&gt; &lt;PositionDelta[, PayloadLength?], PayloadData?, OffsetDelta?,
* OffsetLength?&gt;<sup>PosVIntCount</sup>
* <li>PackedPosDeltaBlock --&gt; {@link PackedInts PackedInts}
* <li>PositionDelta, OffsetDelta, OffsetLength --&gt; {@link DataOutput#writeVInt VInt}
* <li>PayloadData --&gt; {@link DataOutput#writeByte byte}<sup>PayLength</sup>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}
* </ul>
* <p>Notes:
* <ul>
* <li>TermPositions are order by term (terms are implicit, from the term dictionary), and
* position values for each term document pair are incremental, and ordered by document
* number.
* <li>PackedPosBlockNum is the number of packed blocks for current term's positions,
* payloads or offsets. In particular, PackedPosBlockNum =
* floor(totalTermFreq/PackedBlockSize)
* <li>PosVIntCount is the number of positions encoded as VInt format. In particular,
* PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize
* <li>The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
* in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.
* <li>PositionDelta is, if payloads are disabled for the term's field, the difference
* between the position of the current occurrence in the document and the previous
* occurrence (or zero, if this is the first occurrence in this document). If payloads
* are enabled for the term's field, then PositionDelta/2 is the difference between the
* current and the previous position. If payloads are enabled and PositionDelta is odd,
* then PayloadLength is stored, indicating the length of the payload at the current
* term position.
* <li>For example, the TermPositions for a term which occurs as the fourth term in one
* document, and as the fifth and ninth term in a subsequent document, would be the
* following sequence of VInts (payloads disabled):
* <p>4, 5, 4
* <li>PayloadData is metadata associated with the current term position. If PayloadLength
* is stored at the current position, then it indicates the length of this payload. If
* PayloadLength is not stored, then this payload has the same length as the payload at
* the previous position.
* <li>OffsetDelta/2 is the difference between this position's startOffset from the previous
* occurrence (or zero, if this is the first occurrence in this document). If
* OffsetDelta is odd, then the length (endOffset-startOffset) differs from the previous
* occurrence and an OffsetLength follows. Offset data is only written for {@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
* </ul>
* </dl>
*
* <a id="Payloads"></a>
*
* <dl>
* <dd><b>Payloads and Offsets</b>
* <p>The .pay file will store payloads and offsets associated with certain term-document
* positions. Some payloads and offsets will be separated out into .pos file, for performance
* reasons.
* <ul>
* <li>PayFile(.pay): --&gt; Header, &lt;TermPayloads?, TermOffsets?&gt;
* <sup>TermCount</sup>, Footer
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
* <li>TermPayloads --&gt; &lt;PackedPayLengthBlock, SumPayLength, PayData&gt;
* <sup>PackedPayBlockNum</sup>
* <li>TermOffsets --&gt; &lt;PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock&gt;
* <sup>PackedPayBlockNum</sup>
* <li>PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --&gt;
* {@link PackedInts PackedInts}
* <li>SumPayLength --&gt; {@link DataOutput#writeVInt VInt}
* <li>PayData --&gt; {@link DataOutput#writeByte byte}<sup>SumPayLength</sup>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}
* </ul>
* <p>Notes:
* <ul>
* <li>The order of TermPayloads/TermOffsets will be the same as TermPositions, note that
* part of payload/offsets are stored in .pos.
* <li>The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is
* the same as PackedFreqBlock in chapter <a href="#Frequencies">Frequencies and Skip
* Data</a>. While PackedStartDeltaBlock follows a same procedure as
* PackedDocDeltaBlock.
* <li>PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also
* synonym for PackedOffsetBlockNum.
* <li>SumPayLength is the total length of payloads written within one block, should be the
* sum of PayLengths in one packed block.
* <li>PayLength in PackedPayLengthBlock is the length of each payload associated with the
* current position.
* </ul>
* </dl>
*
* @lucene.experimental
*/
public final class Lucene101PostingsFormat extends PostingsFormat {
/** Filename extension for some small metadata about how postings are encoded. */
public static final String META_EXTENSION = "psm";
/**
* Filename extension for document number, frequencies, and skip data. See chapter: <a
* href="#Frequencies">Frequencies and Skip Data</a>
*/
public static final String DOC_EXTENSION = "doc";
/** Filename extension for positions. See chapter: <a href="#Positions">Positions</a> */
public static final String POS_EXTENSION = "pos";
/**
* Filename extension for payloads and offsets. See chapter: <a href="#Payloads">Payloads and
* Offsets</a>
*/
public static final String PAY_EXTENSION = "pay";
/** Size of blocks. */
public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE;
public static final int BLOCK_MASK = BLOCK_SIZE - 1;
/** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */
public static final int LEVEL1_FACTOR = 32;
/** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */
public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE;
public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1;
static final String TERMS_CODEC = "Lucene90PostingsWriterTerms";
static final String META_CODEC = "Lucene101PostingsWriterMeta";
static final String DOC_CODEC = "Lucene101PostingsWriterDoc";
static final String POS_CODEC = "Lucene101PostingsWriterPos";
static final String PAY_CODEC = "Lucene101PostingsWriterPay";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
private final int minTermBlockSize;
private final int maxTermBlockSize;
/** Creates {@code Lucene101PostingsFormat} with default settings. */
public Lucene101PostingsFormat() {
this(
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/**
* Creates {@code Lucene101PostingsFormat} with custom values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
*
* @see
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("Lucene101");
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret =
new Lucene90BlockTreeTermsWriter(
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new Lucene90BlockTreeTermsReader(postingsReader, state);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
/**
* Holds all state required for {@link Lucene101PostingsReader} to produce a {@link
* org.apache.lucene.index.PostingsEnum} without re-seeking the terms dict.
*
* @lucene.internal
*/
public static final class IntBlockTermState extends BlockTermState {
/** file pointer to the start of the doc ids enumeration, in {@link #DOC_EXTENSION} file */
public long docStartFP;
/** file pointer to the start of the positions enumeration, in {@link #POS_EXTENSION} file */
public long posStartFP;
/** file pointer to the start of the payloads enumeration, in {@link #PAY_EXTENSION} file */
public long payStartFP;
/**
* file offset for the last position in the last block, if there are more than {@link
* ForUtil#BLOCK_SIZE} positions; otherwise -1
*
* <p>One might think to use total term frequency to track how many positions are left to read
* as we decode the blocks, and decode the last block differently when num_left_positions &lt;
* BLOCK_SIZE. Unfortunately this won't work since the tracking will be messed up when we skip
* blocks as the skipper will only tell us new position offset (start of block) and number of
* positions to skip for that block, without telling us how many positions it has skipped.
*/
public long lastPosBlockOffset;
/**
* docid when there is a single pulsed posting, otherwise -1. freq is always implicitly
* totalTermFreq in this case.
*/
public int singletonDocID;
/** Sole constructor. */
public IntBlockTermState() {
lastPosBlockOffset = -1;
singletonDocID = -1;
}
@Override
public IntBlockTermState clone() {
IntBlockTermState other = new IntBlockTermState();
other.copyFrom(this);
return other;
}
@Override
public void copyFrom(TermState _other) {
super.copyFrom(_other);
IntBlockTermState other = (IntBlockTermState) _other;
docStartFP = other.docStartFP;
posStartFP = other.posStartFP;
payStartFP = other.payStartFP;
lastPosBlockOffset = other.lastPosBlockOffset;
singletonDocID = other.singletonDocID;
}
@Override
public String toString() {
return super.toString()
+ " docStartFP="
+ docStartFP
+ " posStartFP="
+ posStartFP
+ " payStartFP="
+ payStartFP
+ " lastPosBlockOffset="
+ lastPosBlockOffset
+ " singletonDocID="
+ singletonDocID;
}
}
}

View File

@ -0,0 +1,681 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene101;
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC;
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.LEVEL1_MASK;
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.META_CODEC;
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.POS_CODEC;
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.VERSION_CURRENT;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/** Writer for {@link Lucene101PostingsFormat}. */
public class Lucene101PostingsWriter extends PushPostingsWriterBase {
static final IntBlockTermState EMPTY_STATE = new IntBlockTermState();
IndexOutput metaOut;
IndexOutput docOut;
IndexOutput posOut;
IndexOutput payOut;
IntBlockTermState lastState;
// Holds starting file pointers for current term:
private long docStartFP;
private long posStartFP;
private long payStartFP;
final int[] docDeltaBuffer;
final int[] freqBuffer;
private int docBufferUpto;
final int[] posDeltaBuffer;
final int[] payloadLengthBuffer;
final int[] offsetStartDeltaBuffer;
final int[] offsetLengthBuffer;
private int posBufferUpto;
private byte[] payloadBytes;
private int payloadByteUpto;
private int level0LastDocID;
private long level0LastPosFP;
private long level0LastPayFP;
private int level1LastDocID;
private long level1LastPosFP;
private long level1LastPayFP;
private int docID;
private int lastDocID;
private int lastPosition;
private int lastStartOffset;
private int docCount;
private final PForUtil pforUtil;
private final ForDeltaUtil forDeltaUtil;
private boolean fieldHasNorms;
private NumericDocValues norms;
private final CompetitiveImpactAccumulator level0FreqNormAccumulator =
new CompetitiveImpactAccumulator();
private final CompetitiveImpactAccumulator level1CompetitiveFreqNormAccumulator =
new CompetitiveImpactAccumulator();
private int maxNumImpactsAtLevel0;
private int maxImpactNumBytesAtLevel0;
private int maxNumImpactsAtLevel1;
private int maxImpactNumBytesAtLevel1;
/** Scratch output that we use to be able to prepend the encoded length, e.g. impacts. */
private final ByteBuffersDataOutput scratchOutput = ByteBuffersDataOutput.newResettableInstance();
/**
* Output for a single block. This is useful to be able to prepend skip data before each block,
* which can only be computed once the block is encoded. The content is then typically copied to
* {@link #level1Output}.
*/
private final ByteBuffersDataOutput level0Output = ByteBuffersDataOutput.newResettableInstance();
/**
* Output for groups of 32 blocks. This is useful to prepend skip data for these 32 blocks, which
* can only be done once we have encoded these 32 blocks. The content is then typically copied to
* {@link #docCount}.
*/
private final ByteBuffersDataOutput level1Output = ByteBuffersDataOutput.newResettableInstance();
/** Sole constructor. */
public Lucene101PostingsWriter(SegmentWriteState state) throws IOException {
String metaFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.META_EXTENSION);
String docFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.DOC_EXTENSION);
metaOut = state.directory.createOutput(metaFileName, state.context);
IndexOutput posOut = null;
IndexOutput payOut = null;
boolean success = false;
try {
docOut = state.directory.createOutput(docFileName, state.context);
CodecUtil.writeIndexHeader(
metaOut, META_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(
docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
forDeltaUtil = new ForDeltaUtil();
pforUtil = new PForUtil();
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new int[BLOCK_SIZE];
String posFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.POS_EXTENSION);
posOut = state.directory.createOutput(posFileName, state.context);
CodecUtil.writeIndexHeader(
posOut, POS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
if (state.fieldInfos.hasPayloads()) {
payloadBytes = new byte[128];
payloadLengthBuffer = new int[BLOCK_SIZE];
} else {
payloadBytes = null;
payloadLengthBuffer = null;
}
if (state.fieldInfos.hasOffsets()) {
offsetStartDeltaBuffer = new int[BLOCK_SIZE];
offsetLengthBuffer = new int[BLOCK_SIZE];
} else {
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
}
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
String payFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name,
state.segmentSuffix,
Lucene101PostingsFormat.PAY_EXTENSION);
payOut = state.directory.createOutput(payFileName, state.context);
CodecUtil.writeIndexHeader(
payOut, PAY_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
}
} else {
posDeltaBuffer = null;
payloadLengthBuffer = null;
offsetStartDeltaBuffer = null;
offsetLengthBuffer = null;
payloadBytes = null;
}
this.payOut = payOut;
this.posOut = posOut;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut);
}
}
docDeltaBuffer = new int[BLOCK_SIZE];
freqBuffer = new int[BLOCK_SIZE];
}
@Override
public IntBlockTermState newTermState() {
return new IntBlockTermState();
}
@Override
public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException {
CodecUtil.writeIndexHeader(
termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
termsOut.writeVInt(BLOCK_SIZE);
}
@Override
public void setField(FieldInfo fieldInfo) {
super.setField(fieldInfo);
lastState = EMPTY_STATE;
fieldHasNorms = fieldInfo.hasNorms();
}
@Override
public void startTerm(NumericDocValues norms) {
docStartFP = docOut.getFilePointer();
if (writePositions) {
posStartFP = posOut.getFilePointer();
level1LastPosFP = level0LastPosFP = posStartFP;
if (writePayloads || writeOffsets) {
payStartFP = payOut.getFilePointer();
level1LastPayFP = level0LastPayFP = payStartFP;
}
}
lastDocID = -1;
level0LastDocID = -1;
level1LastDocID = -1;
this.norms = norms;
if (writeFreqs) {
level0FreqNormAccumulator.clear();
}
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
if (docBufferUpto == BLOCK_SIZE) {
flushDocBlock(false);
docBufferUpto = 0;
}
final int docDelta = docID - lastDocID;
if (docID < 0 || docDelta <= 0) {
throw new CorruptIndexException(
"docs out of order (" + docID + " <= " + lastDocID + " )", docOut);
}
docDeltaBuffer[docBufferUpto] = docDelta;
if (writeFreqs) {
freqBuffer[docBufferUpto] = termDocFreq;
}
this.docID = docID;
lastPosition = 0;
lastStartOffset = 0;
if (writeFreqs) {
long norm;
if (fieldHasNorms) {
boolean found = norms.advanceExact(docID);
if (found == false) {
// This can happen if indexing hits a problem after adding a doc to the
// postings but before buffering the norm. Such documents are written
// deleted and will go away on the first merge.
norm = 1L;
} else {
norm = norms.longValue();
assert norm != 0 : docID;
}
} else {
norm = 1L;
}
level0FreqNormAccumulator.add(termDocFreq, norm);
}
}
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset)
throws IOException {
if (position > IndexWriter.MAX_POSITION) {
throw new CorruptIndexException(
"position="
+ position
+ " is too large (> IndexWriter.MAX_POSITION="
+ IndexWriter.MAX_POSITION
+ ")",
docOut);
}
if (position < 0) {
throw new CorruptIndexException("position=" + position + " is < 0", docOut);
}
posDeltaBuffer[posBufferUpto] = position - lastPosition;
if (writePayloads) {
if (payload == null || payload.length == 0) {
// no payload
payloadLengthBuffer[posBufferUpto] = 0;
} else {
payloadLengthBuffer[posBufferUpto] = payload.length;
if (payloadByteUpto + payload.length > payloadBytes.length) {
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
}
System.arraycopy(
payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
payloadByteUpto += payload.length;
}
}
if (writeOffsets) {
assert startOffset >= lastStartOffset;
assert endOffset >= startOffset;
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
lastStartOffset = startOffset;
}
posBufferUpto++;
lastPosition = position;
if (posBufferUpto == BLOCK_SIZE) {
pforUtil.encode(posDeltaBuffer, posOut);
if (writePayloads) {
pforUtil.encode(payloadLengthBuffer, payOut);
payOut.writeVInt(payloadByteUpto);
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
payloadByteUpto = 0;
}
if (writeOffsets) {
pforUtil.encode(offsetStartDeltaBuffer, payOut);
pforUtil.encode(offsetLengthBuffer, payOut);
}
posBufferUpto = 0;
}
}
@Override
public void finishDoc() {
docBufferUpto++;
docCount++;
lastDocID = docID;
}
/**
* Special vints that are encoded on 2 bytes if they require 15 bits or less. VInt becomes
* especially slow when the number of bytes is variable, so this special layout helps in the case
* when the number likely requires 15 bits or less
*/
static void writeVInt15(DataOutput out, int v) throws IOException {
assert v >= 0;
writeVLong15(out, v);
}
/**
* @see #writeVInt15(DataOutput, int)
*/
static void writeVLong15(DataOutput out, long v) throws IOException {
assert v >= 0;
if ((v & ~0x7FFFL) == 0) {
out.writeShort((short) v);
} else {
out.writeShort((short) (0x8000 | (v & 0x7FFF)));
out.writeVLong(v >> 15);
}
}
private void flushDocBlock(boolean finishTerm) throws IOException {
assert docBufferUpto != 0;
if (docBufferUpto < BLOCK_SIZE) {
assert finishTerm;
PostingsUtil.writeVIntBlock(
level0Output, docDeltaBuffer, freqBuffer, docBufferUpto, writeFreqs);
} else {
if (writeFreqs) {
List<Impact> impacts = level0FreqNormAccumulator.getCompetitiveFreqNormPairs();
if (impacts.size() > maxNumImpactsAtLevel0) {
maxNumImpactsAtLevel0 = impacts.size();
}
writeImpacts(impacts, scratchOutput);
assert level0Output.size() == 0;
if (scratchOutput.size() > maxImpactNumBytesAtLevel0) {
maxImpactNumBytesAtLevel0 = Math.toIntExact(scratchOutput.size());
}
level0Output.writeVLong(scratchOutput.size());
scratchOutput.copyTo(level0Output);
scratchOutput.reset();
if (writePositions) {
level0Output.writeVLong(posOut.getFilePointer() - level0LastPosFP);
level0Output.writeByte((byte) posBufferUpto);
level0LastPosFP = posOut.getFilePointer();
if (writeOffsets || writePayloads) {
level0Output.writeVLong(payOut.getFilePointer() - level0LastPayFP);
level0Output.writeVInt(payloadByteUpto);
level0LastPayFP = payOut.getFilePointer();
}
}
}
long numSkipBytes = level0Output.size();
forDeltaUtil.encodeDeltas(docDeltaBuffer, level0Output);
if (writeFreqs) {
pforUtil.encode(freqBuffer, level0Output);
}
// docID - lastBlockDocID is at least 128, so it can never fit a single byte with a vint
// Even if we subtracted 128, only extremely dense blocks would be eligible to a single byte
// so let's go with 2 bytes right away
writeVInt15(scratchOutput, docID - level0LastDocID);
writeVLong15(scratchOutput, level0Output.size());
numSkipBytes += scratchOutput.size();
level1Output.writeVLong(numSkipBytes);
scratchOutput.copyTo(level1Output);
scratchOutput.reset();
}
level0Output.copyTo(level1Output);
level0Output.reset();
level0LastDocID = docID;
if (writeFreqs) {
level1CompetitiveFreqNormAccumulator.addAll(level0FreqNormAccumulator);
level0FreqNormAccumulator.clear();
}
if ((docCount & LEVEL1_MASK) == 0) { // true every 32 blocks (4,096 docs)
writeLevel1SkipData();
level1LastDocID = docID;
level1CompetitiveFreqNormAccumulator.clear();
} else if (finishTerm) {
level1Output.copyTo(docOut);
level1Output.reset();
level1CompetitiveFreqNormAccumulator.clear();
}
}
private void writeLevel1SkipData() throws IOException {
docOut.writeVInt(docID - level1LastDocID);
final long level1End;
if (writeFreqs) {
List<Impact> impacts = level1CompetitiveFreqNormAccumulator.getCompetitiveFreqNormPairs();
if (impacts.size() > maxNumImpactsAtLevel1) {
maxNumImpactsAtLevel1 = impacts.size();
}
writeImpacts(impacts, scratchOutput);
long numImpactBytes = scratchOutput.size();
if (numImpactBytes > maxImpactNumBytesAtLevel1) {
maxImpactNumBytesAtLevel1 = Math.toIntExact(numImpactBytes);
}
if (writePositions) {
scratchOutput.writeVLong(posOut.getFilePointer() - level1LastPosFP);
scratchOutput.writeByte((byte) posBufferUpto);
level1LastPosFP = posOut.getFilePointer();
if (writeOffsets || writePayloads) {
scratchOutput.writeVLong(payOut.getFilePointer() - level1LastPayFP);
scratchOutput.writeVInt(payloadByteUpto);
level1LastPayFP = payOut.getFilePointer();
}
}
final long level1Len = 2 * Short.BYTES + scratchOutput.size() + level1Output.size();
docOut.writeVLong(level1Len);
level1End = docOut.getFilePointer() + level1Len;
// There are at most 128 impacts, that require at most 2 bytes each
assert numImpactBytes <= Short.MAX_VALUE;
// Like impacts plus a few vlongs, still way under the max short value
assert scratchOutput.size() + Short.BYTES <= Short.MAX_VALUE;
docOut.writeShort((short) (scratchOutput.size() + Short.BYTES));
docOut.writeShort((short) numImpactBytes);
scratchOutput.copyTo(docOut);
scratchOutput.reset();
} else {
docOut.writeVLong(level1Output.size());
level1End = docOut.getFilePointer() + level1Output.size();
}
level1Output.copyTo(docOut);
level1Output.reset();
assert docOut.getFilePointer() == level1End : docOut.getFilePointer() + " " + level1End;
}
static void writeImpacts(Collection<Impact> impacts, DataOutput out) throws IOException {
Impact previous = new Impact(0, 0);
for (Impact impact : impacts) {
assert impact.freq > previous.freq;
assert Long.compareUnsigned(impact.norm, previous.norm) > 0;
int freqDelta = impact.freq - previous.freq - 1;
long normDelta = impact.norm - previous.norm - 1;
if (normDelta == 0) {
// most of time, norm only increases by 1, so we can fold everything in a single byte
out.writeVInt(freqDelta << 1);
} else {
out.writeVInt((freqDelta << 1) | 1);
out.writeZLong(normDelta);
}
previous = impact;
}
}
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(BlockTermState _state) throws IOException {
IntBlockTermState state = (IntBlockTermState) _state;
assert state.docFreq > 0;
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
assert state.docFreq == docCount : state.docFreq + " vs " + docCount;
// docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to
// it.
final int singletonDocID;
if (state.docFreq == 1) {
// pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
singletonDocID = docDeltaBuffer[0] - 1;
} else {
singletonDocID = -1;
flushDocBlock(true);
}
final long lastPosBlockOffset;
if (writePositions) {
// totalTermFreq is just total number of positions(or payloads, or offsets)
// associated with current term.
assert state.totalTermFreq != -1;
if (state.totalTermFreq > BLOCK_SIZE) {
// record file offset for last pos in last block
lastPosBlockOffset = posOut.getFilePointer() - posStartFP;
} else {
lastPosBlockOffset = -1;
}
if (posBufferUpto > 0) {
assert posBufferUpto < BLOCK_SIZE;
// TODO: should we send offsets/payloads to
// .pay...? seems wasteful (have to store extra
// vLong for low (< BLOCK_SIZE) DF terms = vast vast
// majority)
// vInt encode the remaining positions/payloads/offsets:
int lastPayloadLength = -1; // force first payload length to be written
int lastOffsetLength = -1; // force first offset length to be written
int payloadBytesReadUpto = 0;
for (int i = 0; i < posBufferUpto; i++) {
final int posDelta = posDeltaBuffer[i];
if (writePayloads) {
final int payloadLength = payloadLengthBuffer[i];
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
posOut.writeVInt((posDelta << 1) | 1);
posOut.writeVInt(payloadLength);
} else {
posOut.writeVInt(posDelta << 1);
}
if (payloadLength != 0) {
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
payloadBytesReadUpto += payloadLength;
}
} else {
posOut.writeVInt(posDelta);
}
if (writeOffsets) {
int delta = offsetStartDeltaBuffer[i];
int length = offsetLengthBuffer[i];
if (length == lastOffsetLength) {
posOut.writeVInt(delta << 1);
} else {
posOut.writeVInt(delta << 1 | 1);
posOut.writeVInt(length);
lastOffsetLength = length;
}
}
}
if (writePayloads) {
assert payloadBytesReadUpto == payloadByteUpto;
payloadByteUpto = 0;
}
}
} else {
lastPosBlockOffset = -1;
}
state.docStartFP = docStartFP;
state.posStartFP = posStartFP;
state.payStartFP = payStartFP;
state.singletonDocID = singletonDocID;
state.lastPosBlockOffset = lastPosBlockOffset;
docBufferUpto = 0;
posBufferUpto = 0;
lastDocID = -1;
docCount = 0;
}
@Override
public void encodeTerm(
DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute)
throws IOException {
IntBlockTermState state = (IntBlockTermState) _state;
if (absolute) {
lastState = EMPTY_STATE;
assert lastState.docStartFP == 0;
}
if (lastState.singletonDocID != -1
&& state.singletonDocID != -1
&& state.docStartFP == lastState.docStartFP) {
// With runs of rare values such as ID fields, the increment of pointers in the docs file is
// often 0.
// Furthermore some ID schemes like auto-increment IDs or Flake IDs are monotonic, so we
// encode the delta
// between consecutive doc IDs to save space.
final long delta = (long) state.singletonDocID - lastState.singletonDocID;
out.writeVLong((BitUtil.zigZagEncode(delta) << 1) | 0x01);
} else {
out.writeVLong((state.docStartFP - lastState.docStartFP) << 1);
if (state.singletonDocID != -1) {
out.writeVInt(state.singletonDocID);
}
}
if (writePositions) {
out.writeVLong(state.posStartFP - lastState.posStartFP);
if (writePayloads || writeOffsets) {
out.writeVLong(state.payStartFP - lastState.payStartFP);
}
}
if (writePositions) {
if (state.lastPosBlockOffset != -1) {
out.writeVLong(state.lastPosBlockOffset);
}
}
lastState = state;
}
@Override
public void close() throws IOException {
// TODO: add a finish() at least to PushBase? DV too...?
boolean success = false;
try {
if (docOut != null) {
CodecUtil.writeFooter(docOut);
}
if (posOut != null) {
CodecUtil.writeFooter(posOut);
}
if (payOut != null) {
CodecUtil.writeFooter(payOut);
}
if (metaOut != null) {
metaOut.writeInt(maxNumImpactsAtLevel0);
metaOut.writeInt(maxImpactNumBytesAtLevel0);
metaOut.writeInt(maxNumImpactsAtLevel1);
metaOut.writeInt(maxImpactNumBytesAtLevel1);
metaOut.writeLong(docOut.getFilePointer());
if (posOut != null) {
metaOut.writeLong(posOut.getFilePointer());
if (payOut != null) {
metaOut.writeLong(payOut.getFilePointer());
}
}
CodecUtil.writeFooter(metaOut);
}
success = true;
} finally {
if (success) {
IOUtils.close(metaOut, docOut, posOut, payOut);
} else {
IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut);
}
metaOut = docOut = posOut = payOut = null;
}
}
}

View File

@ -0,0 +1,134 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene101;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.LongHeap;
import org.apache.lucene.util.packed.PackedInts;
/** Utility class to encode sequences of 128 small positive integers. */
final class PForUtil {
private static final int MAX_EXCEPTIONS = 7;
static boolean allEqual(int[] l) {
for (int i = 1; i < ForUtil.BLOCK_SIZE; ++i) {
if (l[i] != l[0]) {
return false;
}
}
return true;
}
private final ForUtil forUtil = new ForUtil();
static {
assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE;
}
/** Encode 128 integers from {@code ints} into {@code out}. */
void encode(int[] ints, DataOutput out) throws IOException {
// Determine the top MAX_EXCEPTIONS + 1 values
final LongHeap top = new LongHeap(MAX_EXCEPTIONS + 1);
for (int i = 0; i <= MAX_EXCEPTIONS; ++i) {
top.push(ints[i]);
}
long topValue = top.top();
for (int i = MAX_EXCEPTIONS + 1; i < ForUtil.BLOCK_SIZE; ++i) {
if (ints[i] > topValue) {
topValue = top.updateTop(ints[i]);
}
}
long max = 0L;
for (int i = 1; i <= top.size(); ++i) {
max = Math.max(max, top.get(i));
}
final int maxBitsRequired = PackedInts.bitsRequired(max);
// We store the patch on a byte, so we can't decrease the number of bits required by more than 8
final int patchedBitsRequired =
Math.max(PackedInts.bitsRequired(topValue), maxBitsRequired - 8);
int numExceptions = 0;
final long maxUnpatchedValue = (1L << patchedBitsRequired) - 1;
for (int i = 2; i <= top.size(); ++i) {
if (top.get(i) > maxUnpatchedValue) {
numExceptions++;
}
}
final byte[] exceptions = new byte[numExceptions * 2];
if (numExceptions > 0) {
int exceptionCount = 0;
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
if (ints[i] > maxUnpatchedValue) {
exceptions[exceptionCount * 2] = (byte) i;
exceptions[exceptionCount * 2 + 1] = (byte) (ints[i] >>> patchedBitsRequired);
ints[i] &= maxUnpatchedValue;
exceptionCount++;
}
}
assert exceptionCount == numExceptions : exceptionCount + " " + numExceptions;
}
if (allEqual(ints) && maxBitsRequired <= 8) {
for (int i = 0; i < numExceptions; ++i) {
exceptions[2 * i + 1] =
(byte) (Byte.toUnsignedLong(exceptions[2 * i + 1]) << patchedBitsRequired);
}
out.writeByte((byte) (numExceptions << 5));
out.writeVInt(ints[0]);
} else {
final int token = (numExceptions << 5) | patchedBitsRequired;
out.writeByte((byte) token);
forUtil.encode(ints, patchedBitsRequired, out);
}
out.writeBytes(exceptions, exceptions.length);
}
/** Decode 128 integers into {@code ints}. */
void decode(PostingDecodingUtil pdu, int[] ints) throws IOException {
var in = pdu.in;
final int token = Byte.toUnsignedInt(in.readByte());
final int bitsPerValue = token & 0x1f;
if (bitsPerValue == 0) {
Arrays.fill(ints, 0, ForUtil.BLOCK_SIZE, in.readVInt());
} else {
forUtil.decode(bitsPerValue, pdu, ints);
}
final int numExceptions = token >>> 5;
for (int i = 0; i < numExceptions; ++i) {
ints[Byte.toUnsignedInt(in.readByte())] |= Byte.toUnsignedLong(in.readByte()) << bitsPerValue;
}
}
/** Skip 128 integers. */
static void skip(DataInput in) throws IOException {
final int token = Byte.toUnsignedInt(in.readByte());
final int bitsPerValue = token & 0x1f;
final int numExceptions = token >>> 5;
if (bitsPerValue == 0) {
in.readVLong();
in.skipBytes((numExceptions << 1));
} else {
in.skipBytes(ForUtil.numBytes(bitsPerValue) + (numExceptions << 1));
}
}
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.codecs.lucene101;
import java.io.IOException;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
@ -42,16 +42,16 @@ public final class PostingIndexInput {
this.postingDecodingUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
}
/** Decode 128 integers stored on {@code bitsPerValues} bits per value into {@code longs}. */
public void decode(int bitsPerValue, long[] longs) throws IOException {
forUtil.decode(bitsPerValue, postingDecodingUtil, longs);
/** Decode 128 integers stored on {@code bitsPerValues} bits per value into {@code ints}. */
public void decode(int bitsPerValue, int[] ints) throws IOException {
forUtil.decode(bitsPerValue, postingDecodingUtil, ints);
}
/**
* Decode 128 integers stored on {@code bitsPerValues} bits per value, compute their prefix sum,
* and store results into {@code longs}.
* and store results into {@code ints}.
*/
public void decodeAndPrefixSum(int bitsPerValue, long base, long[] longs) throws IOException {
forDeltaUtil.decodeAndPrefixSum(bitsPerValue, postingDecodingUtil, base, longs);
public void decodeAndPrefixSum(int bitsPerValue, int base, int[] ints) throws IOException {
forDeltaUtil.decodeAndPrefixSum(bitsPerValue, postingDecodingUtil, base, ints);
}
}

View File

@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene101;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.GroupVIntUtil;
/** Utility class to encode/decode postings block. */
final class PostingsUtil {
/**
* Read values that have been written using variable-length encoding and group-varint encoding
* instead of bit-packing.
*/
static void readVIntBlock(
IndexInput docIn,
int[] docBuffer,
int[] freqBuffer,
int num,
boolean indexHasFreq,
boolean decodeFreq)
throws IOException {
GroupVIntUtil.readGroupVInts(docIn, docBuffer, num);
if (indexHasFreq && decodeFreq) {
for (int i = 0; i < num; ++i) {
freqBuffer[i] = docBuffer[i] & 0x01;
docBuffer[i] >>>= 1;
if (freqBuffer[i] == 0) {
freqBuffer[i] = docIn.readVInt();
}
}
} else if (indexHasFreq) {
for (int i = 0; i < num; ++i) {
docBuffer[i] >>>= 1;
}
}
}
/** Write freq buffer with variable-length encoding and doc buffer with group-varint encoding. */
static void writeVIntBlock(
DataOutput docOut, int[] docBuffer, int[] freqBuffer, int num, boolean writeFreqs)
throws IOException {
if (writeFreqs) {
for (int i = 0; i < num; i++) {
docBuffer[i] = (docBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
}
}
docOut.writeGroupVInts(docBuffer, num);
if (writeFreqs) {
for (int i = 0; i < num; i++) {
final int freq = freqBuffer[i];
if (freq != 1) {
docOut.writeVInt(freq);
}
}
}
}
}

View File

@ -0,0 +1,377 @@
#! /usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from math import gcd
"""Code generation for ForDeltaUtil.java"""
MAX_SPECIALIZED_BITS_PER_VALUE = 24
OUTPUT_FILE = "ForDeltaUtil.java"
PRIMITIVE_SIZE = [8, 16, 32]
HEADER = """// This file has been automatically generated, DO NOT EDIT
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene101;
import java.io.IOException;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.codecs.lucene101.ForUtil.*;
/**
* Inspired from https://fulmicoton.com/posts/bitpacking/
* Encodes multiple integers in a Java int to get SIMD-like speedups.
* If bitsPerValue &lt;= 4 then we pack 4 ints per Java int
* else if bitsPerValue &lt;= 11 we pack 2 ints per Java int
* else we use scalar operations.
*/
public final class ForDeltaUtil {
private static final int HALF_BLOCK_SIZE = BLOCK_SIZE / 2;
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4;
// IDENTITY_PLUS_ONE[i] == i+1
private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE];
static {
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
IDENTITY_PLUS_ONE[i] = i + 1;
}
}
private static void prefixSumOfOnes(int[] arr, int base) {
System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE);
// This loop gets auto-vectorized
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
arr[i] += base;
}
}
private static void prefixSum8(int[] arr, int base) {
// When the number of bits per value is 4 or less, we can sum up all values in a block without
// risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4
// values at once.
innerPrefixSum8(arr);
expand8(arr);
final int l0 = base;
final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1];
final int l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1];
final int l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1];
for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) {
arr[i] += l0;
arr[ONE_BLOCK_SIZE_FOURTH + i] += l1;
arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2;
arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3;
}
}
private static void prefixSum16(int[] arr, int base) {
// When the number of bits per value is 11 or less, we can sum up all values in a block without
// risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2
// values at once.
innerPrefixSum16(arr);
expand16(arr);
final int l0 = base;
final int l1 = base + arr[HALF_BLOCK_SIZE - 1];
for (int i = 0; i < HALF_BLOCK_SIZE; ++i) {
arr[i] += l0;
arr[HALF_BLOCK_SIZE + i] += l1;
}
}
private static void prefixSum32(int[] arr, int base) {
arr[0] += base;
for (int i = 1; i < BLOCK_SIZE; ++i) {
arr[i] += arr[i-1];
}
}
// For some reason unrolling seems to help
private static void innerPrefixSum8(int[] arr) {
arr[1] += arr[0];
arr[2] += arr[1];
arr[3] += arr[2];
arr[4] += arr[3];
arr[5] += arr[4];
arr[6] += arr[5];
arr[7] += arr[6];
arr[8] += arr[7];
arr[9] += arr[8];
arr[10] += arr[9];
arr[11] += arr[10];
arr[12] += arr[11];
arr[13] += arr[12];
arr[14] += arr[13];
arr[15] += arr[14];
arr[16] += arr[15];
arr[17] += arr[16];
arr[18] += arr[17];
arr[19] += arr[18];
arr[20] += arr[19];
arr[21] += arr[20];
arr[22] += arr[21];
arr[23] += arr[22];
arr[24] += arr[23];
arr[25] += arr[24];
arr[26] += arr[25];
arr[27] += arr[26];
arr[28] += arr[27];
arr[29] += arr[28];
arr[30] += arr[29];
arr[31] += arr[30];
}
// For some reason unrolling seems to help
private static void innerPrefixSum16(int[] arr) {
arr[1] += arr[0];
arr[2] += arr[1];
arr[3] += arr[2];
arr[4] += arr[3];
arr[5] += arr[4];
arr[6] += arr[5];
arr[7] += arr[6];
arr[8] += arr[7];
arr[9] += arr[8];
arr[10] += arr[9];
arr[11] += arr[10];
arr[12] += arr[11];
arr[13] += arr[12];
arr[14] += arr[13];
arr[15] += arr[14];
arr[16] += arr[15];
arr[17] += arr[16];
arr[18] += arr[17];
arr[19] += arr[18];
arr[20] += arr[19];
arr[21] += arr[20];
arr[22] += arr[21];
arr[23] += arr[22];
arr[24] += arr[23];
arr[25] += arr[24];
arr[26] += arr[25];
arr[27] += arr[26];
arr[28] += arr[27];
arr[29] += arr[28];
arr[30] += arr[29];
arr[31] += arr[30];
arr[32] += arr[31];
arr[33] += arr[32];
arr[34] += arr[33];
arr[35] += arr[34];
arr[36] += arr[35];
arr[37] += arr[36];
arr[38] += arr[37];
arr[39] += arr[38];
arr[40] += arr[39];
arr[41] += arr[40];
arr[42] += arr[41];
arr[43] += arr[42];
arr[44] += arr[43];
arr[45] += arr[44];
arr[46] += arr[45];
arr[47] += arr[46];
arr[48] += arr[47];
arr[49] += arr[48];
arr[50] += arr[49];
arr[51] += arr[50];
arr[52] += arr[51];
arr[53] += arr[52];
arr[54] += arr[53];
arr[55] += arr[54];
arr[56] += arr[55];
arr[57] += arr[56];
arr[58] += arr[57];
arr[59] += arr[58];
arr[60] += arr[59];
arr[61] += arr[60];
arr[62] += arr[61];
arr[63] += arr[62];
}
private final int[] tmp = new int[BLOCK_SIZE];
/**
* Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code
* ints} are expected to be deltas between consecutive values.
*/
void encodeDeltas(int[] ints, DataOutput out) throws IOException {
if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings
out.writeByte((byte) 0);
} else {
int or = 0;
for (int l : ints) {
or |= l;
}
assert or != 0;
final int bitsPerValue = PackedInts.bitsRequired(or);
out.writeByte((byte) bitsPerValue);
final int primitiveSize;
if (bitsPerValue <= 3) {
primitiveSize = 8;
collapse8(ints);
} else if (bitsPerValue <= 10) {
primitiveSize = 16;
collapse16(ints);
} else {
primitiveSize = 32;
}
encode(ints, bitsPerValue, primitiveSize, out, tmp);
}
}
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */
void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException {
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
if (bitsPerValue == 0) {
prefixSumOfOnes(ints, base);
} else {
decodeAndPrefixSum(bitsPerValue, pdu, base, ints);
}
}
"""
def primitive_size_for_bpv(bpv):
if bpv <= 3:
# If we have 4 bits per value or less then we can compute the prefix sum of 32 ints that store 4 8-bit values each without overflowing.
return 8
elif bpv <= 10:
# If we have 10 bits per value or less then we can compute the prefix sum of 64 ints that store 2 16-bit values each without overflowing.
return 16
else:
# No risk of overflow with 32 bits per value
return 32
def next_primitive(bpv):
if bpv <= 8:
return 8
elif bpv <= 16:
return 16
else:
return 32
def writeRemainder(bpv, next_primitive, remaining_bits_per_int, o, num_values, f):
iteration = 1
num_ints = bpv * num_values / remaining_bits_per_int
while num_ints % 2 == 0 and num_values % 2 == 0:
num_ints /= 2
num_values /= 2
iteration *= 2
f.write(' for (int iter = 0, tmpIdx = 0, intsIdx = %d; iter < %d; ++iter, tmpIdx += %d, intsIdx += %d) {\n' %(o, iteration, num_ints, num_values))
i = 0
remaining_bits = 0
tmp_idx = 0
for i in range(int(num_values)):
b = bpv
if remaining_bits == 0:
b -= remaining_bits_per_int
f.write(' int l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
else:
b -= remaining_bits
f.write(' int l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b))
tmp_idx += 1
while b >= remaining_bits_per_int:
b -= remaining_bits_per_int
f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
tmp_idx += 1
if b > 0:
f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_int-b, next_primitive, b))
remaining_bits = remaining_bits_per_int-b
f.write(' ints[intsIdx + %d] = l%d;\n' %(i, i))
f.write(' }\n')
def writeDecode(bpv, f):
next_primitive = primitive_size_for_bpv(bpv)
if next_primitive % bpv == 0:
f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %(bpv, next_primitive))
else:
f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {\n' %(bpv, next_primitive))
if bpv == next_primitive:
f.write(' pdu.in.readInts(ints, 0, %d);\n' %(bpv*4))
else:
num_values_per_int = 32 / next_primitive
remaining_bits = next_primitive % bpv
num_iters = (next_primitive - 1) // bpv
o = 4 * bpv * num_iters
if remaining_bits == 0:
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, ints, %d, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
else:
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_int - o, f)
f.write(' }\n')
if __name__ == '__main__':
f = open(OUTPUT_FILE, 'w')
f.write(HEADER)
f.write("""
/**
* Delta-decode 128 integers into {@code ints}.
*/
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int[] ints) throws IOException {
switch (bitsPerValue) {
""")
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
primitive_size = primitive_size_for_bpv(bpv)
f.write(' case %d:\n' %bpv)
if next_primitive(bpv) == primitive_size:
if primitive_size % bpv == 0:
f.write(' decode%d(pdu, ints);\n' %bpv)
else:
f.write(' decode%d(pdu, tmp, ints);\n' %bpv)
else:
if primitive_size % bpv == 0:
f.write(' decode%dTo%d(pdu, ints);\n' %(bpv, primitive_size))
else:
f.write(' decode%dTo%d(pdu, tmp, ints);\n' %(bpv, primitive_size))
f.write(' prefixSum%d(ints, base);\n' %primitive_size)
f.write(' break;\n')
f.write(' default:\n')
f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n')
f.write(' prefixSum32(ints, base);\n')
f.write(' break;\n')
f.write(' }\n')
f.write(' }\n')
f.write('\n')
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
if next_primitive(bpv) != primitive_size_for_bpv(bpv):
writeDecode(bpv, f)
if bpv < MAX_SPECIALIZED_BITS_PER_VALUE:
f.write('\n')
f.write('}\n')

View File

@ -0,0 +1,327 @@
#! /usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from math import gcd
"""Code generation for ForUtil.java"""
MAX_SPECIALIZED_BITS_PER_VALUE = 24
OUTPUT_FILE = "ForUtil.java"
PRIMITIVE_SIZE = [8, 16, 32]
HEADER = """// This file has been automatically generated, DO NOT EDIT
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene101;
import java.io.IOException;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.store.DataOutput;
/**
* Inspired from https://fulmicoton.com/posts/bitpacking/
* Encodes multiple integers in one to get SIMD-like speedups.
* If bitsPerValue &lt;= 8 then we pack 4 ints per Java int
* else if bitsPerValue &lt;= 16 we pack 2 ints per Java int
* else we do scalar operations.
*/
public final class ForUtil {
public static final int BLOCK_SIZE = 128;
static final int BLOCK_SIZE_LOG2 = 7;
static int expandMask16(int mask16) {
return mask16 | (mask16 << 16);
}
static int expandMask8(int mask8) {
return expandMask16(mask8 | (mask8 << 8));
}
static int mask32(int bitsPerValue) {
return (1 << bitsPerValue) - 1;
}
static int mask16(int bitsPerValue) {
return expandMask16((1 << bitsPerValue) - 1);
}
static int mask8(int bitsPerValue) {
return expandMask8((1 << bitsPerValue) - 1);
}
static void expand8(int[] arr) {
for (int i = 0; i < 32; ++i) {
int l = arr[i];
arr[i] = (l >>> 24) & 0xFF;
arr[32 + i] = (l >>> 16) & 0xFF;
arr[64 + i] = (l >>> 8) & 0xFF;
arr[96 + i] = l & 0xFF;
}
}
static void collapse8(int[] arr) {
for (int i = 0; i < 32; ++i) {
arr[i] =
(arr[i] << 24)
| (arr[32 + i] << 16)
| (arr[64 + i] << 8)
| arr[96 + i];
}
}
static void expand16(int[] arr) {
for (int i = 0; i < 64; ++i) {
int l = arr[i];
arr[i] = (l >>> 16) & 0xFFFF;
arr[64 + i] = l & 0xFFFF;
}
}
static void collapse16(int[] arr) {
for (int i = 0; i < 64; ++i) {
arr[i] = (arr[i] << 16) | arr[64 + i];
}
}
private final int[] tmp = new int[BLOCK_SIZE];
/** Encode 128 integers from {@code ints} into {@code out}. */
void encode(int[] ints, int bitsPerValue, DataOutput out) throws IOException {
final int nextPrimitive;
if (bitsPerValue <= 8) {
nextPrimitive = 8;
collapse8(ints);
} else if (bitsPerValue <= 16) {
nextPrimitive = 16;
collapse16(ints);
} else {
nextPrimitive = 32;
}
encode(ints, bitsPerValue, nextPrimitive, out, tmp);
}
static void encode(int[] ints, int bitsPerValue, int primitiveSize, DataOutput out, int[] tmp) throws IOException {
final int numInts = BLOCK_SIZE * primitiveSize / Integer.SIZE;
final int numIntsPerShift = bitsPerValue * 4;
int idx = 0;
int shift = primitiveSize - bitsPerValue;
for (int i = 0; i < numIntsPerShift; ++i) {
tmp[i] = ints[idx++] << shift;
}
for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
for (int i = 0; i < numIntsPerShift; ++i) {
tmp[i] |= ints[idx++] << shift;
}
}
final int remainingBitsPerInt = shift + bitsPerValue;
final int maskRemainingBitsPerInt;
if (primitiveSize == 8) {
maskRemainingBitsPerInt = MASKS8[remainingBitsPerInt];
} else if (primitiveSize == 16) {
maskRemainingBitsPerInt = MASKS16[remainingBitsPerInt];
} else {
maskRemainingBitsPerInt = MASKS32[remainingBitsPerInt];
}
int tmpIdx = 0;
int remainingBitsPerValue = bitsPerValue;
while (idx < numInts) {
if (remainingBitsPerValue >= remainingBitsPerInt) {
remainingBitsPerValue -= remainingBitsPerInt;
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerInt;
if (remainingBitsPerValue == 0) {
idx++;
remainingBitsPerValue = bitsPerValue;
}
} else {
final int mask1, mask2;
if (primitiveSize == 8) {
mask1 = MASKS8[remainingBitsPerValue];
mask2 = MASKS8[remainingBitsPerInt - remainingBitsPerValue];
} else if (primitiveSize == 16) {
mask1 = MASKS16[remainingBitsPerValue];
mask2 = MASKS16[remainingBitsPerInt - remainingBitsPerValue];
} else {
mask1 = MASKS32[remainingBitsPerValue];
mask2 = MASKS32[remainingBitsPerInt - remainingBitsPerValue];
}
tmp[tmpIdx] |= (ints[idx++] & mask1) << (remainingBitsPerInt - remainingBitsPerValue);
remainingBitsPerValue = bitsPerValue - remainingBitsPerInt + remainingBitsPerValue;
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & mask2;
}
}
for (int i = 0; i < numIntsPerShift; ++i) {
out.writeInt(tmp[i]);
}
}
/** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */
static int numBytes(int bitsPerValue) {
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
}
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, int[] tmp, int[] ints)
throws IOException {
final int numInts = bitsPerValue << 2;
final int mask = MASKS32[bitsPerValue];
pdu.splitInts(numInts, ints, 32 - bitsPerValue, 32, mask, tmp, 0, -1);
final int remainingBitsPerInt = 32 - bitsPerValue;
final int mask32RemainingBitsPerInt = MASKS32[remainingBitsPerInt];
int tmpIdx = 0;
int remainingBits = remainingBitsPerInt;
for (int intsIdx = numInts; intsIdx < BLOCK_SIZE; ++intsIdx) {
int b = bitsPerValue - remainingBits;
int l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b;
while (b >= remainingBitsPerInt) {
b -= remainingBitsPerInt;
l |= (tmp[tmpIdx++] & mask32RemainingBitsPerInt) << b;
}
if (b > 0) {
l |= (tmp[tmpIdx] >>> (remainingBitsPerInt - b)) & MASKS32[b];
remainingBits = remainingBitsPerInt - b;
} else {
remainingBits = remainingBitsPerInt;
}
ints[intsIdx] = l;
}
}
"""
def writeRemainder(bpv, next_primitive, remaining_bits_per_int, o, num_values, f):
iteration = 1
num_ints = bpv * num_values / remaining_bits_per_int
while num_ints % 2 == 0 and num_values % 2 == 0:
num_ints /= 2
num_values /= 2
iteration *= 2
f.write(' for (int iter = 0, tmpIdx = 0, intsIdx = %d; iter < %d; ++iter, tmpIdx += %d, intsIdx += %d) {\n' %(o, iteration, num_ints, num_values))
i = 0
remaining_bits = 0
tmp_idx = 0
for i in range(int(num_values)):
b = bpv
if remaining_bits == 0:
b -= remaining_bits_per_int
f.write(' int l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
else:
b -= remaining_bits
f.write(' int l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b))
tmp_idx += 1
while b >= remaining_bits_per_int:
b -= remaining_bits_per_int
f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
tmp_idx += 1
if b > 0:
f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_int-b, next_primitive, b))
remaining_bits = remaining_bits_per_int-b
f.write(' ints[intsIdx + %d] = l%d;\n' %(i, i))
f.write(' }\n')
def writeDecode(bpv, f):
next_primitive = 32
if bpv <= 8:
next_primitive = 8
elif bpv <= 16:
next_primitive = 16
if bpv == next_primitive:
f.write(' static void decode%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %bpv)
f.write(' pdu.in.readInts(ints, 0, %d);\n' %(bpv*4))
else:
num_values_per_int = 32 / next_primitive
remaining_bits = next_primitive % bpv
num_iters = (next_primitive - 1) // bpv
o = 4 * bpv * num_iters
if remaining_bits == 0:
f.write(' static void decode%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %bpv)
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, ints, %d, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
else:
f.write(' static void decode%d(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {\n' %bpv)
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_int - o, f)
f.write(' }\n')
if __name__ == '__main__':
f = open(OUTPUT_FILE, 'w')
f.write(HEADER)
for primitive_size in PRIMITIVE_SIZE:
f.write(' static final int[] MASKS%d = new int[%d];\n' %(primitive_size, primitive_size))
f.write('\n')
f.write(' static {\n')
for primitive_size in PRIMITIVE_SIZE:
f.write(' for (int i = 0; i < %d; ++i) {\n' %primitive_size)
f.write(' MASKS%d[i] = mask%d(i);\n' %(primitive_size, primitive_size))
f.write(' }\n')
f.write(' }')
f.write("""
// mark values in array as final ints to avoid the cost of reading array, arrays should only be
// used when the idx is a variable
""")
for primitive_size in PRIMITIVE_SIZE:
for bpv in range(1, min(MAX_SPECIALIZED_BITS_PER_VALUE + 1, primitive_size)):
f.write(' static final int MASK%d_%d = MASKS%d[%d];\n' %(primitive_size, bpv, primitive_size, bpv))
f.write("""
/** Decode 128 integers into {@code ints}. */
void decode(int bitsPerValue, PostingDecodingUtil pdu, int[] ints) throws IOException {
switch (bitsPerValue) {
""")
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
next_primitive = 32
if bpv <= 8:
next_primitive = 8
elif bpv <= 16:
next_primitive = 16
f.write(' case %d:\n' %bpv)
if next_primitive % bpv == 0:
f.write(' decode%d(pdu, ints);\n' %bpv)
else:
f.write(' decode%d(pdu, tmp, ints);\n' %bpv)
if next_primitive != 32:
f.write(' expand%d(ints);\n' %next_primitive)
f.write(' break;\n')
f.write(' default:\n')
f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n')
f.write(' break;\n')
f.write(' }\n')
f.write(' }\n')
for i in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
writeDecode(i, f)
if i < MAX_SPECIALIZED_BITS_PER_VALUE:
f.write('\n')
f.write('}\n')

View File

@ -16,7 +16,7 @@
*/
/**
* Lucene 10.0 file format.
* Lucene 10.1 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
*
@ -151,15 +151,15 @@
* field names. These are used to store auxiliary information about the document, such as its
* title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
* <li>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term dictionary}. A
* dictionary containing all of the terms used in all of the indexed fields of all of the
* documents. The dictionary also contains the number of documents which contain the term, and
* pointers to the term's frequency and proximity data.
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
* <li>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Frequency data}. For
* each term in the dictionary, the numbers of all the documents that contain that term, and
* the frequency of the term in that document, unless frequencies are omitted ({@link
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
* <li>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Proximity data}. For
* each term in the dictionary, the positions that the term occurs in each document. Note that
* this will not exist if all fields in all documents omit position data.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
@ -255,27 +255,27 @@
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}</td>
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}</td>
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}</td>
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}</td>
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}</td>
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
@ -416,6 +416,8 @@
* <li>In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
* 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
* need skipping, especially conjunctions.
* <li>In version 10.1, block encoding changed to be optimized for int[] storage instead of
* long[].
* </ul>
*
* <a id="Limitations"></a>
@ -430,4 +432,4 @@
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
* VInt} values which have no limit. </div>
*/
package org.apache.lucene.codecs.lucene100;
package org.apache.lucene.codecs.lucene101;

View File

@ -21,8 +21,6 @@ import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_IND
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.BaseTermsEnum;
@ -43,6 +41,7 @@ import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
@ -59,12 +58,12 @@ import org.apache.lucene.util.packed.DirectReader;
/** reader for {@link Lucene90DocValuesFormat} */
final class Lucene90DocValuesProducer extends DocValuesProducer {
private final Map<String, NumericEntry> numerics;
private final Map<String, BinaryEntry> binaries;
private final Map<String, SortedEntry> sorted;
private final Map<String, SortedSetEntry> sortedSets;
private final Map<String, SortedNumericEntry> sortedNumerics;
private final Map<String, DocValuesSkipperEntry> skippers;
private final IntObjectHashMap<NumericEntry> numerics;
private final IntObjectHashMap<BinaryEntry> binaries;
private final IntObjectHashMap<SortedEntry> sorted;
private final IntObjectHashMap<SortedSetEntry> sortedSets;
private final IntObjectHashMap<SortedNumericEntry> sortedNumerics;
private final IntObjectHashMap<DocValuesSkipperEntry> skippers;
private final IndexInput data;
private final int maxDoc;
private int version = -1;
@ -81,12 +80,12 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
String metaName =
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
this.maxDoc = state.segmentInfo.maxDoc();
numerics = new HashMap<>();
binaries = new HashMap<>();
sorted = new HashMap<>();
sortedSets = new HashMap<>();
sortedNumerics = new HashMap<>();
skippers = new HashMap<>();
numerics = new IntObjectHashMap<>();
binaries = new IntObjectHashMap<>();
sorted = new IntObjectHashMap<>();
sortedSets = new IntObjectHashMap<>();
sortedNumerics = new IntObjectHashMap<>();
skippers = new IntObjectHashMap<>();
merging = false;
// read in the entries from the metadata file.
@ -149,12 +148,12 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
// Used for cloning
private Lucene90DocValuesProducer(
Map<String, NumericEntry> numerics,
Map<String, BinaryEntry> binaries,
Map<String, SortedEntry> sorted,
Map<String, SortedSetEntry> sortedSets,
Map<String, SortedNumericEntry> sortedNumerics,
Map<String, DocValuesSkipperEntry> skippers,
IntObjectHashMap<NumericEntry> numerics,
IntObjectHashMap<BinaryEntry> binaries,
IntObjectHashMap<SortedEntry> sorted,
IntObjectHashMap<SortedSetEntry> sortedSets,
IntObjectHashMap<SortedNumericEntry> sortedNumerics,
IntObjectHashMap<DocValuesSkipperEntry> skippers,
IndexInput data,
int maxDoc,
int version,
@ -194,18 +193,18 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
}
byte type = meta.readByte();
if (info.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) {
skippers.put(info.name, readDocValueSkipperMeta(meta));
skippers.put(info.number, readDocValueSkipperMeta(meta));
}
if (type == Lucene90DocValuesFormat.NUMERIC) {
numerics.put(info.name, readNumeric(meta));
numerics.put(info.number, readNumeric(meta));
} else if (type == Lucene90DocValuesFormat.BINARY) {
binaries.put(info.name, readBinary(meta));
binaries.put(info.number, readBinary(meta));
} else if (type == Lucene90DocValuesFormat.SORTED) {
sorted.put(info.name, readSorted(meta));
sorted.put(info.number, readSorted(meta));
} else if (type == Lucene90DocValuesFormat.SORTED_SET) {
sortedSets.put(info.name, readSortedSet(meta));
sortedSets.put(info.number, readSortedSet(meta));
} else if (type == Lucene90DocValuesFormat.SORTED_NUMERIC) {
sortedNumerics.put(info.name, readSortedNumeric(meta));
sortedNumerics.put(info.number, readSortedNumeric(meta));
} else {
throw new CorruptIndexException("invalid type: " + type, meta);
}
@ -430,7 +429,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
NumericEntry entry = numerics.get(field.name);
NumericEntry entry = numerics.get(field.number);
return getNumeric(entry);
}
@ -786,7 +785,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
BinaryEntry entry = binaries.get(field.name);
BinaryEntry entry = binaries.get(field.number);
if (entry.docsWithFieldOffset == -2) {
return DocValues.emptyBinary();
@ -887,7 +886,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
SortedEntry entry = sorted.get(field.name);
SortedEntry entry = sorted.get(field.number);
return getSorted(entry);
}
@ -1363,7 +1362,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedNumericEntry entry = sortedNumerics.get(field.name);
SortedNumericEntry entry = sortedNumerics.get(field.number);
return getSortedNumeric(entry);
}
@ -1508,7 +1507,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetEntry entry = sortedSets.get(field.name);
SortedSetEntry entry = sortedSets.get(field.number);
if (entry.singleValueEntry != null) {
return DocValues.singleton(getSorted(entry.singleValueEntry));
}
@ -1782,7 +1781,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
@Override
public DocValuesSkipper getSkipper(FieldInfo field) throws IOException {
final DocValuesSkipperEntry entry = skippers.get(field.name);
final DocValuesSkipperEntry entry = skippers.get(field.number);
final IndexInput input = data.slice("doc value skipper", entry.offset, entry.length);
// Prefetch the first page of data. Following pages are expected to get prefetched through

View File

@ -21,8 +21,6 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readSi
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.hnsw.FlatVectorsReader;
import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
@ -38,6 +36,7 @@ import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
@ -56,13 +55,15 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
private static final long SHALLOW_SIZE =
RamUsageEstimator.shallowSizeOfInstance(Lucene99FlatVectorsFormat.class);
private final Map<String, FieldEntry> fields = new HashMap<>();
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
private final IndexInput vectorData;
private final FieldInfos fieldInfos;
public Lucene99FlatVectorsReader(SegmentReadState state, FlatVectorsScorer scorer)
throws IOException {
super(scorer);
int versionMeta = readMetadata(state);
this.fieldInfos = state.fieldInfos;
boolean success = false;
try {
vectorData =
@ -155,15 +156,13 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
}
FieldEntry fieldEntry = FieldEntry.create(meta, info);
fields.put(info.name, fieldEntry);
fields.put(info.number, fieldEntry);
}
}
@Override
public long ramBytesUsed() {
return Lucene99FlatVectorsReader.SHALLOW_SIZE
+ RamUsageEstimator.sizeOfMap(
fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class));
return Lucene99FlatVectorsReader.SHALLOW_SIZE + fields.ramBytesUsed();
}
@Override
@ -171,21 +170,27 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
CodecUtil.checksumEntireFile(vectorData);
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) {
final FieldInfo info = fieldInfos.fieldInfo(field);
final FieldEntry fieldEntry;
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
if (fieldEntry.vectorEncoding != expectedEncoding) {
throw new IllegalArgumentException(
"field=\""
+ field
+ "\" is encoded as: "
+ fieldEntry.vectorEncoding
+ " expected: "
+ VectorEncoding.FLOAT32);
+ expectedEncoding);
}
return fieldEntry;
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
return OffHeapFloatVectorValues.load(
fieldEntry.similarityFunction,
vectorScorer,
@ -199,19 +204,7 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
@Override
public ByteVectorValues getByteVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
throw new IllegalArgumentException(
"field=\""
+ field
+ "\" is encoded as: "
+ fieldEntry.vectorEncoding
+ " expected: "
+ VectorEncoding.BYTE);
}
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
return OffHeapByteVectorValues.load(
fieldEntry.similarityFunction,
vectorScorer,
@ -225,10 +218,7 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
@Override
public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
return null;
}
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
return vectorScorer.getRandomVectorScorer(
fieldEntry.similarityFunction,
OffHeapFloatVectorValues.load(
@ -245,10 +235,7 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
@Override
public RandomVectorScorer getRandomVectorScorer(String field, byte[] target) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
return null;
}
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
return vectorScorer.getRandomVectorScorer(
fieldEntry.similarityFunction,
OffHeapByteVectorValues.load(

View File

@ -21,9 +21,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.hnsw.FlatVectorsReader;
@ -37,6 +35,7 @@ import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
@ -70,7 +69,7 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
private final FlatVectorsReader flatVectorsReader;
private final FieldInfos fieldInfos;
private final Map<String, FieldEntry> fields = new HashMap<>();
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
private final IndexInput vectorIndex;
public Lucene99HnswVectorsReader(SegmentReadState state, FlatVectorsReader flatVectorsReader)
@ -162,7 +161,7 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
}
FieldEntry fieldEntry = readField(meta, info);
validateFieldEntry(info, fieldEntry);
fields.put(info.name, fieldEntry);
fields.put(info.number, fieldEntry);
}
}
@ -225,8 +224,7 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
@Override
public long ramBytesUsed() {
return Lucene99HnswVectorsReader.SHALLOW_SIZE
+ RamUsageEstimator.sizeOfMap(
fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class))
+ fields.ramBytesUsed()
+ flatVectorsReader.ramBytesUsed();
}
@ -246,25 +244,43 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
return flatVectorsReader.getByteVectorValues(field);
}
private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) {
final FieldInfo info = fieldInfos.fieldInfo(field);
final FieldEntry fieldEntry;
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != expectedEncoding) {
throw new IllegalArgumentException(
"field=\""
+ field
+ "\" is encoded as: "
+ fieldEntry.vectorEncoding
+ " expected: "
+ expectedEncoding);
}
return fieldEntry;
}
@Override
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
search(
fields.get(field),
fieldEntry,
knnCollector,
acceptDocs,
VectorEncoding.FLOAT32,
() -> flatVectorsReader.getRandomVectorScorer(field, target));
}
@Override
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
search(
fields.get(field),
fieldEntry,
knnCollector,
acceptDocs,
VectorEncoding.BYTE,
() -> flatVectorsReader.getRandomVectorScorer(field, target));
}
@ -272,13 +288,10 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
FieldEntry fieldEntry,
KnnCollector knnCollector,
Bits acceptDocs,
VectorEncoding vectorEncoding,
IOSupplier<RandomVectorScorer> scorerSupplier)
throws IOException {
if (fieldEntry.size() == 0
|| knnCollector.k() == 0
|| fieldEntry.vectorEncoding != vectorEncoding) {
if (fieldEntry.size() == 0 || knnCollector.k() == 0) {
return;
}
final RandomVectorScorer scorer = scorerSupplier.get();
@ -304,12 +317,12 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
@Override
public HnswGraph getGraph(String field) throws IOException {
FieldInfo info = fieldInfos.fieldInfo(field);
if (info == null) {
throw new IllegalArgumentException("No such field '" + field + "'");
final FieldInfo info = fieldInfos.fieldInfo(field);
final FieldEntry entry;
if (info == null || (entry = fields.get(info.number)) == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
FieldEntry entry = fields.get(field);
if (entry != null && entry.vectorIndexLength > 0) {
if (entry.vectorIndexLength > 0) {
return getGraph(entry);
} else {
return HnswGraph.EMPTY;

View File

@ -21,8 +21,6 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readSi
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.hnsw.FlatVectorsReader;
import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
@ -36,6 +34,7 @@ import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.search.VectorScorer;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IOContext;
@ -59,15 +58,17 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
private static final long SHALLOW_SIZE =
RamUsageEstimator.shallowSizeOfInstance(Lucene99ScalarQuantizedVectorsReader.class);
private final Map<String, FieldEntry> fields = new HashMap<>();
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
private final IndexInput quantizedVectorData;
private final FlatVectorsReader rawVectorsReader;
private final FieldInfos fieldInfos;
public Lucene99ScalarQuantizedVectorsReader(
SegmentReadState state, FlatVectorsReader rawVectorsReader, FlatVectorsScorer scorer)
throws IOException {
super(scorer);
this.rawVectorsReader = rawVectorsReader;
this.fieldInfos = state.fieldInfos;
int versionMeta = -1;
String metaFileName =
IndexFileNames.segmentFileName(
@ -118,7 +119,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
}
FieldEntry fieldEntry = readField(meta, versionMeta, info);
validateFieldEntry(info, fieldEntry);
fields.put(info.name, fieldEntry);
fields.put(info.number, fieldEntry);
}
}
@ -163,10 +164,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
CodecUtil.checksumEntireFile(quantizedVectorData);
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
private FieldEntry getFieldEntry(String field) {
final FieldInfo info = fieldInfos.fieldInfo(field);
final FieldEntry fieldEntry;
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
@ -178,6 +179,12 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
+ " expected: "
+ VectorEncoding.FLOAT32);
}
return fieldEntry;
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
final FieldEntry fieldEntry = getFieldEntry(field);
final FloatVectorValues rawVectorValues = rawVectorsReader.getFloatVectorValues(field);
OffHeapQuantizedByteVectorValues quantizedByteVectorValues =
OffHeapQuantizedByteVectorValues.load(
@ -241,10 +248,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
@Override
public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
return null;
}
final FieldEntry fieldEntry = getFieldEntry(field);
if (fieldEntry.scalarQuantizer == null) {
return rawVectorsReader.getRandomVectorScorer(field, target);
}
@ -275,12 +279,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
@Override
public long ramBytesUsed() {
long size = SHALLOW_SIZE;
size +=
RamUsageEstimator.sizeOfMap(
fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class));
size += rawVectorsReader.ramBytesUsed();
return size;
return SHALLOW_SIZE + fields.ramBytesUsed() + rawVectorsReader.ramBytesUsed();
}
private FieldEntry readField(IndexInput input, int versionMeta, FieldInfo info)
@ -301,11 +300,8 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
}
@Override
public QuantizedByteVectorValues getQuantizedVectorValues(String fieldName) throws IOException {
FieldEntry fieldEntry = fields.get(fieldName);
if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
return null;
}
public QuantizedByteVectorValues getQuantizedVectorValues(String field) throws IOException {
final FieldEntry fieldEntry = getFieldEntry(field);
return OffHeapQuantizedByteVectorValues.load(
fieldEntry.ordToDoc,
fieldEntry.dimension,
@ -320,11 +316,8 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
}
@Override
public ScalarQuantizer getQuantizationState(String fieldName) {
FieldEntry fieldEntry = fields.get(fieldName);
if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
return null;
}
public ScalarQuantizer getQuantizationState(String field) {
final FieldEntry fieldEntry = getFieldEntry(field);
return fieldEntry.scalarQuantizer;
}

View File

@ -38,6 +38,7 @@ import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.util.IOUtils;
/**
@ -256,7 +257,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
private static class FieldsReader extends DocValuesProducer {
private final Map<String, DocValuesProducer> fields = new HashMap<>();
private final IntObjectHashMap<DocValuesProducer> fields = new IntObjectHashMap<>();
private final Map<String, DocValuesProducer> formats = new HashMap<>();
// clone for merge
@ -270,10 +271,10 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
}
// Then rebuild fields:
for (Map.Entry<String, DocValuesProducer> ent : other.fields.entrySet()) {
DocValuesProducer producer = oldToNew.get(ent.getValue());
for (IntObjectHashMap.IntObjectCursor<DocValuesProducer> ent : other.fields) {
DocValuesProducer producer = oldToNew.get(ent.value);
assert producer != null;
fields.put(ent.getKey(), producer);
fields.put(ent.key, producer);
}
}
@ -302,7 +303,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
segmentSuffix,
format.fieldsProducer(new SegmentReadState(readState, segmentSuffix)));
}
fields.put(fieldName, formats.get(segmentSuffix));
fields.put(fi.number, formats.get(segmentSuffix));
}
}
}
@ -316,37 +317,37 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
DocValuesProducer producer = fields.get(field.name);
DocValuesProducer producer = fields.get(field.number);
return producer == null ? null : producer.getNumeric(field);
}
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
DocValuesProducer producer = fields.get(field.name);
DocValuesProducer producer = fields.get(field.number);
return producer == null ? null : producer.getBinary(field);
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
DocValuesProducer producer = fields.get(field.name);
DocValuesProducer producer = fields.get(field.number);
return producer == null ? null : producer.getSorted(field);
}
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
DocValuesProducer producer = fields.get(field.name);
DocValuesProducer producer = fields.get(field.number);
return producer == null ? null : producer.getSortedNumeric(field);
}
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
DocValuesProducer producer = fields.get(field.name);
DocValuesProducer producer = fields.get(field.number);
return producer == null ? null : producer.getSortedSet(field);
}
@Override
public DocValuesSkipper getSkipper(FieldInfo field) throws IOException {
DocValuesProducer producer = fields.get(field.name);
DocValuesProducer producer = fields.get(field.number);
return producer == null ? null : producer.getSkipper(field);
}

View File

@ -19,7 +19,9 @@ package org.apache.lucene.codecs.perfield;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.ServiceLoader;
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
@ -28,11 +30,14 @@ import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Sorter;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.internal.hppc.ObjectCursor;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
@ -186,7 +191,8 @@ public abstract class PerFieldKnnVectorsFormat extends KnnVectorsFormat {
/** VectorReader that can wrap multiple delegate readers, selected by field. */
public static class FieldsReader extends KnnVectorsReader {
private final Map<String, KnnVectorsReader> fields = new HashMap<>();
private final IntObjectHashMap<KnnVectorsReader> fields = new IntObjectHashMap<>();
private final FieldInfos fieldInfos;
/**
* Create a FieldsReader over a segment, opening VectorReaders for each KnnVectorsFormat
@ -196,7 +202,7 @@ public abstract class PerFieldKnnVectorsFormat extends KnnVectorsFormat {
* @throws IOException if one of the delegate readers throws
*/
public FieldsReader(final SegmentReadState readState) throws IOException {
this.fieldInfos = readState.fieldInfos;
// Init each unique format:
boolean success = false;
Map<String, KnnVectorsReader> formats = new HashMap<>();
@ -221,7 +227,7 @@ public abstract class PerFieldKnnVectorsFormat extends KnnVectorsFormat {
segmentSuffix,
format.fieldsReader(new SegmentReadState(readState, segmentSuffix)));
}
fields.put(fieldName, formats.get(segmentSuffix));
fields.put(fi.number, formats.get(segmentSuffix));
}
}
}
@ -239,51 +245,69 @@ public abstract class PerFieldKnnVectorsFormat extends KnnVectorsFormat {
* @param field the name of a numeric vector field
*/
public KnnVectorsReader getFieldReader(String field) {
return fields.get(field);
final FieldInfo info = fieldInfos.fieldInfo(field);
if (info == null) {
return null;
}
return fields.get(info.number);
}
@Override
public void checkIntegrity() throws IOException {
for (KnnVectorsReader reader : fields.values()) {
reader.checkIntegrity();
for (ObjectCursor<KnnVectorsReader> cursor : fields.values()) {
cursor.value.checkIntegrity();
}
}
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
KnnVectorsReader knnVectorsReader = fields.get(field);
if (knnVectorsReader == null) {
final FieldInfo info = fieldInfos.fieldInfo(field);
final KnnVectorsReader reader;
if (info == null || (reader = fields.get(info.number)) == null) {
return null;
} else {
return knnVectorsReader.getFloatVectorValues(field);
}
return reader.getFloatVectorValues(field);
}
@Override
public ByteVectorValues getByteVectorValues(String field) throws IOException {
KnnVectorsReader knnVectorsReader = fields.get(field);
if (knnVectorsReader == null) {
final FieldInfo info = fieldInfos.fieldInfo(field);
final KnnVectorsReader reader;
if (info == null || (reader = fields.get(info.number)) == null) {
return null;
} else {
return knnVectorsReader.getByteVectorValues(field);
}
return reader.getByteVectorValues(field);
}
@Override
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
fields.get(field).search(field, target, knnCollector, acceptDocs);
final FieldInfo info = fieldInfos.fieldInfo(field);
final KnnVectorsReader reader;
if (info == null || (reader = fields.get(info.number)) == null) {
return;
}
reader.search(field, target, knnCollector, acceptDocs);
}
@Override
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs)
throws IOException {
fields.get(field).search(field, target, knnCollector, acceptDocs);
final FieldInfo info = fieldInfos.fieldInfo(field);
final KnnVectorsReader reader;
if (info == null || (reader = fields.get(info.number)) == null) {
return;
}
reader.search(field, target, knnCollector, acceptDocs);
}
@Override
public void close() throws IOException {
IOUtils.close(fields.values());
List<KnnVectorsReader> readers = new ArrayList<>(fields.size());
for (ObjectCursor<KnnVectorsReader> cursor : fields.values()) {
readers.add(cursor.value);
}
IOUtils.close(readers);
}
}

View File

@ -27,7 +27,6 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafSimScorer;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
@ -120,7 +119,6 @@ final class FeatureQuery extends Query {
@Override
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
final Weight thisWeight = this;
Terms terms = Terms.getTerms(context.reader(), fieldName);
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
@ -135,10 +133,8 @@ final class FeatureQuery extends Query {
@Override
public Scorer get(long leadCost) throws IOException {
final SimScorer scorer = function.scorer(boost);
final LeafSimScorer simScorer =
new LeafSimScorer(scorer, context.reader(), fieldName, false);
final ImpactsEnum impacts = termsEnum.impacts(PostingsEnum.FREQS);
return new TermScorer(thisWeight, impacts, simScorer, topLevelScoringClause);
return new TermScorer(impacts, scorer, null, topLevelScoringClause);
}
@Override

View File

@ -20,7 +20,6 @@ import static org.apache.lucene.geo.GeoEncodingUtils.encodeLatitude;
import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitude;
import static org.apache.lucene.geo.GeoUtils.lineCrossesLine;
import static org.apache.lucene.geo.GeoUtils.lineOverlapLine;
import static org.apache.lucene.geo.GeoUtils.orient;
import java.util.ArrayList;
import java.util.HashMap;
@ -215,7 +214,7 @@ public final class Tessellator {
* Creates a circular doubly linked list using polygon points. The order is governed by the
* specified winding order
*/
private static final Node createDoublyLinkedList(
private static Node createDoublyLinkedList(
final double[] x,
final double[] y,
final WindingOrder polyWindingOrder,
@ -243,7 +242,7 @@ public final class Tessellator {
return filterPoints(lastNode, null);
}
private static final Node eliminateHoles(final XYPolygon polygon, Node outerNode) {
private static Node eliminateHoles(final XYPolygon polygon, Node outerNode) {
// Define a list to hole a reference to each filtered hole list.
final List<Node> holeList = new ArrayList<>();
// keep a reference to the hole
@ -273,8 +272,8 @@ public final class Tessellator {
return eliminateHoles(holeList, holeListPolygons, outerNode);
}
/** Links every hole into the outer loop, producing a single-ring polygon without holes. * */
private static final Node eliminateHoles(final Polygon polygon, Node outerNode) {
/** Links every hole into the outer loop, producing a single-ring polygon without holes. */
private static Node eliminateHoles(final Polygon polygon, Node outerNode) {
// Define a list to hole a reference to each filtered hole list.
final List<Node> holeList = new ArrayList<>();
// keep a reference to the hole
@ -304,7 +303,7 @@ public final class Tessellator {
return eliminateHoles(holeList, holeListPolygons, outerNode);
}
private static final Node eliminateHoles(
private static Node eliminateHoles(
List<Node> holeList, final Map<Node, ?> holeListPolygons, Node outerNode) {
// Sort the hole vertices by x coordinate
holeList.sort(
@ -350,30 +349,19 @@ public final class Tessellator {
}
/** Finds a bridge between vertices that connects a hole with an outer ring, and links it */
private static final void eliminateHole(
private static void eliminateHole(
final Node holeNode,
Node outerNode,
double holeMinX,
double holeMaxX,
double holeMinY,
double holeMaxY) {
// Attempt to find a common point between the HoleNode and OuterNode.
Node next = outerNode;
do {
if (Rectangle.containsPoint(
next.getY(), next.getX(), holeMinY, holeMaxY, holeMinX, holeMaxX)) {
Node sharedVertex = getSharedVertex(holeNode, next);
if (sharedVertex != null) {
// Split the resulting polygon.
Node node = splitPolygon(next, sharedVertex, true);
// Filter the split nodes.
filterPoints(node, node.next);
return;
}
}
next = next.next;
} while (next != outerNode);
// Attempt to merge the hole using a common point between if it exists.
if (maybeMergeHoleWithSharedVertices(
holeNode, outerNode, holeMinX, holeMaxX, holeMinY, holeMaxY)) {
return;
}
// Attempt to find a logical bridge between the HoleNode and OuterNode.
outerNode = fetchHoleBridge(holeNode, outerNode);
@ -390,12 +378,112 @@ public final class Tessellator {
}
}
/**
* Choose a common vertex between the polygon and the hole if it exists and return true, otherwise
* return false
*/
private static boolean maybeMergeHoleWithSharedVertices(
final Node holeNode,
Node outerNode,
double holeMinX,
double holeMaxX,
double holeMinY,
double holeMaxY) {
// Attempt to find a common point between the HoleNode and OuterNode.
Node sharedVertex = null;
Node sharedVertexConnection = null;
Node next = outerNode;
do {
if (Rectangle.containsPoint(
next.getY(), next.getX(), holeMinY, holeMaxY, holeMinX, holeMaxX)) {
Node newSharedVertex = getSharedVertex(holeNode, next);
if (newSharedVertex != null) {
if (sharedVertex == null) {
sharedVertex = newSharedVertex;
sharedVertexConnection = next;
} else if (newSharedVertex.equals(sharedVertex)) {
// This can only happen if this vertex has been already used for a bridge. We need to
// choose the right one.
sharedVertexConnection =
getSharedInsideVertex(sharedVertex, sharedVertexConnection, next);
}
}
}
next = next.next;
} while (next != outerNode);
if (sharedVertex != null) {
// Split the resulting polygon.
Node node = splitPolygon(sharedVertexConnection, sharedVertex, true);
// Filter the split nodes.
filterPoints(node, node.next);
return true;
}
return false;
}
/** Check if the provided vertex is in the polygon and return it */
private static Node getSharedVertex(final Node polygon, final Node vertex) {
Node next = polygon;
do {
if (isVertexEquals(next, vertex)) {
return next;
}
next = next.next;
} while (next != polygon);
return null;
}
/** Choose the vertex that has a smaller angle with the hole vertex */
static Node getSharedInsideVertex(Node holeVertex, Node candidateA, Node candidateB) {
assert isVertexEquals(holeVertex, candidateA) && isVertexEquals(holeVertex, candidateB);
// we are joining candidate.prevNode -> holeVertex.node -> holeVertex.nextNode.
// A negative area means a convex angle. if both are convex/reflex choose the point of
// minimum angle
final double a1 =
area(
candidateA.previous.getX(),
candidateA.previous.getY(),
holeVertex.getX(),
holeVertex.getY(),
holeVertex.next.getX(),
holeVertex.next.getY());
final double a2 =
area(
candidateB.previous.getX(),
candidateB.previous.getY(),
holeVertex.getX(),
holeVertex.getY(),
holeVertex.next.getX(),
holeVertex.next.getY());
if (a1 < 0 != a2 < 0) {
// one is convex, the other reflex, get the convex one
return a1 < a2 ? candidateA : candidateB;
} else {
// both are convex / reflex, choose the smallest angle
final double angle1 = angle(candidateA.previous, candidateA, holeVertex.next);
final double angle2 = angle(candidateB.previous, candidateB, holeVertex.next);
return angle1 < angle2 ? candidateA : candidateB;
}
}
private static double angle(Node a, Node b, Node c) {
final double ax = a.getX() - b.getX();
final double ay = a.getY() - b.getY();
final double cx = c.getX() - b.getX();
final double cy = c.getY() - b.getY();
final double dotProduct = ax * cx + ay * cy;
final double aLength = Math.sqrt(ax * ax + ay * ay);
final double bLength = Math.sqrt(cx * cx + cy * cy);
return Math.acos(dotProduct / (aLength * bLength));
}
/**
* David Eberly's algorithm for finding a bridge between a hole and outer polygon
*
* <p>see: http://www.geometrictools.com/Documentation/TriangulationByEarClipping.pdf
*/
private static final Node fetchHoleBridge(final Node holeNode, final Node outerNode) {
private static Node fetchHoleBridge(final Node holeNode, final Node outerNode) {
Node p = outerNode;
double qx = Double.NEGATIVE_INFINITY;
final double hx = holeNode.getX();
@ -453,34 +541,8 @@ public final class Tessellator {
return connection;
}
/** Check if the provided vertex is in the polygon and return it * */
private static Node getSharedVertex(final Node polygon, final Node vertex) {
Node next = polygon;
do {
if (isVertexEquals(next, vertex)) {
// make sure we are not crossing the polygon. This might happen when several holes share the
// same polygon vertex.
boolean crosses =
GeoUtils.lineCrossesLine(
next.previous.getX(),
next.previous.getY(),
vertex.next.getX(),
vertex.next.getY(),
next.next.getX(),
next.next.getY(),
vertex.previous.getX(),
vertex.previous.getY());
if (crosses == false) {
return next;
}
}
next = next.next;
} while (next != polygon);
return null;
}
/** Finds the left-most hole of a polygon ring. * */
private static final Node fetchLeftmost(final Node start) {
private static Node fetchLeftmost(final Node start) {
Node node = start;
Node leftMost = start;
do {
@ -502,7 +564,7 @@ public final class Tessellator {
* Main ear slicing loop which triangulates the vertices of a polygon, provided as a doubly-linked
* list. *
*/
private static final List<Triangle> earcutLinkedList(
private static List<Triangle> earcutLinkedList(
Object polygon,
Node currEar,
final List<Triangle> tessellation,
@ -587,7 +649,7 @@ public final class Tessellator {
}
/** Determines whether a polygon node forms a valid ear with adjacent nodes. * */
private static final boolean isEar(final Node ear, final boolean mortonOptimized) {
private static boolean isEar(final Node ear, final boolean mortonOptimized) {
if (mortonOptimized == true) {
return mortonIsEar(ear);
}
@ -623,7 +685,7 @@ public final class Tessellator {
* Uses morton code for speed to determine whether or a polygon node forms a valid ear w/ adjacent
* nodes
*/
private static final boolean mortonIsEar(final Node ear) {
private static boolean mortonIsEar(final Node ear) {
// triangle bbox (flip the bits so negative encoded values are < positive encoded values)
int minTX = StrictMath.min(StrictMath.min(ear.previous.x, ear.x), ear.next.x) ^ 0x80000000;
int minTY = StrictMath.min(StrictMath.min(ear.previous.y, ear.y), ear.next.y) ^ 0x80000000;
@ -740,7 +802,7 @@ public final class Tessellator {
}
/** Iterate through all polygon nodes and remove small local self-intersections * */
private static final Node cureLocalIntersections(
private static Node cureLocalIntersections(
Node startNode, final List<Triangle> tessellation, final boolean mortonOptimized) {
Node node = startNode;
Node nextNode;
@ -794,7 +856,7 @@ public final class Tessellator {
* Attempt to split a polygon and independently triangulate each side. Return true if the polygon
* was splitted *
*/
private static final boolean splitEarcut(
private static boolean splitEarcut(
final Object polygon,
final Node start,
final List<Triangle> tessellation,
@ -858,7 +920,7 @@ public final class Tessellator {
* Uses morton code for speed to determine whether or not and edge defined by a and b overlaps
* with a polygon edge
*/
private static final void mortonCheckIntersection(final Node a, final Node b) {
private static void mortonCheckIntersection(final Node a, final Node b) {
// edge bbox (flip the bits so negative encoded values are < positive encoded values)
int minTX = StrictMath.min(a.x, a.next.x) ^ 0x80000000;
int minTY = StrictMath.min(a.y, a.next.y) ^ 0x80000000;
@ -974,7 +1036,7 @@ public final class Tessellator {
* Uses morton code for speed to determine whether or not and edge defined by a and b overlaps
* with a polygon edge
*/
private static final boolean isMortonEdgeFromPolygon(final Node a, final Node b) {
private static boolean isMortonEdgeFromPolygon(final Node a, final Node b) {
// edge bbox (flip the bits so negative encoded values are < positive encoded values)
final int minTX = StrictMath.min(a.x, b.x) ^ 0x80000000;
final int minTY = StrictMath.min(a.y, b.y) ^ 0x80000000;
@ -1060,7 +1122,7 @@ public final class Tessellator {
}
/** Links two polygon vertices using a bridge. * */
private static final Node splitPolygon(final Node a, final Node b, boolean edgeFromPolygon) {
private static Node splitPolygon(final Node a, final Node b, boolean edgeFromPolygon) {
final Node a2 = new Node(a);
final Node b2 = new Node(b);
final Node an = a.next;
@ -1136,7 +1198,7 @@ public final class Tessellator {
return windingSum;
}
private static final boolean isLocallyInside(final Node a, final Node b) {
private static boolean isLocallyInside(final Node a, final Node b) {
double area =
area(
a.previous.getX(), a.previous.getY(), a.getX(), a.getY(), a.next.getX(), a.next.getY());
@ -1156,7 +1218,7 @@ public final class Tessellator {
}
/** Determine whether the middle point of a polygon diagonal is contained within the polygon */
private static final boolean middleInsert(
private static boolean middleInsert(
final Node start, final double x0, final double y0, final double x1, final double y1) {
Node node = start;
Node nextNode;
@ -1179,7 +1241,7 @@ public final class Tessellator {
}
/** Determines if the diagonal of a polygon is intersecting with any polygon elements. * */
private static final boolean isIntersectingPolygon(
private static boolean isIntersectingPolygon(
final Node start, final double x0, final double y0, final double x1, final double y1) {
Node node = start;
Node nextNode;
@ -1198,7 +1260,7 @@ public final class Tessellator {
}
/** Determines whether two line segments intersect. * */
public static final boolean linesIntersect(
public static boolean linesIntersect(
final double aX0,
final double aY0,
final double aX1,
@ -1212,7 +1274,7 @@ public final class Tessellator {
}
/** Interlinks polygon nodes in Z-Order. It reset the values on the z values* */
private static final void sortByMortonWithReset(Node start) {
private static void sortByMortonWithReset(Node start) {
Node next = start;
do {
next.previousZ = next.previous;
@ -1223,7 +1285,7 @@ public final class Tessellator {
}
/** Interlinks polygon nodes in Z-Order. * */
private static final void sortByMorton(Node start) {
private static void sortByMorton(Node start) {
start.previousZ.nextZ = null;
start.previousZ = null;
// Sort the generated ring using Z ordering.
@ -1234,7 +1296,7 @@ public final class Tessellator {
* Simon Tatham's doubly-linked list O(n log n) mergesort see:
* http://www.chiark.greenend.org.uk/~sgtatham/algorithms/listsort.html
*/
private static final void tathamSort(Node list) {
private static void tathamSort(Node list) {
Node p, q, e, tail;
int i, numMerges, pSize, qSize;
int inSize = 1;
@ -1290,7 +1352,7 @@ public final class Tessellator {
}
/** Eliminate colinear/duplicate points from the doubly linked list */
private static final Node filterPoints(final Node start, Node end) {
private static Node filterPoints(final Node start, Node end) {
if (start == null) {
return start;
}
@ -1343,7 +1405,7 @@ public final class Tessellator {
/**
* Creates a node and optionally links it with a previous node in a circular doubly-linked list
*/
private static final Node insertNode(
private static Node insertNode(
final double[] x,
final double[] y,
int index,
@ -1370,7 +1432,7 @@ public final class Tessellator {
}
/** Removes a node from the doubly linked list */
private static final void removeNode(Node node, boolean edgeFromPolygon) {
private static void removeNode(Node node, boolean edgeFromPolygon) {
node.next.previous = node.previous;
node.previous.next = node.next;
node.previous.isNextEdgeFromPolygon = edgeFromPolygon;
@ -1384,16 +1446,16 @@ public final class Tessellator {
}
/** Determines if two point vertices are equal. * */
private static final boolean isVertexEquals(final Node a, final Node b) {
private static boolean isVertexEquals(final Node a, final Node b) {
return isVertexEquals(a, b.getX(), b.getY());
}
/** Determines if two point vertices are equal. * */
private static final boolean isVertexEquals(final Node a, final double x, final double y) {
private static boolean isVertexEquals(final Node a, final double x, final double y) {
return a.getX() == x && a.getY() == y;
}
/** Compute signed area of triangle */
/** Compute signed area of triangle, negative means convex angle and positive reflex angle. */
private static double area(
final double aX,
final double aY,
@ -1419,29 +1481,6 @@ public final class Tessellator {
&& (bx - x) * (cy - y) - (cx - x) * (by - y) >= 0;
}
/** compute whether the given x, y point is in a triangle; uses the winding order method */
public static boolean pointInTriangle(
double x, double y, double ax, double ay, double bx, double by, double cx, double cy) {
double minX = StrictMath.min(ax, StrictMath.min(bx, cx));
double minY = StrictMath.min(ay, StrictMath.min(by, cy));
double maxX = StrictMath.max(ax, StrictMath.max(bx, cx));
double maxY = StrictMath.max(ay, StrictMath.max(by, cy));
// check the bounding box because if the triangle is degenerated, e.g points and lines, we need
// to filter out
// coplanar points that are not part of the triangle.
if (x >= minX && x <= maxX && y >= minY && y <= maxY) {
int a = orient(x, y, ax, ay, bx, by);
int b = orient(x, y, bx, by, cx, cy);
if (a == 0 || b == 0 || a < 0 == b < 0) {
int c = orient(x, y, cx, cy, ax, ay);
return c == 0 || (c < 0 == (b < 0 || a < 0));
}
return false;
} else {
return false;
}
}
/**
* Implementation of this interface will receive calls with internal data at each step of the
* triangulation algorithm. This is of use for debugging complex cases, as well as gaining insight
@ -1508,7 +1547,7 @@ public final class Tessellator {
}
/** Circular Doubly-linked list used for polygon coordinates */
protected static class Node {
static class Node {
// node index in the linked list
private final int idx;
// vertex index in the polygon
@ -1524,9 +1563,9 @@ public final class Tessellator {
private final long morton;
// previous node
private Node previous;
Node previous;
// next node
private Node next;
Node next;
// previous z node
private Node previousZ;
// next z node
@ -1534,7 +1573,7 @@ public final class Tessellator {
// if the edge from this node to the next node is part of the polygon edges
private boolean isNextEdgeFromPolygon;
protected Node(
Node(
final double[] x,
final double[] y,
final int index,
@ -1600,7 +1639,7 @@ public final class Tessellator {
Node[] vertex;
boolean[] edgeFromPolygon;
protected Triangle(
private Triangle(
Node a,
boolean isABfromPolygon,
Node b,
@ -1636,19 +1675,6 @@ public final class Tessellator {
return edgeFromPolygon[startVertex];
}
/** utility method to compute whether the point is in the triangle */
protected boolean containsPoint(double lat, double lon) {
return pointInTriangle(
lon,
lat,
vertex[0].getX(),
vertex[0].getY(),
vertex[1].getX(),
vertex[1].getY(),
vertex[2].getX(),
vertex[2].getY());
}
/** pretty print the triangle vertices */
@Override
public String toString() {

View File

@ -4284,21 +4284,8 @@ public final class CheckIndex implements Closeable {
int level = Integer.parseInt(args[i]);
Level.checkIfLevelInBounds(level);
opts.level = level;
} else if ("-fast".equals(arg)) {
// Deprecated. Remove in Lucene 11.
System.err.println(
"-fast is deprecated, use '-level 1' for explicitly verifying file checksums only. This is also now the default "
+ "behaviour!");
} else if ("-slow".equals(arg)) {
// Deprecated. Remove in Lucene 11.
System.err.println("-slow is deprecated, use '-level 3' instead for slow checks");
opts.level = Level.MIN_LEVEL_FOR_SLOW_CHECKS;
} else if ("-exorcise".equals(arg)) {
opts.doExorcise = true;
} else if ("-crossCheckTermVectors".equals(arg)) {
// Deprecated. Remove in Lucene 11.
System.err.println("-crossCheckTermVectors is deprecated, use '-level 3' instead");
opts.level = Level.MAX_VALUE;
} else if (arg.equals("-verbose")) {
opts.verbose = true;
} else if (arg.equals("-segment")) {

View File

@ -18,11 +18,10 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.IdentityHashMap;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.internal.hppc.LongArrayList;
import org.apache.lucene.store.Directory;
@ -32,7 +31,7 @@ import org.apache.lucene.store.Directory;
// producer?
class SegmentDocValuesProducer extends DocValuesProducer {
final Map<String, DocValuesProducer> dvProducersByField = new HashMap<>();
final IntObjectHashMap<DocValuesProducer> dvProducersByField = new IntObjectHashMap<>();
final Set<DocValuesProducer> dvProducers =
Collections.newSetFromMap(new IdentityHashMap<DocValuesProducer, Boolean>());
final LongArrayList dvGens = new LongArrayList();
@ -67,7 +66,7 @@ class SegmentDocValuesProducer extends DocValuesProducer {
dvGens.add(docValuesGen);
dvProducers.add(baseProducer);
}
dvProducersByField.put(fi.name, baseProducer);
dvProducersByField.put(fi.number, baseProducer);
} else {
assert !dvGens.contains(docValuesGen);
// otherwise, producer sees only the one fieldinfo it wrote
@ -76,7 +75,7 @@ class SegmentDocValuesProducer extends DocValuesProducer {
docValuesGen, si, dir, new FieldInfos(new FieldInfo[] {fi}));
dvGens.add(docValuesGen);
dvProducers.add(dvp);
dvProducersByField.put(fi.name, dvp);
dvProducersByField.put(fi.number, dvp);
}
}
} catch (Throwable t) {
@ -91,42 +90,42 @@ class SegmentDocValuesProducer extends DocValuesProducer {
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
assert dvProducer != null;
return dvProducer.getNumeric(field);
}
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
assert dvProducer != null;
return dvProducer.getBinary(field);
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
assert dvProducer != null;
return dvProducer.getSorted(field);
}
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
assert dvProducer != null;
return dvProducer.getSortedNumeric(field);
}
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
assert dvProducer != null;
return dvProducer.getSortedSet(field);
}
@Override
public DocValuesSkipper getSkipper(FieldInfo field) throws IOException {
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
assert dvProducer != null;
return dvProducer.getSkipper(field);
}

View File

@ -93,9 +93,8 @@ class StoredFieldsConsumer {
void finish(int maxDoc) throws IOException {
while (lastDoc < maxDoc - 1) {
startDocument(lastDoc);
startDocument(lastDoc + 1);
finishDocument();
++lastDoc;
}
}

View File

@ -197,4 +197,14 @@ final class DefaultVectorUtilSupport implements VectorUtilSupport {
}
return squareSum;
}
@Override
public int findNextGEQ(int[] buffer, int target, int from, int to) {
for (int i = from; i < to; ++i) {
if (buffer[i] >= target) {
return i;
}
}
return to;
}
}

View File

@ -34,19 +34,19 @@ public class PostingDecodingUtil {
* Core methods for decoding blocks of docs / freqs / positions / offsets.
*
* <ul>
* <li>Read {@code count} longs.
* <li>Read {@code count} ints.
* <li>For all {@code i} &gt;= 0 so that {@code bShift - i * dec} &gt; 0, apply shift {@code
* bShift - i * dec} and store the result in {@code b} at offset {@code count * i}.
* <li>Apply mask {@code cMask} and store the result in {@code c} starting at offset {@code
* cIndex}.
* </ul>
*/
public void splitLongs(
int count, long[] b, int bShift, int dec, long bMask, long[] c, int cIndex, long cMask)
public void splitInts(
int count, int[] b, int bShift, int dec, int bMask, int[] c, int cIndex, int cMask)
throws IOException {
// Default implementation, which takes advantage of the C2 compiler's loop unrolling and
// auto-vectorization.
in.readLongs(c, cIndex, count);
in.readInts(c, cIndex, count);
int maxIter = (bShift - 1) / dec;
for (int i = 0; i < count; ++i) {
for (int j = 0; j <= maxIter; ++j) {

View File

@ -44,4 +44,12 @@ public interface VectorUtilSupport {
/** Returns the sum of squared differences of the two byte vectors. */
int squareDistance(byte[] a, byte[] b);
/**
* Given an array {@code buffer} that is sorted between indexes {@code 0} inclusive and {@code to}
* exclusive, find the first array index whose value is greater than or equal to {@code target}.
* This index is guaranteed to be at least {@code from}. If there is no such array index, {@code
* to} is returned.
*/
int findNextGEQ(int[] buffer, int target, int from, int to);
}

View File

@ -38,12 +38,16 @@ import org.apache.lucene.util.VectorUtil;
* vectorization modules in the Java runtime this class provides optimized implementations (using
* SIMD) of several algorithms used throughout Apache Lucene.
*
* <p>Expert: set the {@value #UPPER_JAVA_FEATURE_VERSION_SYSPROP} system property to increase the
* set of Java versions this class will provide optimized implementations for.
*
* @lucene.internal
*/
public abstract class VectorizationProvider {
static final OptionalInt TESTS_VECTOR_SIZE;
static final boolean TESTS_FORCE_INTEGER_VECTORS;
static final int UPPER_JAVA_FEATURE_VERSION = getUpperJavaFeatureVersion();
static {
var vs = OptionalInt.empty();
@ -71,6 +75,27 @@ public abstract class VectorizationProvider {
TESTS_FORCE_INTEGER_VECTORS = enforce;
}
private static final String UPPER_JAVA_FEATURE_VERSION_SYSPROP =
"org.apache.lucene.vectorization.upperJavaFeatureVersion";
private static final int DEFAULT_UPPER_JAVA_FEATURE_VERSION = 23;
private static int getUpperJavaFeatureVersion() {
int runtimeVersion = DEFAULT_UPPER_JAVA_FEATURE_VERSION;
try {
String str = System.getProperty(UPPER_JAVA_FEATURE_VERSION_SYSPROP);
if (str != null) {
runtimeVersion = Math.max(Integer.parseInt(str), runtimeVersion);
}
} catch (@SuppressWarnings("unused") NumberFormatException | SecurityException ignored) {
Logger.getLogger(VectorizationProvider.class.getName())
.warning(
"Cannot read sysprop "
+ UPPER_JAVA_FEATURE_VERSION_SYSPROP
+ ", so the default value will be used.");
}
return runtimeVersion;
}
/**
* Returns the default instance of the provider matching vectorization possibilities of actual
* runtime.
@ -108,7 +133,7 @@ public abstract class VectorizationProvider {
static VectorizationProvider lookup(boolean testMode) {
final int runtimeVersion = Runtime.version().feature();
assert runtimeVersion >= 21;
if (runtimeVersion <= 23) {
if (runtimeVersion <= UPPER_JAVA_FEATURE_VERSION) {
// only use vector module with Hotspot VM
if (!Constants.IS_HOTSPOT_VM) {
LOG.warning(
@ -190,8 +215,8 @@ public abstract class VectorizationProvider {
Set.of(
"org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil",
"org.apache.lucene.util.VectorUtil",
"org.apache.lucene.codecs.lucene912.Lucene912PostingsReader",
"org.apache.lucene.codecs.lucene912.PostingIndexInput");
"org.apache.lucene.codecs.lucene101.Lucene101PostingsReader",
"org.apache.lucene.codecs.lucene101.PostingIndexInput");
private static void ensureCaller() {
final boolean validCaller =

View File

@ -87,6 +87,22 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
return this;
}
/**
* Add a collection of BooleanClauses to this {@link Builder}. Note that the order in which
* clauses are added does not have any impact on matching documents or query performance.
*
* @throws IndexSearcher.TooManyClauses if the new number of clauses exceeds the maximum clause
* number
*/
public Builder add(Collection<BooleanClause> collection) {
// see #addClause(BooleanClause)
if ((clauses.size() + collection.size()) > IndexSearcher.maxClauseCount) {
throw new IndexSearcher.TooManyClauses();
}
clauses.addAll(collection);
return this;
}
/**
* Add a new clause to this {@link Builder}. Note that the order in which clauses are added does
* not have any impact on matching documents or query performance.
@ -136,7 +152,7 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
}
/** Return the collection of queries for the given {@link Occur}. */
Collection<Query> getClauses(Occur occur) {
public Collection<Query> getClauses(Occur occur) {
return clauseSets.get(occur);
}

View File

@ -20,13 +20,14 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Objects;
import org.apache.lucene.internal.hppc.LongArrayList;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.PriorityQueue;
/**
* {@link BulkScorer} that is used for pure disjunctions and disjunctions that have low values of
* {@link BooleanQuery.Builder#setMinimumNumberShouldMatch(int)} and dense clauses. This scorer
* scores documents by batches of 2048 docs.
* scores documents by batches of 4,096 docs.
*/
final class BooleanScorer extends BulkScorer {
@ -41,71 +42,32 @@ final class BooleanScorer extends BulkScorer {
int freq;
}
private class BulkScorerAndDoc {
final BulkScorer scorer;
final long cost;
int next;
BulkScorerAndDoc(BulkScorer scorer) {
this.scorer = scorer;
this.cost = scorer.cost();
this.next = -1;
}
void advance(int min) throws IOException {
score(orCollector, null, min, min);
}
void score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
next = scorer.score(collector, acceptDocs, min, max);
}
}
// See WANDScorer for an explanation
private static long cost(Collection<BulkScorer> scorers, int minShouldMatch) {
final PriorityQueue<BulkScorer> pq =
new PriorityQueue<BulkScorer>(scorers.size() - minShouldMatch + 1) {
@Override
protected boolean lessThan(BulkScorer a, BulkScorer b) {
return a.cost() > b.cost();
}
};
for (BulkScorer scorer : scorers) {
pq.insertWithOverflow(scorer);
}
long cost = 0;
for (BulkScorer scorer = pq.pop(); scorer != null; scorer = pq.pop()) {
cost += scorer.cost();
}
return cost;
}
static final class HeadPriorityQueue extends PriorityQueue<BulkScorerAndDoc> {
static final class HeadPriorityQueue extends PriorityQueue<DisiWrapper> {
public HeadPriorityQueue(int maxSize) {
super(maxSize);
}
@Override
protected boolean lessThan(BulkScorerAndDoc a, BulkScorerAndDoc b) {
return a.next < b.next;
protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
return a.doc < b.doc;
}
}
static final class TailPriorityQueue extends PriorityQueue<BulkScorerAndDoc> {
static final class TailPriorityQueue extends PriorityQueue<DisiWrapper> {
public TailPriorityQueue(int maxSize) {
super(maxSize);
}
@Override
protected boolean lessThan(BulkScorerAndDoc a, BulkScorerAndDoc b) {
protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
return a.cost < b.cost;
}
public BulkScorerAndDoc get(int i) {
public DisiWrapper get(int i) {
Objects.checkIndex(i, size());
return (BulkScorerAndDoc) getHeapArray()[1 + i];
return (DisiWrapper) getHeapArray()[1 + i];
}
}
@ -115,7 +77,7 @@ final class BooleanScorer extends BulkScorer {
// This is basically an inlined FixedBitSet... seems to help with bound checks
final long[] matching = new long[SET_SIZE];
final BulkScorerAndDoc[] leads;
final DisiWrapper[] leads;
final HeadPriorityQueue head;
final TailPriorityQueue tail;
final Score score = new Score();
@ -123,31 +85,6 @@ final class BooleanScorer extends BulkScorer {
final long cost;
final boolean needsScores;
final class OrCollector implements LeafCollector {
Scorable scorer;
@Override
public void setScorer(Scorable scorer) {
this.scorer = scorer;
}
@Override
public void collect(int doc) throws IOException {
final int i = doc & MASK;
final int idx = i >>> 6;
matching[idx] |= 1L << i;
if (buckets != null) {
final Bucket bucket = buckets[i];
bucket.freq++;
if (needsScores) {
bucket.score += scorer.score();
}
}
}
}
final OrCollector orCollector = new OrCollector();
final class DocIdStreamView extends DocIdStream {
int base;
@ -194,7 +131,7 @@ final class BooleanScorer extends BulkScorer {
private final DocIdStreamView docIdStreamView = new DocIdStreamView();
BooleanScorer(Collection<BulkScorer> scorers, int minShouldMatch, boolean needsScores) {
BooleanScorer(Collection<Scorer> scorers, int minShouldMatch, boolean needsScores) {
if (minShouldMatch < 1 || minShouldMatch > scorers.size()) {
throw new IllegalArgumentException(
"minShouldMatch should be within 1..num_scorers. Got " + minShouldMatch);
@ -211,18 +148,21 @@ final class BooleanScorer extends BulkScorer {
} else {
buckets = null;
}
this.leads = new BulkScorerAndDoc[scorers.size()];
this.leads = new DisiWrapper[scorers.size()];
this.head = new HeadPriorityQueue(scorers.size() - minShouldMatch + 1);
this.tail = new TailPriorityQueue(minShouldMatch - 1);
this.minShouldMatch = minShouldMatch;
this.needsScores = needsScores;
for (BulkScorer scorer : scorers) {
final BulkScorerAndDoc evicted = tail.insertWithOverflow(new BulkScorerAndDoc(scorer));
LongArrayList costs = new LongArrayList(scorers.size());
for (Scorer scorer : scorers) {
DisiWrapper w = new DisiWrapper(scorer);
costs.add(w.cost);
final DisiWrapper evicted = tail.insertWithOverflow(w);
if (evicted != null) {
head.add(evicted);
}
}
this.cost = cost(scorers, minShouldMatch);
this.cost = ScorerUtil.costWithMinShouldMatch(costs.stream(), costs.size(), minShouldMatch);
}
@Override
@ -230,19 +170,49 @@ final class BooleanScorer extends BulkScorer {
return cost;
}
private void scoreDisiWrapperIntoBitSet(DisiWrapper w, Bits acceptDocs, int min, int max)
throws IOException {
boolean needsScores = BooleanScorer.this.needsScores;
long[] matching = BooleanScorer.this.matching;
Bucket[] buckets = BooleanScorer.this.buckets;
DocIdSetIterator it = w.iterator;
Scorer scorer = w.scorer;
int doc = w.doc;
if (doc < min) {
doc = it.advance(min);
}
for (; doc < max; doc = it.nextDoc()) {
if (acceptDocs == null || acceptDocs.get(doc)) {
final int i = doc & MASK;
final int idx = i >> 6;
matching[idx] |= 1L << i;
if (buckets != null) {
final Bucket bucket = buckets[i];
bucket.freq++;
if (needsScores) {
bucket.score += scorer.score();
}
}
}
}
w.doc = doc;
}
private void scoreWindowIntoBitSetAndReplay(
LeafCollector collector,
Bits acceptDocs,
int base,
int min,
int max,
BulkScorerAndDoc[] scorers,
DisiWrapper[] scorers,
int numScorers)
throws IOException {
for (int i = 0; i < numScorers; ++i) {
final BulkScorerAndDoc scorer = scorers[i];
assert scorer.next < max;
scorer.score(orCollector, acceptDocs, min, max);
final DisiWrapper w = scorers[i];
assert w.doc < max;
scoreDisiWrapperIntoBitSet(w, acceptDocs, min, max);
}
docIdStreamView.base = base;
@ -251,20 +221,20 @@ final class BooleanScorer extends BulkScorer {
Arrays.fill(matching, 0L);
}
private BulkScorerAndDoc advance(int min) throws IOException {
private DisiWrapper advance(int min) throws IOException {
assert tail.size() == minShouldMatch - 1;
final HeadPriorityQueue head = this.head;
final TailPriorityQueue tail = this.tail;
BulkScorerAndDoc headTop = head.top();
BulkScorerAndDoc tailTop = tail.top();
while (headTop.next < min) {
DisiWrapper headTop = head.top();
DisiWrapper tailTop = tail.top();
while (headTop.doc < min) {
if (tailTop == null || headTop.cost <= tailTop.cost) {
headTop.advance(min);
headTop.doc = headTop.iterator.advance(min);
headTop = head.updateTop();
} else {
// swap the top of head and tail
final BulkScorerAndDoc previousHeadTop = headTop;
tailTop.advance(min);
final DisiWrapper previousHeadTop = headTop;
tailTop.doc = tailTop.iterator.advance(min);
headTop = head.updateTop(tailTop);
tailTop = tail.updateTop(previousHeadTop);
}
@ -282,9 +252,11 @@ final class BooleanScorer extends BulkScorer {
throws IOException {
while (maxFreq < minShouldMatch && maxFreq + tail.size() >= minShouldMatch) {
// a match is still possible
final BulkScorerAndDoc candidate = tail.pop();
candidate.advance(windowMin);
if (candidate.next < windowMax) {
final DisiWrapper candidate = tail.pop();
if (candidate.doc < windowMin) {
candidate.doc = candidate.iterator.advance(windowMin);
}
if (candidate.doc < windowMax) {
leads[maxFreq++] = candidate;
} else {
head.add(candidate);
@ -304,7 +276,7 @@ final class BooleanScorer extends BulkScorer {
// Push back scorers into head and tail
for (int i = 0; i < maxFreq; ++i) {
final BulkScorerAndDoc evicted = head.insertWithOverflow(leads[i]);
final DisiWrapper evicted = head.insertWithOverflow(leads[i]);
if (evicted != null) {
tail.add(evicted);
}
@ -312,7 +284,7 @@ final class BooleanScorer extends BulkScorer {
}
private void scoreWindowSingleScorer(
BulkScorerAndDoc bulkScorer,
DisiWrapper w,
LeafCollector collector,
Bits acceptDocs,
int windowMin,
@ -320,33 +292,44 @@ final class BooleanScorer extends BulkScorer {
int max)
throws IOException {
assert tail.size() == 0;
final int nextWindowBase = head.top().next & ~MASK;
final int nextWindowBase = head.top().doc & ~MASK;
final int end = Math.max(windowMax, Math.min(max, nextWindowBase));
bulkScorer.score(collector, acceptDocs, windowMin, end);
DocIdSetIterator it = w.iterator;
int doc = w.doc;
if (doc < windowMin) {
doc = it.advance(windowMin);
}
collector.setScorer(w.scorer);
for (; doc < end; doc = it.nextDoc()) {
if (acceptDocs == null || acceptDocs.get(doc)) {
collector.collect(doc);
}
}
w.doc = doc;
// reset the scorer that should be used for the general case
collector.setScorer(score);
}
private BulkScorerAndDoc scoreWindow(
BulkScorerAndDoc top, LeafCollector collector, Bits acceptDocs, int min, int max)
private DisiWrapper scoreWindow(
DisiWrapper top, LeafCollector collector, Bits acceptDocs, int min, int max)
throws IOException {
final int windowBase = top.next & ~MASK; // find the window that the next match belongs to
final int windowBase = top.doc & ~MASK; // find the window that the next match belongs to
final int windowMin = Math.max(min, windowBase);
final int windowMax = Math.min(max, windowBase + SIZE);
// Fill 'leads' with all scorers from 'head' that are in the right window
leads[0] = head.pop();
int maxFreq = 1;
while (head.size() > 0 && head.top().next < windowMax) {
while (head.size() > 0 && head.top().doc < windowMax) {
leads[maxFreq++] = head.pop();
}
if (minShouldMatch == 1 && maxFreq == 1) {
// special case: only one scorer can match in the current window,
// we can collect directly
final BulkScorerAndDoc bulkScorer = leads[0];
final DisiWrapper bulkScorer = leads[0];
scoreWindowSingleScorer(bulkScorer, collector, acceptDocs, windowMin, windowMax, max);
return head.add(bulkScorer);
} else {
@ -360,11 +343,11 @@ final class BooleanScorer extends BulkScorer {
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
collector.setScorer(score);
BulkScorerAndDoc top = advance(min);
while (top.next < max) {
DisiWrapper top = advance(min);
while (top.doc < max) {
top = scoreWindow(top, collector, acceptDocs, min, max);
}
return top.next;
return top.doc;
}
}

View File

@ -289,9 +289,9 @@ final class BooleanScorerSupplier extends ScorerSupplier {
return new MaxScoreBulkScorer(maxDoc, optionalScorers);
}
List<BulkScorer> optional = new ArrayList<BulkScorer>();
List<Scorer> optional = new ArrayList<Scorer>();
for (ScorerSupplier ss : subs.get(Occur.SHOULD)) {
optional.add(ss.bulkScorer());
optional.add(ss.get(Long.MAX_VALUE));
}
return new BooleanScorer(optional, Math.max(1, minShouldMatch), scoreMode.needsScores());

View File

@ -153,70 +153,6 @@ final class BooleanWeight extends Weight {
return MatchesUtils.fromSubMatches(matches);
}
// Return a BulkScorer for the optional clauses only,
// or null if it is not applicable
// pkg-private for forcing use of BooleanScorer in tests
BulkScorer optionalBulkScorer(LeafReaderContext context) throws IOException {
if (scoreMode == ScoreMode.TOP_SCORES) {
if (!query.isPureDisjunction()) {
return null;
}
List<ScorerSupplier> optional = new ArrayList<>();
for (WeightedBooleanClause wc : weightedClauses) {
Weight w = wc.weight;
BooleanClause c = wc.clause;
if (c.occur() != Occur.SHOULD) {
continue;
}
ScorerSupplier scorer = w.scorerSupplier(context);
if (scorer != null) {
optional.add(scorer);
}
}
if (optional.size() <= 1) {
return null;
}
List<Scorer> optionalScorers = new ArrayList<>();
for (ScorerSupplier ss : optional) {
optionalScorers.add(ss.get(Long.MAX_VALUE));
}
return new MaxScoreBulkScorer(context.reader().maxDoc(), optionalScorers);
}
List<BulkScorer> optional = new ArrayList<BulkScorer>();
for (WeightedBooleanClause wc : weightedClauses) {
Weight w = wc.weight;
BooleanClause c = wc.clause;
if (c.occur() != Occur.SHOULD) {
continue;
}
BulkScorer subScorer = w.bulkScorer(context);
if (subScorer != null) {
optional.add(subScorer);
}
}
if (optional.size() == 0) {
return null;
}
if (query.getMinimumNumberShouldMatch() > optional.size()) {
return null;
}
if (optional.size() == 1) {
return optional.get(0);
}
return new BooleanScorer(
optional, Math.max(1, query.getMinimumNumberShouldMatch()), scoreMode.needsScores());
}
@Override
public int count(LeafReaderContext context) throws IOException {
final int numDocs = context.reader().numDocs();

View File

@ -23,6 +23,7 @@ import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.lucene.index.LeafReaderContext;
/**
@ -44,6 +45,7 @@ public final class DisjunctionMaxQuery extends Query implements Iterable<Query>
/* The subqueries */
private final Multiset<Query> disjuncts = new Multiset<>();
private final List<Query> orderedQueries; // used for toString()
/* Multiple of the non-max disjunct scores added into our final score. Non-zero values support tie-breaking. */
private final float tieBreakerMultiplier;
@ -65,6 +67,7 @@ public final class DisjunctionMaxQuery extends Query implements Iterable<Query>
}
this.tieBreakerMultiplier = tieBreakerMultiplier;
this.disjuncts.addAll(disjuncts);
this.orderedQueries = new ArrayList<>(disjuncts); // order from the caller
}
/**
@ -295,24 +298,19 @@ public final class DisjunctionMaxQuery extends Query implements Iterable<Query>
*/
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
buffer.append("(");
Iterator<Query> it = disjuncts.iterator();
for (int i = 0; it.hasNext(); i++) {
Query subquery = it.next();
if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens
buffer.append("(");
buffer.append(subquery.toString(field));
buffer.append(")");
} else buffer.append(subquery.toString(field));
if (i != disjuncts.size() - 1) buffer.append(" | ");
}
buffer.append(")");
if (tieBreakerMultiplier != 0.0f) {
buffer.append("~");
buffer.append(tieBreakerMultiplier);
}
return buffer.toString();
return this.orderedQueries.stream()
.map(
subquery -> {
if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens
return "(" + subquery.toString(field) + ")";
}
return subquery.toString(field);
})
.collect(
Collectors.joining(
" | ",
"(",
")" + ((tieBreakerMultiplier != 0.0f) ? "~" + tieBreakerMultiplier : "")));
}
/**

View File

@ -1,147 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.util.concurrent.atomic.LongAdder;
/** Used for defining custom algorithms to allow searches to early terminate */
abstract class HitsThresholdChecker {
/** Implementation of HitsThresholdChecker which allows global hit counting */
private static class GlobalHitsThresholdChecker extends HitsThresholdChecker {
private final LongAdder globalHitCount = new LongAdder();
// Cache whether the threshold has been reached already. It is not volatile or synchronized on
// purpose to contain the overhead of reading the value similarly to what String#hashCode()
// does. This does not affect correctness.
private boolean thresholdReached = false;
GlobalHitsThresholdChecker(int totalHitsThreshold) {
super(totalHitsThreshold);
assert totalHitsThreshold != Integer.MAX_VALUE;
}
@Override
void incrementHitCount() {
if (thresholdReached == false) {
globalHitCount.increment();
}
}
@Override
boolean isThresholdReached() {
if (thresholdReached) {
return true;
}
if (globalHitCount.longValue() > getHitsThreshold()) {
thresholdReached = true;
return true;
}
return false;
}
@Override
ScoreMode scoreMode() {
return ScoreMode.TOP_SCORES;
}
}
/** Default implementation of HitsThresholdChecker to be used for single threaded execution */
private static class LocalHitsThresholdChecker extends HitsThresholdChecker {
private int hitCount;
LocalHitsThresholdChecker(int totalHitsThreshold) {
super(totalHitsThreshold);
assert totalHitsThreshold != Integer.MAX_VALUE;
}
@Override
void incrementHitCount() {
++hitCount;
}
@Override
boolean isThresholdReached() {
return hitCount > getHitsThreshold();
}
@Override
ScoreMode scoreMode() {
return ScoreMode.TOP_SCORES;
}
}
/**
* No-op implementation of {@link HitsThresholdChecker} that does no counting, as the threshold
* can never be reached. This is useful for cases where early termination is never desired, so
* that the overhead of counting hits can be avoided.
*/
private static final HitsThresholdChecker EXACT_HITS_COUNT_THRESHOLD_CHECKER =
new HitsThresholdChecker(Integer.MAX_VALUE) {
@Override
void incrementHitCount() {
// noop
}
@Override
boolean isThresholdReached() {
return false;
}
@Override
ScoreMode scoreMode() {
return ScoreMode.COMPLETE;
}
};
/*
* Returns a threshold checker that is useful for single threaded searches
*/
static HitsThresholdChecker create(final int totalHitsThreshold) {
return totalHitsThreshold == Integer.MAX_VALUE
? HitsThresholdChecker.EXACT_HITS_COUNT_THRESHOLD_CHECKER
: new LocalHitsThresholdChecker(totalHitsThreshold);
}
/*
* Returns a threshold checker that is based on a shared counter
*/
static HitsThresholdChecker createShared(final int totalHitsThreshold) {
return totalHitsThreshold == Integer.MAX_VALUE
? HitsThresholdChecker.EXACT_HITS_COUNT_THRESHOLD_CHECKER
: new GlobalHitsThresholdChecker(totalHitsThreshold);
}
private final int totalHitsThreshold;
HitsThresholdChecker(int totalHitsThreshold) {
if (totalHitsThreshold < 0) {
throw new IllegalArgumentException(
"totalHitsThreshold must be >= 0, got " + totalHitsThreshold);
}
this.totalHitsThreshold = totalHitsThreshold;
}
final int getHitsThreshold() {
return totalHitsThreshold;
}
abstract boolean isThresholdReached();
abstract ScoreMode scoreMode();
abstract void incrementHitCount();
}

View File

@ -106,6 +106,10 @@ public final class ImpactsDISI extends DocIdSetIterator {
@Override
public int nextDoc() throws IOException {
DocIdSetIterator in = this.in;
if (in.docID() < upTo) {
return in.nextDoc();
}
return advance(in.docID() + 1);
}

View File

@ -27,7 +27,6 @@ import java.util.Objects;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.Executor;
import java.util.function.Function;
import java.util.function.Supplier;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@ -115,13 +114,7 @@ public class IndexSearcher {
protected final IndexReaderContext readerContext;
protected final List<LeafReaderContext> leafContexts;
/**
* Used with executor - LeafSlice supplier where each slice holds a set of leafs executed within
* one thread. We are caching it instead of creating it eagerly to avoid calling a protected
* method from constructor, which is a bad practice. Always non-null, regardless of whether an
* executor is provided or not.
*/
private final Supplier<LeafSlice[]> leafSlicesSupplier;
private volatile LeafSlice[] leafSlices;
// Used internally for load balancing threads executing for the query
private final TaskExecutor taskExecutor;
@ -230,20 +223,18 @@ public class IndexSearcher {
executor == null ? new TaskExecutor(Runnable::run) : new TaskExecutor(executor);
this.readerContext = context;
leafContexts = context.leaves();
Function<List<LeafReaderContext>, LeafSlice[]> slicesProvider =
executor == null
? leaves ->
leaves.isEmpty()
? new LeafSlice[0]
: new LeafSlice[] {
new LeafSlice(
new ArrayList<>(
leaves.stream()
.map(LeafReaderContextPartition::createForEntireSegment)
.toList()))
}
: this::slices;
leafSlicesSupplier = new CachingLeafSlicesSupplier(slicesProvider, leafContexts);
if (executor == null) {
leafSlices =
leafContexts.isEmpty()
? new LeafSlice[0]
: new LeafSlice[] {
new LeafSlice(
new ArrayList<>(
leafContexts.stream()
.map(LeafReaderContextPartition::createForEntireSegment)
.toList()))
};
}
}
/**
@ -540,7 +531,43 @@ public class IndexSearcher {
* @lucene.experimental
*/
public final LeafSlice[] getSlices() {
return leafSlicesSupplier.get();
LeafSlice[] res = leafSlices;
if (res == null) {
res = computeAndCacheSlices();
}
return res;
}
private synchronized LeafSlice[] computeAndCacheSlices() {
LeafSlice[] res = leafSlices;
if (res == null) {
res = slices(leafContexts);
/*
* Enforce that there aren't multiple leaf partitions within the same leaf slice pointing to the
* same leaf context. It is a requirement that {@link Collector#getLeafCollector(LeafReaderContext)}
* gets called once per leaf context. Also, it does not make sense to partition a segment to then search
* those partitions as part of the same slice, because the goal of partitioning is parallel searching
* which happens at the slice level.
*/
for (LeafSlice leafSlice : res) {
if (leafSlice.partitions.length <= 1) {
continue;
}
enforceDistinctLeaves(leafSlice);
}
leafSlices = res;
}
return res;
}
private static void enforceDistinctLeaves(LeafSlice leafSlice) {
Set<LeafReaderContext> distinctLeaves = new HashSet<>();
for (LeafReaderContextPartition leafPartition : leafSlice.partitions) {
if (distinctLeaves.add(leafPartition.ctx) == false) {
throw new IllegalStateException(
"The same slice targets multiple leaf partitions of the same leaf reader context. A physical segment should rather get partitioned to be searched concurrently from as many slices as the number of leaf partitions it is split into.");
}
}
}
/**
@ -564,10 +591,8 @@ public class IndexSearcher {
}
final int cappedNumHits = Math.min(numHits, limit);
final boolean supportsConcurrency = getSlices().length > 1;
CollectorManager<TopScoreDocCollector, TopDocs> manager =
new TopScoreDocCollectorManager(
cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
new TopScoreDocCollectorManager(cappedNumHits, after, TOTAL_HITS_THRESHOLD);
return search(query, manager);
}
@ -699,12 +724,9 @@ public class IndexSearcher {
}
final int cappedNumHits = Math.min(numHits, limit);
final Sort rewrittenSort = sort.rewrite(this);
final LeafSlice[] leafSlices = getSlices();
final boolean supportsConcurrency = leafSlices.length > 1;
final CollectorManager<TopFieldCollector, TopFieldDocs> manager =
new TopFieldCollectorManager(
rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
new TopFieldCollectorManager(rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD);
TopFieldDocs topDocs = search(query, manager);
if (doDocScores) {
@ -1169,60 +1191,4 @@ public class IndexSearcher {
+ IndexSearcher.getMaxClauseCount());
}
}
/**
* Supplier for {@link LeafSlice} slices which computes and caches the value on first invocation
* and returns cached value on subsequent invocation. If the passed in provider for slice
* computation throws exception then same will be passed to the caller of this supplier on each
* invocation. If the provider returns null then {@link NullPointerException} will be thrown to
* the caller.
*
* <p>NOTE: To provide thread safe caching mechanism this class is implementing the (subtle) <a
* href="https://shipilev.net/blog/2014/safe-public-construction/">double-checked locking
* idiom</a>
*/
private static class CachingLeafSlicesSupplier implements Supplier<LeafSlice[]> {
private volatile LeafSlice[] leafSlices;
private final Function<List<LeafReaderContext>, LeafSlice[]> sliceProvider;
private final List<LeafReaderContext> leaves;
private CachingLeafSlicesSupplier(
Function<List<LeafReaderContext>, LeafSlice[]> provider, List<LeafReaderContext> leaves) {
this.sliceProvider = Objects.requireNonNull(provider, "leaf slice provider cannot be null");
this.leaves = Objects.requireNonNull(leaves, "list of LeafReaderContext cannot be null");
}
@Override
public LeafSlice[] get() {
if (leafSlices == null) {
synchronized (this) {
if (leafSlices == null) {
leafSlices =
Objects.requireNonNull(
sliceProvider.apply(leaves), "slices computed by the provider is null");
/*
* Enforce that there aren't multiple leaf partitions within the same leaf slice pointing to the
* same leaf context. It is a requirement that {@link Collector#getLeafCollector(LeafReaderContext)}
* gets called once per leaf context. Also, it does not make sense to partition a segment to then search
* those partitions as part of the same slice, because the goal of partitioning is parallel searching
* which happens at the slice level.
*/
for (LeafSlice leafSlice : leafSlices) {
Set<LeafReaderContext> distinctLeaves = new HashSet<>();
for (LeafReaderContextPartition leafPartition : leafSlice.partitions) {
distinctLeaves.add(leafPartition.ctx);
}
if (leafSlice.partitions.length != distinctLeaves.size()) {
throw new IllegalStateException(
"The same slice targets multiple leaf partitions of the same leaf reader context. A physical segment should rather get partitioned to be searched concurrently from as many slices as the number of leaf partitions it is split into.");
}
}
}
}
}
return leafSlices;
}
}
}

View File

@ -186,10 +186,44 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
@Override
public int count(LeafReaderContext context) throws IOException {
if (context.reader().hasDeletions() == false) {
IteratorAndCount itAndCount = getDocIdSetIteratorOrNull(context);
if (lowerValue > upperValue) {
return 0;
}
IteratorAndCount itAndCount = null;
LeafReader reader = context.reader();
// first use bkd optimization if possible
SortedNumericDocValues sortedNumericValues = DocValues.getSortedNumeric(reader, field);
NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues);
PointValues pointValues = reader.getPointValues(field);
if (pointValues != null && pointValues.getDocCount() == reader.maxDoc()) {
itAndCount = getDocIdSetIteratorOrNullFromBkd(context, numericValues);
}
if (itAndCount != null && itAndCount.count != -1) {
return itAndCount.count;
}
// use index sort optimization if possible
Sort indexSort = reader.getMetaData().sort();
if (indexSort != null
&& indexSort.getSort().length > 0
&& indexSort.getSort()[0].getField().equals(field)) {
final SortField sortField = indexSort.getSort()[0];
final SortField.Type sortFieldType = getSortFieldType(sortField);
// The index sort optimization is only supported for Type.INT and Type.LONG
if (sortFieldType == Type.INT || sortFieldType == Type.LONG) {
Object missingValue = sortField.getMissingValue();
final long missingLongValue = missingValue == null ? 0L : (long) missingValue;
// all documents have docValues or missing value falls outside the range
if ((pointValues != null && pointValues.getDocCount() == reader.maxDoc())
|| (missingLongValue < lowerValue || missingLongValue > upperValue)) {
itAndCount = getDocIdSetIterator(sortField, sortFieldType, context, numericValues);
}
if (itAndCount != null && itAndCount.count != -1) {
return itAndCount.count;
}
}
}
}
return fallbackWeight.count(context);
}

View File

@ -1,72 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
/** {@link SimScorer} on a specific {@link LeafReader}. */
public final class LeafSimScorer {
private final SimScorer scorer;
private final NumericDocValues norms;
/** Sole constructor: Score documents of {@code reader} with {@code scorer}. */
public LeafSimScorer(SimScorer scorer, LeafReader reader, String field, boolean needsScores)
throws IOException {
this.scorer = Objects.requireNonNull(scorer);
norms = needsScores ? reader.getNormValues(field) : null;
}
/** Return the wrapped {@link SimScorer}. */
public SimScorer getSimScorer() {
return scorer;
}
private long getNormValue(int doc) throws IOException {
if (norms != null) {
boolean found = norms.advanceExact(doc);
assert found;
return norms.longValue();
} else {
return 1L; // default norm
}
}
/**
* Score the provided document assuming the given term document frequency. This method must be
* called on non-decreasing sequences of doc ids.
*
* @see SimScorer#score(float, long)
*/
public float score(int doc, float freq) throws IOException {
return scorer.score(freq, getNormValue(doc));
}
/**
* Explain the score for the provided document assuming the given term document frequency. This
* method must be called on non-decreasing sequences of doc ids.
*
* @see SimScorer#explain(Explanation, long)
*/
public Explanation explain(int doc, Explanation freqExpl) throws IOException {
return scorer.explain(freqExpl, getNormValue(doc));
}
}

View File

@ -66,6 +66,15 @@ final class MaxScoreBulkScorer extends BulkScorer {
maxScoreSums = new double[allScorers.length];
}
// Number of outer windows that have been evaluated
private int numOuterWindows;
// Number of candidate matches so far
private int numCandidates;
// Minimum window size. See #computeOuterWindowMax where we have heuristics that adjust the
// minimum window size based on the average number of candidate matches per outer window, to keep
// the per-window overhead under control.
private int minWindowSize = 1;
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
collector.setScorer(scorable);
@ -124,6 +133,7 @@ final class MaxScoreBulkScorer extends BulkScorer {
}
outerWindowMin = Math.min(top.doc, outerWindowMax);
++numOuterWindows;
}
return nextCandidate(max);
@ -278,6 +288,23 @@ final class MaxScoreBulkScorer extends BulkScorer {
windowMax = (int) Math.min(windowMax, upTo + 1L); // upTo is inclusive
}
if (allScorers.length - firstWindowLead > 1) {
// The more clauses we consider to compute outer windows, the higher chances that one of these
// clauses has a block boundary in the next few doc IDs. This situation can result in more
// time spent computing maximum scores per outer window than evaluating hits. To avoid such
// situations, we target at least 32 candidate matches per clause per outer window on average,
// to make sure we amortize the cost of computing maximum scores.
long threshold = numOuterWindows * 32L * allScorers.length;
if (numCandidates < threshold) {
minWindowSize = Math.min(minWindowSize << 1, INNER_WINDOW_SIZE);
} else {
minWindowSize = 1;
}
int minWindowMax = (int) Math.min(Integer.MAX_VALUE, (long) windowMin + minWindowSize);
windowMax = Math.max(windowMax, minWindowMax);
}
return windowMax;
}
@ -300,6 +327,9 @@ final class MaxScoreBulkScorer extends BulkScorer {
private void scoreNonEssentialClauses(
LeafCollector collector, int doc, double essentialScore, int numNonEssentialClauses)
throws IOException {
++numCandidates;
double score = essentialScore;
for (int i = numNonEssentialClauses - 1; i >= 0; --i) {
float maxPossibleScore =

View File

@ -21,8 +21,8 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@ -399,10 +399,10 @@ public class PhraseQuery extends Query {
/**
* A guess of the average number of simple operations for the initial seek and buffer refill per
* document for the positions of a term. See also {@link
* Lucene912PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
* Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
*
* <p>Aside: Instead of being constant this could depend among others on {@link
* Lucene912PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
* Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
* TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs),
* {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block
* size of the device storing the index.
@ -411,7 +411,7 @@ public class PhraseQuery extends Query {
/**
* Number of simple operations in {@link
* Lucene912PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill
* Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill
* is done.
*/
private static final int TERM_OPS_PER_POS = 7;

View File

@ -18,6 +18,8 @@
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
class PhraseScorer extends Scorer {
@ -26,16 +28,19 @@ class PhraseScorer extends Scorer {
final MaxScoreCache maxScoreCache;
final PhraseMatcher matcher;
final ScoreMode scoreMode;
private final LeafSimScorer simScorer;
private final SimScorer simScorer;
private final NumericDocValues norms;
final float matchCost;
private float minCompetitiveScore = 0;
private float freq = 0;
PhraseScorer(PhraseMatcher matcher, ScoreMode scoreMode, LeafSimScorer simScorer) {
PhraseScorer(
PhraseMatcher matcher, ScoreMode scoreMode, SimScorer simScorer, NumericDocValues norms) {
this.matcher = matcher;
this.scoreMode = scoreMode;
this.simScorer = simScorer;
this.norms = norms;
this.matchCost = matcher.getMatchCost();
this.approximation = matcher.approximation();
this.impactsApproximation = matcher.impactsApproximation();
@ -50,7 +55,11 @@ class PhraseScorer extends Scorer {
matcher.reset();
if (scoreMode == ScoreMode.TOP_SCORES && minCompetitiveScore > 0) {
float maxFreq = matcher.maxFreq();
if (simScorer.score(docID(), maxFreq) < minCompetitiveScore) {
long norm = 1L;
if (norms != null && norms.advanceExact(docID())) {
norm = norms.longValue();
}
if (simScorer.score(maxFreq, norm) < minCompetitiveScore) {
// The maximum score we could get is less than the min competitive score
return false;
}
@ -79,7 +88,11 @@ class PhraseScorer extends Scorer {
freq += matcher.sloppyWeight();
}
}
return simScorer.score(docID(), freq);
long norm = 1L;
if (norms != null && norms.advanceExact(docID())) {
norm = norms.longValue();
}
return simScorer.score(freq, norm);
}
@Override

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
@ -63,9 +64,8 @@ public abstract class PhraseWeight extends Weight {
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
PhraseMatcher matcher = getPhraseMatcher(context, stats, false);
if (matcher == null) return null;
LeafSimScorer simScorer =
new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores());
final var scorer = new PhraseScorer(matcher, scoreMode, simScorer);
NumericDocValues norms = scoreMode.needsScores() ? context.reader().getNormValues(field) : null;
final var scorer = new PhraseScorer(matcher, scoreMode, stats, norms);
return new DefaultScorerSupplier(scorer);
}
@ -83,10 +83,13 @@ public abstract class PhraseWeight extends Weight {
while (matcher.nextMatch()) {
freq += matcher.sloppyWeight();
}
LeafSimScorer docScorer =
new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores());
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
NumericDocValues norms = scoreMode.needsScores() ? context.reader().getNormValues(field) : null;
long norm = 1L;
if (norms != null && norms.advanceExact(doc)) {
norm = norms.longValue();
}
Explanation scoreExplanation = stats.explain(freqExplanation, norm);
return Explanation.match(
scoreExplanation.getValue(),
"weight("

View File

@ -46,9 +46,7 @@ public class SortRescorer extends Rescorer {
List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
TopFieldCollector collector =
new TopFieldCollectorManager(
sort, topN, null, Integer.MAX_VALUE, searcher.getSlices().length > 1)
.newCollector();
new TopFieldCollectorManager(sort, topN, null, Integer.MAX_VALUE).newCollector();
// Now merge sort docIDs from hits, with reader's leaves:
int hitUpto = 0;

View File

@ -30,6 +30,7 @@ import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term;
@ -38,6 +39,7 @@ import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.PriorityQueue;
@ -259,9 +261,13 @@ public final class SynonymQuery extends Query {
assert scorer instanceof TermScorer;
freq = ((TermScorer) scorer).freq();
}
LeafSimScorer docScorer = new LeafSimScorer(simWeight, context.reader(), field, true);
Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
NumericDocValues norms = context.reader().getNormValues(field);
long norm = 1L;
if (norms != null && norms.advanceExact(doc)) {
norm = norms.longValue();
}
Explanation scoreExplanation = simWeight.explain(freqExplanation, norm);
return Explanation.match(
scoreExplanation.getValue(),
"weight("
@ -334,27 +340,27 @@ public final class SynonymQuery extends Query {
return new ConstantScoreScorer(0f, scoreMode, DocIdSetIterator.empty());
}
LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), field, true);
NumericDocValues norms = context.reader().getNormValues(field);
// we must optimize this case (term not in segment), disjunctions require >= 2 subs
if (iterators.size() == 1) {
final TermScorer scorer;
if (scoreMode == ScoreMode.TOP_SCORES) {
scorer = new TermScorer(impacts.get(0), simScorer);
scorer = new TermScorer(impacts.get(0), simWeight, norms);
} else {
scorer = new TermScorer(iterators.get(0), simScorer);
scorer = new TermScorer(iterators.get(0), simWeight, norms);
}
float boost = termBoosts.get(0);
return scoreMode == ScoreMode.COMPLETE_NO_SCORES || boost == 1f
? scorer
: new FreqBoostTermScorer(boost, scorer, simScorer);
: new FreqBoostTermScorer(boost, scorer, simWeight, norms);
} else {
// we use termscorers + disjunction as an impl detail
DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size());
for (int i = 0; i < iterators.size(); i++) {
PostingsEnum postings = iterators.get(i);
final TermScorer termScorer = new TermScorer(postings, simScorer);
final TermScorer termScorer = new TermScorer(postings, simWeight, norms);
float boost = termBoosts.get(i);
final DisiWrapperFreq wrapper = new DisiWrapperFreq(termScorer, boost);
queue.add(wrapper);
@ -368,8 +374,7 @@ public final class SynonymQuery extends Query {
boosts[i] = termBoosts.get(i);
}
ImpactsSource impactsSource = mergeImpacts(impacts.toArray(new ImpactsEnum[0]), boosts);
MaxScoreCache maxScoreCache =
new MaxScoreCache(impactsSource, simScorer.getSimScorer());
MaxScoreCache maxScoreCache = new MaxScoreCache(impactsSource, simWeight);
ImpactsDISI impactsDisi = new ImpactsDISI(iterator, maxScoreCache);
if (scoreMode == ScoreMode.TOP_SCORES) {
@ -379,7 +384,7 @@ public final class SynonymQuery extends Query {
iterator = impactsDisi;
}
return new SynonymScorer(queue, iterator, impactsDisi, simScorer);
return new SynonymScorer(queue, iterator, impactsDisi, simWeight, norms);
}
}
@ -575,18 +580,21 @@ public final class SynonymQuery extends Query {
private final DocIdSetIterator iterator;
private final MaxScoreCache maxScoreCache;
private final ImpactsDISI impactsDisi;
private final LeafSimScorer simScorer;
private final SimScorer scorer;
private final NumericDocValues norms;
SynonymScorer(
DisiPriorityQueue queue,
DocIdSetIterator iterator,
ImpactsDISI impactsDisi,
LeafSimScorer simScorer) {
SimScorer scorer,
NumericDocValues norms) {
this.queue = queue;
this.iterator = iterator;
this.maxScoreCache = impactsDisi.getMaxScoreCache();
this.impactsDisi = impactsDisi;
this.simScorer = simScorer;
this.scorer = scorer;
this.norms = norms;
}
@Override
@ -605,7 +613,11 @@ public final class SynonymQuery extends Query {
@Override
public float score() throws IOException {
return simScorer.score(iterator.docID(), freq());
long norm = 1L;
if (norms != null && norms.advanceExact(iterator.docID())) {
norm = norms.longValue();
}
return scorer.score(freq(), norm);
}
@Override
@ -647,9 +659,11 @@ public final class SynonymQuery extends Query {
private static class FreqBoostTermScorer extends FilterScorer {
final float boost;
final TermScorer in;
final LeafSimScorer docScorer;
final SimScorer scorer;
final NumericDocValues norms;
public FreqBoostTermScorer(float boost, TermScorer in, LeafSimScorer docScorer) {
public FreqBoostTermScorer(
float boost, TermScorer in, SimScorer scorer, NumericDocValues norms) {
super(in);
if (Float.isNaN(boost) || Float.compare(boost, 0f) < 0 || Float.compare(boost, 1f) > 0) {
throw new IllegalArgumentException(
@ -657,7 +671,8 @@ public final class SynonymQuery extends Query {
}
this.boost = boost;
this.in = in;
this.docScorer = docScorer;
this.scorer = scorer;
this.norms = norms;
}
float freq() throws IOException {
@ -666,8 +681,11 @@ public final class SynonymQuery extends Query {
@Override
public float score() throws IOException {
assert docID() != DocIdSetIterator.NO_MORE_DOCS;
return docScorer.score(in.docID(), freq());
long norm = 1L;
if (norms != null && norms.advanceExact(in.docID())) {
norm = norms.longValue();
}
return scorer.score(freq(), norm);
}
@Override

View File

@ -22,6 +22,7 @@ import java.util.Objects;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
@ -150,19 +151,17 @@ public class TermQuery extends Query {
return new ConstantScoreScorer(0f, scoreMode, DocIdSetIterator.empty());
}
LeafSimScorer scorer =
new LeafSimScorer(simScorer, context.reader(), term.field(), scoreMode.needsScores());
NumericDocValues norms = null;
if (scoreMode.needsScores()) {
norms = context.reader().getNormValues(term.field());
}
if (scoreMode == ScoreMode.TOP_SCORES) {
return new TermScorer(
TermWeight.this,
termsEnum.impacts(PostingsEnum.FREQS),
scorer,
topLevelScoringClause);
termsEnum.impacts(PostingsEnum.FREQS), simScorer, norms, topLevelScoringClause);
} else {
return new TermScorer(
termsEnum.postings(
null, scoreMode.needsScores() ? PostingsEnum.FREQS : PostingsEnum.NONE),
scorer);
int flags = scoreMode.needsScores() ? PostingsEnum.FREQS : PostingsEnum.NONE;
return new TermScorer(termsEnum.postings(null, flags), simScorer, norms);
}
}
@ -223,11 +222,14 @@ public class TermQuery extends Query {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float freq = ((TermScorer) scorer).freq();
LeafSimScorer docScorer =
new LeafSimScorer(simScorer, context.reader(), term.field(), true);
NumericDocValues norms = context.reader().getNormValues(term.field());
long norm = 1L;
if (norms != null && norms.advanceExact(doc)) {
norm = norms.longValue();
}
Explanation freqExplanation =
Explanation.match(freq, "freq, occurrences of term within document");
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
Explanation scoreExplanation = simScorer.explain(freqExplanation, norm);
return Explanation.match(
scoreExplanation.getValue(),
"weight("

View File

@ -18,8 +18,10 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
/**
* Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
@ -29,17 +31,19 @@ import org.apache.lucene.index.SlowImpactsEnum;
public final class TermScorer extends Scorer {
private final PostingsEnum postingsEnum;
private final DocIdSetIterator iterator;
private final LeafSimScorer docScorer;
private final SimScorer scorer;
private final NumericDocValues norms;
private final ImpactsDISI impactsDisi;
private final MaxScoreCache maxScoreCache;
/** Construct a {@link TermScorer} that will iterate all documents. */
public TermScorer(PostingsEnum postingsEnum, LeafSimScorer docScorer) {
public TermScorer(PostingsEnum postingsEnum, SimScorer scorer, NumericDocValues norms) {
iterator = this.postingsEnum = postingsEnum;
ImpactsEnum impactsEnum = new SlowImpactsEnum(postingsEnum);
maxScoreCache = new MaxScoreCache(impactsEnum, docScorer.getSimScorer());
maxScoreCache = new MaxScoreCache(impactsEnum, scorer);
impactsDisi = null;
this.docScorer = docScorer;
this.scorer = scorer;
this.norms = norms;
}
/**
@ -47,12 +51,12 @@ public final class TermScorer extends Scorer {
* documents.
*/
public TermScorer(
Weight weight,
ImpactsEnum impactsEnum,
LeafSimScorer docScorer,
SimScorer scorer,
NumericDocValues norms,
boolean topLevelScoringClause) {
postingsEnum = impactsEnum;
maxScoreCache = new MaxScoreCache(impactsEnum, docScorer.getSimScorer());
maxScoreCache = new MaxScoreCache(impactsEnum, scorer);
if (topLevelScoringClause) {
impactsDisi = new ImpactsDISI(impactsEnum, maxScoreCache);
iterator = impactsDisi;
@ -60,7 +64,8 @@ public final class TermScorer extends Scorer {
impactsDisi = null;
iterator = impactsEnum;
}
this.docScorer = docScorer;
this.scorer = scorer;
this.norms = norms;
}
@Override
@ -80,13 +85,23 @@ public final class TermScorer extends Scorer {
@Override
public float score() throws IOException {
assert docID() != DocIdSetIterator.NO_MORE_DOCS;
return docScorer.score(postingsEnum.docID(), postingsEnum.freq());
var postingsEnum = this.postingsEnum;
var norms = this.norms;
long norm = 1L;
if (norms != null && norms.advanceExact(postingsEnum.docID())) {
norm = norms.longValue();
}
return scorer.score(postingsEnum.freq(), norm);
}
@Override
public float smoothingScore(int docId) throws IOException {
return docScorer.score(docId, 0);
long norm = 1L;
if (norms != null && norms.advanceExact(docId)) {
norm = norms.longValue();
}
return scorer.score(0, norm);
}
@Override

View File

@ -71,15 +71,14 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
}
void countHit(int doc) throws IOException {
++totalHits;
hitsThresholdChecker.incrementHitCount();
int hitCountSoFar = ++totalHits;
if (minScoreAcc != null && (totalHits & minScoreAcc.modInterval) == 0) {
if (minScoreAcc != null && (hitCountSoFar & minScoreAcc.modInterval) == 0) {
updateGlobalMinCompetitiveScore(scorer);
}
if (scoreMode.isExhaustive() == false
&& totalHitsRelation == TotalHits.Relation.EQUAL_TO
&& hitsThresholdChecker.isThresholdReached()) {
&& totalHits > totalHitsThreshold) {
// for the first time hitsThreshold is reached, notify comparator about this
comparator.setHitsThresholdReached();
totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
@ -92,7 +91,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
// this document is larger than anything else in the queue, and
// therefore not competitive.
if (searchSortPartOfIndexSort) {
if (hitsThresholdChecker.isThresholdReached()) {
if (totalHits > totalHitsThreshold) {
totalHitsRelation = Relation.GREATER_THAN_OR_EQUAL_TO;
throw new CollectionTerminatedException();
} else {
@ -180,9 +179,9 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
Sort sort,
FieldValueHitQueue<Entry> queue,
int numHits,
HitsThresholdChecker hitsThresholdChecker,
int totalHitsThreshold,
MaxScoreAccumulator minScoreAcc) {
super(queue, numHits, hitsThresholdChecker, sort.needsScores(), minScoreAcc);
super(queue, numHits, totalHitsThreshold, sort.needsScores(), minScoreAcc);
this.sort = sort;
this.queue = queue;
}
@ -235,9 +234,9 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
FieldValueHitQueue<Entry> queue,
FieldDoc after,
int numHits,
HitsThresholdChecker hitsThresholdChecker,
int totalHitsThreshold,
MaxScoreAccumulator minScoreAcc) {
super(queue, numHits, hitsThresholdChecker, sort.needsScores(), minScoreAcc);
super(queue, numHits, totalHitsThreshold, sort.needsScores(), minScoreAcc);
this.sort = sort;
this.queue = queue;
this.after = after;
@ -301,7 +300,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
private static final ScoreDoc[] EMPTY_SCOREDOCS = new ScoreDoc[0];
final int numHits;
final HitsThresholdChecker hitsThresholdChecker;
final int totalHitsThreshold;
final FieldComparator<?> firstComparator;
final boolean canSetMinScore;
@ -327,25 +326,25 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
private TopFieldCollector(
FieldValueHitQueue<Entry> pq,
int numHits,
HitsThresholdChecker hitsThresholdChecker,
int totalHitsThreshold,
boolean needsScores,
MaxScoreAccumulator minScoreAcc) {
super(pq);
this.needsScores = needsScores;
this.numHits = numHits;
this.hitsThresholdChecker = hitsThresholdChecker;
this.totalHitsThreshold = Math.max(totalHitsThreshold, numHits);
this.numComparators = pq.getComparators().length;
this.firstComparator = pq.getComparators()[0];
int reverseMul = pq.reverseMul[0];
if (firstComparator.getClass().equals(FieldComparator.RelevanceComparator.class)
&& reverseMul == 1 // if the natural sort is preserved (sort by descending relevance)
&& hitsThresholdChecker.getHitsThreshold() != Integer.MAX_VALUE) {
&& totalHitsThreshold != Integer.MAX_VALUE) {
scoreMode = ScoreMode.TOP_SCORES;
canSetMinScore = true;
} else {
canSetMinScore = false;
if (hitsThresholdChecker.getHitsThreshold() != Integer.MAX_VALUE) {
if (totalHitsThreshold != Integer.MAX_VALUE) {
scoreMode = needsScores ? ScoreMode.TOP_DOCS_WITH_SCORES : ScoreMode.TOP_DOCS;
} else {
scoreMode = needsScores ? ScoreMode.COMPLETE : ScoreMode.COMPLETE_NO_SCORES;
@ -361,10 +360,10 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
protected void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOException {
assert minScoreAcc != null;
if (canSetMinScore && hitsThresholdChecker.isThresholdReached()) {
// we can start checking the global maximum score even
// if the local queue is not full because the threshold
// is reached.
if (canSetMinScore) {
// we can start checking the global maximum score even if the local queue is not full or if
// the threshold is not reached on the local competitor: the fact that there is a shared min
// competitive score implies that one of the collectors hit its totalHitsThreshold already
long maxMinScore = minScoreAcc.getRaw();
float score;
if (maxMinScore != Long.MIN_VALUE
@ -377,7 +376,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
}
protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {
if (canSetMinScore && queueFull && hitsThresholdChecker.isThresholdReached()) {
if (canSetMinScore && queueFull && totalHits > totalHitsThreshold) {
assert bottom != null;
float minScore = (float) firstComparator.value(bottom.slot);
if (minScore > minCompetitiveScore) {

Some files were not shown because too many files have changed in this diff Show More