mirror of https://github.com/apache/lucene.git
Merge branch 'main' into optimize_prefix_query
This commit is contained in:
commit
cdf2d5a2aa
|
@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
|
|||
description "Regenerate gen_ForUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene101")
|
||||
def genScript = file("${genDir}/gen_ForUtil.py")
|
||||
def genOutput = file("${genDir}/ForUtil.java")
|
||||
|
||||
|
@ -48,7 +48,7 @@ configure(project(":lucene:core")) {
|
|||
description "Regenerate gen_ForDeltaUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene101")
|
||||
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
|
||||
def genOutput = file("${genDir}/ForDeltaUtil.java")
|
||||
|
||||
|
@ -68,6 +68,7 @@ configure(project(":lucene:core")) {
|
|||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
}
|
||||
|
||||
configure(project(":lucene:backward-codecs")) {
|
||||
|
@ -146,5 +147,55 @@ configure(project(":lucene:backward-codecs")) {
|
|||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
task generateForUtil912Internal() {
|
||||
description "Regenerate gen_ForUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene912")
|
||||
def genScript = file("${genDir}/gen_ForUtil.py")
|
||||
def genOutput = file("${genDir}/ForUtil.java")
|
||||
|
||||
inputs.file genScript
|
||||
outputs.file genOutput
|
||||
|
||||
doLast {
|
||||
quietExec {
|
||||
workingDir genDir
|
||||
executable project.externalTool("python3")
|
||||
args = [ '-B', genScript ]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil912Internal, [
|
||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
task generateForDeltaUtil912Internal() {
|
||||
description "Regenerate gen_ForDeltaUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene912")
|
||||
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
|
||||
def genOutput = file("${genDir}/ForDeltaUtil.java")
|
||||
|
||||
inputs.file genScript
|
||||
outputs.file genOutput
|
||||
|
||||
doLast {
|
||||
quietExec {
|
||||
workingDir genDir
|
||||
executable project.externalTool("python3")
|
||||
args = [ '-B', genScript ]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtil912Internal, [
|
||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
}
|
||||
|
||||
|
|
|
@ -128,8 +128,14 @@ allprojects {
|
|||
jvmArgs '--add-modules', 'jdk.management'
|
||||
|
||||
// Enable the vector incubator module on supported Java versions:
|
||||
if (rootProject.vectorIncubatorJavaVersions.contains(rootProject.runtimeJavaVersion)) {
|
||||
def prop = propertyOrDefault("org.apache.lucene.vectorization.upperJavaFeatureVersion", "1") as String
|
||||
def v = JavaVersion.toVersion(Integer.parseInt(prop)).majorVersion
|
||||
if (rootProject.vectorIncubatorJavaVersions.contains(rootProject.runtimeJavaVersion) ||
|
||||
rootProject.runtimeJavaVersion.majorVersion <= v) {
|
||||
jvmArgs '--add-modules', 'jdk.incubator.vector'
|
||||
if (rootProject.runtimeJavaVersion.majorVersion <= v) {
|
||||
systemProperty 'org.apache.lucene.vectorization.upperJavaFeatureVersion', v
|
||||
}
|
||||
}
|
||||
|
||||
jvmArgs '--enable-native-access=' + (project.path in [
|
||||
|
|
|
@ -7,7 +7,7 @@ http://s.apache.org/luceneversions
|
|||
|
||||
API Changes
|
||||
---------------------
|
||||
(No changes)
|
||||
* GITHUB#11023: Removing deprecated parameters from CheckIndex. (Jakub Slowinski)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
@ -36,6 +36,10 @@ API Changes
|
|||
|
||||
* GITHUB#13859: Allow open-ended ranges in Intervals range queries. (Mayya Sharipova)
|
||||
|
||||
* GITHUB#13950: Make BooleanQuery#getClauses public and add #add(Collection<BooleanClause>) to BQ builder. (Shubham Chaudhary)
|
||||
|
||||
* GITHUB#13957: Removed LeafSimScorer class, to save its overhead. Scorers now
|
||||
compute scores directly from a SimScorer, postings and norms. (Adrien Grand)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
@ -43,7 +47,11 @@ New Features
|
|||
|
||||
Improvements
|
||||
---------------------
|
||||
(No changes)
|
||||
|
||||
* GITHUB#13986: Allow easier configuration of the Panama Vectorization provider with
|
||||
newer Java versions. Set the `org.apache.lucene.vectorization.upperJavaFeatureVersion`
|
||||
system property to increase the set of Java versions that Panama Vectorization will
|
||||
provide optimized implementations for. (Chris Hegarty)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
@ -53,12 +61,48 @@ Optimizations
|
|||
* GITHUB#13800: MaxScoreBulkScorer now recomputes scorer partitions when the
|
||||
minimum competitive allows for a more favorable partitioning. (Adrien Grand)
|
||||
|
||||
* GITHUB#13930: Use growNoCopy when copying bytes in BytesRefBuilder. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13931: Refactored `BooleanScorer` to evaluate matches of sub clauses
|
||||
using the `Scorer` abstraction rather than the `BulkScorer` abstraction. This
|
||||
speeds up exhaustive evaluation of disjunctions of term queries.
|
||||
(Adrien Grand)
|
||||
|
||||
* GITHUB#13941: Optimized computation of top-hits on disjunctive queries with
|
||||
many clauses. (Adrien Grand)
|
||||
|
||||
* GITHUB#13954: Disabled exchanging scores across slices for exhaustive
|
||||
top-hits evaluation. (Adrien Grand)
|
||||
|
||||
* GITHUB#13899: Check ahead if we can get the count. (Lu Xugang)
|
||||
|
||||
* GITHUB#13943: Removed shared `HitsThresholdChecker`, which reduces overhead
|
||||
but may delay a bit when dynamic pruning kicks in. (Adrien Grand)
|
||||
|
||||
* GITHUB#13961: Replace Map<String,Object> with IntObjectHashMap for DV producer. (Pan Guixin)
|
||||
|
||||
* GITHUB#13963: Speed up nextDoc() implementations in Lucene912PostingsReader.
|
||||
(Adrien Grand)
|
||||
|
||||
* GITHUB#13958: Speed up advancing within a block. (Adrien Grand)
|
||||
|
||||
* GITHUB#13763: Replace Map<String,Object> with IntObjectHashMap for KnnVectorsReader (Pan Guixin)
|
||||
|
||||
* GITHUB#13968: Switch postings from storing doc IDs in a long[] to an int[].
|
||||
Lucene 8.4 had moved to a long[] to help speed up block decoding by using
|
||||
longs that would pack two integers. We are now moving back to integers to be
|
||||
able to take advantage of 2x more lanes with the vector API. (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
|
||||
when they were not sorted by startOffset. (Seunghan Jung)
|
||||
* GITHUB#13884: Remove broken .toArray from Long/CharObjectHashMap entirely. (Pan Guixin)
|
||||
* GITHUB#12686: Added support for highlighting IndexOrDocValuesQuery. (Prudhvi Godithi)
|
||||
* GITHUB#13927: Fix StoredFieldsConsumer finish. (linfn)
|
||||
* GITHUB#13944: Ensure deterministic order of clauses for `DisjunctionMaxQuery#toString`. (Laurent Jakubina)
|
||||
* GITHUB#13841: Improve Tessellatorlogic when two holes share the same vertex with the polygon which was failing
|
||||
in valid polygons. (Ignacio Vera)
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
@ -67,7 +111,7 @@ Build
|
|||
|
||||
Other
|
||||
---------------------
|
||||
(No changes)
|
||||
* GITHUB#13982: Remove duplicate test code. (Lu Xugang)
|
||||
|
||||
======================== Lucene 10.0.1 =======================
|
||||
|
||||
|
@ -295,6 +339,8 @@ Bug Fixes
|
|||
* GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those
|
||||
of DoubleValues#doubleValue(). (Uwe Schindler)
|
||||
|
||||
* GITHUB#13498: Avoid performance regression by constructing lazily the PointTree in NumericComparator, (Ignacio Vera)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -19,6 +19,13 @@
|
|||
|
||||
## Migration from Lucene 9.x to Lucene 10.0
|
||||
|
||||
### DataInput#readVLong() may now read negative vlongs
|
||||
|
||||
LUCENE-10376 started allowing `DataInput#readVLong()` to read negative vlongs.
|
||||
In particular, this feature is used by the `DataInput#readZLong()` method. A
|
||||
practical implication is that `DataInput#readVLong()` may now read up to 10
|
||||
bytes, while it would never read more than 9 bytes in Lucene 9.x.
|
||||
|
||||
### Changes to DataInput.readGroupVInt and readGroupVInts methods
|
||||
|
||||
As part of GITHUB#13820, GITHUB#13825, GITHUB#13830, this issue corrects DataInput.readGroupVInts
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForDeltaUtil.java": "b81961f0b277b1458ca259e0d23ccc4eeeb47fe7",
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForDeltaUtil.py": "3191d7591309b7876c5c709fb9375af5b87c2ef8"
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForUtil.java": "e6db3c665dfebca8b93eb6b4651d2eb3af637b02",
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForUtil.py": "993ecc9cf7ea821963384070669695257b16e040"
|
||||
}
|
|
@ -37,6 +37,7 @@ module org.apache.lucene.backward_codecs {
|
|||
exports org.apache.lucene.backward_codecs.lucene95;
|
||||
exports org.apache.lucene.backward_codecs.lucene99;
|
||||
exports org.apache.lucene.backward_codecs.lucene912;
|
||||
exports org.apache.lucene.backward_codecs.lucene100;
|
||||
exports org.apache.lucene.backward_codecs.packed;
|
||||
exports org.apache.lucene.backward_codecs.store;
|
||||
|
||||
|
@ -46,7 +47,8 @@ module org.apache.lucene.backward_codecs {
|
|||
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat;
|
||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
|
||||
|
@ -64,5 +66,6 @@ module org.apache.lucene.backward_codecs {
|
|||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
|
||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec,
|
||||
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec;
|
||||
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec,
|
||||
org.apache.lucene.backward_codecs.lucene100.Lucene100Codec;
|
||||
}
|
||||
|
|
|
@ -14,9 +14,10 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene100;
|
||||
package org.apache.lucene.backward_codecs.lucene100;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
|
@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||
|
@ -50,7 +50,7 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|||
*
|
||||
* <p>If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene100 package documentation for file format details.
|
||||
* @see org.apache.lucene.backward_codecs.lucene100 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene100Codec extends Codec {
|
|
@ -15,5 +15,5 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Lucene 9.12 file format. */
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
/** Lucene 10.0 file format. */
|
||||
package org.apache.lucene.backward_codecs.lucene100;
|
|
@ -17,8 +17,6 @@
|
|||
package org.apache.lucene.backward_codecs.lucene80;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.backward_codecs.packed.LegacyDirectMonotonicReader;
|
||||
import org.apache.lucene.backward_codecs.packed.LegacyDirectReader;
|
||||
import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil;
|
||||
|
@ -41,6 +39,7 @@ import org.apache.lucene.index.SortedNumericDocValues;
|
|||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -53,11 +52,11 @@ import org.apache.lucene.util.compress.LZ4;
|
|||
|
||||
/** reader for {@link Lucene80DocValuesFormat} */
|
||||
final class Lucene80DocValuesProducer extends DocValuesProducer {
|
||||
private final Map<String, NumericEntry> numerics = new HashMap<>();
|
||||
private final Map<String, BinaryEntry> binaries = new HashMap<>();
|
||||
private final Map<String, SortedEntry> sorted = new HashMap<>();
|
||||
private final Map<String, SortedSetEntry> sortedSets = new HashMap<>();
|
||||
private final Map<String, SortedNumericEntry> sortedNumerics = new HashMap<>();
|
||||
private final IntObjectHashMap<NumericEntry> numerics = new IntObjectHashMap<>();
|
||||
private final IntObjectHashMap<BinaryEntry> binaries = new IntObjectHashMap<>();
|
||||
private final IntObjectHashMap<SortedEntry> sorted = new IntObjectHashMap<>();
|
||||
private final IntObjectHashMap<SortedSetEntry> sortedSets = new IntObjectHashMap<>();
|
||||
private final IntObjectHashMap<SortedNumericEntry> sortedNumerics = new IntObjectHashMap<>();
|
||||
private final IndexInput data;
|
||||
private final int maxDoc;
|
||||
private int version = -1;
|
||||
|
@ -139,7 +138,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
}
|
||||
byte type = meta.readByte();
|
||||
if (type == Lucene80DocValuesFormat.NUMERIC) {
|
||||
numerics.put(info.name, readNumeric(meta));
|
||||
numerics.put(info.number, readNumeric(meta));
|
||||
} else if (type == Lucene80DocValuesFormat.BINARY) {
|
||||
final boolean compressed;
|
||||
if (version >= Lucene80DocValuesFormat.VERSION_CONFIGURABLE_COMPRESSION) {
|
||||
|
@ -158,13 +157,13 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
} else {
|
||||
compressed = version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED;
|
||||
}
|
||||
binaries.put(info.name, readBinary(meta, compressed));
|
||||
binaries.put(info.number, readBinary(meta, compressed));
|
||||
} else if (type == Lucene80DocValuesFormat.SORTED) {
|
||||
sorted.put(info.name, readSorted(meta));
|
||||
sorted.put(info.number, readSorted(meta));
|
||||
} else if (type == Lucene80DocValuesFormat.SORTED_SET) {
|
||||
sortedSets.put(info.name, readSortedSet(meta));
|
||||
sortedSets.put(info.number, readSortedSet(meta));
|
||||
} else if (type == Lucene80DocValuesFormat.SORTED_NUMERIC) {
|
||||
sortedNumerics.put(info.name, readSortedNumeric(meta));
|
||||
sortedNumerics.put(info.number, readSortedNumeric(meta));
|
||||
} else {
|
||||
throw new CorruptIndexException("invalid type: " + type, meta);
|
||||
}
|
||||
|
@ -426,7 +425,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
|
||||
NumericEntry entry = numerics.get(field.name);
|
||||
NumericEntry entry = numerics.get(field.number);
|
||||
return getNumeric(entry);
|
||||
}
|
||||
|
||||
|
@ -915,7 +914,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||
BinaryEntry entry = binaries.get(field.name);
|
||||
BinaryEntry entry = binaries.get(field.number);
|
||||
if (entry.compressed) {
|
||||
return getCompressedBinary(entry);
|
||||
} else {
|
||||
|
@ -973,7 +972,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
SortedEntry entry = sorted.get(field.name);
|
||||
SortedEntry entry = sorted.get(field.number);
|
||||
return getSorted(entry);
|
||||
}
|
||||
|
||||
|
@ -1407,7 +1406,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
|
||||
SortedNumericEntry entry = sortedNumerics.get(field.name);
|
||||
SortedNumericEntry entry = sortedNumerics.get(field.number);
|
||||
if (entry.numValues == entry.numDocsWithField) {
|
||||
return DocValues.singleton(getNumeric(entry));
|
||||
}
|
||||
|
@ -1543,7 +1542,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
||||
SortedSetEntry entry = sortedSets.get(field.name);
|
||||
SortedSetEntry entry = sortedSets.get(field.number);
|
||||
if (entry.singleValueEntry != null) {
|
||||
return DocValues.singleton(getSorted(entry.singleValueEntry));
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.backward_codecs.lucene90;
|
|||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.SplittableRandom;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
|
@ -33,6 +31,7 @@ import org.apache.lucene.index.FloatVectorValues;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.search.KnnCollector;
|
||||
import org.apache.lucene.search.VectorScorer;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
|
@ -50,14 +49,16 @@ import org.apache.lucene.util.hnsw.NeighborQueue;
|
|||
*/
|
||||
public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
|
||||
|
||||
private final Map<String, FieldEntry> fields = new HashMap<>();
|
||||
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
|
||||
private final IndexInput vectorData;
|
||||
private final IndexInput vectorIndex;
|
||||
private final long checksumSeed;
|
||||
private final FieldInfos fieldInfos;
|
||||
|
||||
Lucene90HnswVectorsReader(SegmentReadState state) throws IOException {
|
||||
int versionMeta = readMetadata(state);
|
||||
long[] checksumRef = new long[1];
|
||||
this.fieldInfos = state.fieldInfos;
|
||||
boolean success = false;
|
||||
try {
|
||||
vectorData =
|
||||
|
@ -158,7 +159,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
|
|||
|
||||
FieldEntry fieldEntry = readField(meta, info);
|
||||
validateFieldEntry(info, fieldEntry);
|
||||
fields.put(info.name, fieldEntry);
|
||||
fields.put(info.number, fieldEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -218,13 +219,18 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
|
|||
CodecUtil.checksumEntireFile(vectorIndex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
private FieldEntry getFieldEntry(String field) {
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final FieldEntry fieldEntry;
|
||||
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
return getOffHeapVectorValues(fieldEntry);
|
||||
return fieldEntry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
return getOffHeapVectorValues(getFieldEntry(field));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -235,8 +241,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
|
|||
@Override
|
||||
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
|
||||
final FieldEntry fieldEntry = getFieldEntry(field);
|
||||
if (fieldEntry.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -21,8 +21,6 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.function.IntUnaryOperator;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
|
@ -35,6 +33,7 @@ import org.apache.lucene.index.FloatVectorValues;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.KnnCollector;
|
||||
import org.apache.lucene.search.VectorScorer;
|
||||
|
@ -55,13 +54,15 @@ import org.apache.lucene.util.hnsw.RandomVectorScorer;
|
|||
*/
|
||||
public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
|
||||
|
||||
private final Map<String, FieldEntry> fields = new HashMap<>();
|
||||
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
|
||||
private final IndexInput vectorData;
|
||||
private final IndexInput vectorIndex;
|
||||
private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
|
||||
private final FieldInfos fieldInfos;
|
||||
|
||||
Lucene91HnswVectorsReader(SegmentReadState state) throws IOException {
|
||||
int versionMeta = readMetadata(state);
|
||||
this.fieldInfos = state.fieldInfos;
|
||||
boolean success = false;
|
||||
try {
|
||||
vectorData =
|
||||
|
@ -154,7 +155,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
|
|||
}
|
||||
FieldEntry fieldEntry = readField(meta, info);
|
||||
validateFieldEntry(info, fieldEntry);
|
||||
fields.put(info.name, fieldEntry);
|
||||
fields.put(info.number, fieldEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -214,13 +215,18 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
|
|||
CodecUtil.checksumEntireFile(vectorIndex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
private FieldEntry getFieldEntry(String field) {
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final FieldEntry fieldEntry;
|
||||
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
return getOffHeapVectorValues(fieldEntry);
|
||||
return fieldEntry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
return getOffHeapVectorValues(getFieldEntry(field));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -231,8 +237,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
|
|||
@Override
|
||||
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
|
||||
final FieldEntry fieldEntry = getFieldEntry(field);
|
||||
if (fieldEntry.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -16,13 +16,13 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene912.ForUtil.*;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.ForUtil.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
|
@ -30,7 +30,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* SIMD-like speedups. If bitsPerValue <= 4 then we pack 8 ints per long else if bitsPerValue
|
||||
* <= 11 we pack 4 ints per long else we pack 2 ints per long
|
||||
*/
|
||||
public final class ForDeltaUtil {
|
||||
final class ForDeltaUtil {
|
||||
|
||||
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
|
||||
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
|
||||
|
@ -272,125 +272,124 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
|
||||
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */
|
||||
void decodeAndPrefixSum(PostingDecodingUtil pdu, long base, long[] longs) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
|
||||
void decodeAndPrefixSum(IndexInput in, long base, long[] longs) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(in.readByte());
|
||||
if (bitsPerValue == 0) {
|
||||
prefixSumOfOnes(longs, base);
|
||||
} else {
|
||||
decodeAndPrefixSum(bitsPerValue, pdu, base, longs);
|
||||
decodeAndPrefixSum(bitsPerValue, in, base, longs);
|
||||
}
|
||||
}
|
||||
|
||||
/** Delta-decode 128 integers into {@code longs}. */
|
||||
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, long base, long[] longs)
|
||||
void decodeAndPrefixSum(int bitsPerValue, IndexInput in, long base, long[] longs)
|
||||
throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
case 1:
|
||||
decode1(pdu, tmp, longs);
|
||||
decode1(in, longs);
|
||||
prefixSum8(longs, base);
|
||||
break;
|
||||
case 2:
|
||||
decode2(pdu, tmp, longs);
|
||||
decode2(in, longs);
|
||||
prefixSum8(longs, base);
|
||||
break;
|
||||
case 3:
|
||||
decode3(pdu, tmp, longs);
|
||||
decode3(in, tmp, longs);
|
||||
prefixSum8(longs, base);
|
||||
break;
|
||||
case 4:
|
||||
decode4(pdu, tmp, longs);
|
||||
decode4(in, longs);
|
||||
prefixSum8(longs, base);
|
||||
break;
|
||||
case 5:
|
||||
decode5To16(pdu, tmp, longs);
|
||||
decode5To16(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 6:
|
||||
decode6To16(pdu, tmp, longs);
|
||||
decode6To16(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 7:
|
||||
decode7To16(pdu, tmp, longs);
|
||||
decode7To16(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 8:
|
||||
decode8To16(pdu, tmp, longs);
|
||||
decode8To16(in, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 9:
|
||||
decode9(pdu, tmp, longs);
|
||||
decode9(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 10:
|
||||
decode10(pdu, tmp, longs);
|
||||
decode10(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 11:
|
||||
decode11(pdu, tmp, longs);
|
||||
decode11(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 12:
|
||||
decode12To32(pdu, tmp, longs);
|
||||
decode12To32(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 13:
|
||||
decode13To32(pdu, tmp, longs);
|
||||
decode13To32(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 14:
|
||||
decode14To32(pdu, tmp, longs);
|
||||
decode14To32(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 15:
|
||||
decode15To32(pdu, tmp, longs);
|
||||
decode15To32(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 16:
|
||||
decode16To32(pdu, tmp, longs);
|
||||
decode16To32(in, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 17:
|
||||
decode17(pdu, tmp, longs);
|
||||
decode17(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 18:
|
||||
decode18(pdu, tmp, longs);
|
||||
decode18(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 19:
|
||||
decode19(pdu, tmp, longs);
|
||||
decode19(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 20:
|
||||
decode20(pdu, tmp, longs);
|
||||
decode20(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 21:
|
||||
decode21(pdu, tmp, longs);
|
||||
decode21(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 22:
|
||||
decode22(pdu, tmp, longs);
|
||||
decode22(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 23:
|
||||
decode23(pdu, tmp, longs);
|
||||
decode23(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 24:
|
||||
decode24(pdu, tmp, longs);
|
||||
decode24(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
default:
|
||||
decodeSlow(bitsPerValue, pdu, tmp, longs);
|
||||
decodeSlow(bitsPerValue, in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode5To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(10, longs, 11, 5, MASK16_5, tmp, 0, MASK16_1);
|
||||
private static void decode5To16(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 10, longs, 11, 5, MASK16_5, tmp, 0, MASK16_1);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 30; iter < 2; ++iter, tmpIdx += 5, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= tmp[tmpIdx + 1] << 3;
|
||||
|
@ -401,9 +400,8 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode6To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(12, longs, 10, 6, MASK16_6, tmp, 0, MASK16_4);
|
||||
private static void decode6To16(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 12, longs, 10, 6, MASK16_6, tmp, 0, MASK16_4);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 24; iter < 4; ++iter, tmpIdx += 3, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_2;
|
||||
|
@ -414,9 +412,8 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode7To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(14, longs, 9, 7, MASK16_7, tmp, 0, MASK16_2);
|
||||
private static void decode7To16(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 14, longs, 9, 7, MASK16_7, tmp, 0, MASK16_2);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 28; iter < 2; ++iter, tmpIdx += 7, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 5;
|
||||
l0 |= tmp[tmpIdx + 1] << 3;
|
||||
|
@ -431,14 +428,12 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode8To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(16, longs, 8, 8, MASK16_8, longs, 16, MASK16_8);
|
||||
private static void decode8To16(IndexInput in, long[] longs) throws IOException {
|
||||
splitLongs(in, 16, longs, 8, 8, MASK16_8, longs, 16, MASK16_8);
|
||||
}
|
||||
|
||||
private static void decode12To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(24, longs, 20, 12, MASK32_12, tmp, 0, MASK32_8);
|
||||
private static void decode12To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 24, longs, 20, 12, MASK32_12, tmp, 0, MASK32_8);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 48; iter < 8; ++iter, tmpIdx += 3, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_4;
|
||||
|
@ -449,9 +444,8 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode13To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(26, longs, 19, 13, MASK32_13, tmp, 0, MASK32_6);
|
||||
private static void decode13To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 26, longs, 19, 13, MASK32_13, tmp, 0, MASK32_6);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 52; iter < 2; ++iter, tmpIdx += 13, longsIdx += 6) {
|
||||
long l0 = tmp[tmpIdx + 0] << 7;
|
||||
l0 |= tmp[tmpIdx + 1] << 1;
|
||||
|
@ -480,9 +474,8 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode14To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(28, longs, 18, 14, MASK32_14, tmp, 0, MASK32_4);
|
||||
private static void decode14To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 28, longs, 18, 14, MASK32_14, tmp, 0, MASK32_4);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 56; iter < 4; ++iter, tmpIdx += 7, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= tmp[tmpIdx + 1] << 6;
|
||||
|
@ -497,9 +490,8 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode15To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(30, longs, 17, 15, MASK32_15, tmp, 0, MASK32_2);
|
||||
private static void decode15To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 30, longs, 17, 15, MASK32_15, tmp, 0, MASK32_2);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 60; iter < 2; ++iter, tmpIdx += 15, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 13;
|
||||
l0 |= tmp[tmpIdx + 1] << 11;
|
||||
|
@ -522,8 +514,7 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode16To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(32, longs, 16, 16, MASK32_16, longs, 32, MASK32_16);
|
||||
private static void decode16To32(IndexInput in, long[] longs) throws IOException {
|
||||
splitLongs(in, 32, longs, 16, 16, MASK32_16, longs, 32, MASK32_16);
|
||||
}
|
||||
}
|
|
@ -16,18 +16,18 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in a long to get
|
||||
* SIMD-like speedups. If bitsPerValue <= 8 then we pack 8 ints per long else if bitsPerValue
|
||||
* <= 16 we pack 4 ints per long else we pack 2 ints per long
|
||||
*/
|
||||
public final class ForUtil {
|
||||
final class ForUtil {
|
||||
|
||||
public static final int BLOCK_SIZE = 128;
|
||||
static final int BLOCK_SIZE_LOG2 = 7;
|
||||
|
@ -196,11 +196,11 @@ public final class ForUtil {
|
|||
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
|
||||
}
|
||||
|
||||
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
static void decodeSlow(int bitsPerValue, IndexInput in, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
final int numLongs = bitsPerValue << 1;
|
||||
final long mask = MASKS32[bitsPerValue];
|
||||
pdu.splitLongs(numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
|
||||
splitLongs(in, numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
|
||||
final int remainingBitsPerLong = 32 - bitsPerValue;
|
||||
final long mask32RemainingBitsPerLong = MASKS32[remainingBitsPerLong];
|
||||
int tmpIdx = 0;
|
||||
|
@ -222,6 +222,28 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void splitLongs(
|
||||
IndexInput in,
|
||||
int count,
|
||||
long[] b,
|
||||
int bShift,
|
||||
int dec,
|
||||
long bMask,
|
||||
long[] c,
|
||||
int cIndex,
|
||||
long cMask)
|
||||
throws IOException {
|
||||
// takes advantage of the C2 compiler's loop unrolling and auto-vectorization.
|
||||
in.readLongs(c, cIndex, count);
|
||||
int maxIter = (bShift - 1) / dec;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
for (int j = 0; j <= maxIter; ++j) {
|
||||
b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask;
|
||||
}
|
||||
c[cIndex + i] &= cMask;
|
||||
}
|
||||
}
|
||||
|
||||
static final long[] MASKS8 = new long[8];
|
||||
static final long[] MASKS16 = new long[16];
|
||||
static final long[] MASKS32 = new long[32];
|
||||
|
@ -288,121 +310,121 @@ public final class ForUtil {
|
|||
static final long MASK32_24 = MASKS32[24];
|
||||
|
||||
/** Decode 128 integers into {@code longs}. */
|
||||
void decode(int bitsPerValue, PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
void decode(int bitsPerValue, IndexInput in, long[] longs) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
case 1:
|
||||
decode1(pdu, tmp, longs);
|
||||
decode1(in, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 2:
|
||||
decode2(pdu, tmp, longs);
|
||||
decode2(in, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 3:
|
||||
decode3(pdu, tmp, longs);
|
||||
decode3(in, tmp, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 4:
|
||||
decode4(pdu, tmp, longs);
|
||||
decode4(in, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 5:
|
||||
decode5(pdu, tmp, longs);
|
||||
decode5(in, tmp, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 6:
|
||||
decode6(pdu, tmp, longs);
|
||||
decode6(in, tmp, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 7:
|
||||
decode7(pdu, tmp, longs);
|
||||
decode7(in, tmp, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 8:
|
||||
decode8(pdu, tmp, longs);
|
||||
decode8(in, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 9:
|
||||
decode9(pdu, tmp, longs);
|
||||
decode9(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 10:
|
||||
decode10(pdu, tmp, longs);
|
||||
decode10(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 11:
|
||||
decode11(pdu, tmp, longs);
|
||||
decode11(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 12:
|
||||
decode12(pdu, tmp, longs);
|
||||
decode12(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 13:
|
||||
decode13(pdu, tmp, longs);
|
||||
decode13(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 14:
|
||||
decode14(pdu, tmp, longs);
|
||||
decode14(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 15:
|
||||
decode15(pdu, tmp, longs);
|
||||
decode15(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 16:
|
||||
decode16(pdu, tmp, longs);
|
||||
decode16(in, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 17:
|
||||
decode17(pdu, tmp, longs);
|
||||
decode17(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 18:
|
||||
decode18(pdu, tmp, longs);
|
||||
decode18(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 19:
|
||||
decode19(pdu, tmp, longs);
|
||||
decode19(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 20:
|
||||
decode20(pdu, tmp, longs);
|
||||
decode20(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 21:
|
||||
decode21(pdu, tmp, longs);
|
||||
decode21(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 22:
|
||||
decode22(pdu, tmp, longs);
|
||||
decode22(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 23:
|
||||
decode23(pdu, tmp, longs);
|
||||
decode23(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 24:
|
||||
decode24(pdu, tmp, longs);
|
||||
decode24(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
default:
|
||||
decodeSlow(bitsPerValue, pdu, tmp, longs);
|
||||
decodeSlow(bitsPerValue, in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode1(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(2, longs, 7, 1, MASK8_1, longs, 14, MASK8_1);
|
||||
static void decode1(IndexInput in, long[] longs) throws IOException {
|
||||
splitLongs(in, 2, longs, 7, 1, MASK8_1, longs, 14, MASK8_1);
|
||||
}
|
||||
|
||||
static void decode2(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(4, longs, 6, 2, MASK8_2, longs, 12, MASK8_2);
|
||||
static void decode2(IndexInput in, long[] longs) throws IOException {
|
||||
splitLongs(in, 4, longs, 6, 2, MASK8_2, longs, 12, MASK8_2);
|
||||
}
|
||||
|
||||
static void decode3(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(6, longs, 5, 3, MASK8_3, tmp, 0, MASK8_2);
|
||||
static void decode3(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 6, longs, 5, 3, MASK8_3, tmp, 0, MASK8_2);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 12; iter < 2; ++iter, tmpIdx += 3, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 1;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_1;
|
||||
|
@ -413,12 +435,12 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode4(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(8, longs, 4, 4, MASK8_4, longs, 8, MASK8_4);
|
||||
static void decode4(IndexInput in, long[] longs) throws IOException {
|
||||
splitLongs(in, 8, longs, 4, 4, MASK8_4, longs, 8, MASK8_4);
|
||||
}
|
||||
|
||||
static void decode5(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(10, longs, 3, 5, MASK8_5, tmp, 0, MASK8_3);
|
||||
static void decode5(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 10, longs, 3, 5, MASK8_5, tmp, 0, MASK8_3);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 10; iter < 2; ++iter, tmpIdx += 5, longsIdx += 3) {
|
||||
long l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_2;
|
||||
|
@ -433,8 +455,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode6(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(12, longs, 2, 6, MASK8_6, tmp, 0, MASK8_2);
|
||||
static void decode6(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 12, longs, 2, 6, MASK8_6, tmp, 0, MASK8_2);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 12; iter < 4; ++iter, tmpIdx += 3, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= tmp[tmpIdx + 1] << 2;
|
||||
|
@ -443,8 +465,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode7(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(14, longs, 1, 7, MASK8_7, tmp, 0, MASK8_1);
|
||||
static void decode7(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 14, longs, 1, 7, MASK8_7, tmp, 0, MASK8_1);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 14; iter < 2; ++iter, tmpIdx += 7, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= tmp[tmpIdx + 1] << 5;
|
||||
|
@ -457,12 +479,12 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode8(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.in.readLongs(longs, 0, 16);
|
||||
static void decode8(IndexInput in, long[] longs) throws IOException {
|
||||
in.readLongs(longs, 0, 16);
|
||||
}
|
||||
|
||||
static void decode9(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(18, longs, 7, 9, MASK16_9, tmp, 0, MASK16_7);
|
||||
static void decode9(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 18, longs, 7, 9, MASK16_9, tmp, 0, MASK16_7);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 18; iter < 2; ++iter, tmpIdx += 9, longsIdx += 7) {
|
||||
long l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 5) & MASK16_2;
|
||||
|
@ -489,8 +511,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode10(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(20, longs, 6, 10, MASK16_10, tmp, 0, MASK16_6);
|
||||
static void decode10(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 20, longs, 6, 10, MASK16_10, tmp, 0, MASK16_6);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 20; iter < 4; ++iter, tmpIdx += 5, longsIdx += 3) {
|
||||
long l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_4;
|
||||
|
@ -505,8 +527,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode11(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(22, longs, 5, 11, MASK16_11, tmp, 0, MASK16_5);
|
||||
static void decode11(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 22, longs, 5, 11, MASK16_11, tmp, 0, MASK16_5);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 22; iter < 2; ++iter, tmpIdx += 11, longsIdx += 5) {
|
||||
long l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= tmp[tmpIdx + 1] << 1;
|
||||
|
@ -531,8 +553,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode12(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(24, longs, 4, 12, MASK16_12, tmp, 0, MASK16_4);
|
||||
static void decode12(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 24, longs, 4, 12, MASK16_12, tmp, 0, MASK16_4);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 24; iter < 8; ++iter, tmpIdx += 3, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 8;
|
||||
l0 |= tmp[tmpIdx + 1] << 4;
|
||||
|
@ -541,8 +563,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode13(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(26, longs, 3, 13, MASK16_13, tmp, 0, MASK16_3);
|
||||
static void decode13(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 26, longs, 3, 13, MASK16_13, tmp, 0, MASK16_3);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 26; iter < 2; ++iter, tmpIdx += 13, longsIdx += 3) {
|
||||
long l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= tmp[tmpIdx + 1] << 7;
|
||||
|
@ -565,8 +587,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode14(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(28, longs, 2, 14, MASK16_14, tmp, 0, MASK16_2);
|
||||
static void decode14(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 28, longs, 2, 14, MASK16_14, tmp, 0, MASK16_2);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 28; iter < 4; ++iter, tmpIdx += 7, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 12;
|
||||
l0 |= tmp[tmpIdx + 1] << 10;
|
||||
|
@ -579,8 +601,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode15(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(30, longs, 1, 15, MASK16_15, tmp, 0, MASK16_1);
|
||||
static void decode15(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 30, longs, 1, 15, MASK16_15, tmp, 0, MASK16_1);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 30; iter < 2; ++iter, tmpIdx += 15, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 14;
|
||||
l0 |= tmp[tmpIdx + 1] << 13;
|
||||
|
@ -601,12 +623,12 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode16(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.in.readLongs(longs, 0, 32);
|
||||
static void decode16(IndexInput in, long[] longs) throws IOException {
|
||||
in.readLongs(longs, 0, 32);
|
||||
}
|
||||
|
||||
static void decode17(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(34, longs, 15, 17, MASK32_17, tmp, 0, MASK32_15);
|
||||
static void decode17(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 34, longs, 15, 17, MASK32_17, tmp, 0, MASK32_15);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 34; iter < 2; ++iter, tmpIdx += 17, longsIdx += 15) {
|
||||
long l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 13) & MASK32_2;
|
||||
|
@ -657,8 +679,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode18(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(36, longs, 14, 18, MASK32_18, tmp, 0, MASK32_14);
|
||||
static void decode18(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 36, longs, 14, 18, MASK32_18, tmp, 0, MASK32_14);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 36; iter < 4; ++iter, tmpIdx += 9, longsIdx += 7) {
|
||||
long l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 10) & MASK32_4;
|
||||
|
@ -685,8 +707,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode19(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(38, longs, 13, 19, MASK32_19, tmp, 0, MASK32_13);
|
||||
static void decode19(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 38, longs, 13, 19, MASK32_19, tmp, 0, MASK32_13);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 38; iter < 2; ++iter, tmpIdx += 19, longsIdx += 13) {
|
||||
long l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 7) & MASK32_6;
|
||||
|
@ -735,8 +757,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode20(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(40, longs, 12, 20, MASK32_20, tmp, 0, MASK32_12);
|
||||
static void decode20(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 40, longs, 12, 20, MASK32_20, tmp, 0, MASK32_12);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 40; iter < 8; ++iter, tmpIdx += 5, longsIdx += 3) {
|
||||
long l0 = tmp[tmpIdx + 0] << 8;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_8;
|
||||
|
@ -751,8 +773,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode21(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(42, longs, 11, 21, MASK32_21, tmp, 0, MASK32_11);
|
||||
static void decode21(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 42, longs, 11, 21, MASK32_21, tmp, 0, MASK32_11);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 42; iter < 2; ++iter, tmpIdx += 21, longsIdx += 11) {
|
||||
long l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK32_10;
|
||||
|
@ -799,8 +821,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode22(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(44, longs, 10, 22, MASK32_22, tmp, 0, MASK32_10);
|
||||
static void decode22(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 44, longs, 10, 22, MASK32_22, tmp, 0, MASK32_10);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 44; iter < 4; ++iter, tmpIdx += 11, longsIdx += 5) {
|
||||
long l0 = tmp[tmpIdx + 0] << 12;
|
||||
l0 |= tmp[tmpIdx + 1] << 2;
|
||||
|
@ -825,8 +847,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode23(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(46, longs, 9, 23, MASK32_23, tmp, 0, MASK32_9);
|
||||
static void decode23(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 46, longs, 9, 23, MASK32_23, tmp, 0, MASK32_9);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 46; iter < 2; ++iter, tmpIdx += 23, longsIdx += 9) {
|
||||
long l0 = tmp[tmpIdx + 0] << 14;
|
||||
l0 |= tmp[tmpIdx + 1] << 5;
|
||||
|
@ -871,8 +893,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode24(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(48, longs, 8, 24, MASK32_24, tmp, 0, MASK32_8);
|
||||
static void decode24(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 48, longs, 8, 24, MASK32_24, tmp, 0, MASK32_8);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 48; iter < 16; ++iter, tmpIdx += 3, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 16;
|
||||
l0 |= tmp[tmpIdx + 1] << 8;
|
|
@ -37,7 +37,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
|
@ -23,7 +23,6 @@ import org.apache.lucene.codecs.FieldsConsumer;
|
|||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
|
@ -318,7 +317,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene912PostingsFormat extends PostingsFormat {
|
||||
public class Lucene912PostingsFormat extends PostingsFormat {
|
||||
|
||||
/** Filename extension for some small metadata about how postings are encoded. */
|
||||
public static final String META_EXTENSION = "psm";
|
||||
|
@ -341,7 +340,7 @@ public final class Lucene912PostingsFormat extends PostingsFormat {
|
|||
/** Size of blocks. */
|
||||
public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE;
|
||||
|
||||
public static final int BLOCK_MASK = BLOCK_SIZE - 1;
|
||||
static final int BLOCK_MASK = BLOCK_SIZE - 1;
|
||||
|
||||
/** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */
|
||||
public static final int LEVEL1_FACTOR = 32;
|
||||
|
@ -349,7 +348,7 @@ public final class Lucene912PostingsFormat extends PostingsFormat {
|
|||
/** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */
|
||||
public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE;
|
||||
|
||||
public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1;
|
||||
static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1;
|
||||
|
||||
static final String TERMS_CODEC = "Lucene90PostingsWriterTerms";
|
||||
static final String META_CODEC = "Lucene912PostingsWriterMeta";
|
||||
|
@ -360,45 +359,15 @@ public final class Lucene912PostingsFormat extends PostingsFormat {
|
|||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/** Creates {@code Lucene912PostingsFormat} with default settings. */
|
||||
public Lucene912PostingsFormat() {
|
||||
this(
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@code Lucene912PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
*
|
||||
* @see
|
||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||
*/
|
||||
public Lucene912PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("Lucene912");
|
||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene90BlockTreeTermsWriter(
|
||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
throw new UnsupportedOperationException(
|
||||
"This postings format may not be used for writing, use the current postings format");
|
||||
}
|
||||
|
||||
@Override
|
|
@ -14,17 +14,17 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene912.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.LEVEL1_NUM_DOCS;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_START;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.LEVEL1_NUM_DOCS;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_START;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.AbstractList;
|
||||
|
@ -32,10 +32,10 @@ import java.util.Arrays;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.RandomAccess;
|
||||
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Impact;
|
||||
import org.apache.lucene.index.Impacts;
|
||||
|
@ -45,8 +45,6 @@ import org.apache.lucene.index.IndexOptions;
|
|||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SlowImpactsEnum;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.internal.vectorization.VectorizationProvider;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -64,7 +62,6 @@ import org.apache.lucene.util.IOUtils;
|
|||
*/
|
||||
public final class Lucene912PostingsReader extends PostingsReaderBase {
|
||||
|
||||
static final VectorizationProvider VECTORIZATION_PROVIDER = VectorizationProvider.getInstance();
|
||||
// Dummy impacts, composed of the maximum possible term frequency and the lowest possible
|
||||
// (unsigned) norm value. This is typically used on tail blocks, which don't actually record
|
||||
// impacts as the storage overhead would not be worth any query evaluation speedup, since there's
|
||||
|
@ -215,15 +212,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
static int findFirstGreater(long[] buffer, int target, int from) {
|
||||
for (int i = from; i < BLOCK_SIZE; ++i) {
|
||||
if (buffer[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return BLOCK_SIZE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BlockTermState newTermState() {
|
||||
return new IntBlockTermState();
|
||||
|
@ -357,10 +345,10 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
protected int docCountUpto; // number of docs in or before the current block
|
||||
protected long prevDocID; // last doc ID of the previous block
|
||||
|
||||
protected int docBufferSize;
|
||||
protected int docBufferUpto;
|
||||
|
||||
protected IndexInput docIn;
|
||||
protected PostingDecodingUtil docInUtil;
|
||||
|
||||
protected AbstractPostingsEnum(FieldInfo fieldInfo) {
|
||||
indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
|
@ -381,7 +369,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
if (docIn == null) {
|
||||
// lazy init
|
||||
docIn = Lucene912PostingsReader.this.docIn.clone();
|
||||
docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn);
|
||||
}
|
||||
prefetchPostings(docIn, termState);
|
||||
}
|
||||
|
@ -402,6 +389,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
level1DocEndFP = termState.docStartFP;
|
||||
}
|
||||
level1DocCountUpto = 0;
|
||||
docBufferSize = BLOCK_SIZE;
|
||||
docBufferUpto = BLOCK_SIZE;
|
||||
return this;
|
||||
}
|
||||
|
@ -427,7 +415,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException {
|
||||
resetIndexInput(termState);
|
||||
if (pforUtil == null && docFreq >= BLOCK_SIZE) {
|
||||
pforUtil = new PForUtil(new ForUtil());
|
||||
pforUtil = new PForUtil();
|
||||
forDeltaUtil = new ForDeltaUtil();
|
||||
}
|
||||
totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq;
|
||||
|
@ -446,7 +434,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
public int freq() throws IOException {
|
||||
if (freqFP != -1) {
|
||||
docIn.seek(freqFP);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
pforUtil.decode(docIn, freqBuffer);
|
||||
freqFP = -1;
|
||||
}
|
||||
|
||||
|
@ -476,7 +464,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
private void refillFullBlock() throws IOException {
|
||||
assert docFreq - docCountUpto >= BLOCK_SIZE;
|
||||
|
||||
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
|
||||
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
|
||||
|
||||
if (indexHasFreq) {
|
||||
if (needsFreq) {
|
||||
|
@ -487,7 +475,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
docCountUpto += BLOCK_SIZE;
|
||||
prevDocID = docBuffer[BLOCK_SIZE - 1];
|
||||
docBufferUpto = 0;
|
||||
assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS;
|
||||
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
private void refillRemainder() throws IOException {
|
||||
|
@ -508,6 +496,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
docCountUpto += left;
|
||||
}
|
||||
docBufferUpto = 0;
|
||||
docBufferSize = left;
|
||||
freqFP = -1;
|
||||
}
|
||||
|
||||
|
@ -580,7 +569,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (doc == level0LastDocID) { // advance skip data on level 0
|
||||
if (docBufferUpto == BLOCK_SIZE) { // advance skip data on level 0
|
||||
moveToNextLevel0Block();
|
||||
}
|
||||
|
||||
|
@ -604,7 +593,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
int next = findFirstGreater(docBuffer, target, docBufferUpto);
|
||||
int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
|
||||
this.doc = (int) docBuffer[next];
|
||||
docBufferUpto = next + 1;
|
||||
return doc;
|
||||
|
@ -636,9 +625,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
private int posBufferUpto;
|
||||
|
||||
final IndexInput posIn;
|
||||
final PostingDecodingUtil posInUtil;
|
||||
final IndexInput payIn;
|
||||
final PostingDecodingUtil payInUtil;
|
||||
final BytesRef payload;
|
||||
|
||||
final boolean indexHasOffsets;
|
||||
|
@ -681,13 +668,10 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads;
|
||||
|
||||
this.posIn = Lucene912PostingsReader.this.posIn.clone();
|
||||
posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn);
|
||||
if (indexHasOffsetsOrPayloads) {
|
||||
this.payIn = Lucene912PostingsReader.this.payIn.clone();
|
||||
payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn);
|
||||
} else {
|
||||
this.payIn = null;
|
||||
payInUtil = null;
|
||||
}
|
||||
if (indexHasOffsets) {
|
||||
offsetStartDeltaBuffer = new long[BLOCK_SIZE];
|
||||
|
@ -727,7 +711,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
totalTermFreq = termState.totalTermFreq;
|
||||
if (pforUtil == null && totalTermFreq >= BLOCK_SIZE) {
|
||||
pforUtil = new PForUtil(new ForUtil());
|
||||
pforUtil = new PForUtil();
|
||||
}
|
||||
// Where this term's postings start in the .pos file:
|
||||
final long posTermStartFP = termState.posStartFP;
|
||||
|
@ -774,24 +758,26 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
assert left >= 0;
|
||||
|
||||
if (left >= BLOCK_SIZE) {
|
||||
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
|
||||
pforUtil.decode(docIn, freqBuffer);
|
||||
docCountUpto += BLOCK_SIZE;
|
||||
} else if (docFreq == 1) {
|
||||
docBuffer[0] = singletonDocID;
|
||||
freqBuffer[0] = totalTermFreq;
|
||||
docBuffer[1] = NO_MORE_DOCS;
|
||||
docCountUpto++;
|
||||
docBufferSize = 1;
|
||||
} else {
|
||||
// Read vInts:
|
||||
PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
|
||||
prefixSum(docBuffer, left, prevDocID);
|
||||
docBuffer[left] = NO_MORE_DOCS;
|
||||
docCountUpto += left;
|
||||
docBufferSize = left;
|
||||
}
|
||||
prevDocID = docBuffer[BLOCK_SIZE - 1];
|
||||
docBufferUpto = 0;
|
||||
assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS;
|
||||
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
private void skipLevel1To(int target) throws IOException {
|
||||
|
@ -875,7 +861,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (doc == level0LastDocID) { // advance level 0 skip data
|
||||
if (docBufferUpto == BLOCK_SIZE) { // advance level 0 skip data
|
||||
moveToNextLevel0Block();
|
||||
}
|
||||
|
||||
|
@ -951,7 +937,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
refillDocs();
|
||||
}
|
||||
|
||||
int next = findFirstGreater(docBuffer, target, docBufferUpto);
|
||||
int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
|
||||
posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1);
|
||||
this.freq = (int) freqBuffer[next];
|
||||
this.docBufferUpto = next + 1;
|
||||
|
@ -1045,11 +1031,11 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
payloadByteUpto = 0;
|
||||
} else {
|
||||
pforUtil.decode(posInUtil, posDeltaBuffer);
|
||||
pforUtil.decode(posIn, posDeltaBuffer);
|
||||
|
||||
if (indexHasPayloads) {
|
||||
if (needsPayloads) {
|
||||
pforUtil.decode(payInUtil, payloadLengthBuffer);
|
||||
pforUtil.decode(payIn, payloadLengthBuffer);
|
||||
int numBytes = payIn.readVInt();
|
||||
|
||||
if (numBytes > payloadBytes.length) {
|
||||
|
@ -1068,8 +1054,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
|
||||
if (indexHasOffsets) {
|
||||
if (needsOffsets) {
|
||||
pforUtil.decode(payInUtil, offsetStartDeltaBuffer);
|
||||
pforUtil.decode(payInUtil, offsetLengthBuffer);
|
||||
pforUtil.decode(payIn, offsetStartDeltaBuffer);
|
||||
pforUtil.decode(payIn, offsetLengthBuffer);
|
||||
} else {
|
||||
// this works, because when writing a vint block we always force the first length to be
|
||||
// written
|
||||
|
@ -1142,7 +1128,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
private abstract class BlockImpactsEnum extends ImpactsEnum {
|
||||
|
||||
protected final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||
protected final PForUtil pforUtil = new PForUtil(new ForUtil());
|
||||
protected final PForUtil pforUtil = new PForUtil();
|
||||
|
||||
protected final long[] docBuffer = new long[BLOCK_SIZE + 1];
|
||||
protected final long[] freqBuffer = new long[BLOCK_SIZE];
|
||||
|
@ -1150,11 +1136,11 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
protected final int docFreq; // number of docs in this posting list
|
||||
|
||||
protected final IndexInput docIn;
|
||||
protected final PostingDecodingUtil docInUtil;
|
||||
|
||||
protected int docCountUpto; // number of docs in or before the current block
|
||||
protected int doc = -1; // doc we last read
|
||||
protected long prevDocID = -1; // last doc ID of the previous block
|
||||
protected int docBufferSize = BLOCK_SIZE;
|
||||
protected int docBufferUpto = BLOCK_SIZE;
|
||||
|
||||
// true if we shallow-advanced to a new block that we have not decoded yet
|
||||
|
@ -1175,7 +1161,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
private BlockImpactsEnum(IntBlockTermState termState) throws IOException {
|
||||
this.docFreq = termState.docFreq;
|
||||
this.docIn = Lucene912PostingsReader.this.docIn.clone();
|
||||
this.docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn);
|
||||
prefetchPostings(docIn, termState);
|
||||
level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0);
|
||||
level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1);
|
||||
|
@ -1279,7 +1264,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
public int freq() throws IOException {
|
||||
if (freqFP != -1) {
|
||||
docIn.seek(freqFP);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
pforUtil.decode(docIn, freqBuffer);
|
||||
freqFP = -1;
|
||||
}
|
||||
return (int) freqBuffer[docBufferUpto - 1];
|
||||
|
@ -1295,7 +1280,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
assert left >= 0;
|
||||
|
||||
if (left >= BLOCK_SIZE) {
|
||||
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
|
||||
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
|
||||
freqFP = docIn.getFilePointer();
|
||||
PForUtil.skip(docIn);
|
||||
docCountUpto += BLOCK_SIZE;
|
||||
|
@ -1306,10 +1291,11 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
docBuffer[left] = NO_MORE_DOCS;
|
||||
freqFP = -1;
|
||||
docCountUpto += left;
|
||||
docBufferSize = left;
|
||||
}
|
||||
prevDocID = docBuffer[BLOCK_SIZE - 1];
|
||||
docBufferUpto = 0;
|
||||
assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS;
|
||||
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
private void skipLevel1To(int target) throws IOException {
|
||||
|
@ -1417,11 +1403,13 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (doc == level0LastDocID) {
|
||||
moveToNextLevel0Block();
|
||||
} else if (needsRefilling) {
|
||||
refillDocs();
|
||||
needsRefilling = false;
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
if (needsRefilling) {
|
||||
refillDocs();
|
||||
needsRefilling = false;
|
||||
} else {
|
||||
moveToNextLevel0Block();
|
||||
}
|
||||
}
|
||||
|
||||
return this.doc = (int) docBuffer[docBufferUpto++];
|
||||
|
@ -1435,7 +1423,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
needsRefilling = false;
|
||||
}
|
||||
|
||||
int next = findFirstGreater(docBuffer, target, docBufferUpto);
|
||||
int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
|
||||
this.doc = (int) docBuffer[next];
|
||||
docBufferUpto = next + 1;
|
||||
return doc;
|
||||
|
@ -1447,7 +1435,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
|
||||
private int posBufferUpto;
|
||||
final IndexInput posIn;
|
||||
final PostingDecodingUtil posInUtil;
|
||||
|
||||
final boolean indexHasFreq;
|
||||
final boolean indexHasOffsets;
|
||||
|
@ -1488,7 +1475,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads;
|
||||
|
||||
this.posIn = Lucene912PostingsReader.this.posIn.clone();
|
||||
posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn);
|
||||
|
||||
// Where this term's postings start in the .pos file:
|
||||
final long posTermStartFP = termState.posStartFP;
|
||||
|
@ -1519,8 +1505,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
assert left >= 0;
|
||||
|
||||
if (left >= BLOCK_SIZE) {
|
||||
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
|
||||
pforUtil.decode(docIn, freqBuffer);
|
||||
docCountUpto += BLOCK_SIZE;
|
||||
} else if (docFreq == 1) {
|
||||
docBuffer[0] = singletonDocID;
|
||||
|
@ -1533,10 +1519,11 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
prefixSum(docBuffer, left, prevDocID);
|
||||
docBuffer[left] = NO_MORE_DOCS;
|
||||
docCountUpto += left;
|
||||
docBufferSize = left;
|
||||
}
|
||||
prevDocID = docBuffer[BLOCK_SIZE - 1];
|
||||
docBufferUpto = 0;
|
||||
assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS;
|
||||
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
private void skipLevel1To(int target) throws IOException {
|
||||
|
@ -1644,8 +1631,9 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
advanceShallow(doc + 1);
|
||||
if (needsRefilling) {
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
advanceShallow(doc + 1);
|
||||
assert needsRefilling;
|
||||
refillDocs();
|
||||
needsRefilling = false;
|
||||
}
|
||||
|
@ -1666,7 +1654,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
needsRefilling = false;
|
||||
}
|
||||
|
||||
int next = findFirstGreater(docBuffer, target, docBufferUpto);
|
||||
int next = findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
|
||||
posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1);
|
||||
freq = (int) freqBuffer[next];
|
||||
docBufferUpto = next + 1;
|
||||
|
@ -1724,7 +1712,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
} else {
|
||||
pforUtil.decode(posInUtil, posDeltaBuffer);
|
||||
pforUtil.decode(posIn, posDeltaBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1749,9 +1737,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @see Lucene912PostingsWriter#writeVInt15(org.apache.lucene.store.DataOutput, int)
|
||||
*/
|
||||
static int readVInt15(DataInput in) throws IOException {
|
||||
short s = in.readShort();
|
||||
if (s >= 0) {
|
||||
|
@ -1761,9 +1746,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @see Lucene912PostingsWriter#writeVLong15(org.apache.lucene.store.DataOutput, long)
|
||||
*/
|
||||
static long readVLong15(DataInput in) throws IOException {
|
||||
short s = in.readShort();
|
||||
if (s >= 0) {
|
||||
|
@ -1773,6 +1755,15 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
private static int findNextGEQ(long[] buffer, long target, int from, int to) {
|
||||
for (int i = from; i < to; ++i) {
|
||||
if (buffer[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return to;
|
||||
}
|
||||
|
||||
private static void prefetchPostings(IndexInput docIn, IntBlockTermState state)
|
||||
throws IOException {
|
||||
assert state.docFreq > 1; // Singletons are inlined in the terms dict, nothing to prefetch
|
|
@ -14,13 +14,13 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.LongHeap;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
|
@ -38,11 +38,10 @@ final class PForUtil {
|
|||
return true;
|
||||
}
|
||||
|
||||
private final ForUtil forUtil;
|
||||
private final ForUtil forUtil = new ForUtil();
|
||||
|
||||
PForUtil(ForUtil forUtil) {
|
||||
static {
|
||||
assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE;
|
||||
this.forUtil = forUtil;
|
||||
}
|
||||
|
||||
/** Encode 128 integers from {@code longs} into {@code out}. */
|
||||
|
@ -105,18 +104,18 @@ final class PForUtil {
|
|||
}
|
||||
|
||||
/** Decode 128 integers into {@code ints}. */
|
||||
void decode(PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
final int token = Byte.toUnsignedInt(pdu.in.readByte());
|
||||
void decode(IndexInput in, long[] longs) throws IOException {
|
||||
final int token = Byte.toUnsignedInt(in.readByte());
|
||||
final int bitsPerValue = token & 0x1f;
|
||||
final int numExceptions = token >>> 5;
|
||||
if (bitsPerValue == 0) {
|
||||
Arrays.fill(longs, 0, ForUtil.BLOCK_SIZE, pdu.in.readVLong());
|
||||
Arrays.fill(longs, 0, ForUtil.BLOCK_SIZE, in.readVLong());
|
||||
} else {
|
||||
forUtil.decode(bitsPerValue, pdu, longs);
|
||||
forUtil.decode(bitsPerValue, in, longs);
|
||||
}
|
||||
final int numExceptions = token >>> 5;
|
||||
for (int i = 0; i < numExceptions; ++i) {
|
||||
longs[Byte.toUnsignedInt(pdu.in.readByte())] |=
|
||||
Byte.toUnsignedLong(pdu.in.readByte()) << bitsPerValue;
|
||||
longs[Byte.toUnsignedInt(in.readByte())] |=
|
||||
Byte.toUnsignedLong(in.readByte()) << bitsPerValue;
|
||||
}
|
||||
}
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataOutput;
|
|
@ -40,15 +40,14 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene912.ForUtil.*;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.ForUtil.*;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/
|
||||
|
@ -57,7 +56,7 @@ import static org.apache.lucene.codecs.lucene912.ForUtil.*;
|
|||
* else if bitsPerValue <= 11 we pack 4 ints per long
|
||||
* else we pack 2 ints per long
|
||||
*/
|
||||
public final class ForDeltaUtil {
|
||||
final class ForDeltaUtil {
|
||||
|
||||
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
|
||||
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
|
||||
|
@ -299,12 +298,12 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
|
||||
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */
|
||||
void decodeAndPrefixSum(PostingDecodingUtil pdu, long base, long[] longs) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
|
||||
void decodeAndPrefixSum(IndexInput in, long base, long[] longs) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(in.readByte());
|
||||
if (bitsPerValue == 0) {
|
||||
prefixSumOfOnes(longs, base);
|
||||
} else {
|
||||
decodeAndPrefixSum(bitsPerValue, pdu, base, longs);
|
||||
decodeAndPrefixSum(bitsPerValue, in, base, longs);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -361,18 +360,21 @@ def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values,
|
|||
|
||||
def writeDecode(bpv, f):
|
||||
next_primitive = primitive_size_for_bpv(bpv)
|
||||
f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {\n' %(bpv, next_primitive))
|
||||
if next_primitive % bpv == 0:
|
||||
f.write(' private static void decode%dTo%d(IndexInput in, long[] longs) throws IOException {\n' %(bpv, next_primitive))
|
||||
else:
|
||||
f.write(' private static void decode%dTo%d(IndexInput in, long[] tmp, long[] longs) throws IOException {\n' %(bpv, next_primitive))
|
||||
if bpv == next_primitive:
|
||||
f.write(' pdu.in.readLongs(longs, 0, %d);\n' %(bpv*2))
|
||||
f.write(' in.readLongs(longs, 0, %d);\n' %(bpv*2))
|
||||
else:
|
||||
num_values_per_long = 64 / next_primitive
|
||||
remaining_bits = next_primitive % bpv
|
||||
num_iters = (next_primitive - 1) // bpv
|
||||
o = 2 * bpv * num_iters
|
||||
if remaining_bits == 0:
|
||||
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
else:
|
||||
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_long - o, f)
|
||||
f.write(' }\n')
|
||||
|
||||
|
@ -383,20 +385,26 @@ if __name__ == '__main__':
|
|||
/**
|
||||
* Delta-decode 128 integers into {@code longs}.
|
||||
*/
|
||||
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, long base, long[] longs) throws IOException {
|
||||
void decodeAndPrefixSum(int bitsPerValue, IndexInput in, long base, long[] longs) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
""")
|
||||
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
primitive_size = primitive_size_for_bpv(bpv)
|
||||
f.write(' case %d:\n' %bpv)
|
||||
if next_primitive(bpv) == primitive_size:
|
||||
f.write(' decode%d(pdu, tmp, longs);\n' %bpv)
|
||||
if primitive_size % bpv == 0:
|
||||
f.write(' decode%d(in, longs);\n' %bpv)
|
||||
else:
|
||||
f.write(' decode%d(in, tmp, longs);\n' %bpv)
|
||||
else:
|
||||
f.write(' decode%dTo%d(pdu, tmp, longs);\n' %(bpv, primitive_size))
|
||||
if primitive_size % bpv == 0:
|
||||
f.write(' decode%dTo%d(in, longs);\n' %(bpv, primitive_size))
|
||||
else:
|
||||
f.write(' decode%dTo%d(in, tmp, longs);\n' %(bpv, primitive_size))
|
||||
f.write(' prefixSum%d(longs, base);\n' %primitive_size)
|
||||
f.write(' break;\n')
|
||||
f.write(' default:\n')
|
||||
f.write(' decodeSlow(bitsPerValue, pdu, tmp, longs);\n')
|
||||
f.write(' decodeSlow(bitsPerValue, in, tmp, longs);\n')
|
||||
f.write(' prefixSum32(longs, base);\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' }\n')
|
|
@ -40,10 +40,9 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
|
@ -54,7 +53,7 @@ import org.apache.lucene.store.IndexInput;
|
|||
* else if bitsPerValue <= 16 we pack 4 ints per long
|
||||
* else we pack 2 ints per long
|
||||
*/
|
||||
public final class ForUtil {
|
||||
final class ForUtil {
|
||||
|
||||
public static final int BLOCK_SIZE = 128;
|
||||
static final int BLOCK_SIZE_LOG2 = 7;
|
||||
|
@ -222,11 +221,11 @@ public final class ForUtil {
|
|||
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
|
||||
}
|
||||
|
||||
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
static void decodeSlow(int bitsPerValue, IndexInput in, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
final int numLongs = bitsPerValue << 1;
|
||||
final long mask = MASKS32[bitsPerValue];
|
||||
pdu.splitLongs(numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
|
||||
splitLongs(in, numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
|
||||
final int remainingBitsPerLong = 32 - bitsPerValue;
|
||||
final long mask32RemainingBitsPerLong = MASKS32[remainingBitsPerLong];
|
||||
int tmpIdx = 0;
|
||||
|
@ -248,6 +247,20 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void splitLongs(
|
||||
IndexInput in, int count, long[] b, int bShift, int dec, long bMask, long[] c, int cIndex, long cMask)
|
||||
throws IOException {
|
||||
// takes advantage of the C2 compiler's loop unrolling and auto-vectorization.
|
||||
in.readLongs(c, cIndex, count);
|
||||
int maxIter = (bShift - 1) / dec;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
for (int j = 0; j <= maxIter; ++j) {
|
||||
b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask;
|
||||
}
|
||||
c[cIndex + i] &= cMask;
|
||||
}
|
||||
}
|
||||
|
||||
"""
|
||||
|
||||
def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values, f):
|
||||
|
@ -287,18 +300,20 @@ def writeDecode(bpv, f):
|
|||
next_primitive = 8
|
||||
elif bpv <= 16:
|
||||
next_primitive = 16
|
||||
f.write(' static void decode%d(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {\n' %bpv)
|
||||
if bpv == next_primitive:
|
||||
f.write(' pdu.in.readLongs(longs, 0, %d);\n' %(bpv*2))
|
||||
f.write(' static void decode%d(IndexInput in, long[] longs) throws IOException {\n' %bpv)
|
||||
f.write(' in.readLongs(longs, 0, %d);\n' %(bpv*2))
|
||||
else:
|
||||
num_values_per_long = 64 / next_primitive
|
||||
remaining_bits = next_primitive % bpv
|
||||
num_iters = (next_primitive - 1) // bpv
|
||||
o = 2 * bpv * num_iters
|
||||
if remaining_bits == 0:
|
||||
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
f.write(' static void decode%d(IndexInput in, long[] longs) throws IOException {\n' %bpv)
|
||||
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
else:
|
||||
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
f.write(' static void decode%d(IndexInput in, long[] tmp, long[] longs) throws IOException {\n' %bpv)
|
||||
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_long - o, f)
|
||||
f.write(' }\n')
|
||||
|
||||
|
@ -324,7 +339,7 @@ if __name__ == '__main__':
|
|||
|
||||
f.write("""
|
||||
/** Decode 128 integers into {@code longs}. */
|
||||
void decode(int bitsPerValue, PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
void decode(int bitsPerValue, IndexInput in, long[] longs) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
""")
|
||||
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
|
@ -334,11 +349,14 @@ if __name__ == '__main__':
|
|||
elif bpv <= 16:
|
||||
next_primitive = 16
|
||||
f.write(' case %d:\n' %bpv)
|
||||
f.write(' decode%d(pdu, tmp, longs);\n' %bpv)
|
||||
if next_primitive % bpv == 0:
|
||||
f.write(' decode%d(in, longs);\n' %bpv)
|
||||
else:
|
||||
f.write(' decode%d(in, tmp, longs);\n' %bpv)
|
||||
f.write(' expand%d(longs);\n' %next_primitive)
|
||||
f.write(' break;\n')
|
||||
f.write(' default:\n')
|
||||
f.write(' decodeSlow(bitsPerValue, pdu, tmp, longs);\n')
|
||||
f.write(' decodeSlow(bitsPerValue, in, tmp, longs);\n')
|
||||
f.write(' expand32(longs);\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' }\n')
|
|
@ -15,419 +15,5 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lucene 9.12 file format.
|
||||
*
|
||||
* <h2>Apache Lucene - Index File Formats</h2>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <ul>
|
||||
* <li><a href="#Introduction">Introduction</a>
|
||||
* <li><a href="#Definitions">Definitions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
|
||||
* <li><a href="#Types_of_Fields">Types of Fields</a>
|
||||
* <li><a href="#Segments">Segments</a>
|
||||
* <li><a href="#Document_Numbers">Document Numbers</a>
|
||||
* </ul>
|
||||
* <li><a href="#Overview">Index Structure Overview</a>
|
||||
* <li><a href="#File_Naming">File Naming</a>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a>
|
||||
* <li><a href="#History">History</a>
|
||||
* <li><a href="#Limitations">Limitations</a>
|
||||
* </ul>
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Introduction"></a>
|
||||
*
|
||||
* <h3>Introduction</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>This document defines the index file formats used in this version of Lucene. If you are using
|
||||
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
|
||||
* with the version you are using.
|
||||
*
|
||||
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
|
||||
* </div> <a id="Definitions"></a>
|
||||
*
|
||||
* <h3>Definitions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.
|
||||
*
|
||||
* <p>An index contains a sequence of documents.
|
||||
*
|
||||
* <ul>
|
||||
* <li>A document is a sequence of fields.
|
||||
* <li>A field is a named sequence of terms.
|
||||
* <li>A term is a sequence of bytes.
|
||||
* </ul>
|
||||
*
|
||||
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
|
||||
* are represented as a pair: the string naming the field, and the bytes within the field. <a
|
||||
* id="Inverted_Indexing"></a>
|
||||
*
|
||||
* <h4>Inverted Indexing</h4>
|
||||
*
|
||||
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
|
||||
* search more efficient. Lucene's terms index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
|
||||
* This is the inverse of the natural relationship, in which documents list terms. <a
|
||||
* id="Types_of_Fields"></a>
|
||||
*
|
||||
* <h4>Types of Fields</h4>
|
||||
*
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
|
||||
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
|
||||
* may be both stored and indexed.
|
||||
*
|
||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
|
||||
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
|
||||
* useful for certain identifier fields to be indexed literally.
|
||||
*
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
|
||||
* Fields. <a id="Segments"></a>
|
||||
*
|
||||
* <h4>Segments</h4>
|
||||
*
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
|
||||
* fully independent index, which could be searched separately. Indexes evolve by:
|
||||
*
|
||||
* <ol>
|
||||
* <li>Creating new segments for newly added documents.
|
||||
* <li>Merging existing segments.
|
||||
* </ol>
|
||||
*
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
|
||||
* composed of a set of segments. <a id="Document_Numbers"></a>
|
||||
*
|
||||
* <h4>Document Numbers</h4>
|
||||
*
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
|
||||
* document added to an index is numbered zero, and each subsequent document added gets a number one
|
||||
* greater than the previous.
|
||||
*
|
||||
* <p>Note that a document's number may change, so caution should be taken when storing these
|
||||
* numbers outside of Lucene. In particular, numbers may change in the following situations:
|
||||
*
|
||||
* <ul>
|
||||
* <li>
|
||||
* <p>The numbers stored in each segment are unique only within the segment, and must be
|
||||
* converted before they can be used in a larger context. The standard technique is to
|
||||
* allocate each segment a range of values, based on the range of numbers used in that
|
||||
* segment. To convert a document number from a segment to an external value, the segment's
|
||||
* <i>base</i> document number is added. To convert an external value back to a
|
||||
* segment-specific value, the segment is identified by the range that the external value is
|
||||
* in, and the segment's base value is subtracted. For example two five document segments
|
||||
* might be combined, so that the first segment has a base value of zero, and the second of
|
||||
* five. Document three from the second segment would have an external value of eight.
|
||||
* <li>
|
||||
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
|
||||
* removed as the index evolves through merging. Deleted documents are dropped when segments
|
||||
* are merged. A freshly-merged segment thus has no gaps in its numbering.
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Overview"></a>
|
||||
*
|
||||
* <h3>Index Structure Overview</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Each segment index maintains the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
|
||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||
* information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
|
||||
* contains metadata about the set of named fields used in the index.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||
* This contains, for each document, a list of attribute-value pairs, where the attributes are
|
||||
* field names. These are used to store auxiliary information about the document, such as its
|
||||
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
|
||||
* dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||
* pointers to the term's frequency and proximity data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
|
||||
* each term in the dictionary, the numbers of all the documents that contain that term, and
|
||||
* the frequency of the term in that document, unless frequencies are omitted ({@link
|
||||
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
|
||||
* each term in the dictionary, the positions that the term occurs in each document. Note that
|
||||
* this will not exist if all fields in all documents omit position data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||
* each field in each document, a value is stored that is multiplied into the score for hits
|
||||
* on that field.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
|
||||
* field in each document, the term vector (sometimes called document vector) may be stored. A
|
||||
* term vector consists of term text and term frequency. To add Term Vectors to your index see
|
||||
* the {@link org.apache.lucene.document.Field Field} constructors
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
|
||||
* stored values, these are also keyed by document number, but are generally intended to be
|
||||
* loaded into main memory for fast access. Whereas stored values are generally intended for
|
||||
* summary results from searches, per-document values are useful for things like scoring
|
||||
* factors.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
|
||||
* optional file indicating which documents are live.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
|
||||
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
|
||||
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
|
||||
* intersection (2D, 3D).
|
||||
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
|
||||
* vector format stores numeric vectors in a format optimized for random access and
|
||||
* computation, supporting high-dimensional nearest-neighbor search.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
|
||||
*
|
||||
* <h3>File Naming</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
|
||||
* correspond to the different file formats described below. When using the Compound File format
|
||||
* (default for small segments) these files (except for the Segment info file, the Lock file, and
|
||||
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
|
||||
*
|
||||
* <p>Typically, all segments in an index are stored in a single directory, although this is not
|
||||
* required.
|
||||
*
|
||||
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
|
||||
* never before used filename. This is achieved using a simple generations approach. For example,
|
||||
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
|
||||
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
|
||||
*
|
||||
* <h3>Summary of File Extensions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The following table summarizes the names and extensions of the files in Lucene:
|
||||
*
|
||||
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
||||
* <caption>lucene filenames by extension</caption>
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
* <th>Brief Description</th>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||
* <td>segments_N</td>
|
||||
* <td>Stores information about a commit point</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td><a href="#Lock_File">Lock File</a></td>
|
||||
* <td>write.lock</td>
|
||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
|
||||
* <td>.cfs, .cfe</td>
|
||||
* <td>An optional "virtual" file consisting of all the other index files for
|
||||
* systems that frequently run out of file handles.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
|
||||
* <td>.fnm</td>
|
||||
* <td>Stores information about the fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
|
||||
* <td>.fdx</td>
|
||||
* <td>Contains pointers to field data</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
|
||||
* <td>.fdt</td>
|
||||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
|
||||
* <td>.nvd, .nvm</td>
|
||||
* <td>Encodes length and boost factors for docs and fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
|
||||
* <td>.dvd, .dvm</td>
|
||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
|
||||
* <td>.tvx</td>
|
||||
* <td>Stores offset into the document data file</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
|
||||
* <td>.tvd</td>
|
||||
* <td>Contains term vector data.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
|
||||
* <td>.liv</td>
|
||||
* <td>Info about what documents are live</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
|
||||
* <td>.kdd, .kdi, .kdm</td>
|
||||
* <td>Holds indexed points</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
|
||||
* <td>.vec, .vem, .veq, vex</td>
|
||||
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
|
||||
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
|
||||
* hnsw graph data.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
* </div> <a id="Lock_File"></a>
|
||||
*
|
||||
* <h3>Lock File</h3>
|
||||
*
|
||||
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
|
||||
* lock directory is different from the index directory then the write lock will be named
|
||||
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
|
||||
* directory. When this file is present, a writer is currently modifying the index (adding or
|
||||
* removing documents). This lock file ensures that only one writer is modifying the index at a
|
||||
* time. <a id="History"></a>
|
||||
*
|
||||
* <h3>History</h3>
|
||||
*
|
||||
* <p>Compatibility notes are provided in this document, describing how file formats have changed
|
||||
* from prior versions:
|
||||
*
|
||||
* <ul>
|
||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
|
||||
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
|
||||
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
|
||||
* written in the new file format (meaning no specific "upgrade" process is needed). But note
|
||||
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
|
||||
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
|
||||
* store (vectors & stored fields) files. This allows for faster indexing in certain
|
||||
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
|
||||
* change in 2.1).
|
||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
|
||||
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
|
||||
* details.
|
||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
|
||||
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
|
||||
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
|
||||
* details. Also, diagnostics were added to each segment written recording details about why
|
||||
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
|
||||
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
|
||||
* read, but on merge the new segment will write them, uncompressed). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
|
||||
* <li>In version 3.1, segments records the code version that created them. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||
* Additionally segments track explicitly whether or not they have term vectors. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
|
||||
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
|
||||
* they were stored in text format only.
|
||||
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
|
||||
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
|
||||
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
|
||||
* was introduced. Normalization factors need no longer be a single byte, they can be any
|
||||
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
|
||||
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
|
||||
* the postings lists. Payloads can be stored in the term vectors.
|
||||
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
|
||||
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
|
||||
* were changed to inline directly into the term dictionary. Stored fields are compressed by
|
||||
* default.
|
||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
|
||||
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
|
||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
|
||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||
* allow updating NumericDocValues fields.
|
||||
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
|
||||
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
|
||||
* checksum of the file.
|
||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
|
||||
* suitable for faceting/sorting/analytics.
|
||||
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
|
||||
* for binary fields and ord indexes for multi-valued fields.
|
||||
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
||||
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
|
||||
* sorting.
|
||||
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
|
||||
* an iterator API.
|
||||
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
|
||||
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
|
||||
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
|
||||
* if they may not produce high enough scores. Additionally doc values and norms has been
|
||||
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
|
||||
* elements to skip when advancing in the data.
|
||||
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
||||
* performant encoding that is vectorized.
|
||||
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
|
||||
* user-defined sorts to be used
|
||||
* <li>In version 8.6, points fields split the index tree and leaf data into separate files, to
|
||||
* allow for different access patterns to the different data structures
|
||||
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
|
||||
* smaller stored fields.
|
||||
* <li>In version 9.0, vector-valued fields were added.
|
||||
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
|
||||
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
|
||||
* IndexDISI. ordToDoc mappings was added to .vem.
|
||||
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
|
||||
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
|
||||
* layer and not writing the node ids for the zeroth layer.
|
||||
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
|
||||
* format to utilize int8 quantized vectors for float32 vector search.
|
||||
* <li>In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
|
||||
* 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
|
||||
* need skipping, especially conjunctions.
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Limitations"></a>
|
||||
*
|
||||
* <h3>Limitations</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
|
||||
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
|
||||
* index file format and the current implementation. Eventually these should be replaced with either
|
||||
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||
* VInt} values which have no limit. </div>
|
||||
*/
|
||||
/** Lucene 9.12 file format. */
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
|
|
@ -21,8 +21,6 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
|
||||
|
@ -34,6 +32,7 @@ import org.apache.lucene.index.FloatVectorValues;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.search.KnnCollector;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -53,13 +52,15 @@ import org.apache.lucene.util.packed.DirectMonotonicReader;
|
|||
*/
|
||||
public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
|
||||
|
||||
private final Map<String, FieldEntry> fields = new HashMap<>();
|
||||
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
|
||||
private final IndexInput vectorData;
|
||||
private final IndexInput vectorIndex;
|
||||
private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
|
||||
private final FieldInfos fieldInfos;
|
||||
|
||||
Lucene92HnswVectorsReader(SegmentReadState state) throws IOException {
|
||||
int versionMeta = readMetadata(state);
|
||||
this.fieldInfos = state.fieldInfos;
|
||||
boolean success = false;
|
||||
try {
|
||||
vectorData =
|
||||
|
@ -152,7 +153,7 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
|
|||
}
|
||||
FieldEntry fieldEntry = readField(meta, info);
|
||||
validateFieldEntry(info, fieldEntry);
|
||||
fields.put(info.name, fieldEntry);
|
||||
fields.put(info.number, fieldEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -212,13 +213,18 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
|
|||
CodecUtil.checksumEntireFile(vectorIndex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
private FieldEntry getFieldEntry(String field) {
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final FieldEntry fieldEntry;
|
||||
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
return OffHeapFloatVectorValues.load(fieldEntry, vectorData);
|
||||
return fieldEntry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
return OffHeapFloatVectorValues.load(getFieldEntry(field), vectorData);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -229,8 +235,7 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
|
|||
@Override
|
||||
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
|
||||
final FieldEntry fieldEntry = getFieldEntry(field);
|
||||
if (fieldEntry.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -21,8 +21,6 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
|
||||
|
@ -35,6 +33,7 @@ import org.apache.lucene.index.IndexFileNames;
|
|||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.VectorEncoding;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.search.KnnCollector;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -54,13 +53,15 @@ import org.apache.lucene.util.packed.DirectMonotonicReader;
|
|||
*/
|
||||
public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
|
||||
|
||||
private final Map<String, FieldEntry> fields = new HashMap<>();
|
||||
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
|
||||
private final IndexInput vectorData;
|
||||
private final IndexInput vectorIndex;
|
||||
private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
|
||||
private final FieldInfos fieldInfos;
|
||||
|
||||
Lucene94HnswVectorsReader(SegmentReadState state) throws IOException {
|
||||
int versionMeta = readMetadata(state);
|
||||
this.fieldInfos = state.fieldInfos;
|
||||
boolean success = false;
|
||||
try {
|
||||
vectorData =
|
||||
|
@ -153,7 +154,7 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
|
|||
}
|
||||
FieldEntry fieldEntry = readField(meta, info);
|
||||
validateFieldEntry(info, fieldEntry);
|
||||
fields.put(info.name, fieldEntry);
|
||||
fields.put(info.number, fieldEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -230,48 +231,41 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
|
|||
CodecUtil.checksumEntireFile(vectorIndex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) {
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final FieldEntry fieldEntry;
|
||||
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
if (fieldEntry.vectorEncoding != expectedEncoding) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
+ field
|
||||
+ "\" is encoded as: "
|
||||
+ fieldEntry.vectorEncoding
|
||||
+ " expected: "
|
||||
+ VectorEncoding.FLOAT32);
|
||||
+ expectedEncoding);
|
||||
}
|
||||
return fieldEntry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
|
||||
return OffHeapFloatVectorValues.load(fieldEntry, vectorData);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ByteVectorValues getByteVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
+ field
|
||||
+ "\" is encoded as: "
|
||||
+ fieldEntry.vectorEncoding
|
||||
+ " expected: "
|
||||
+ VectorEncoding.BYTE);
|
||||
}
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
|
||||
return OffHeapByteVectorValues.load(fieldEntry, vectorData);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
|
||||
if (fieldEntry.size() == 0 || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
|
||||
if (fieldEntry.size() == 0 || knnCollector.k() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -289,9 +283,8 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
|
|||
@Override
|
||||
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
|
||||
if (fieldEntry.size() == 0 || fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
|
||||
if (fieldEntry.size() == 0 || knnCollector.k() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,8 +21,6 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
|
||||
|
@ -39,6 +37,7 @@ import org.apache.lucene.index.IndexFileNames;
|
|||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.VectorEncoding;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.search.KnnCollector;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -61,7 +60,7 @@ import org.apache.lucene.util.packed.DirectMonotonicReader;
|
|||
public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements HnswGraphProvider {
|
||||
|
||||
private final FieldInfos fieldInfos;
|
||||
private final Map<String, FieldEntry> fields = new HashMap<>();
|
||||
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
|
||||
private final IndexInput vectorData;
|
||||
private final IndexInput vectorIndex;
|
||||
private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
|
||||
|
@ -161,7 +160,7 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
}
|
||||
FieldEntry fieldEntry = readField(meta, info);
|
||||
validateFieldEntry(info, fieldEntry);
|
||||
fields.put(info.name, fieldEntry);
|
||||
fields.put(info.number, fieldEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -238,21 +237,27 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
CodecUtil.checksumEntireFile(vectorIndex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) {
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final FieldEntry fieldEntry;
|
||||
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
if (fieldEntry.vectorEncoding != expectedEncoding) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
+ field
|
||||
+ "\" is encoded as: "
|
||||
+ fieldEntry.vectorEncoding
|
||||
+ " expected: "
|
||||
+ VectorEncoding.FLOAT32);
|
||||
+ expectedEncoding);
|
||||
}
|
||||
return fieldEntry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
|
||||
return OffHeapFloatVectorValues.load(
|
||||
fieldEntry.similarityFunction,
|
||||
defaultFlatVectorScorer,
|
||||
|
@ -266,19 +271,7 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
|
||||
@Override
|
||||
public ByteVectorValues getByteVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
+ field
|
||||
+ "\" is encoded as: "
|
||||
+ fieldEntry.vectorEncoding
|
||||
+ " expected: "
|
||||
+ VectorEncoding.BYTE);
|
||||
}
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
|
||||
return OffHeapByteVectorValues.load(
|
||||
fieldEntry.similarityFunction,
|
||||
defaultFlatVectorScorer,
|
||||
|
@ -293,11 +286,8 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
@Override
|
||||
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
|
||||
if (fieldEntry.size() == 0
|
||||
|| knnCollector.k() == 0
|
||||
|| fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
|
||||
if (fieldEntry.size() == 0 || knnCollector.k() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -324,11 +314,8 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
@Override
|
||||
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
|
||||
if (fieldEntry.size() == 0
|
||||
|| knnCollector.k() == 0
|
||||
|| fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
|
||||
if (fieldEntry.size() == 0 || knnCollector.k() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -355,12 +342,12 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
/** Get knn graph values; used for testing */
|
||||
@Override
|
||||
public HnswGraph getGraph(String field) throws IOException {
|
||||
FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
if (info == null) {
|
||||
throw new IllegalArgumentException("No such field '" + field + "'");
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final FieldEntry entry;
|
||||
if (info == null || (entry = fields.get(info.number)) == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
FieldEntry entry = fields.get(field);
|
||||
if (entry != null && entry.vectorIndexLength > 0) {
|
||||
if (entry.vectorIndexLength > 0) {
|
||||
return getGraph(entry);
|
||||
} else {
|
||||
return HnswGraph.EMPTY;
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
|
@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||
|
|
|
@ -24,3 +24,4 @@ org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
|
|||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec
|
||||
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec
|
||||
org.apache.lucene.backward_codecs.lucene100.Lucene100Codec
|
||||
|
|
|
@ -17,3 +17,4 @@ org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat
|
|||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat
|
||||
|
|
|
@ -14,25 +14,25 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.LEVEL1_MASK;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.LEVEL1_MASK;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.PushPostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Impact;
|
||||
|
@ -142,9 +142,8 @@ public class Lucene912PostingsWriter extends PushPostingsWriterBase {
|
|||
metaOut, META_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
CodecUtil.writeIndexHeader(
|
||||
docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
final ForUtil forUtil = new ForUtil();
|
||||
forDeltaUtil = new ForDeltaUtil();
|
||||
pforUtil = new PForUtil(forUtil);
|
||||
pforUtil = new PForUtil();
|
||||
if (state.fieldInfos.hasProx()) {
|
||||
posDeltaBuffer = new long[BLOCK_SIZE];
|
||||
String posFileName =
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Read-write impersonation of {@link Lucene912PostingsFormat}. */
|
||||
public final class Lucene912RWPostingsFormat extends Lucene912PostingsFormat {
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/** Creates {@code Lucene912PostingsFormat} with default settings. */
|
||||
public Lucene912RWPostingsFormat() {
|
||||
this(
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@code Lucene912PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
*
|
||||
* @see
|
||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||
*/
|
||||
public Lucene912RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super();
|
||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene90BlockTreeTermsWriter(
|
||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,12 +14,11 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
@ -65,13 +64,11 @@ public class TestForDeltaUtil extends LuceneTestCase {
|
|||
{
|
||||
// decode
|
||||
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
|
||||
PostingDecodingUtil pdu =
|
||||
Lucene912PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
|
||||
ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
long base = 0;
|
||||
final long[] restored = new long[ForUtil.BLOCK_SIZE];
|
||||
forDeltaUtil.decodeAndPrefixSum(pdu, base, restored);
|
||||
forDeltaUtil.decodeAndPrefixSum(in, base, restored);
|
||||
final long[] expected = new long[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
expected[j] = values[i * ForUtil.BLOCK_SIZE + j];
|
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
public class TestForUtil extends LuceneTestCase {
|
||||
|
||||
public void testEncodeDecode() throws IOException {
|
||||
final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
|
||||
final int[] values = new int[iterations * ForUtil.BLOCK_SIZE];
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
final int bpv = TestUtil.nextInt(random(), 1, 31);
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
values[i * ForUtil.BLOCK_SIZE + j] =
|
||||
RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
|
||||
}
|
||||
}
|
||||
|
||||
final Directory d = new ByteBuffersDirectory();
|
||||
final long endPointer;
|
||||
|
||||
{
|
||||
// encode
|
||||
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
|
||||
final ForUtil forUtil = new ForUtil();
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
long[] source = new long[ForUtil.BLOCK_SIZE];
|
||||
long or = 0;
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
source[j] = values[i * ForUtil.BLOCK_SIZE + j];
|
||||
or |= source[j];
|
||||
}
|
||||
final int bpv = PackedInts.bitsRequired(or);
|
||||
out.writeByte((byte) bpv);
|
||||
forUtil.encode(source, bpv, out);
|
||||
}
|
||||
endPointer = out.getFilePointer();
|
||||
out.close();
|
||||
}
|
||||
|
||||
{
|
||||
// decode
|
||||
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
|
||||
ForUtil forUtil = new ForUtil();
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
final int bitsPerValue = in.readByte();
|
||||
final long currentFilePointer = in.getFilePointer();
|
||||
final long[] restored = new long[ForUtil.BLOCK_SIZE];
|
||||
forUtil.decode(bitsPerValue, in, restored);
|
||||
int[] ints = new int[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
ints[j] = Math.toIntExact(restored[j]);
|
||||
}
|
||||
assertArrayEquals(
|
||||
Arrays.toString(ints),
|
||||
ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE),
|
||||
ints);
|
||||
assertEquals(ForUtil.numBytes(bitsPerValue), in.getFilePointer() - currentFilePointer);
|
||||
}
|
||||
assertEquals(endPointer, in.getFilePointer());
|
||||
in.close();
|
||||
}
|
||||
|
||||
d.close();
|
||||
}
|
||||
}
|
|
@ -14,17 +14,17 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsReader.MutableImpactList;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader.MutableImpactList;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
|
@ -45,7 +45,7 @@ public class TestLucene912PostingsFormat extends BasePostingsFormatTestCase {
|
|||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return TestUtil.alwaysPostingsFormat(new Lucene912PostingsFormat());
|
||||
return TestUtil.alwaysPostingsFormat(new Lucene912RWPostingsFormat());
|
||||
}
|
||||
|
||||
public void testVInt15() throws IOException {
|
|
@ -0,0 +1,104 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
public class TestPForUtil extends LuceneTestCase {
|
||||
|
||||
public void testEncodeDecode() throws IOException {
|
||||
final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
|
||||
final int[] values = createTestData(iterations, 31);
|
||||
|
||||
final Directory d = new ByteBuffersDirectory();
|
||||
final long endPointer = encodeTestData(iterations, values, d);
|
||||
|
||||
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
|
||||
final PForUtil pforUtil = new PForUtil();
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
if (random().nextInt(5) == 0) {
|
||||
PForUtil.skip(in);
|
||||
continue;
|
||||
}
|
||||
final long[] restored = new long[ForUtil.BLOCK_SIZE];
|
||||
pforUtil.decode(in, restored);
|
||||
int[] ints = new int[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
ints[j] = Math.toIntExact(restored[j]);
|
||||
}
|
||||
assertArrayEquals(
|
||||
Arrays.toString(ints),
|
||||
ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE),
|
||||
ints);
|
||||
}
|
||||
assertEquals(endPointer, in.getFilePointer());
|
||||
in.close();
|
||||
|
||||
d.close();
|
||||
}
|
||||
|
||||
private int[] createTestData(int iterations, int maxBpv) {
|
||||
final int[] values = new int[iterations * ForUtil.BLOCK_SIZE];
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
final int bpv = TestUtil.nextInt(random(), 0, maxBpv);
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
values[i * ForUtil.BLOCK_SIZE + j] =
|
||||
RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
|
||||
if (random().nextInt(100) == 0) {
|
||||
final int exceptionBpv;
|
||||
if (random().nextInt(10) == 0) {
|
||||
exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 9, 16), maxBpv);
|
||||
} else {
|
||||
exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 1, 8), maxBpv);
|
||||
}
|
||||
values[i * ForUtil.BLOCK_SIZE + j] |= random().nextInt(1 << (exceptionBpv - bpv)) << bpv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return values;
|
||||
}
|
||||
|
||||
private long encodeTestData(int iterations, int[] values, Directory d) throws IOException {
|
||||
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
|
||||
final PForUtil pforUtil = new PForUtil();
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
long[] source = new long[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
source[j] = values[i * ForUtil.BLOCK_SIZE + j];
|
||||
}
|
||||
pforUtil.encode(source, out);
|
||||
}
|
||||
final long endPointer = out.getFilePointer();
|
||||
out.close();
|
||||
|
||||
return endPointer;
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.Directory;
|
|
@ -18,17 +18,12 @@
|
|||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
|
||||
public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene99Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene99RWHnswScalarQuantizationVectorsFormat();
|
||||
}
|
||||
};
|
||||
return TestUtil.alwaysKnnVectorsFormat(new Lucene99RWHnswScalarQuantizationVectorsFormat());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,9 +20,7 @@ import static org.apache.lucene.backward_index.TestBasicBackwardsCompatibility.a
|
|||
|
||||
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
|
@ -69,14 +67,10 @@ public class TestInt7HnswBackwardsCompatibility extends BackwardsCompatibilityTe
|
|||
}
|
||||
|
||||
protected Codec getCodec() {
|
||||
return new Lucene99Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene99HnswScalarQuantizedVectorsFormat(
|
||||
return TestUtil.alwaysKnnVectorsFormat(
|
||||
new Lucene99HnswScalarQuantizedVectorsFormat(
|
||||
Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN,
|
||||
Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH);
|
||||
}
|
||||
};
|
||||
Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -16,6 +16,9 @@
|
|||
*/
|
||||
|
||||
/** Lucene JMH benchmarks. */
|
||||
|
||||
// jmh.core is not modularized and causes a warning. Suppressing it until it is modularized.
|
||||
@SuppressWarnings("requires-automatic")
|
||||
module org.apache.lucene.benchmark.jmh {
|
||||
requires jmh.core;
|
||||
requires jdk.unsupported;
|
||||
|
|
|
@ -0,0 +1,180 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.VectorUtil;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.CompilerControl;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Warmup(iterations = 5, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
@Fork(
|
||||
value = 3,
|
||||
jvmArgsAppend = {
|
||||
"-Xmx1g",
|
||||
"-Xms1g",
|
||||
"-XX:+AlwaysPreTouch",
|
||||
"--add-modules",
|
||||
"jdk.incubator.vector"
|
||||
})
|
||||
public class AdvanceBenchmark {
|
||||
|
||||
private final int[] values = new int[129];
|
||||
private final int[] startIndexes = new int[1_000];
|
||||
private final int[] targets = new int[startIndexes.length];
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws Exception {
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
values[i] = i;
|
||||
}
|
||||
values[128] = DocIdSetIterator.NO_MORE_DOCS;
|
||||
Random r = new Random(0);
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
startIndexes[i] = r.nextInt(64);
|
||||
targets[i] = startIndexes[i] + 1 + r.nextInt(1 << r.nextInt(7));
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch(int[] values, int target, int startIndex) {
|
||||
// Standard binary search
|
||||
int i = Arrays.binarySearch(values, startIndex, values.length, target);
|
||||
if (i < 0) {
|
||||
i = -1 - i;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void inlinedBranchlessBinarySearch() {
|
||||
for (int i = 0; i < targets.length; ++i) {
|
||||
inlinedBranchlessBinarySearch(values, targets[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int inlinedBranchlessBinarySearch(int[] values, int target) {
|
||||
// This compiles to cmov instructions.
|
||||
int start = 0;
|
||||
|
||||
if (values[63] < target) {
|
||||
start += 64;
|
||||
}
|
||||
if (values[start + 31] < target) {
|
||||
start += 32;
|
||||
}
|
||||
if (values[start + 15] < target) {
|
||||
start += 16;
|
||||
}
|
||||
if (values[start + 7] < target) {
|
||||
start += 8;
|
||||
}
|
||||
if (values[start + 3] < target) {
|
||||
start += 4;
|
||||
}
|
||||
if (values[start + 1] < target) {
|
||||
start += 2;
|
||||
}
|
||||
if (values[start] < target) {
|
||||
start += 1;
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void linearSearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
linearSearch(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int linearSearch(int[] values, long target, int startIndex) {
|
||||
// Naive linear search.
|
||||
for (int i = startIndex; i < values.length; ++i) {
|
||||
if (values[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return values.length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void vectorUtilSearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
VectorUtil.findNextGEQ(values, targets[i], startIndexes[i], 128);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int vectorUtilSearch(int[] values, int target, int startIndex) {
|
||||
return VectorUtil.findNextGEQ(values, target, startIndex, 128);
|
||||
}
|
||||
|
||||
private static void assertEquals(int expected, int actual) {
|
||||
if (expected != actual) {
|
||||
throw new AssertionError("Expected: " + expected + ", got " + actual);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
// For testing purposes
|
||||
int[] values = new int[129];
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
values[i] = i;
|
||||
}
|
||||
values[128] = DocIdSetIterator.NO_MORE_DOCS;
|
||||
for (int start = 0; start < 128; ++start) {
|
||||
for (int targetIndex = start; targetIndex < 128; ++targetIndex) {
|
||||
int actualIndex = binarySearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = inlinedBranchlessBinarySearch(values, values[targetIndex]);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = linearSearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = vectorUtilSearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -21,9 +21,9 @@ import java.nio.file.Files;
|
|||
import java.nio.file.Path;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.codecs.lucene912.ForDeltaUtil;
|
||||
import org.apache.lucene.codecs.lucene912.ForUtil;
|
||||
import org.apache.lucene.codecs.lucene912.PostingIndexInput;
|
||||
import org.apache.lucene.codecs.lucene101.ForDeltaUtil;
|
||||
import org.apache.lucene.codecs.lucene101.ForUtil;
|
||||
import org.apache.lucene.codecs.lucene101.PostingIndexInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
@ -61,7 +61,7 @@ public class PostingIndexInputBenchmark {
|
|||
private PostingIndexInput postingIn;
|
||||
private final ForUtil forUtil = new ForUtil();
|
||||
private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||
private final long[] values = new long[128];
|
||||
private final int[] values = new int[ForUtil.BLOCK_SIZE];
|
||||
|
||||
@Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"})
|
||||
public int bpv;
|
||||
|
|
|
@ -112,8 +112,7 @@ public abstract class ReadTask extends PerfTask {
|
|||
// pulling the Weight ourselves:
|
||||
int totalHitsThreshold = withTotalHits() ? Integer.MAX_VALUE : 1;
|
||||
TopFieldCollectorManager collectorManager =
|
||||
new TopFieldCollectorManager(
|
||||
sort, numHits, null, totalHitsThreshold, searcher.getSlices().length > 1);
|
||||
new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold);
|
||||
hits = searcher.search(q, collectorManager);
|
||||
} else {
|
||||
hits = searcher.search(q, numHits);
|
||||
|
|
|
@ -22,14 +22,14 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene912PostingsWriter}. */
|
||||
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene101PostingsWriter}. */
|
||||
public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
||||
|
||||
private final int minTermBlockSize;
|
||||
|
@ -67,7 +67,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
|
@ -84,7 +84,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);
|
||||
|
|
|
@ -24,7 +24,7 @@ import java.util.TreeMap;
|
|||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
import org.apache.lucene.index.BaseTermsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Fields;
|
||||
|
@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.TransitionAccessor;
|
|||
// - or: longer dense skip lists than just next byte?
|
||||
|
||||
/**
|
||||
* Wraps {@link Lucene912PostingsFormat} format for on-disk storage, but then at read time loads and
|
||||
* Wraps {@link Lucene101PostingsFormat} format for on-disk storage, but then at read time loads and
|
||||
* stores all terms and postings directly in RAM as byte[], int[].
|
||||
*
|
||||
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
|
||||
|
@ -97,12 +97,12 @@ public final class DirectPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
return PostingsFormat.forName("Lucene912").fieldsConsumer(state);
|
||||
return PostingsFormat.forName("Lucene101").fieldsConsumer(state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
FieldsProducer postings = PostingsFormat.forName("Lucene912").fieldsProducer(state);
|
||||
FieldsProducer postings = PostingsFormat.forName("Lucene101").fieldsProducer(state);
|
||||
if (state.context.context() != IOContext.Context.MERGE) {
|
||||
FieldsProducer loadedPostings;
|
||||
try {
|
||||
|
|
|
@ -22,8 +22,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -41,7 +41,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
|
@ -57,7 +57,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new FSTTermsReader(state, postingsReader);
|
||||
|
|
|
@ -26,8 +26,6 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextKnnVectorsWriter.VEC
|
|||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.index.ByteVectorValues;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
|
@ -36,6 +34,7 @@ import org.apache.lucene.index.FloatVectorValues;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.KnnCollector;
|
||||
import org.apache.lucene.search.VectorScorer;
|
||||
|
@ -63,7 +62,7 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader {
|
|||
private final SegmentReadState readState;
|
||||
private final IndexInput dataIn;
|
||||
private final BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
private final Map<String, FieldEntry> fieldEntries = new HashMap<>();
|
||||
private final IntObjectHashMap<FieldEntry> fieldEntries = new IntObjectHashMap<>();
|
||||
|
||||
SimpleTextKnnVectorsReader(SegmentReadState readState) throws IOException {
|
||||
this.readState = readState;
|
||||
|
@ -91,9 +90,9 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader {
|
|||
for (int i = 0; i < size; i++) {
|
||||
docIds[i] = readInt(in, EMPTY);
|
||||
}
|
||||
assert fieldEntries.containsKey(fieldName) == false;
|
||||
assert fieldEntries.containsKey(fieldNumber) == false;
|
||||
fieldEntries.put(
|
||||
fieldName,
|
||||
fieldNumber,
|
||||
new FieldEntry(
|
||||
dimension,
|
||||
vectorDataOffset,
|
||||
|
@ -126,7 +125,7 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader {
|
|||
throw new IllegalStateException(
|
||||
"KNN vectors readers should not be called on fields that don't enable KNN vectors");
|
||||
}
|
||||
FieldEntry fieldEntry = fieldEntries.get(field);
|
||||
FieldEntry fieldEntry = fieldEntries.get(info.number);
|
||||
if (fieldEntry == null) {
|
||||
// mirror the handling in Lucene90VectorReader#getVectorValues
|
||||
// needed to pass TestSimpleTextKnnVectorsFormat#testDeleteAllVectorDocs
|
||||
|
@ -159,7 +158,7 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader {
|
|||
throw new IllegalStateException(
|
||||
"KNN vectors readers should not be called on fields that don't enable KNN vectors");
|
||||
}
|
||||
FieldEntry fieldEntry = fieldEntries.get(field);
|
||||
FieldEntry fieldEntry = fieldEntries.get(info.number);
|
||||
if (fieldEntry == null) {
|
||||
// mirror the handling in Lucene90VectorReader#getVectorValues
|
||||
// needed to pass TestSimpleTextKnnVectorsFormat#testDeleteAllVectorDocs
|
||||
|
|
|
@ -17,13 +17,13 @@
|
|||
|
||||
package org.apache.lucene.codecs.uniformsplit;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.TermState;
|
||||
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
|
||||
/**
|
||||
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
|
||||
* pointer. It differs from {@link Lucene912PostingsWriter#encodeTerm} which encodes each file
|
||||
* pointer. It differs from {@link Lucene101PostingsWriter#encodeTerm} which encodes each file
|
||||
* pointer as a delta relative to the previous file pointer.
|
||||
*
|
||||
* <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP,
|
||||
|
@ -95,7 +95,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
/**
|
||||
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
|
||||
*
|
||||
* <p>Simpler variant of {@link Lucene912PostingsWriter#encodeTerm(DataOutput, FieldInfo,
|
||||
* <p>Simpler variant of {@link Lucene101PostingsWriter#encodeTerm(DataOutput, FieldInfo,
|
||||
* BlockTermState, boolean)}.
|
||||
*/
|
||||
public void writeTermState(
|
||||
|
@ -145,7 +145,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
/**
|
||||
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
|
||||
*
|
||||
* <p>Simpler variant of {@link Lucene912PostingsReader#decodeTerm(DataInput, FieldInfo,
|
||||
* <p>Simpler variant of {@link Lucene101PostingsReader#decodeTerm(DataInput, FieldInfo,
|
||||
* BlockTermState, boolean)}.
|
||||
*
|
||||
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.
|
||||
|
|
|
@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -113,7 +113,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer termsWriter =
|
||||
|
@ -130,7 +130,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer termsReader =
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
* org.apache.lucene.search.PhraseQuery})
|
||||
* <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery}
|
||||
* <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case
|
||||
* prefer {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
|
||||
* prefer {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat}
|
||||
* </ul>
|
||||
*/
|
||||
package org.apache.lucene.codecs.uniformsplit;
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.KnnByteVectorField;
|
||||
|
@ -38,16 +37,12 @@ import org.apache.lucene.search.TopDocs;
|
|||
import org.apache.lucene.search.TopKnnCollector;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
|
||||
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene100Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new HnswBitVectorsFormat();
|
||||
}
|
||||
};
|
||||
return TestUtil.alwaysKnnVectorsFormat(new HnswBitVectorsFormat());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
package org.apache.lucene.codecs.lucene90.tests;
|
||||
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
|
||||
|
||||
/** Test utility class to create mock {@link IntBlockTermState}. */
|
||||
public class MockTermStateFactory {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
{
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "f561578ccb6a95364bb62c5ed86b38ff0b4a009d",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "eea1a71be9da8a13fdd979354dc4a8c6edf21be1"
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "0ff7fb9159693055d9e4b9468b004166156f6550",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "8c55b7aaced028388408c5eb968b1f1197e11142"
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
{
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "159e82388346fde147924d5e15ca65df4dd63b9a",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "66dc8813160feae2a37d8b50474f5f9830b6cb22"
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java": "10ceb79f031232bc1e4564db7e3ebb16eedd2e0a",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py": "d69e734bce30375952046a3776bbb7a5c1edbd51"
|
||||
}
|
|
@ -15,8 +15,6 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
|
||||
|
||||
/** Lucene Core. */
|
||||
@SuppressWarnings("module") // the test framework is compiled after the core...
|
||||
module org.apache.lucene.core {
|
||||
|
@ -33,8 +31,7 @@ module org.apache.lucene.core {
|
|||
exports org.apache.lucene.codecs.lucene94;
|
||||
exports org.apache.lucene.codecs.lucene95;
|
||||
exports org.apache.lucene.codecs.lucene99;
|
||||
exports org.apache.lucene.codecs.lucene912;
|
||||
exports org.apache.lucene.codecs.lucene100;
|
||||
exports org.apache.lucene.codecs.lucene101;
|
||||
exports org.apache.lucene.codecs.perfield;
|
||||
exports org.apache.lucene.codecs;
|
||||
exports org.apache.lucene.document;
|
||||
|
@ -73,7 +70,7 @@ module org.apache.lucene.core {
|
|||
provides org.apache.lucene.analysis.TokenizerFactory with
|
||||
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
||||
provides org.apache.lucene.codecs.Codec with
|
||||
Lucene100Codec;
|
||||
org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
provides org.apache.lucene.codecs.DocValuesFormat with
|
||||
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||
|
@ -81,7 +78,7 @@ module org.apache.lucene.core {
|
|||
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
|
||||
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat;
|
||||
provides org.apache.lucene.codecs.PostingsFormat with
|
||||
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
provides org.apache.lucene.index.SortFieldProvider with
|
||||
org.apache.lucene.search.SortField.Provider,
|
||||
org.apache.lucene.search.SortedNumericSortField.Provider,
|
||||
|
|
|
@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
|
|||
return LOADER;
|
||||
}
|
||||
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene100");
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene101");
|
||||
}
|
||||
|
||||
private final String name;
|
||||
|
|
|
@ -0,0 +1,525 @@
|
|||
// This file has been automatically generated, DO NOT EDIT
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene101.ForUtil.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in a Java int to
|
||||
* get SIMD-like speedups. If bitsPerValue <= 4 then we pack 4 ints per Java int else if
|
||||
* bitsPerValue <= 11 we pack 2 ints per Java int else we use scalar operations.
|
||||
*/
|
||||
public final class ForDeltaUtil {
|
||||
|
||||
private static final int HALF_BLOCK_SIZE = BLOCK_SIZE / 2;
|
||||
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
|
||||
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
|
||||
private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4;
|
||||
|
||||
// IDENTITY_PLUS_ONE[i] == i+1
|
||||
private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
IDENTITY_PLUS_ONE[i] = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSumOfOnes(int[] arr, int base) {
|
||||
System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE);
|
||||
// This loop gets auto-vectorized
|
||||
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
arr[i] += base;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum8(int[] arr, int base) {
|
||||
// When the number of bits per value is 4 or less, we can sum up all values in a block without
|
||||
// risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4
|
||||
// values at once.
|
||||
innerPrefixSum8(arr);
|
||||
expand8(arr);
|
||||
final int l0 = base;
|
||||
final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1];
|
||||
final int l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1];
|
||||
final int l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1];
|
||||
|
||||
for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) {
|
||||
arr[i] += l0;
|
||||
arr[ONE_BLOCK_SIZE_FOURTH + i] += l1;
|
||||
arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2;
|
||||
arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum16(int[] arr, int base) {
|
||||
// When the number of bits per value is 11 or less, we can sum up all values in a block without
|
||||
// risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2
|
||||
// values at once.
|
||||
innerPrefixSum16(arr);
|
||||
expand16(arr);
|
||||
final int l0 = base;
|
||||
final int l1 = base + arr[HALF_BLOCK_SIZE - 1];
|
||||
for (int i = 0; i < HALF_BLOCK_SIZE; ++i) {
|
||||
arr[i] += l0;
|
||||
arr[HALF_BLOCK_SIZE + i] += l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum32(int[] arr, int base) {
|
||||
arr[0] += base;
|
||||
for (int i = 1; i < BLOCK_SIZE; ++i) {
|
||||
arr[i] += arr[i - 1];
|
||||
}
|
||||
}
|
||||
|
||||
// For some reason unrolling seems to help
|
||||
private static void innerPrefixSum8(int[] arr) {
|
||||
arr[1] += arr[0];
|
||||
arr[2] += arr[1];
|
||||
arr[3] += arr[2];
|
||||
arr[4] += arr[3];
|
||||
arr[5] += arr[4];
|
||||
arr[6] += arr[5];
|
||||
arr[7] += arr[6];
|
||||
arr[8] += arr[7];
|
||||
arr[9] += arr[8];
|
||||
arr[10] += arr[9];
|
||||
arr[11] += arr[10];
|
||||
arr[12] += arr[11];
|
||||
arr[13] += arr[12];
|
||||
arr[14] += arr[13];
|
||||
arr[15] += arr[14];
|
||||
arr[16] += arr[15];
|
||||
arr[17] += arr[16];
|
||||
arr[18] += arr[17];
|
||||
arr[19] += arr[18];
|
||||
arr[20] += arr[19];
|
||||
arr[21] += arr[20];
|
||||
arr[22] += arr[21];
|
||||
arr[23] += arr[22];
|
||||
arr[24] += arr[23];
|
||||
arr[25] += arr[24];
|
||||
arr[26] += arr[25];
|
||||
arr[27] += arr[26];
|
||||
arr[28] += arr[27];
|
||||
arr[29] += arr[28];
|
||||
arr[30] += arr[29];
|
||||
arr[31] += arr[30];
|
||||
}
|
||||
|
||||
// For some reason unrolling seems to help
|
||||
private static void innerPrefixSum16(int[] arr) {
|
||||
arr[1] += arr[0];
|
||||
arr[2] += arr[1];
|
||||
arr[3] += arr[2];
|
||||
arr[4] += arr[3];
|
||||
arr[5] += arr[4];
|
||||
arr[6] += arr[5];
|
||||
arr[7] += arr[6];
|
||||
arr[8] += arr[7];
|
||||
arr[9] += arr[8];
|
||||
arr[10] += arr[9];
|
||||
arr[11] += arr[10];
|
||||
arr[12] += arr[11];
|
||||
arr[13] += arr[12];
|
||||
arr[14] += arr[13];
|
||||
arr[15] += arr[14];
|
||||
arr[16] += arr[15];
|
||||
arr[17] += arr[16];
|
||||
arr[18] += arr[17];
|
||||
arr[19] += arr[18];
|
||||
arr[20] += arr[19];
|
||||
arr[21] += arr[20];
|
||||
arr[22] += arr[21];
|
||||
arr[23] += arr[22];
|
||||
arr[24] += arr[23];
|
||||
arr[25] += arr[24];
|
||||
arr[26] += arr[25];
|
||||
arr[27] += arr[26];
|
||||
arr[28] += arr[27];
|
||||
arr[29] += arr[28];
|
||||
arr[30] += arr[29];
|
||||
arr[31] += arr[30];
|
||||
arr[32] += arr[31];
|
||||
arr[33] += arr[32];
|
||||
arr[34] += arr[33];
|
||||
arr[35] += arr[34];
|
||||
arr[36] += arr[35];
|
||||
arr[37] += arr[36];
|
||||
arr[38] += arr[37];
|
||||
arr[39] += arr[38];
|
||||
arr[40] += arr[39];
|
||||
arr[41] += arr[40];
|
||||
arr[42] += arr[41];
|
||||
arr[43] += arr[42];
|
||||
arr[44] += arr[43];
|
||||
arr[45] += arr[44];
|
||||
arr[46] += arr[45];
|
||||
arr[47] += arr[46];
|
||||
arr[48] += arr[47];
|
||||
arr[49] += arr[48];
|
||||
arr[50] += arr[49];
|
||||
arr[51] += arr[50];
|
||||
arr[52] += arr[51];
|
||||
arr[53] += arr[52];
|
||||
arr[54] += arr[53];
|
||||
arr[55] += arr[54];
|
||||
arr[56] += arr[55];
|
||||
arr[57] += arr[56];
|
||||
arr[58] += arr[57];
|
||||
arr[59] += arr[58];
|
||||
arr[60] += arr[59];
|
||||
arr[61] += arr[60];
|
||||
arr[62] += arr[61];
|
||||
arr[63] += arr[62];
|
||||
}
|
||||
|
||||
private final int[] tmp = new int[BLOCK_SIZE];
|
||||
|
||||
/**
|
||||
* Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code
|
||||
* ints} are expected to be deltas between consecutive values.
|
||||
*/
|
||||
void encodeDeltas(int[] ints, DataOutput out) throws IOException {
|
||||
if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings
|
||||
out.writeByte((byte) 0);
|
||||
} else {
|
||||
int or = 0;
|
||||
for (int l : ints) {
|
||||
or |= l;
|
||||
}
|
||||
assert or != 0;
|
||||
final int bitsPerValue = PackedInts.bitsRequired(or);
|
||||
out.writeByte((byte) bitsPerValue);
|
||||
|
||||
final int primitiveSize;
|
||||
if (bitsPerValue <= 3) {
|
||||
primitiveSize = 8;
|
||||
collapse8(ints);
|
||||
} else if (bitsPerValue <= 10) {
|
||||
primitiveSize = 16;
|
||||
collapse16(ints);
|
||||
} else {
|
||||
primitiveSize = 32;
|
||||
}
|
||||
encode(ints, bitsPerValue, primitiveSize, out, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */
|
||||
void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
|
||||
if (bitsPerValue == 0) {
|
||||
prefixSumOfOnes(ints, base);
|
||||
} else {
|
||||
decodeAndPrefixSum(bitsPerValue, pdu, base, ints);
|
||||
}
|
||||
}
|
||||
|
||||
/** Delta-decode 128 integers into {@code ints}. */
|
||||
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int[] ints)
|
||||
throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
case 1:
|
||||
decode1(pdu, ints);
|
||||
prefixSum8(ints, base);
|
||||
break;
|
||||
case 2:
|
||||
decode2(pdu, ints);
|
||||
prefixSum8(ints, base);
|
||||
break;
|
||||
case 3:
|
||||
decode3(pdu, tmp, ints);
|
||||
prefixSum8(ints, base);
|
||||
break;
|
||||
case 4:
|
||||
decode4To16(pdu, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 5:
|
||||
decode5To16(pdu, tmp, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 6:
|
||||
decode6To16(pdu, tmp, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 7:
|
||||
decode7To16(pdu, tmp, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 8:
|
||||
decode8To16(pdu, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 9:
|
||||
decode9(pdu, tmp, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 10:
|
||||
decode10(pdu, tmp, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 11:
|
||||
decode11To32(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 12:
|
||||
decode12To32(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 13:
|
||||
decode13To32(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 14:
|
||||
decode14To32(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 15:
|
||||
decode15To32(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 16:
|
||||
decode16To32(pdu, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 17:
|
||||
decode17(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 18:
|
||||
decode18(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 19:
|
||||
decode19(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 20:
|
||||
decode20(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 21:
|
||||
decode21(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 22:
|
||||
decode22(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 23:
|
||||
decode23(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 24:
|
||||
decode24(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
default:
|
||||
decodeSlow(bitsPerValue, pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode4To16(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(16, ints, 12, 4, MASK16_4, ints, 48, MASK16_4);
|
||||
}
|
||||
|
||||
private static void decode5To16(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(20, ints, 11, 5, MASK16_5, tmp, 0, MASK16_1);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 60; iter < 4; ++iter, tmpIdx += 5, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= tmp[tmpIdx + 1] << 3;
|
||||
l0 |= tmp[tmpIdx + 2] << 2;
|
||||
l0 |= tmp[tmpIdx + 3] << 1;
|
||||
l0 |= tmp[tmpIdx + 4] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode6To16(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(24, ints, 10, 6, MASK16_6, tmp, 0, MASK16_4);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 48; iter < 8; ++iter, tmpIdx += 3, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK16_2) << 4;
|
||||
l1 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode7To16(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(28, ints, 9, 7, MASK16_7, tmp, 0, MASK16_2);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 56; iter < 4; ++iter, tmpIdx += 7, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 5;
|
||||
l0 |= tmp[tmpIdx + 1] << 3;
|
||||
l0 |= tmp[tmpIdx + 2] << 1;
|
||||
l0 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 3] & MASK16_1) << 6;
|
||||
l1 |= tmp[tmpIdx + 4] << 4;
|
||||
l1 |= tmp[tmpIdx + 5] << 2;
|
||||
l1 |= tmp[tmpIdx + 6] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode8To16(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(32, ints, 8, 8, MASK16_8, ints, 32, MASK16_8);
|
||||
}
|
||||
|
||||
private static void decode11To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(44, ints, 21, 11, MASK32_11, tmp, 0, MASK32_10);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 88; iter < 4; ++iter, tmpIdx += 11, intsIdx += 10) {
|
||||
int l0 = tmp[tmpIdx + 0] << 1;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 9) & MASK32_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_9) << 2;
|
||||
l1 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 2] & MASK32_8) << 3;
|
||||
l2 |= (tmp[tmpIdx + 3] >>> 7) & MASK32_3;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 3] & MASK32_7) << 4;
|
||||
l3 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 4] & MASK32_6) << 5;
|
||||
l4 |= (tmp[tmpIdx + 5] >>> 5) & MASK32_5;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 5] & MASK32_5) << 6;
|
||||
l5 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 6] & MASK32_4) << 7;
|
||||
l6 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_7;
|
||||
ints[intsIdx + 6] = l6;
|
||||
int l7 = (tmp[tmpIdx + 7] & MASK32_3) << 8;
|
||||
l7 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8;
|
||||
ints[intsIdx + 7] = l7;
|
||||
int l8 = (tmp[tmpIdx + 8] & MASK32_2) << 9;
|
||||
l8 |= (tmp[tmpIdx + 9] >>> 1) & MASK32_9;
|
||||
ints[intsIdx + 8] = l8;
|
||||
int l9 = (tmp[tmpIdx + 9] & MASK32_1) << 10;
|
||||
l9 |= tmp[tmpIdx + 10] << 0;
|
||||
ints[intsIdx + 9] = l9;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode12To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(48, ints, 20, 12, MASK32_12, tmp, 0, MASK32_8);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 96; iter < 16; ++iter, tmpIdx += 3, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_4;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_4) << 8;
|
||||
l1 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode13To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(52, ints, 19, 13, MASK32_13, tmp, 0, MASK32_6);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 104; iter < 4; ++iter, tmpIdx += 13, intsIdx += 6) {
|
||||
int l0 = tmp[tmpIdx + 0] << 7;
|
||||
l0 |= tmp[tmpIdx + 1] << 1;
|
||||
l0 |= (tmp[tmpIdx + 2] >>> 5) & MASK32_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 2] & MASK32_5) << 8;
|
||||
l1 |= tmp[tmpIdx + 3] << 2;
|
||||
l1 |= (tmp[tmpIdx + 4] >>> 4) & MASK32_2;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 4] & MASK32_4) << 9;
|
||||
l2 |= tmp[tmpIdx + 5] << 3;
|
||||
l2 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_3;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 6] & MASK32_3) << 10;
|
||||
l3 |= tmp[tmpIdx + 7] << 4;
|
||||
l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_4;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 8] & MASK32_2) << 11;
|
||||
l4 |= tmp[tmpIdx + 9] << 5;
|
||||
l4 |= (tmp[tmpIdx + 10] >>> 1) & MASK32_5;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 10] & MASK32_1) << 12;
|
||||
l5 |= tmp[tmpIdx + 11] << 6;
|
||||
l5 |= tmp[tmpIdx + 12] << 0;
|
||||
ints[intsIdx + 5] = l5;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode14To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(56, ints, 18, 14, MASK32_14, tmp, 0, MASK32_4);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 112; iter < 8; ++iter, tmpIdx += 7, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= tmp[tmpIdx + 1] << 6;
|
||||
l0 |= tmp[tmpIdx + 2] << 2;
|
||||
l0 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 3] & MASK32_2) << 12;
|
||||
l1 |= tmp[tmpIdx + 4] << 8;
|
||||
l1 |= tmp[tmpIdx + 5] << 4;
|
||||
l1 |= tmp[tmpIdx + 6] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode15To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(60, ints, 17, 15, MASK32_15, tmp, 0, MASK32_2);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 120; iter < 4; ++iter, tmpIdx += 15, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 13;
|
||||
l0 |= tmp[tmpIdx + 1] << 11;
|
||||
l0 |= tmp[tmpIdx + 2] << 9;
|
||||
l0 |= tmp[tmpIdx + 3] << 7;
|
||||
l0 |= tmp[tmpIdx + 4] << 5;
|
||||
l0 |= tmp[tmpIdx + 5] << 3;
|
||||
l0 |= tmp[tmpIdx + 6] << 1;
|
||||
l0 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 7] & MASK32_1) << 14;
|
||||
l1 |= tmp[tmpIdx + 8] << 12;
|
||||
l1 |= tmp[tmpIdx + 9] << 10;
|
||||
l1 |= tmp[tmpIdx + 10] << 8;
|
||||
l1 |= tmp[tmpIdx + 11] << 6;
|
||||
l1 |= tmp[tmpIdx + 12] << 4;
|
||||
l1 |= tmp[tmpIdx + 13] << 2;
|
||||
l1 |= tmp[tmpIdx + 14] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode16To32(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(64, ints, 16, 16, MASK32_16, ints, 64, MASK32_16);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,841 @@
|
|||
// This file has been automatically generated, DO NOT EDIT
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in one to get
|
||||
* SIMD-like speedups. If bitsPerValue <= 8 then we pack 4 ints per Java int else if bitsPerValue
|
||||
* <= 16 we pack 2 ints per Java int else we do scalar operations.
|
||||
*/
|
||||
public final class ForUtil {
|
||||
|
||||
public static final int BLOCK_SIZE = 128;
|
||||
static final int BLOCK_SIZE_LOG2 = 7;
|
||||
|
||||
static int expandMask16(int mask16) {
|
||||
return mask16 | (mask16 << 16);
|
||||
}
|
||||
|
||||
static int expandMask8(int mask8) {
|
||||
return expandMask16(mask8 | (mask8 << 8));
|
||||
}
|
||||
|
||||
static int mask32(int bitsPerValue) {
|
||||
return (1 << bitsPerValue) - 1;
|
||||
}
|
||||
|
||||
static int mask16(int bitsPerValue) {
|
||||
return expandMask16((1 << bitsPerValue) - 1);
|
||||
}
|
||||
|
||||
static int mask8(int bitsPerValue) {
|
||||
return expandMask8((1 << bitsPerValue) - 1);
|
||||
}
|
||||
|
||||
static void expand8(int[] arr) {
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
int l = arr[i];
|
||||
arr[i] = (l >>> 24) & 0xFF;
|
||||
arr[32 + i] = (l >>> 16) & 0xFF;
|
||||
arr[64 + i] = (l >>> 8) & 0xFF;
|
||||
arr[96 + i] = l & 0xFF;
|
||||
}
|
||||
}
|
||||
|
||||
static void collapse8(int[] arr) {
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
arr[i] = (arr[i] << 24) | (arr[32 + i] << 16) | (arr[64 + i] << 8) | arr[96 + i];
|
||||
}
|
||||
}
|
||||
|
||||
static void expand16(int[] arr) {
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
int l = arr[i];
|
||||
arr[i] = (l >>> 16) & 0xFFFF;
|
||||
arr[64 + i] = l & 0xFFFF;
|
||||
}
|
||||
}
|
||||
|
||||
static void collapse16(int[] arr) {
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
arr[i] = (arr[i] << 16) | arr[64 + i];
|
||||
}
|
||||
}
|
||||
|
||||
private final int[] tmp = new int[BLOCK_SIZE];
|
||||
|
||||
/** Encode 128 integers from {@code ints} into {@code out}. */
|
||||
void encode(int[] ints, int bitsPerValue, DataOutput out) throws IOException {
|
||||
final int nextPrimitive;
|
||||
if (bitsPerValue <= 8) {
|
||||
nextPrimitive = 8;
|
||||
collapse8(ints);
|
||||
} else if (bitsPerValue <= 16) {
|
||||
nextPrimitive = 16;
|
||||
collapse16(ints);
|
||||
} else {
|
||||
nextPrimitive = 32;
|
||||
}
|
||||
encode(ints, bitsPerValue, nextPrimitive, out, tmp);
|
||||
}
|
||||
|
||||
static void encode(int[] ints, int bitsPerValue, int primitiveSize, DataOutput out, int[] tmp)
|
||||
throws IOException {
|
||||
final int numInts = BLOCK_SIZE * primitiveSize / Integer.SIZE;
|
||||
|
||||
final int numIntsPerShift = bitsPerValue * 4;
|
||||
int idx = 0;
|
||||
int shift = primitiveSize - bitsPerValue;
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
tmp[i] = ints[idx++] << shift;
|
||||
}
|
||||
for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
tmp[i] |= ints[idx++] << shift;
|
||||
}
|
||||
}
|
||||
|
||||
final int remainingBitsPerInt = shift + bitsPerValue;
|
||||
final int maskRemainingBitsPerInt;
|
||||
if (primitiveSize == 8) {
|
||||
maskRemainingBitsPerInt = MASKS8[remainingBitsPerInt];
|
||||
} else if (primitiveSize == 16) {
|
||||
maskRemainingBitsPerInt = MASKS16[remainingBitsPerInt];
|
||||
} else {
|
||||
maskRemainingBitsPerInt = MASKS32[remainingBitsPerInt];
|
||||
}
|
||||
|
||||
int tmpIdx = 0;
|
||||
int remainingBitsPerValue = bitsPerValue;
|
||||
while (idx < numInts) {
|
||||
if (remainingBitsPerValue >= remainingBitsPerInt) {
|
||||
remainingBitsPerValue -= remainingBitsPerInt;
|
||||
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerInt;
|
||||
if (remainingBitsPerValue == 0) {
|
||||
idx++;
|
||||
remainingBitsPerValue = bitsPerValue;
|
||||
}
|
||||
} else {
|
||||
final int mask1, mask2;
|
||||
if (primitiveSize == 8) {
|
||||
mask1 = MASKS8[remainingBitsPerValue];
|
||||
mask2 = MASKS8[remainingBitsPerInt - remainingBitsPerValue];
|
||||
} else if (primitiveSize == 16) {
|
||||
mask1 = MASKS16[remainingBitsPerValue];
|
||||
mask2 = MASKS16[remainingBitsPerInt - remainingBitsPerValue];
|
||||
} else {
|
||||
mask1 = MASKS32[remainingBitsPerValue];
|
||||
mask2 = MASKS32[remainingBitsPerInt - remainingBitsPerValue];
|
||||
}
|
||||
tmp[tmpIdx] |= (ints[idx++] & mask1) << (remainingBitsPerInt - remainingBitsPerValue);
|
||||
remainingBitsPerValue = bitsPerValue - remainingBitsPerInt + remainingBitsPerValue;
|
||||
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & mask2;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
out.writeInt(tmp[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */
|
||||
static int numBytes(int bitsPerValue) {
|
||||
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
|
||||
}
|
||||
|
||||
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
final int numInts = bitsPerValue << 2;
|
||||
final int mask = MASKS32[bitsPerValue];
|
||||
pdu.splitInts(numInts, ints, 32 - bitsPerValue, 32, mask, tmp, 0, -1);
|
||||
final int remainingBitsPerInt = 32 - bitsPerValue;
|
||||
final int mask32RemainingBitsPerInt = MASKS32[remainingBitsPerInt];
|
||||
int tmpIdx = 0;
|
||||
int remainingBits = remainingBitsPerInt;
|
||||
for (int intsIdx = numInts; intsIdx < BLOCK_SIZE; ++intsIdx) {
|
||||
int b = bitsPerValue - remainingBits;
|
||||
int l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b;
|
||||
while (b >= remainingBitsPerInt) {
|
||||
b -= remainingBitsPerInt;
|
||||
l |= (tmp[tmpIdx++] & mask32RemainingBitsPerInt) << b;
|
||||
}
|
||||
if (b > 0) {
|
||||
l |= (tmp[tmpIdx] >>> (remainingBitsPerInt - b)) & MASKS32[b];
|
||||
remainingBits = remainingBitsPerInt - b;
|
||||
} else {
|
||||
remainingBits = remainingBitsPerInt;
|
||||
}
|
||||
ints[intsIdx] = l;
|
||||
}
|
||||
}
|
||||
|
||||
static final int[] MASKS8 = new int[8];
|
||||
static final int[] MASKS16 = new int[16];
|
||||
static final int[] MASKS32 = new int[32];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
MASKS8[i] = mask8(i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
MASKS16[i] = mask16(i);
|
||||
}
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
MASKS32[i] = mask32(i);
|
||||
}
|
||||
}
|
||||
|
||||
// mark values in array as final ints to avoid the cost of reading array, arrays should only be
|
||||
// used when the idx is a variable
|
||||
static final int MASK8_1 = MASKS8[1];
|
||||
static final int MASK8_2 = MASKS8[2];
|
||||
static final int MASK8_3 = MASKS8[3];
|
||||
static final int MASK8_4 = MASKS8[4];
|
||||
static final int MASK8_5 = MASKS8[5];
|
||||
static final int MASK8_6 = MASKS8[6];
|
||||
static final int MASK8_7 = MASKS8[7];
|
||||
static final int MASK16_1 = MASKS16[1];
|
||||
static final int MASK16_2 = MASKS16[2];
|
||||
static final int MASK16_3 = MASKS16[3];
|
||||
static final int MASK16_4 = MASKS16[4];
|
||||
static final int MASK16_5 = MASKS16[5];
|
||||
static final int MASK16_6 = MASKS16[6];
|
||||
static final int MASK16_7 = MASKS16[7];
|
||||
static final int MASK16_8 = MASKS16[8];
|
||||
static final int MASK16_9 = MASKS16[9];
|
||||
static final int MASK16_10 = MASKS16[10];
|
||||
static final int MASK16_11 = MASKS16[11];
|
||||
static final int MASK16_12 = MASKS16[12];
|
||||
static final int MASK16_13 = MASKS16[13];
|
||||
static final int MASK16_14 = MASKS16[14];
|
||||
static final int MASK16_15 = MASKS16[15];
|
||||
static final int MASK32_1 = MASKS32[1];
|
||||
static final int MASK32_2 = MASKS32[2];
|
||||
static final int MASK32_3 = MASKS32[3];
|
||||
static final int MASK32_4 = MASKS32[4];
|
||||
static final int MASK32_5 = MASKS32[5];
|
||||
static final int MASK32_6 = MASKS32[6];
|
||||
static final int MASK32_7 = MASKS32[7];
|
||||
static final int MASK32_8 = MASKS32[8];
|
||||
static final int MASK32_9 = MASKS32[9];
|
||||
static final int MASK32_10 = MASKS32[10];
|
||||
static final int MASK32_11 = MASKS32[11];
|
||||
static final int MASK32_12 = MASKS32[12];
|
||||
static final int MASK32_13 = MASKS32[13];
|
||||
static final int MASK32_14 = MASKS32[14];
|
||||
static final int MASK32_15 = MASKS32[15];
|
||||
static final int MASK32_16 = MASKS32[16];
|
||||
static final int MASK32_17 = MASKS32[17];
|
||||
static final int MASK32_18 = MASKS32[18];
|
||||
static final int MASK32_19 = MASKS32[19];
|
||||
static final int MASK32_20 = MASKS32[20];
|
||||
static final int MASK32_21 = MASKS32[21];
|
||||
static final int MASK32_22 = MASKS32[22];
|
||||
static final int MASK32_23 = MASKS32[23];
|
||||
static final int MASK32_24 = MASKS32[24];
|
||||
|
||||
/** Decode 128 integers into {@code ints}. */
|
||||
void decode(int bitsPerValue, PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
case 1:
|
||||
decode1(pdu, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 2:
|
||||
decode2(pdu, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 3:
|
||||
decode3(pdu, tmp, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 4:
|
||||
decode4(pdu, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 5:
|
||||
decode5(pdu, tmp, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 6:
|
||||
decode6(pdu, tmp, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 7:
|
||||
decode7(pdu, tmp, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 8:
|
||||
decode8(pdu, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 9:
|
||||
decode9(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 10:
|
||||
decode10(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 11:
|
||||
decode11(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 12:
|
||||
decode12(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 13:
|
||||
decode13(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 14:
|
||||
decode14(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 15:
|
||||
decode15(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 16:
|
||||
decode16(pdu, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 17:
|
||||
decode17(pdu, tmp, ints);
|
||||
break;
|
||||
case 18:
|
||||
decode18(pdu, tmp, ints);
|
||||
break;
|
||||
case 19:
|
||||
decode19(pdu, tmp, ints);
|
||||
break;
|
||||
case 20:
|
||||
decode20(pdu, tmp, ints);
|
||||
break;
|
||||
case 21:
|
||||
decode21(pdu, tmp, ints);
|
||||
break;
|
||||
case 22:
|
||||
decode22(pdu, tmp, ints);
|
||||
break;
|
||||
case 23:
|
||||
decode23(pdu, tmp, ints);
|
||||
break;
|
||||
case 24:
|
||||
decode24(pdu, tmp, ints);
|
||||
break;
|
||||
default:
|
||||
decodeSlow(bitsPerValue, pdu, tmp, ints);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode1(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(4, ints, 7, 1, MASK8_1, ints, 28, MASK8_1);
|
||||
}
|
||||
|
||||
static void decode2(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(8, ints, 6, 2, MASK8_2, ints, 24, MASK8_2);
|
||||
}
|
||||
|
||||
static void decode3(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(12, ints, 5, 3, MASK8_3, tmp, 0, MASK8_2);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 24; iter < 4; ++iter, tmpIdx += 3, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 1;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK8_1) << 2;
|
||||
l1 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode4(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(16, ints, 4, 4, MASK8_4, ints, 16, MASK8_4);
|
||||
}
|
||||
|
||||
static void decode5(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(20, ints, 3, 5, MASK8_5, tmp, 0, MASK8_3);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 20; iter < 4; ++iter, tmpIdx += 5, intsIdx += 3) {
|
||||
int l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK8_1) << 4;
|
||||
l1 |= tmp[tmpIdx + 2] << 1;
|
||||
l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK8_1;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 3] & MASK8_2) << 3;
|
||||
l2 |= tmp[tmpIdx + 4] << 0;
|
||||
ints[intsIdx + 2] = l2;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode6(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(24, ints, 2, 6, MASK8_6, tmp, 0, MASK8_2);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 24; iter < 8; ++iter, tmpIdx += 3, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= tmp[tmpIdx + 1] << 2;
|
||||
l0 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode7(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(28, ints, 1, 7, MASK8_7, tmp, 0, MASK8_1);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 28; iter < 4; ++iter, tmpIdx += 7, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= tmp[tmpIdx + 1] << 5;
|
||||
l0 |= tmp[tmpIdx + 2] << 4;
|
||||
l0 |= tmp[tmpIdx + 3] << 3;
|
||||
l0 |= tmp[tmpIdx + 4] << 2;
|
||||
l0 |= tmp[tmpIdx + 5] << 1;
|
||||
l0 |= tmp[tmpIdx + 6] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode8(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.in.readInts(ints, 0, 32);
|
||||
}
|
||||
|
||||
static void decode9(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(36, ints, 7, 9, MASK16_9, tmp, 0, MASK16_7);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 36; iter < 4; ++iter, tmpIdx += 9, intsIdx += 7) {
|
||||
int l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 5) & MASK16_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK16_5) << 4;
|
||||
l1 |= (tmp[tmpIdx + 2] >>> 3) & MASK16_4;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 2] & MASK16_3) << 6;
|
||||
l2 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_6;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 3] & MASK16_1) << 8;
|
||||
l3 |= tmp[tmpIdx + 4] << 1;
|
||||
l3 |= (tmp[tmpIdx + 5] >>> 6) & MASK16_1;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 5] & MASK16_6) << 3;
|
||||
l4 |= (tmp[tmpIdx + 6] >>> 4) & MASK16_3;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 6] & MASK16_4) << 5;
|
||||
l5 |= (tmp[tmpIdx + 7] >>> 2) & MASK16_5;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 7] & MASK16_2) << 7;
|
||||
l6 |= tmp[tmpIdx + 8] << 0;
|
||||
ints[intsIdx + 6] = l6;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode10(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(40, ints, 6, 10, MASK16_10, tmp, 0, MASK16_6);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 40; iter < 8; ++iter, tmpIdx += 5, intsIdx += 3) {
|
||||
int l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_4;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK16_2) << 8;
|
||||
l1 |= tmp[tmpIdx + 2] << 2;
|
||||
l1 |= (tmp[tmpIdx + 3] >>> 4) & MASK16_2;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 3] & MASK16_4) << 6;
|
||||
l2 |= tmp[tmpIdx + 4] << 0;
|
||||
ints[intsIdx + 2] = l2;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode11(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(44, ints, 5, 11, MASK16_11, tmp, 0, MASK16_5);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 44; iter < 4; ++iter, tmpIdx += 11, intsIdx += 5) {
|
||||
int l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= tmp[tmpIdx + 1] << 1;
|
||||
l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK16_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 2] & MASK16_4) << 7;
|
||||
l1 |= tmp[tmpIdx + 3] << 2;
|
||||
l1 |= (tmp[tmpIdx + 4] >>> 3) & MASK16_2;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 4] & MASK16_3) << 8;
|
||||
l2 |= tmp[tmpIdx + 5] << 3;
|
||||
l2 |= (tmp[tmpIdx + 6] >>> 2) & MASK16_3;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 6] & MASK16_2) << 9;
|
||||
l3 |= tmp[tmpIdx + 7] << 4;
|
||||
l3 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_4;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 8] & MASK16_1) << 10;
|
||||
l4 |= tmp[tmpIdx + 9] << 5;
|
||||
l4 |= tmp[tmpIdx + 10] << 0;
|
||||
ints[intsIdx + 4] = l4;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode12(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(48, ints, 4, 12, MASK16_12, tmp, 0, MASK16_4);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 48; iter < 16; ++iter, tmpIdx += 3, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 8;
|
||||
l0 |= tmp[tmpIdx + 1] << 4;
|
||||
l0 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode13(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(52, ints, 3, 13, MASK16_13, tmp, 0, MASK16_3);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 52; iter < 4; ++iter, tmpIdx += 13, intsIdx += 3) {
|
||||
int l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= tmp[tmpIdx + 1] << 7;
|
||||
l0 |= tmp[tmpIdx + 2] << 4;
|
||||
l0 |= tmp[tmpIdx + 3] << 1;
|
||||
l0 |= (tmp[tmpIdx + 4] >>> 2) & MASK16_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 4] & MASK16_2) << 11;
|
||||
l1 |= tmp[tmpIdx + 5] << 8;
|
||||
l1 |= tmp[tmpIdx + 6] << 5;
|
||||
l1 |= tmp[tmpIdx + 7] << 2;
|
||||
l1 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_2;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 8] & MASK16_1) << 12;
|
||||
l2 |= tmp[tmpIdx + 9] << 9;
|
||||
l2 |= tmp[tmpIdx + 10] << 6;
|
||||
l2 |= tmp[tmpIdx + 11] << 3;
|
||||
l2 |= tmp[tmpIdx + 12] << 0;
|
||||
ints[intsIdx + 2] = l2;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode14(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(56, ints, 2, 14, MASK16_14, tmp, 0, MASK16_2);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 56; iter < 8; ++iter, tmpIdx += 7, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 12;
|
||||
l0 |= tmp[tmpIdx + 1] << 10;
|
||||
l0 |= tmp[tmpIdx + 2] << 8;
|
||||
l0 |= tmp[tmpIdx + 3] << 6;
|
||||
l0 |= tmp[tmpIdx + 4] << 4;
|
||||
l0 |= tmp[tmpIdx + 5] << 2;
|
||||
l0 |= tmp[tmpIdx + 6] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode15(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(60, ints, 1, 15, MASK16_15, tmp, 0, MASK16_1);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 60; iter < 4; ++iter, tmpIdx += 15, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 14;
|
||||
l0 |= tmp[tmpIdx + 1] << 13;
|
||||
l0 |= tmp[tmpIdx + 2] << 12;
|
||||
l0 |= tmp[tmpIdx + 3] << 11;
|
||||
l0 |= tmp[tmpIdx + 4] << 10;
|
||||
l0 |= tmp[tmpIdx + 5] << 9;
|
||||
l0 |= tmp[tmpIdx + 6] << 8;
|
||||
l0 |= tmp[tmpIdx + 7] << 7;
|
||||
l0 |= tmp[tmpIdx + 8] << 6;
|
||||
l0 |= tmp[tmpIdx + 9] << 5;
|
||||
l0 |= tmp[tmpIdx + 10] << 4;
|
||||
l0 |= tmp[tmpIdx + 11] << 3;
|
||||
l0 |= tmp[tmpIdx + 12] << 2;
|
||||
l0 |= tmp[tmpIdx + 13] << 1;
|
||||
l0 |= tmp[tmpIdx + 14] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode16(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.in.readInts(ints, 0, 64);
|
||||
}
|
||||
|
||||
static void decode17(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(68, ints, 15, 17, MASK32_17, tmp, 0, MASK32_15);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 68; iter < 4; ++iter, tmpIdx += 17, intsIdx += 15) {
|
||||
int l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 13) & MASK32_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_13) << 4;
|
||||
l1 |= (tmp[tmpIdx + 2] >>> 11) & MASK32_4;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 2] & MASK32_11) << 6;
|
||||
l2 |= (tmp[tmpIdx + 3] >>> 9) & MASK32_6;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 3] & MASK32_9) << 8;
|
||||
l3 |= (tmp[tmpIdx + 4] >>> 7) & MASK32_8;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 4] & MASK32_7) << 10;
|
||||
l4 |= (tmp[tmpIdx + 5] >>> 5) & MASK32_10;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 5] & MASK32_5) << 12;
|
||||
l5 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_12;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 6] & MASK32_3) << 14;
|
||||
l6 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_14;
|
||||
ints[intsIdx + 6] = l6;
|
||||
int l7 = (tmp[tmpIdx + 7] & MASK32_1) << 16;
|
||||
l7 |= tmp[tmpIdx + 8] << 1;
|
||||
l7 |= (tmp[tmpIdx + 9] >>> 14) & MASK32_1;
|
||||
ints[intsIdx + 7] = l7;
|
||||
int l8 = (tmp[tmpIdx + 9] & MASK32_14) << 3;
|
||||
l8 |= (tmp[tmpIdx + 10] >>> 12) & MASK32_3;
|
||||
ints[intsIdx + 8] = l8;
|
||||
int l9 = (tmp[tmpIdx + 10] & MASK32_12) << 5;
|
||||
l9 |= (tmp[tmpIdx + 11] >>> 10) & MASK32_5;
|
||||
ints[intsIdx + 9] = l9;
|
||||
int l10 = (tmp[tmpIdx + 11] & MASK32_10) << 7;
|
||||
l10 |= (tmp[tmpIdx + 12] >>> 8) & MASK32_7;
|
||||
ints[intsIdx + 10] = l10;
|
||||
int l11 = (tmp[tmpIdx + 12] & MASK32_8) << 9;
|
||||
l11 |= (tmp[tmpIdx + 13] >>> 6) & MASK32_9;
|
||||
ints[intsIdx + 11] = l11;
|
||||
int l12 = (tmp[tmpIdx + 13] & MASK32_6) << 11;
|
||||
l12 |= (tmp[tmpIdx + 14] >>> 4) & MASK32_11;
|
||||
ints[intsIdx + 12] = l12;
|
||||
int l13 = (tmp[tmpIdx + 14] & MASK32_4) << 13;
|
||||
l13 |= (tmp[tmpIdx + 15] >>> 2) & MASK32_13;
|
||||
ints[intsIdx + 13] = l13;
|
||||
int l14 = (tmp[tmpIdx + 15] & MASK32_2) << 15;
|
||||
l14 |= tmp[tmpIdx + 16] << 0;
|
||||
ints[intsIdx + 14] = l14;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode18(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(72, ints, 14, 18, MASK32_18, tmp, 0, MASK32_14);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 72; iter < 8; ++iter, tmpIdx += 9, intsIdx += 7) {
|
||||
int l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 10) & MASK32_4;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_10) << 8;
|
||||
l1 |= (tmp[tmpIdx + 2] >>> 6) & MASK32_8;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 2] & MASK32_6) << 12;
|
||||
l2 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_12;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 3] & MASK32_2) << 16;
|
||||
l3 |= tmp[tmpIdx + 4] << 2;
|
||||
l3 |= (tmp[tmpIdx + 5] >>> 12) & MASK32_2;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 5] & MASK32_12) << 6;
|
||||
l4 |= (tmp[tmpIdx + 6] >>> 8) & MASK32_6;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 6] & MASK32_8) << 10;
|
||||
l5 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_10;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 7] & MASK32_4) << 14;
|
||||
l6 |= tmp[tmpIdx + 8] << 0;
|
||||
ints[intsIdx + 6] = l6;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode19(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(76, ints, 13, 19, MASK32_19, tmp, 0, MASK32_13);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 76; iter < 4; ++iter, tmpIdx += 19, intsIdx += 13) {
|
||||
int l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 7) & MASK32_6;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_7) << 12;
|
||||
l1 |= (tmp[tmpIdx + 2] >>> 1) & MASK32_12;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 2] & MASK32_1) << 18;
|
||||
l2 |= tmp[tmpIdx + 3] << 5;
|
||||
l2 |= (tmp[tmpIdx + 4] >>> 8) & MASK32_5;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 4] & MASK32_8) << 11;
|
||||
l3 |= (tmp[tmpIdx + 5] >>> 2) & MASK32_11;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 5] & MASK32_2) << 17;
|
||||
l4 |= tmp[tmpIdx + 6] << 4;
|
||||
l4 |= (tmp[tmpIdx + 7] >>> 9) & MASK32_4;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 7] & MASK32_9) << 10;
|
||||
l5 |= (tmp[tmpIdx + 8] >>> 3) & MASK32_10;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 8] & MASK32_3) << 16;
|
||||
l6 |= tmp[tmpIdx + 9] << 3;
|
||||
l6 |= (tmp[tmpIdx + 10] >>> 10) & MASK32_3;
|
||||
ints[intsIdx + 6] = l6;
|
||||
int l7 = (tmp[tmpIdx + 10] & MASK32_10) << 9;
|
||||
l7 |= (tmp[tmpIdx + 11] >>> 4) & MASK32_9;
|
||||
ints[intsIdx + 7] = l7;
|
||||
int l8 = (tmp[tmpIdx + 11] & MASK32_4) << 15;
|
||||
l8 |= tmp[tmpIdx + 12] << 2;
|
||||
l8 |= (tmp[tmpIdx + 13] >>> 11) & MASK32_2;
|
||||
ints[intsIdx + 8] = l8;
|
||||
int l9 = (tmp[tmpIdx + 13] & MASK32_11) << 8;
|
||||
l9 |= (tmp[tmpIdx + 14] >>> 5) & MASK32_8;
|
||||
ints[intsIdx + 9] = l9;
|
||||
int l10 = (tmp[tmpIdx + 14] & MASK32_5) << 14;
|
||||
l10 |= tmp[tmpIdx + 15] << 1;
|
||||
l10 |= (tmp[tmpIdx + 16] >>> 12) & MASK32_1;
|
||||
ints[intsIdx + 10] = l10;
|
||||
int l11 = (tmp[tmpIdx + 16] & MASK32_12) << 7;
|
||||
l11 |= (tmp[tmpIdx + 17] >>> 6) & MASK32_7;
|
||||
ints[intsIdx + 11] = l11;
|
||||
int l12 = (tmp[tmpIdx + 17] & MASK32_6) << 13;
|
||||
l12 |= tmp[tmpIdx + 18] << 0;
|
||||
ints[intsIdx + 12] = l12;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode20(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(80, ints, 12, 20, MASK32_20, tmp, 0, MASK32_12);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 80; iter < 16; ++iter, tmpIdx += 5, intsIdx += 3) {
|
||||
int l0 = tmp[tmpIdx + 0] << 8;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_8;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_4) << 16;
|
||||
l1 |= tmp[tmpIdx + 2] << 4;
|
||||
l1 |= (tmp[tmpIdx + 3] >>> 8) & MASK32_4;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 3] & MASK32_8) << 12;
|
||||
l2 |= tmp[tmpIdx + 4] << 0;
|
||||
ints[intsIdx + 2] = l2;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode21(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(84, ints, 11, 21, MASK32_21, tmp, 0, MASK32_11);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 84; iter < 4; ++iter, tmpIdx += 21, intsIdx += 11) {
|
||||
int l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK32_10;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_1) << 20;
|
||||
l1 |= tmp[tmpIdx + 2] << 9;
|
||||
l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_9;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 3] & MASK32_2) << 19;
|
||||
l2 |= tmp[tmpIdx + 4] << 8;
|
||||
l2 |= (tmp[tmpIdx + 5] >>> 3) & MASK32_8;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 5] & MASK32_3) << 18;
|
||||
l3 |= tmp[tmpIdx + 6] << 7;
|
||||
l3 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_7;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 7] & MASK32_4) << 17;
|
||||
l4 |= tmp[tmpIdx + 8] << 6;
|
||||
l4 |= (tmp[tmpIdx + 9] >>> 5) & MASK32_6;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 9] & MASK32_5) << 16;
|
||||
l5 |= tmp[tmpIdx + 10] << 5;
|
||||
l5 |= (tmp[tmpIdx + 11] >>> 6) & MASK32_5;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 11] & MASK32_6) << 15;
|
||||
l6 |= tmp[tmpIdx + 12] << 4;
|
||||
l6 |= (tmp[tmpIdx + 13] >>> 7) & MASK32_4;
|
||||
ints[intsIdx + 6] = l6;
|
||||
int l7 = (tmp[tmpIdx + 13] & MASK32_7) << 14;
|
||||
l7 |= tmp[tmpIdx + 14] << 3;
|
||||
l7 |= (tmp[tmpIdx + 15] >>> 8) & MASK32_3;
|
||||
ints[intsIdx + 7] = l7;
|
||||
int l8 = (tmp[tmpIdx + 15] & MASK32_8) << 13;
|
||||
l8 |= tmp[tmpIdx + 16] << 2;
|
||||
l8 |= (tmp[tmpIdx + 17] >>> 9) & MASK32_2;
|
||||
ints[intsIdx + 8] = l8;
|
||||
int l9 = (tmp[tmpIdx + 17] & MASK32_9) << 12;
|
||||
l9 |= tmp[tmpIdx + 18] << 1;
|
||||
l9 |= (tmp[tmpIdx + 19] >>> 10) & MASK32_1;
|
||||
ints[intsIdx + 9] = l9;
|
||||
int l10 = (tmp[tmpIdx + 19] & MASK32_10) << 11;
|
||||
l10 |= tmp[tmpIdx + 20] << 0;
|
||||
ints[intsIdx + 10] = l10;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode22(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(88, ints, 10, 22, MASK32_22, tmp, 0, MASK32_10);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 88; iter < 8; ++iter, tmpIdx += 11, intsIdx += 5) {
|
||||
int l0 = tmp[tmpIdx + 0] << 12;
|
||||
l0 |= tmp[tmpIdx + 1] << 2;
|
||||
l0 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 2] & MASK32_8) << 14;
|
||||
l1 |= tmp[tmpIdx + 3] << 4;
|
||||
l1 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 4] & MASK32_6) << 16;
|
||||
l2 |= tmp[tmpIdx + 5] << 6;
|
||||
l2 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 6] & MASK32_4) << 18;
|
||||
l3 |= tmp[tmpIdx + 7] << 8;
|
||||
l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 8] & MASK32_2) << 20;
|
||||
l4 |= tmp[tmpIdx + 9] << 10;
|
||||
l4 |= tmp[tmpIdx + 10] << 0;
|
||||
ints[intsIdx + 4] = l4;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode23(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(92, ints, 9, 23, MASK32_23, tmp, 0, MASK32_9);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 92; iter < 4; ++iter, tmpIdx += 23, intsIdx += 9) {
|
||||
int l0 = tmp[tmpIdx + 0] << 14;
|
||||
l0 |= tmp[tmpIdx + 1] << 5;
|
||||
l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK32_5;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 2] & MASK32_4) << 19;
|
||||
l1 |= tmp[tmpIdx + 3] << 10;
|
||||
l1 |= tmp[tmpIdx + 4] << 1;
|
||||
l1 |= (tmp[tmpIdx + 5] >>> 8) & MASK32_1;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 5] & MASK32_8) << 15;
|
||||
l2 |= tmp[tmpIdx + 6] << 6;
|
||||
l2 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_6;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 7] & MASK32_3) << 20;
|
||||
l3 |= tmp[tmpIdx + 8] << 11;
|
||||
l3 |= tmp[tmpIdx + 9] << 2;
|
||||
l3 |= (tmp[tmpIdx + 10] >>> 7) & MASK32_2;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 10] & MASK32_7) << 16;
|
||||
l4 |= tmp[tmpIdx + 11] << 7;
|
||||
l4 |= (tmp[tmpIdx + 12] >>> 2) & MASK32_7;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 12] & MASK32_2) << 21;
|
||||
l5 |= tmp[tmpIdx + 13] << 12;
|
||||
l5 |= tmp[tmpIdx + 14] << 3;
|
||||
l5 |= (tmp[tmpIdx + 15] >>> 6) & MASK32_3;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 15] & MASK32_6) << 17;
|
||||
l6 |= tmp[tmpIdx + 16] << 8;
|
||||
l6 |= (tmp[tmpIdx + 17] >>> 1) & MASK32_8;
|
||||
ints[intsIdx + 6] = l6;
|
||||
int l7 = (tmp[tmpIdx + 17] & MASK32_1) << 22;
|
||||
l7 |= tmp[tmpIdx + 18] << 13;
|
||||
l7 |= tmp[tmpIdx + 19] << 4;
|
||||
l7 |= (tmp[tmpIdx + 20] >>> 5) & MASK32_4;
|
||||
ints[intsIdx + 7] = l7;
|
||||
int l8 = (tmp[tmpIdx + 20] & MASK32_5) << 18;
|
||||
l8 |= tmp[tmpIdx + 21] << 9;
|
||||
l8 |= tmp[tmpIdx + 22] << 0;
|
||||
ints[intsIdx + 8] = l8;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode24(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(96, ints, 8, 24, MASK32_24, tmp, 0, MASK32_8);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 96; iter < 32; ++iter, tmpIdx += 3, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 16;
|
||||
l0 |= tmp[tmpIdx + 1] << 8;
|
||||
l0 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,217 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PointsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 10.1 index format
|
||||
*
|
||||
* <p>If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene101 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene101Codec extends Codec {
|
||||
|
||||
/** Configuration option for the codec. */
|
||||
public enum Mode {
|
||||
/** Trade compression ratio for retrieval speed. */
|
||||
BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED),
|
||||
/** Trade retrieval speed for compression ratio. */
|
||||
BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION);
|
||||
|
||||
private final Lucene90StoredFieldsFormat.Mode storedMode;
|
||||
|
||||
private Mode(Lucene90StoredFieldsFormat.Mode storedMode) {
|
||||
this.storedMode = Objects.requireNonNull(storedMode);
|
||||
}
|
||||
}
|
||||
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat();
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat();
|
||||
private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
|
||||
private final CompoundFormat compoundFormat = new Lucene90CompoundFormat();
|
||||
private final NormsFormat normsFormat = new Lucene90NormsFormat();
|
||||
|
||||
private final PostingsFormat defaultPostingsFormat;
|
||||
private final PostingsFormat postingsFormat =
|
||||
new PerFieldPostingsFormat() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return Lucene101Codec.this.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final DocValuesFormat defaultDVFormat;
|
||||
private final DocValuesFormat docValuesFormat =
|
||||
new PerFieldDocValuesFormat() {
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return Lucene101Codec.this.getDocValuesFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final KnnVectorsFormat defaultKnnVectorsFormat;
|
||||
private final KnnVectorsFormat knnVectorsFormat =
|
||||
new PerFieldKnnVectorsFormat() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return Lucene101Codec.this.getKnnVectorsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final StoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/** Instantiates a new codec. */
|
||||
public Lucene101Codec() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new codec, specifying the stored fields compression mode to use.
|
||||
*
|
||||
* @param mode stored fields compression mode to use for newly flushed/merged segments.
|
||||
*/
|
||||
public Lucene101Codec(Mode mode) {
|
||||
super("Lucene101");
|
||||
this.storedFieldsFormat =
|
||||
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
|
||||
this.defaultPostingsFormat = new Lucene101PostingsFormat();
|
||||
this.defaultDVFormat = new Lucene90DocValuesFormat();
|
||||
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final StoredFieldsFormat storedFieldsFormat() {
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final TermVectorsFormat termVectorsFormat() {
|
||||
return vectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PostingsFormat postingsFormat() {
|
||||
return postingsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final FieldInfosFormat fieldInfosFormat() {
|
||||
return fieldInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final LiveDocsFormat liveDocsFormat() {
|
||||
return liveDocsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final CompoundFormat compoundFormat() {
|
||||
return compoundFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PointsFormat pointsFormat() {
|
||||
return new Lucene90PointsFormat();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final KnnVectorsFormat knnVectorsFormat() {
|
||||
return knnVectorsFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the postings format that should be used for writing new segments of <code>field</code>.
|
||||
*
|
||||
* <p>The default implementation always returns "Lucene101".
|
||||
*
|
||||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||
* future version of Lucene are only guaranteed to be able to read the default implementation,
|
||||
*/
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return defaultPostingsFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the docvalues format that should be used for writing new segments of <code>field</code>
|
||||
* .
|
||||
*
|
||||
* <p>The default implementation always returns "Lucene90".
|
||||
*
|
||||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||
* future version of Lucene are only guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return defaultDVFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the vectors format that should be used for writing new segments of <code>field</code>
|
||||
*
|
||||
* <p>The default implementation always returns "Lucene99HnswVectorsFormat".
|
||||
*
|
||||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||
* future version of Lucene are only guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return defaultKnnVectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final DocValuesFormat docValuesFormat() {
|
||||
return docValuesFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,492 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Lucene 10.1 postings format, which encodes postings in packed integer blocks for fast decode.
|
||||
*
|
||||
* <p>Basic idea:
|
||||
*
|
||||
* <ul>
|
||||
* <li><b>Packed Blocks and VInt Blocks</b>:
|
||||
* <p>In packed blocks, integers are encoded with the same bit width ({@link PackedInts packed
|
||||
* format}): the block size (i.e. number of integers inside block) is fixed (currently 128).
|
||||
* Additionally blocks that are all the same value are encoded in an optimized way.
|
||||
* <p>In VInt blocks, integers are encoded as {@link DataOutput#writeVInt VInt}: the block
|
||||
* size is variable.
|
||||
* <li><b>Block structure</b>:
|
||||
* <p>When the postings are long enough, Lucene101PostingsFormat will try to encode most
|
||||
* integer data as a packed block.
|
||||
* <p>Take a term with 259 documents as an example, the first 256 document ids are encoded as
|
||||
* two packed blocks, while the remaining 3 are encoded as one VInt block.
|
||||
* <p>Different kinds of data are always encoded separately into different packed blocks, but
|
||||
* may possibly be interleaved into the same VInt block.
|
||||
* <p>This strategy is applied to pairs: <document number, frequency>, <position,
|
||||
* payload length>, <position, offset start, offset length>, and <position,
|
||||
* payload length, offsetstart, offset length>.
|
||||
* <li><b>Skipdata</b>:
|
||||
* <p>Skipdata is interleaved with blocks on 2 levels. Level 0 skip data is interleaved
|
||||
* between every packed block. Level 1 skip data is interleaved between every 32 packed
|
||||
* blocks.
|
||||
* <li><b>Positions, Payloads, and Offsets</b>:
|
||||
* <p>A position is an integer indicating where the term occurs within one document. A payload
|
||||
* is a blob of metadata associated with current position. An offset is a pair of integers
|
||||
* indicating the tokenized start/end offsets for given term in current position: it is
|
||||
* essentially a specialized payload.
|
||||
* <p>When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets
|
||||
* (assuming a null payload contributes one count). As mentioned in block structure, it is
|
||||
* possible to encode these three either combined or separately.
|
||||
* <p>In all cases, payloads and offsets are stored together. When encoded as a packed block,
|
||||
* position data is separated out as .pos, while payloads and offsets are encoded in .pay
|
||||
* (payload metadata will also be stored directly in .pay). When encoded as VInt blocks, all
|
||||
* these three are stored interleaved into the .pos (so is payload metadata).
|
||||
* <p>With this strategy, the majority of payload and offset data will be outside .pos file.
|
||||
* So for queries that require only position data, running on a full index with payloads and
|
||||
* offsets, this reduces disk pre-fetches.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Files and detailed format:
|
||||
*
|
||||
* <ul>
|
||||
* <li><code>.tim</code>: <a href="#Termdictionary">Term Dictionary</a>
|
||||
* <li><code>.tip</code>: <a href="#Termindex">Term Index</a>
|
||||
* <li><code>.doc</code>: <a href="#Frequencies">Frequencies and Skip Data</a>
|
||||
* <li><code>.pos</code>: <a href="#Positions">Positions</a>
|
||||
* <li><code>.pay</code>: <a href="#Payloads">Payloads and Offsets</a>
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Termdictionary"></a>
|
||||
*
|
||||
* <dl>
|
||||
* <dd><b>Term Dictionary</b>
|
||||
* <p>The .tim file contains the list of terms in each field along with per-term statistics
|
||||
* (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the
|
||||
* .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on
|
||||
* the format.
|
||||
* <p>NOTE: The term dictionary can plug into different postings implementations: the postings
|
||||
* writer/reader are actually responsible for encoding and decoding the PostingsHeader and
|
||||
* TermMetadata sections described here:
|
||||
* <ul>
|
||||
* <li>PostingsHeader --> Header, PackedBlockSize
|
||||
* <li>TermMetadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?,
|
||||
* PayFPDelta?
|
||||
* <li>Header, --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
* <li>PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt}
|
||||
* <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta --> {@link
|
||||
* DataOutput#writeVLong VLong}
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}
|
||||
* </ul>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version
|
||||
* information for the postings.
|
||||
* <li>PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width
|
||||
* is determined by the largest integer. Smaller block size result in smaller variance
|
||||
* among width of integers hence smaller indexes. Larger block size result in more
|
||||
* efficient bulk i/o hence better acceleration. This value should always be a multiple
|
||||
* of 64, currently fixed as 128 as a tradeoff. It is also the skip interval used to
|
||||
* accelerate {@link org.apache.lucene.index.PostingsEnum#advance(int)}.
|
||||
* <li>DocFPDelta determines the position of this term's TermFreqs within the .doc file. In
|
||||
* particular, it is the difference of file offset between this term's data and previous
|
||||
* term's data (or zero, for the first term in the block).On disk it is stored as the
|
||||
* difference from previous value in sequence.
|
||||
* <li>PosFPDelta determines the position of this term's TermPositions within the .pos file.
|
||||
* While PayFPDelta determines the position of this term's <TermPayloads,
|
||||
* TermOffsets?> within the .pay file. Similar to DocFPDelta, it is the difference
|
||||
* between two file positions (or neglected, for fields that omit payloads and offsets).
|
||||
* <li>PosVIntBlockFPDelta determines the position of this term's last TermPosition in last
|
||||
* pos packed block within the .pos file. It is synonym for PayVIntBlockFPDelta or
|
||||
* OffsetVIntBlockFPDelta. This is actually used to indicate whether it is necessary to
|
||||
* load following payloads and offsets from .pos instead of .pay. Every time a new block
|
||||
* of positions are to be loaded, the PostingsReader will use this value to check
|
||||
* whether current block is packed format or VInt. When packed format, payloads and
|
||||
* offsets are fetched from .pay, otherwise from .pos. (this value is neglected when
|
||||
* total number of positions i.e. totalTermFreq is less or equal to PackedBlockSize).
|
||||
* <li>SingletonDocID is an optimization when a term only appears in one document. In this
|
||||
* case, instead of writing a file pointer to the .doc file (DocFPDelta), and then a
|
||||
* VIntBlock at that location, the single document ID is written to the term dictionary.
|
||||
* </ul>
|
||||
* </dl>
|
||||
*
|
||||
* <a id="Termindex"></a>
|
||||
*
|
||||
* <dl>
|
||||
* <dd><b>Term Index</b>
|
||||
* <p>The .tip file contains an index into the term dictionary, so that it can be accessed
|
||||
* randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format.
|
||||
* </dl>
|
||||
*
|
||||
* <a id="Frequencies"></a>
|
||||
*
|
||||
* <dl>
|
||||
* <dd><b>Frequencies and Skip Data</b>
|
||||
* <p>The .doc file contains the lists of documents which contain each term, along with the
|
||||
* frequency of the term in that document (except when frequencies are omitted: {@link
|
||||
* IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data
|
||||
* is saved once for the entire postings list.
|
||||
* <ul>
|
||||
* <li>docFile(.doc) --> Header, <TermFreqs><sup>TermCount</sup>, Footer
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
* <li>TermFreqs --> <PackedBlock32> <sup>PackedDocBlockNum/32</sup>, VIntBlock?
|
||||
* <li>PackedBlock32 --> Level1SkipData, <PackedBlock> <sup>32</sup>
|
||||
* <li>PackedBlock --> Level0SkipData, PackedDocDeltaBlock, PackedFreqBlock?
|
||||
* <li>VIntBlock -->
|
||||
* <DocDelta[,Freq?]><sup>DocFreq-PackedBlockSize*PackedDocBlockNum</sup>
|
||||
* <li>Level1SkipData --> DocDelta, DocFPDelta, Skip1NumBytes?, ImpactLength?, Impacts?,
|
||||
* PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto?
|
||||
* <li>Level0SkipData --> Skip0NumBytes, DocDelta, DocFPDelta, PackedBlockLength,
|
||||
* ImpactLength?, Impacts?, PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto?
|
||||
* <li>PackedFreqBlock --> {@link PackedInts PackedInts}, uses patching
|
||||
* <li>PackedDocDeltaBlock --> {@link PackedInts PackedInts}, does not use patching
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}
|
||||
* </ul>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>PackedDocDeltaBlock is theoretically generated from two steps:
|
||||
* <ol>
|
||||
* <li>Calculate the difference between each document number and previous one, and get
|
||||
* a d-gaps list (for the first document, use absolute value);
|
||||
* <li>For those d-gaps from first one to
|
||||
* PackedDocBlockNum*PackedBlockSize<sup>th</sup>, separately encode as packed
|
||||
* blocks.
|
||||
* </ol>
|
||||
* If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step.
|
||||
* <li>VIntBlock stores remaining d-gaps (along with frequencies when possible) with a
|
||||
* format that encodes DocDelta and Freq:
|
||||
* <p>DocDelta: if frequencies are indexed, this determines both the document number and
|
||||
* the frequency. In particular, DocDelta/2 is the difference between this document
|
||||
* number and the previous document number (or zero when this is the first document in a
|
||||
* TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the
|
||||
* frequency is read as another VInt. If frequencies are omitted, DocDelta contains the
|
||||
* gap (not multiplied by 2) between document numbers and no frequency information is
|
||||
* stored.
|
||||
* <p>For example, the TermFreqs for a term which occurs once in document seven and
|
||||
* three times in document eleven, with frequencies indexed, would be the following
|
||||
* sequence of VInts:
|
||||
* <p>15, 8, 3
|
||||
* <p>If frequencies were omitted ({@link IndexOptions#DOCS}) it would be this sequence
|
||||
* of VInts instead:
|
||||
* <p>7,4
|
||||
* <li>PackedDocBlockNum is the number of packed blocks for current term's docids or
|
||||
* frequencies. In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize)
|
||||
* <li>On skip data, DocDelta is the delta between the last doc of the previous block - or
|
||||
* -1 if there is no previous block - and the last doc of this block. This helps know by
|
||||
* how much the doc ID should be incremented in case the block gets skipped.
|
||||
* <li>Skip0Length is the length of skip data at level 0. Encoding it is useful when skip
|
||||
* data is never needed to quickly skip over skip data, e.g. if only using nextDoc(). It
|
||||
* is also used when only the first fields of skip data are needed, in order to skip
|
||||
* over remaining fields without reading them.
|
||||
* <li>ImpactLength and Impacts are only stored if frequencies are indexed.
|
||||
* <li>Since positions and payloads are also block encoded, the skip should skip to related
|
||||
* block first, then fetch the values according to in-block offset. PosFPSkip and
|
||||
* PayFPSkip record the file offsets of related block in .pos and .pay, respectively.
|
||||
* While PosBlockOffset indicates which value to fetch inside the related block
|
||||
* (PayBlockOffset is unnecessary since it is always equal to PosBlockOffset). Same as
|
||||
* DocFPSkip, the file offsets are relative to the start of current term's TermFreqs,
|
||||
* and stored as a difference sequence.
|
||||
* <li>PayByteUpto indicates the start offset of the current payload. It is equivalent to
|
||||
* the sum of the payload lengths in the current block up to PosBlockOffset
|
||||
* <li>ImpactLength is the total length of CompetitiveFreqDelta and CompetitiveNormDelta
|
||||
* pairs. CompetitiveFreqDelta and CompetitiveNormDelta are used to safely skip score
|
||||
* calculation for uncompetitive documents; See {@link
|
||||
* org.apache.lucene.codecs.CompetitiveImpactAccumulator} for more details.
|
||||
* </ul>
|
||||
* </dl>
|
||||
*
|
||||
* <a id="Positions"></a>
|
||||
*
|
||||
* <dl>
|
||||
* <dd><b>Positions</b>
|
||||
* <p>The .pos file contains the lists of positions that each term occurs at within documents.
|
||||
* It also sometimes stores part of payloads and offsets for speedup.
|
||||
* <ul>
|
||||
* <li>PosFile(.pos) --> Header, <TermPositions> <sup>TermCount</sup>, Footer
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
* <li>TermPositions --> <PackedPosDeltaBlock> <sup>PackedPosBlockNum</sup>,
|
||||
* VIntBlock?
|
||||
* <li>VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?, OffsetDelta?,
|
||||
* OffsetLength?><sup>PosVIntCount</sup>
|
||||
* <li>PackedPosDeltaBlock --> {@link PackedInts PackedInts}
|
||||
* <li>PositionDelta, OffsetDelta, OffsetLength --> {@link DataOutput#writeVInt VInt}
|
||||
* <li>PayloadData --> {@link DataOutput#writeByte byte}<sup>PayLength</sup>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}
|
||||
* </ul>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>TermPositions are order by term (terms are implicit, from the term dictionary), and
|
||||
* position values for each term document pair are incremental, and ordered by document
|
||||
* number.
|
||||
* <li>PackedPosBlockNum is the number of packed blocks for current term's positions,
|
||||
* payloads or offsets. In particular, PackedPosBlockNum =
|
||||
* floor(totalTermFreq/PackedBlockSize)
|
||||
* <li>PosVIntCount is the number of positions encoded as VInt format. In particular,
|
||||
* PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize
|
||||
* <li>The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
|
||||
* in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.
|
||||
* <li>PositionDelta is, if payloads are disabled for the term's field, the difference
|
||||
* between the position of the current occurrence in the document and the previous
|
||||
* occurrence (or zero, if this is the first occurrence in this document). If payloads
|
||||
* are enabled for the term's field, then PositionDelta/2 is the difference between the
|
||||
* current and the previous position. If payloads are enabled and PositionDelta is odd,
|
||||
* then PayloadLength is stored, indicating the length of the payload at the current
|
||||
* term position.
|
||||
* <li>For example, the TermPositions for a term which occurs as the fourth term in one
|
||||
* document, and as the fifth and ninth term in a subsequent document, would be the
|
||||
* following sequence of VInts (payloads disabled):
|
||||
* <p>4, 5, 4
|
||||
* <li>PayloadData is metadata associated with the current term position. If PayloadLength
|
||||
* is stored at the current position, then it indicates the length of this payload. If
|
||||
* PayloadLength is not stored, then this payload has the same length as the payload at
|
||||
* the previous position.
|
||||
* <li>OffsetDelta/2 is the difference between this position's startOffset from the previous
|
||||
* occurrence (or zero, if this is the first occurrence in this document). If
|
||||
* OffsetDelta is odd, then the length (endOffset-startOffset) differs from the previous
|
||||
* occurrence and an OffsetLength follows. Offset data is only written for {@link
|
||||
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
|
||||
* </ul>
|
||||
* </dl>
|
||||
*
|
||||
* <a id="Payloads"></a>
|
||||
*
|
||||
* <dl>
|
||||
* <dd><b>Payloads and Offsets</b>
|
||||
* <p>The .pay file will store payloads and offsets associated with certain term-document
|
||||
* positions. Some payloads and offsets will be separated out into .pos file, for performance
|
||||
* reasons.
|
||||
* <ul>
|
||||
* <li>PayFile(.pay): --> Header, <TermPayloads?, TermOffsets?>
|
||||
* <sup>TermCount</sup>, Footer
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
* <li>TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData>
|
||||
* <sup>PackedPayBlockNum</sup>
|
||||
* <li>TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock>
|
||||
* <sup>PackedPayBlockNum</sup>
|
||||
* <li>PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock -->
|
||||
* {@link PackedInts PackedInts}
|
||||
* <li>SumPayLength --> {@link DataOutput#writeVInt VInt}
|
||||
* <li>PayData --> {@link DataOutput#writeByte byte}<sup>SumPayLength</sup>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}
|
||||
* </ul>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>The order of TermPayloads/TermOffsets will be the same as TermPositions, note that
|
||||
* part of payload/offsets are stored in .pos.
|
||||
* <li>The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is
|
||||
* the same as PackedFreqBlock in chapter <a href="#Frequencies">Frequencies and Skip
|
||||
* Data</a>. While PackedStartDeltaBlock follows a same procedure as
|
||||
* PackedDocDeltaBlock.
|
||||
* <li>PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also
|
||||
* synonym for PackedOffsetBlockNum.
|
||||
* <li>SumPayLength is the total length of payloads written within one block, should be the
|
||||
* sum of PayLengths in one packed block.
|
||||
* <li>PayLength in PackedPayLengthBlock is the length of each payload associated with the
|
||||
* current position.
|
||||
* </ul>
|
||||
* </dl>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene101PostingsFormat extends PostingsFormat {
|
||||
|
||||
/** Filename extension for some small metadata about how postings are encoded. */
|
||||
public static final String META_EXTENSION = "psm";
|
||||
|
||||
/**
|
||||
* Filename extension for document number, frequencies, and skip data. See chapter: <a
|
||||
* href="#Frequencies">Frequencies and Skip Data</a>
|
||||
*/
|
||||
public static final String DOC_EXTENSION = "doc";
|
||||
|
||||
/** Filename extension for positions. See chapter: <a href="#Positions">Positions</a> */
|
||||
public static final String POS_EXTENSION = "pos";
|
||||
|
||||
/**
|
||||
* Filename extension for payloads and offsets. See chapter: <a href="#Payloads">Payloads and
|
||||
* Offsets</a>
|
||||
*/
|
||||
public static final String PAY_EXTENSION = "pay";
|
||||
|
||||
/** Size of blocks. */
|
||||
public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE;
|
||||
|
||||
public static final int BLOCK_MASK = BLOCK_SIZE - 1;
|
||||
|
||||
/** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */
|
||||
public static final int LEVEL1_FACTOR = 32;
|
||||
|
||||
/** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */
|
||||
public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE;
|
||||
|
||||
public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1;
|
||||
|
||||
static final String TERMS_CODEC = "Lucene90PostingsWriterTerms";
|
||||
static final String META_CODEC = "Lucene101PostingsWriterMeta";
|
||||
static final String DOC_CODEC = "Lucene101PostingsWriterDoc";
|
||||
static final String POS_CODEC = "Lucene101PostingsWriterPos";
|
||||
static final String PAY_CODEC = "Lucene101PostingsWriterPay";
|
||||
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/** Creates {@code Lucene101PostingsFormat} with default settings. */
|
||||
public Lucene101PostingsFormat() {
|
||||
this(
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@code Lucene101PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
*
|
||||
* @see
|
||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||
*/
|
||||
public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("Lucene101");
|
||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene90BlockTreeTermsWriter(
|
||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new Lucene90BlockTreeTermsReader(postingsReader, state);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds all state required for {@link Lucene101PostingsReader} to produce a {@link
|
||||
* org.apache.lucene.index.PostingsEnum} without re-seeking the terms dict.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public static final class IntBlockTermState extends BlockTermState {
|
||||
/** file pointer to the start of the doc ids enumeration, in {@link #DOC_EXTENSION} file */
|
||||
public long docStartFP;
|
||||
|
||||
/** file pointer to the start of the positions enumeration, in {@link #POS_EXTENSION} file */
|
||||
public long posStartFP;
|
||||
|
||||
/** file pointer to the start of the payloads enumeration, in {@link #PAY_EXTENSION} file */
|
||||
public long payStartFP;
|
||||
|
||||
/**
|
||||
* file offset for the last position in the last block, if there are more than {@link
|
||||
* ForUtil#BLOCK_SIZE} positions; otherwise -1
|
||||
*
|
||||
* <p>One might think to use total term frequency to track how many positions are left to read
|
||||
* as we decode the blocks, and decode the last block differently when num_left_positions <
|
||||
* BLOCK_SIZE. Unfortunately this won't work since the tracking will be messed up when we skip
|
||||
* blocks as the skipper will only tell us new position offset (start of block) and number of
|
||||
* positions to skip for that block, without telling us how many positions it has skipped.
|
||||
*/
|
||||
public long lastPosBlockOffset;
|
||||
|
||||
/**
|
||||
* docid when there is a single pulsed posting, otherwise -1. freq is always implicitly
|
||||
* totalTermFreq in this case.
|
||||
*/
|
||||
public int singletonDocID;
|
||||
|
||||
/** Sole constructor. */
|
||||
public IntBlockTermState() {
|
||||
lastPosBlockOffset = -1;
|
||||
singletonDocID = -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntBlockTermState clone() {
|
||||
IntBlockTermState other = new IntBlockTermState();
|
||||
other.copyFrom(this);
|
||||
return other;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyFrom(TermState _other) {
|
||||
super.copyFrom(_other);
|
||||
IntBlockTermState other = (IntBlockTermState) _other;
|
||||
docStartFP = other.docStartFP;
|
||||
posStartFP = other.posStartFP;
|
||||
payStartFP = other.payStartFP;
|
||||
lastPosBlockOffset = other.lastPosBlockOffset;
|
||||
singletonDocID = other.singletonDocID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return super.toString()
|
||||
+ " docStartFP="
|
||||
+ docStartFP
|
||||
+ " posStartFP="
|
||||
+ posStartFP
|
||||
+ " payStartFP="
|
||||
+ payStartFP
|
||||
+ " lastPosBlockOffset="
|
||||
+ lastPosBlockOffset
|
||||
+ " singletonDocID="
|
||||
+ singletonDocID;
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,681 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.LEVEL1_MASK;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.META_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.VERSION_CURRENT;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.PushPostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Impact;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.ByteBuffersDataOutput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Writer for {@link Lucene101PostingsFormat}. */
|
||||
public class Lucene101PostingsWriter extends PushPostingsWriterBase {
|
||||
|
||||
static final IntBlockTermState EMPTY_STATE = new IntBlockTermState();
|
||||
|
||||
IndexOutput metaOut;
|
||||
IndexOutput docOut;
|
||||
IndexOutput posOut;
|
||||
IndexOutput payOut;
|
||||
|
||||
IntBlockTermState lastState;
|
||||
|
||||
// Holds starting file pointers for current term:
|
||||
private long docStartFP;
|
||||
private long posStartFP;
|
||||
private long payStartFP;
|
||||
|
||||
final int[] docDeltaBuffer;
|
||||
final int[] freqBuffer;
|
||||
private int docBufferUpto;
|
||||
|
||||
final int[] posDeltaBuffer;
|
||||
final int[] payloadLengthBuffer;
|
||||
final int[] offsetStartDeltaBuffer;
|
||||
final int[] offsetLengthBuffer;
|
||||
private int posBufferUpto;
|
||||
|
||||
private byte[] payloadBytes;
|
||||
private int payloadByteUpto;
|
||||
|
||||
private int level0LastDocID;
|
||||
private long level0LastPosFP;
|
||||
private long level0LastPayFP;
|
||||
|
||||
private int level1LastDocID;
|
||||
private long level1LastPosFP;
|
||||
private long level1LastPayFP;
|
||||
|
||||
private int docID;
|
||||
private int lastDocID;
|
||||
private int lastPosition;
|
||||
private int lastStartOffset;
|
||||
private int docCount;
|
||||
|
||||
private final PForUtil pforUtil;
|
||||
private final ForDeltaUtil forDeltaUtil;
|
||||
|
||||
private boolean fieldHasNorms;
|
||||
private NumericDocValues norms;
|
||||
private final CompetitiveImpactAccumulator level0FreqNormAccumulator =
|
||||
new CompetitiveImpactAccumulator();
|
||||
private final CompetitiveImpactAccumulator level1CompetitiveFreqNormAccumulator =
|
||||
new CompetitiveImpactAccumulator();
|
||||
|
||||
private int maxNumImpactsAtLevel0;
|
||||
private int maxImpactNumBytesAtLevel0;
|
||||
private int maxNumImpactsAtLevel1;
|
||||
private int maxImpactNumBytesAtLevel1;
|
||||
|
||||
/** Scratch output that we use to be able to prepend the encoded length, e.g. impacts. */
|
||||
private final ByteBuffersDataOutput scratchOutput = ByteBuffersDataOutput.newResettableInstance();
|
||||
|
||||
/**
|
||||
* Output for a single block. This is useful to be able to prepend skip data before each block,
|
||||
* which can only be computed once the block is encoded. The content is then typically copied to
|
||||
* {@link #level1Output}.
|
||||
*/
|
||||
private final ByteBuffersDataOutput level0Output = ByteBuffersDataOutput.newResettableInstance();
|
||||
|
||||
/**
|
||||
* Output for groups of 32 blocks. This is useful to prepend skip data for these 32 blocks, which
|
||||
* can only be done once we have encoded these 32 blocks. The content is then typically copied to
|
||||
* {@link #docCount}.
|
||||
*/
|
||||
private final ByteBuffersDataOutput level1Output = ByteBuffersDataOutput.newResettableInstance();
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene101PostingsWriter(SegmentWriteState state) throws IOException {
|
||||
String metaFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.META_EXTENSION);
|
||||
String docFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.DOC_EXTENSION);
|
||||
metaOut = state.directory.createOutput(metaFileName, state.context);
|
||||
IndexOutput posOut = null;
|
||||
IndexOutput payOut = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
docOut = state.directory.createOutput(docFileName, state.context);
|
||||
CodecUtil.writeIndexHeader(
|
||||
metaOut, META_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
CodecUtil.writeIndexHeader(
|
||||
docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
forDeltaUtil = new ForDeltaUtil();
|
||||
pforUtil = new PForUtil();
|
||||
if (state.fieldInfos.hasProx()) {
|
||||
posDeltaBuffer = new int[BLOCK_SIZE];
|
||||
String posFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.POS_EXTENSION);
|
||||
posOut = state.directory.createOutput(posFileName, state.context);
|
||||
CodecUtil.writeIndexHeader(
|
||||
posOut, POS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
|
||||
if (state.fieldInfos.hasPayloads()) {
|
||||
payloadBytes = new byte[128];
|
||||
payloadLengthBuffer = new int[BLOCK_SIZE];
|
||||
} else {
|
||||
payloadBytes = null;
|
||||
payloadLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasOffsets()) {
|
||||
offsetStartDeltaBuffer = new int[BLOCK_SIZE];
|
||||
offsetLengthBuffer = new int[BLOCK_SIZE];
|
||||
} else {
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
|
||||
String payFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name,
|
||||
state.segmentSuffix,
|
||||
Lucene101PostingsFormat.PAY_EXTENSION);
|
||||
payOut = state.directory.createOutput(payFileName, state.context);
|
||||
CodecUtil.writeIndexHeader(
|
||||
payOut, PAY_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
}
|
||||
} else {
|
||||
posDeltaBuffer = null;
|
||||
payloadLengthBuffer = null;
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
payloadBytes = null;
|
||||
}
|
||||
this.payOut = payOut;
|
||||
this.posOut = posOut;
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut);
|
||||
}
|
||||
}
|
||||
|
||||
docDeltaBuffer = new int[BLOCK_SIZE];
|
||||
freqBuffer = new int[BLOCK_SIZE];
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntBlockTermState newTermState() {
|
||||
return new IntBlockTermState();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException {
|
||||
CodecUtil.writeIndexHeader(
|
||||
termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
termsOut.writeVInt(BLOCK_SIZE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
super.setField(fieldInfo);
|
||||
lastState = EMPTY_STATE;
|
||||
fieldHasNorms = fieldInfo.hasNorms();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startTerm(NumericDocValues norms) {
|
||||
docStartFP = docOut.getFilePointer();
|
||||
if (writePositions) {
|
||||
posStartFP = posOut.getFilePointer();
|
||||
level1LastPosFP = level0LastPosFP = posStartFP;
|
||||
if (writePayloads || writeOffsets) {
|
||||
payStartFP = payOut.getFilePointer();
|
||||
level1LastPayFP = level0LastPayFP = payStartFP;
|
||||
}
|
||||
}
|
||||
lastDocID = -1;
|
||||
level0LastDocID = -1;
|
||||
level1LastDocID = -1;
|
||||
this.norms = norms;
|
||||
if (writeFreqs) {
|
||||
level0FreqNormAccumulator.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
flushDocBlock(false);
|
||||
docBufferUpto = 0;
|
||||
}
|
||||
|
||||
final int docDelta = docID - lastDocID;
|
||||
|
||||
if (docID < 0 || docDelta <= 0) {
|
||||
throw new CorruptIndexException(
|
||||
"docs out of order (" + docID + " <= " + lastDocID + " )", docOut);
|
||||
}
|
||||
|
||||
docDeltaBuffer[docBufferUpto] = docDelta;
|
||||
if (writeFreqs) {
|
||||
freqBuffer[docBufferUpto] = termDocFreq;
|
||||
}
|
||||
|
||||
this.docID = docID;
|
||||
lastPosition = 0;
|
||||
lastStartOffset = 0;
|
||||
|
||||
if (writeFreqs) {
|
||||
long norm;
|
||||
if (fieldHasNorms) {
|
||||
boolean found = norms.advanceExact(docID);
|
||||
if (found == false) {
|
||||
// This can happen if indexing hits a problem after adding a doc to the
|
||||
// postings but before buffering the norm. Such documents are written
|
||||
// deleted and will go away on the first merge.
|
||||
norm = 1L;
|
||||
} else {
|
||||
norm = norms.longValue();
|
||||
assert norm != 0 : docID;
|
||||
}
|
||||
} else {
|
||||
norm = 1L;
|
||||
}
|
||||
|
||||
level0FreqNormAccumulator.add(termDocFreq, norm);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset)
|
||||
throws IOException {
|
||||
if (position > IndexWriter.MAX_POSITION) {
|
||||
throw new CorruptIndexException(
|
||||
"position="
|
||||
+ position
|
||||
+ " is too large (> IndexWriter.MAX_POSITION="
|
||||
+ IndexWriter.MAX_POSITION
|
||||
+ ")",
|
||||
docOut);
|
||||
}
|
||||
if (position < 0) {
|
||||
throw new CorruptIndexException("position=" + position + " is < 0", docOut);
|
||||
}
|
||||
posDeltaBuffer[posBufferUpto] = position - lastPosition;
|
||||
if (writePayloads) {
|
||||
if (payload == null || payload.length == 0) {
|
||||
// no payload
|
||||
payloadLengthBuffer[posBufferUpto] = 0;
|
||||
} else {
|
||||
payloadLengthBuffer[posBufferUpto] = payload.length;
|
||||
if (payloadByteUpto + payload.length > payloadBytes.length) {
|
||||
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
|
||||
}
|
||||
System.arraycopy(
|
||||
payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
|
||||
payloadByteUpto += payload.length;
|
||||
}
|
||||
}
|
||||
|
||||
if (writeOffsets) {
|
||||
assert startOffset >= lastStartOffset;
|
||||
assert endOffset >= startOffset;
|
||||
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
|
||||
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
|
||||
lastStartOffset = startOffset;
|
||||
}
|
||||
|
||||
posBufferUpto++;
|
||||
lastPosition = position;
|
||||
if (posBufferUpto == BLOCK_SIZE) {
|
||||
pforUtil.encode(posDeltaBuffer, posOut);
|
||||
|
||||
if (writePayloads) {
|
||||
pforUtil.encode(payloadLengthBuffer, payOut);
|
||||
payOut.writeVInt(payloadByteUpto);
|
||||
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
if (writeOffsets) {
|
||||
pforUtil.encode(offsetStartDeltaBuffer, payOut);
|
||||
pforUtil.encode(offsetLengthBuffer, payOut);
|
||||
}
|
||||
posBufferUpto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishDoc() {
|
||||
docBufferUpto++;
|
||||
docCount++;
|
||||
|
||||
lastDocID = docID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Special vints that are encoded on 2 bytes if they require 15 bits or less. VInt becomes
|
||||
* especially slow when the number of bytes is variable, so this special layout helps in the case
|
||||
* when the number likely requires 15 bits or less
|
||||
*/
|
||||
static void writeVInt15(DataOutput out, int v) throws IOException {
|
||||
assert v >= 0;
|
||||
writeVLong15(out, v);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #writeVInt15(DataOutput, int)
|
||||
*/
|
||||
static void writeVLong15(DataOutput out, long v) throws IOException {
|
||||
assert v >= 0;
|
||||
if ((v & ~0x7FFFL) == 0) {
|
||||
out.writeShort((short) v);
|
||||
} else {
|
||||
out.writeShort((short) (0x8000 | (v & 0x7FFF)));
|
||||
out.writeVLong(v >> 15);
|
||||
}
|
||||
}
|
||||
|
||||
private void flushDocBlock(boolean finishTerm) throws IOException {
|
||||
assert docBufferUpto != 0;
|
||||
|
||||
if (docBufferUpto < BLOCK_SIZE) {
|
||||
assert finishTerm;
|
||||
PostingsUtil.writeVIntBlock(
|
||||
level0Output, docDeltaBuffer, freqBuffer, docBufferUpto, writeFreqs);
|
||||
} else {
|
||||
if (writeFreqs) {
|
||||
List<Impact> impacts = level0FreqNormAccumulator.getCompetitiveFreqNormPairs();
|
||||
if (impacts.size() > maxNumImpactsAtLevel0) {
|
||||
maxNumImpactsAtLevel0 = impacts.size();
|
||||
}
|
||||
writeImpacts(impacts, scratchOutput);
|
||||
assert level0Output.size() == 0;
|
||||
if (scratchOutput.size() > maxImpactNumBytesAtLevel0) {
|
||||
maxImpactNumBytesAtLevel0 = Math.toIntExact(scratchOutput.size());
|
||||
}
|
||||
level0Output.writeVLong(scratchOutput.size());
|
||||
scratchOutput.copyTo(level0Output);
|
||||
scratchOutput.reset();
|
||||
if (writePositions) {
|
||||
level0Output.writeVLong(posOut.getFilePointer() - level0LastPosFP);
|
||||
level0Output.writeByte((byte) posBufferUpto);
|
||||
level0LastPosFP = posOut.getFilePointer();
|
||||
|
||||
if (writeOffsets || writePayloads) {
|
||||
level0Output.writeVLong(payOut.getFilePointer() - level0LastPayFP);
|
||||
level0Output.writeVInt(payloadByteUpto);
|
||||
level0LastPayFP = payOut.getFilePointer();
|
||||
}
|
||||
}
|
||||
}
|
||||
long numSkipBytes = level0Output.size();
|
||||
forDeltaUtil.encodeDeltas(docDeltaBuffer, level0Output);
|
||||
if (writeFreqs) {
|
||||
pforUtil.encode(freqBuffer, level0Output);
|
||||
}
|
||||
|
||||
// docID - lastBlockDocID is at least 128, so it can never fit a single byte with a vint
|
||||
// Even if we subtracted 128, only extremely dense blocks would be eligible to a single byte
|
||||
// so let's go with 2 bytes right away
|
||||
writeVInt15(scratchOutput, docID - level0LastDocID);
|
||||
writeVLong15(scratchOutput, level0Output.size());
|
||||
numSkipBytes += scratchOutput.size();
|
||||
level1Output.writeVLong(numSkipBytes);
|
||||
scratchOutput.copyTo(level1Output);
|
||||
scratchOutput.reset();
|
||||
}
|
||||
|
||||
level0Output.copyTo(level1Output);
|
||||
level0Output.reset();
|
||||
level0LastDocID = docID;
|
||||
if (writeFreqs) {
|
||||
level1CompetitiveFreqNormAccumulator.addAll(level0FreqNormAccumulator);
|
||||
level0FreqNormAccumulator.clear();
|
||||
}
|
||||
|
||||
if ((docCount & LEVEL1_MASK) == 0) { // true every 32 blocks (4,096 docs)
|
||||
writeLevel1SkipData();
|
||||
level1LastDocID = docID;
|
||||
level1CompetitiveFreqNormAccumulator.clear();
|
||||
} else if (finishTerm) {
|
||||
level1Output.copyTo(docOut);
|
||||
level1Output.reset();
|
||||
level1CompetitiveFreqNormAccumulator.clear();
|
||||
}
|
||||
}
|
||||
|
||||
private void writeLevel1SkipData() throws IOException {
|
||||
docOut.writeVInt(docID - level1LastDocID);
|
||||
final long level1End;
|
||||
if (writeFreqs) {
|
||||
List<Impact> impacts = level1CompetitiveFreqNormAccumulator.getCompetitiveFreqNormPairs();
|
||||
if (impacts.size() > maxNumImpactsAtLevel1) {
|
||||
maxNumImpactsAtLevel1 = impacts.size();
|
||||
}
|
||||
writeImpacts(impacts, scratchOutput);
|
||||
long numImpactBytes = scratchOutput.size();
|
||||
if (numImpactBytes > maxImpactNumBytesAtLevel1) {
|
||||
maxImpactNumBytesAtLevel1 = Math.toIntExact(numImpactBytes);
|
||||
}
|
||||
if (writePositions) {
|
||||
scratchOutput.writeVLong(posOut.getFilePointer() - level1LastPosFP);
|
||||
scratchOutput.writeByte((byte) posBufferUpto);
|
||||
level1LastPosFP = posOut.getFilePointer();
|
||||
if (writeOffsets || writePayloads) {
|
||||
scratchOutput.writeVLong(payOut.getFilePointer() - level1LastPayFP);
|
||||
scratchOutput.writeVInt(payloadByteUpto);
|
||||
level1LastPayFP = payOut.getFilePointer();
|
||||
}
|
||||
}
|
||||
final long level1Len = 2 * Short.BYTES + scratchOutput.size() + level1Output.size();
|
||||
docOut.writeVLong(level1Len);
|
||||
level1End = docOut.getFilePointer() + level1Len;
|
||||
// There are at most 128 impacts, that require at most 2 bytes each
|
||||
assert numImpactBytes <= Short.MAX_VALUE;
|
||||
// Like impacts plus a few vlongs, still way under the max short value
|
||||
assert scratchOutput.size() + Short.BYTES <= Short.MAX_VALUE;
|
||||
docOut.writeShort((short) (scratchOutput.size() + Short.BYTES));
|
||||
docOut.writeShort((short) numImpactBytes);
|
||||
scratchOutput.copyTo(docOut);
|
||||
scratchOutput.reset();
|
||||
} else {
|
||||
docOut.writeVLong(level1Output.size());
|
||||
level1End = docOut.getFilePointer() + level1Output.size();
|
||||
}
|
||||
level1Output.copyTo(docOut);
|
||||
level1Output.reset();
|
||||
assert docOut.getFilePointer() == level1End : docOut.getFilePointer() + " " + level1End;
|
||||
}
|
||||
|
||||
static void writeImpacts(Collection<Impact> impacts, DataOutput out) throws IOException {
|
||||
Impact previous = new Impact(0, 0);
|
||||
for (Impact impact : impacts) {
|
||||
assert impact.freq > previous.freq;
|
||||
assert Long.compareUnsigned(impact.norm, previous.norm) > 0;
|
||||
int freqDelta = impact.freq - previous.freq - 1;
|
||||
long normDelta = impact.norm - previous.norm - 1;
|
||||
if (normDelta == 0) {
|
||||
// most of time, norm only increases by 1, so we can fold everything in a single byte
|
||||
out.writeVInt(freqDelta << 1);
|
||||
} else {
|
||||
out.writeVInt((freqDelta << 1) | 1);
|
||||
out.writeZLong(normDelta);
|
||||
}
|
||||
previous = impact;
|
||||
}
|
||||
}
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(BlockTermState _state) throws IOException {
|
||||
IntBlockTermState state = (IntBlockTermState) _state;
|
||||
assert state.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
// for this term) in two places?
|
||||
assert state.docFreq == docCount : state.docFreq + " vs " + docCount;
|
||||
|
||||
// docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to
|
||||
// it.
|
||||
final int singletonDocID;
|
||||
if (state.docFreq == 1) {
|
||||
// pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
|
||||
singletonDocID = docDeltaBuffer[0] - 1;
|
||||
} else {
|
||||
singletonDocID = -1;
|
||||
flushDocBlock(true);
|
||||
}
|
||||
|
||||
final long lastPosBlockOffset;
|
||||
|
||||
if (writePositions) {
|
||||
// totalTermFreq is just total number of positions(or payloads, or offsets)
|
||||
// associated with current term.
|
||||
assert state.totalTermFreq != -1;
|
||||
if (state.totalTermFreq > BLOCK_SIZE) {
|
||||
// record file offset for last pos in last block
|
||||
lastPosBlockOffset = posOut.getFilePointer() - posStartFP;
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
if (posBufferUpto > 0) {
|
||||
assert posBufferUpto < BLOCK_SIZE;
|
||||
// TODO: should we send offsets/payloads to
|
||||
// .pay...? seems wasteful (have to store extra
|
||||
// vLong for low (< BLOCK_SIZE) DF terms = vast vast
|
||||
// majority)
|
||||
|
||||
// vInt encode the remaining positions/payloads/offsets:
|
||||
int lastPayloadLength = -1; // force first payload length to be written
|
||||
int lastOffsetLength = -1; // force first offset length to be written
|
||||
int payloadBytesReadUpto = 0;
|
||||
for (int i = 0; i < posBufferUpto; i++) {
|
||||
final int posDelta = posDeltaBuffer[i];
|
||||
if (writePayloads) {
|
||||
final int payloadLength = payloadLengthBuffer[i];
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
lastPayloadLength = payloadLength;
|
||||
posOut.writeVInt((posDelta << 1) | 1);
|
||||
posOut.writeVInt(payloadLength);
|
||||
} else {
|
||||
posOut.writeVInt(posDelta << 1);
|
||||
}
|
||||
|
||||
if (payloadLength != 0) {
|
||||
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
|
||||
payloadBytesReadUpto += payloadLength;
|
||||
}
|
||||
} else {
|
||||
posOut.writeVInt(posDelta);
|
||||
}
|
||||
|
||||
if (writeOffsets) {
|
||||
int delta = offsetStartDeltaBuffer[i];
|
||||
int length = offsetLengthBuffer[i];
|
||||
if (length == lastOffsetLength) {
|
||||
posOut.writeVInt(delta << 1);
|
||||
} else {
|
||||
posOut.writeVInt(delta << 1 | 1);
|
||||
posOut.writeVInt(length);
|
||||
lastOffsetLength = length;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (writePayloads) {
|
||||
assert payloadBytesReadUpto == payloadByteUpto;
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
|
||||
state.docStartFP = docStartFP;
|
||||
state.posStartFP = posStartFP;
|
||||
state.payStartFP = payStartFP;
|
||||
state.singletonDocID = singletonDocID;
|
||||
|
||||
state.lastPosBlockOffset = lastPosBlockOffset;
|
||||
docBufferUpto = 0;
|
||||
posBufferUpto = 0;
|
||||
lastDocID = -1;
|
||||
docCount = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encodeTerm(
|
||||
DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute)
|
||||
throws IOException {
|
||||
IntBlockTermState state = (IntBlockTermState) _state;
|
||||
if (absolute) {
|
||||
lastState = EMPTY_STATE;
|
||||
assert lastState.docStartFP == 0;
|
||||
}
|
||||
|
||||
if (lastState.singletonDocID != -1
|
||||
&& state.singletonDocID != -1
|
||||
&& state.docStartFP == lastState.docStartFP) {
|
||||
// With runs of rare values such as ID fields, the increment of pointers in the docs file is
|
||||
// often 0.
|
||||
// Furthermore some ID schemes like auto-increment IDs or Flake IDs are monotonic, so we
|
||||
// encode the delta
|
||||
// between consecutive doc IDs to save space.
|
||||
final long delta = (long) state.singletonDocID - lastState.singletonDocID;
|
||||
out.writeVLong((BitUtil.zigZagEncode(delta) << 1) | 0x01);
|
||||
} else {
|
||||
out.writeVLong((state.docStartFP - lastState.docStartFP) << 1);
|
||||
if (state.singletonDocID != -1) {
|
||||
out.writeVInt(state.singletonDocID);
|
||||
}
|
||||
}
|
||||
|
||||
if (writePositions) {
|
||||
out.writeVLong(state.posStartFP - lastState.posStartFP);
|
||||
if (writePayloads || writeOffsets) {
|
||||
out.writeVLong(state.payStartFP - lastState.payStartFP);
|
||||
}
|
||||
}
|
||||
if (writePositions) {
|
||||
if (state.lastPosBlockOffset != -1) {
|
||||
out.writeVLong(state.lastPosBlockOffset);
|
||||
}
|
||||
}
|
||||
lastState = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
// TODO: add a finish() at least to PushBase? DV too...?
|
||||
boolean success = false;
|
||||
try {
|
||||
if (docOut != null) {
|
||||
CodecUtil.writeFooter(docOut);
|
||||
}
|
||||
if (posOut != null) {
|
||||
CodecUtil.writeFooter(posOut);
|
||||
}
|
||||
if (payOut != null) {
|
||||
CodecUtil.writeFooter(payOut);
|
||||
}
|
||||
if (metaOut != null) {
|
||||
metaOut.writeInt(maxNumImpactsAtLevel0);
|
||||
metaOut.writeInt(maxImpactNumBytesAtLevel0);
|
||||
metaOut.writeInt(maxNumImpactsAtLevel1);
|
||||
metaOut.writeInt(maxImpactNumBytesAtLevel1);
|
||||
metaOut.writeLong(docOut.getFilePointer());
|
||||
if (posOut != null) {
|
||||
metaOut.writeLong(posOut.getFilePointer());
|
||||
if (payOut != null) {
|
||||
metaOut.writeLong(payOut.getFilePointer());
|
||||
}
|
||||
}
|
||||
CodecUtil.writeFooter(metaOut);
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(metaOut, docOut, posOut, payOut);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut);
|
||||
}
|
||||
metaOut = docOut = posOut = payOut = null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.LongHeap;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/** Utility class to encode sequences of 128 small positive integers. */
|
||||
final class PForUtil {
|
||||
|
||||
private static final int MAX_EXCEPTIONS = 7;
|
||||
|
||||
static boolean allEqual(int[] l) {
|
||||
for (int i = 1; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
if (l[i] != l[0]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private final ForUtil forUtil = new ForUtil();
|
||||
|
||||
static {
|
||||
assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE;
|
||||
}
|
||||
|
||||
/** Encode 128 integers from {@code ints} into {@code out}. */
|
||||
void encode(int[] ints, DataOutput out) throws IOException {
|
||||
// Determine the top MAX_EXCEPTIONS + 1 values
|
||||
final LongHeap top = new LongHeap(MAX_EXCEPTIONS + 1);
|
||||
for (int i = 0; i <= MAX_EXCEPTIONS; ++i) {
|
||||
top.push(ints[i]);
|
||||
}
|
||||
long topValue = top.top();
|
||||
for (int i = MAX_EXCEPTIONS + 1; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
if (ints[i] > topValue) {
|
||||
topValue = top.updateTop(ints[i]);
|
||||
}
|
||||
}
|
||||
|
||||
long max = 0L;
|
||||
for (int i = 1; i <= top.size(); ++i) {
|
||||
max = Math.max(max, top.get(i));
|
||||
}
|
||||
|
||||
final int maxBitsRequired = PackedInts.bitsRequired(max);
|
||||
// We store the patch on a byte, so we can't decrease the number of bits required by more than 8
|
||||
final int patchedBitsRequired =
|
||||
Math.max(PackedInts.bitsRequired(topValue), maxBitsRequired - 8);
|
||||
int numExceptions = 0;
|
||||
final long maxUnpatchedValue = (1L << patchedBitsRequired) - 1;
|
||||
for (int i = 2; i <= top.size(); ++i) {
|
||||
if (top.get(i) > maxUnpatchedValue) {
|
||||
numExceptions++;
|
||||
}
|
||||
}
|
||||
final byte[] exceptions = new byte[numExceptions * 2];
|
||||
if (numExceptions > 0) {
|
||||
int exceptionCount = 0;
|
||||
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
if (ints[i] > maxUnpatchedValue) {
|
||||
exceptions[exceptionCount * 2] = (byte) i;
|
||||
exceptions[exceptionCount * 2 + 1] = (byte) (ints[i] >>> patchedBitsRequired);
|
||||
ints[i] &= maxUnpatchedValue;
|
||||
exceptionCount++;
|
||||
}
|
||||
}
|
||||
assert exceptionCount == numExceptions : exceptionCount + " " + numExceptions;
|
||||
}
|
||||
|
||||
if (allEqual(ints) && maxBitsRequired <= 8) {
|
||||
for (int i = 0; i < numExceptions; ++i) {
|
||||
exceptions[2 * i + 1] =
|
||||
(byte) (Byte.toUnsignedLong(exceptions[2 * i + 1]) << patchedBitsRequired);
|
||||
}
|
||||
out.writeByte((byte) (numExceptions << 5));
|
||||
out.writeVInt(ints[0]);
|
||||
} else {
|
||||
final int token = (numExceptions << 5) | patchedBitsRequired;
|
||||
out.writeByte((byte) token);
|
||||
forUtil.encode(ints, patchedBitsRequired, out);
|
||||
}
|
||||
out.writeBytes(exceptions, exceptions.length);
|
||||
}
|
||||
|
||||
/** Decode 128 integers into {@code ints}. */
|
||||
void decode(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
var in = pdu.in;
|
||||
final int token = Byte.toUnsignedInt(in.readByte());
|
||||
final int bitsPerValue = token & 0x1f;
|
||||
if (bitsPerValue == 0) {
|
||||
Arrays.fill(ints, 0, ForUtil.BLOCK_SIZE, in.readVInt());
|
||||
} else {
|
||||
forUtil.decode(bitsPerValue, pdu, ints);
|
||||
}
|
||||
final int numExceptions = token >>> 5;
|
||||
for (int i = 0; i < numExceptions; ++i) {
|
||||
ints[Byte.toUnsignedInt(in.readByte())] |= Byte.toUnsignedLong(in.readByte()) << bitsPerValue;
|
||||
}
|
||||
}
|
||||
|
||||
/** Skip 128 integers. */
|
||||
static void skip(DataInput in) throws IOException {
|
||||
final int token = Byte.toUnsignedInt(in.readByte());
|
||||
final int bitsPerValue = token & 0x1f;
|
||||
final int numExceptions = token >>> 5;
|
||||
if (bitsPerValue == 0) {
|
||||
in.readVLong();
|
||||
in.skipBytes((numExceptions << 1));
|
||||
} else {
|
||||
in.skipBytes(ForUtil.numBytes(bitsPerValue) + (numExceptions << 1));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
|
@ -42,16 +42,16 @@ public final class PostingIndexInput {
|
|||
this.postingDecodingUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
|
||||
}
|
||||
|
||||
/** Decode 128 integers stored on {@code bitsPerValues} bits per value into {@code longs}. */
|
||||
public void decode(int bitsPerValue, long[] longs) throws IOException {
|
||||
forUtil.decode(bitsPerValue, postingDecodingUtil, longs);
|
||||
/** Decode 128 integers stored on {@code bitsPerValues} bits per value into {@code ints}. */
|
||||
public void decode(int bitsPerValue, int[] ints) throws IOException {
|
||||
forUtil.decode(bitsPerValue, postingDecodingUtil, ints);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode 128 integers stored on {@code bitsPerValues} bits per value, compute their prefix sum,
|
||||
* and store results into {@code longs}.
|
||||
* and store results into {@code ints}.
|
||||
*/
|
||||
public void decodeAndPrefixSum(int bitsPerValue, long base, long[] longs) throws IOException {
|
||||
forDeltaUtil.decodeAndPrefixSum(bitsPerValue, postingDecodingUtil, base, longs);
|
||||
public void decodeAndPrefixSum(int bitsPerValue, int base, int[] ints) throws IOException {
|
||||
forDeltaUtil.decodeAndPrefixSum(bitsPerValue, postingDecodingUtil, base, ints);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.GroupVIntUtil;
|
||||
|
||||
/** Utility class to encode/decode postings block. */
|
||||
final class PostingsUtil {
|
||||
|
||||
/**
|
||||
* Read values that have been written using variable-length encoding and group-varint encoding
|
||||
* instead of bit-packing.
|
||||
*/
|
||||
static void readVIntBlock(
|
||||
IndexInput docIn,
|
||||
int[] docBuffer,
|
||||
int[] freqBuffer,
|
||||
int num,
|
||||
boolean indexHasFreq,
|
||||
boolean decodeFreq)
|
||||
throws IOException {
|
||||
GroupVIntUtil.readGroupVInts(docIn, docBuffer, num);
|
||||
if (indexHasFreq && decodeFreq) {
|
||||
for (int i = 0; i < num; ++i) {
|
||||
freqBuffer[i] = docBuffer[i] & 0x01;
|
||||
docBuffer[i] >>>= 1;
|
||||
if (freqBuffer[i] == 0) {
|
||||
freqBuffer[i] = docIn.readVInt();
|
||||
}
|
||||
}
|
||||
} else if (indexHasFreq) {
|
||||
for (int i = 0; i < num; ++i) {
|
||||
docBuffer[i] >>>= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Write freq buffer with variable-length encoding and doc buffer with group-varint encoding. */
|
||||
static void writeVIntBlock(
|
||||
DataOutput docOut, int[] docBuffer, int[] freqBuffer, int num, boolean writeFreqs)
|
||||
throws IOException {
|
||||
if (writeFreqs) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
docBuffer[i] = (docBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
|
||||
}
|
||||
}
|
||||
docOut.writeGroupVInts(docBuffer, num);
|
||||
if (writeFreqs) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
final int freq = freqBuffer[i];
|
||||
if (freq != 1) {
|
||||
docOut.writeVInt(freq);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,377 @@
|
|||
#! /usr/bin/env python
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from math import gcd
|
||||
|
||||
"""Code generation for ForDeltaUtil.java"""
|
||||
|
||||
MAX_SPECIALIZED_BITS_PER_VALUE = 24
|
||||
OUTPUT_FILE = "ForDeltaUtil.java"
|
||||
PRIMITIVE_SIZE = [8, 16, 32]
|
||||
HEADER = """// This file has been automatically generated, DO NOT EDIT
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene101.ForUtil.*;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/
|
||||
* Encodes multiple integers in a Java int to get SIMD-like speedups.
|
||||
* If bitsPerValue <= 4 then we pack 4 ints per Java int
|
||||
* else if bitsPerValue <= 11 we pack 2 ints per Java int
|
||||
* else we use scalar operations.
|
||||
*/
|
||||
public final class ForDeltaUtil {
|
||||
|
||||
private static final int HALF_BLOCK_SIZE = BLOCK_SIZE / 2;
|
||||
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
|
||||
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
|
||||
private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4;
|
||||
|
||||
// IDENTITY_PLUS_ONE[i] == i+1
|
||||
private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
IDENTITY_PLUS_ONE[i] = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSumOfOnes(int[] arr, int base) {
|
||||
System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE);
|
||||
// This loop gets auto-vectorized
|
||||
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
arr[i] += base;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum8(int[] arr, int base) {
|
||||
// When the number of bits per value is 4 or less, we can sum up all values in a block without
|
||||
// risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4
|
||||
// values at once.
|
||||
innerPrefixSum8(arr);
|
||||
expand8(arr);
|
||||
final int l0 = base;
|
||||
final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1];
|
||||
final int l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1];
|
||||
final int l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1];
|
||||
|
||||
for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) {
|
||||
arr[i] += l0;
|
||||
arr[ONE_BLOCK_SIZE_FOURTH + i] += l1;
|
||||
arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2;
|
||||
arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum16(int[] arr, int base) {
|
||||
// When the number of bits per value is 11 or less, we can sum up all values in a block without
|
||||
// risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2
|
||||
// values at once.
|
||||
innerPrefixSum16(arr);
|
||||
expand16(arr);
|
||||
final int l0 = base;
|
||||
final int l1 = base + arr[HALF_BLOCK_SIZE - 1];
|
||||
for (int i = 0; i < HALF_BLOCK_SIZE; ++i) {
|
||||
arr[i] += l0;
|
||||
arr[HALF_BLOCK_SIZE + i] += l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum32(int[] arr, int base) {
|
||||
arr[0] += base;
|
||||
for (int i = 1; i < BLOCK_SIZE; ++i) {
|
||||
arr[i] += arr[i-1];
|
||||
}
|
||||
}
|
||||
|
||||
// For some reason unrolling seems to help
|
||||
private static void innerPrefixSum8(int[] arr) {
|
||||
arr[1] += arr[0];
|
||||
arr[2] += arr[1];
|
||||
arr[3] += arr[2];
|
||||
arr[4] += arr[3];
|
||||
arr[5] += arr[4];
|
||||
arr[6] += arr[5];
|
||||
arr[7] += arr[6];
|
||||
arr[8] += arr[7];
|
||||
arr[9] += arr[8];
|
||||
arr[10] += arr[9];
|
||||
arr[11] += arr[10];
|
||||
arr[12] += arr[11];
|
||||
arr[13] += arr[12];
|
||||
arr[14] += arr[13];
|
||||
arr[15] += arr[14];
|
||||
arr[16] += arr[15];
|
||||
arr[17] += arr[16];
|
||||
arr[18] += arr[17];
|
||||
arr[19] += arr[18];
|
||||
arr[20] += arr[19];
|
||||
arr[21] += arr[20];
|
||||
arr[22] += arr[21];
|
||||
arr[23] += arr[22];
|
||||
arr[24] += arr[23];
|
||||
arr[25] += arr[24];
|
||||
arr[26] += arr[25];
|
||||
arr[27] += arr[26];
|
||||
arr[28] += arr[27];
|
||||
arr[29] += arr[28];
|
||||
arr[30] += arr[29];
|
||||
arr[31] += arr[30];
|
||||
}
|
||||
|
||||
// For some reason unrolling seems to help
|
||||
private static void innerPrefixSum16(int[] arr) {
|
||||
arr[1] += arr[0];
|
||||
arr[2] += arr[1];
|
||||
arr[3] += arr[2];
|
||||
arr[4] += arr[3];
|
||||
arr[5] += arr[4];
|
||||
arr[6] += arr[5];
|
||||
arr[7] += arr[6];
|
||||
arr[8] += arr[7];
|
||||
arr[9] += arr[8];
|
||||
arr[10] += arr[9];
|
||||
arr[11] += arr[10];
|
||||
arr[12] += arr[11];
|
||||
arr[13] += arr[12];
|
||||
arr[14] += arr[13];
|
||||
arr[15] += arr[14];
|
||||
arr[16] += arr[15];
|
||||
arr[17] += arr[16];
|
||||
arr[18] += arr[17];
|
||||
arr[19] += arr[18];
|
||||
arr[20] += arr[19];
|
||||
arr[21] += arr[20];
|
||||
arr[22] += arr[21];
|
||||
arr[23] += arr[22];
|
||||
arr[24] += arr[23];
|
||||
arr[25] += arr[24];
|
||||
arr[26] += arr[25];
|
||||
arr[27] += arr[26];
|
||||
arr[28] += arr[27];
|
||||
arr[29] += arr[28];
|
||||
arr[30] += arr[29];
|
||||
arr[31] += arr[30];
|
||||
arr[32] += arr[31];
|
||||
arr[33] += arr[32];
|
||||
arr[34] += arr[33];
|
||||
arr[35] += arr[34];
|
||||
arr[36] += arr[35];
|
||||
arr[37] += arr[36];
|
||||
arr[38] += arr[37];
|
||||
arr[39] += arr[38];
|
||||
arr[40] += arr[39];
|
||||
arr[41] += arr[40];
|
||||
arr[42] += arr[41];
|
||||
arr[43] += arr[42];
|
||||
arr[44] += arr[43];
|
||||
arr[45] += arr[44];
|
||||
arr[46] += arr[45];
|
||||
arr[47] += arr[46];
|
||||
arr[48] += arr[47];
|
||||
arr[49] += arr[48];
|
||||
arr[50] += arr[49];
|
||||
arr[51] += arr[50];
|
||||
arr[52] += arr[51];
|
||||
arr[53] += arr[52];
|
||||
arr[54] += arr[53];
|
||||
arr[55] += arr[54];
|
||||
arr[56] += arr[55];
|
||||
arr[57] += arr[56];
|
||||
arr[58] += arr[57];
|
||||
arr[59] += arr[58];
|
||||
arr[60] += arr[59];
|
||||
arr[61] += arr[60];
|
||||
arr[62] += arr[61];
|
||||
arr[63] += arr[62];
|
||||
}
|
||||
|
||||
private final int[] tmp = new int[BLOCK_SIZE];
|
||||
|
||||
/**
|
||||
* Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code
|
||||
* ints} are expected to be deltas between consecutive values.
|
||||
*/
|
||||
void encodeDeltas(int[] ints, DataOutput out) throws IOException {
|
||||
if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings
|
||||
out.writeByte((byte) 0);
|
||||
} else {
|
||||
int or = 0;
|
||||
for (int l : ints) {
|
||||
or |= l;
|
||||
}
|
||||
assert or != 0;
|
||||
final int bitsPerValue = PackedInts.bitsRequired(or);
|
||||
out.writeByte((byte) bitsPerValue);
|
||||
|
||||
final int primitiveSize;
|
||||
if (bitsPerValue <= 3) {
|
||||
primitiveSize = 8;
|
||||
collapse8(ints);
|
||||
} else if (bitsPerValue <= 10) {
|
||||
primitiveSize = 16;
|
||||
collapse16(ints);
|
||||
} else {
|
||||
primitiveSize = 32;
|
||||
}
|
||||
encode(ints, bitsPerValue, primitiveSize, out, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */
|
||||
void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
|
||||
if (bitsPerValue == 0) {
|
||||
prefixSumOfOnes(ints, base);
|
||||
} else {
|
||||
decodeAndPrefixSum(bitsPerValue, pdu, base, ints);
|
||||
}
|
||||
}
|
||||
|
||||
"""
|
||||
|
||||
def primitive_size_for_bpv(bpv):
|
||||
if bpv <= 3:
|
||||
# If we have 4 bits per value or less then we can compute the prefix sum of 32 ints that store 4 8-bit values each without overflowing.
|
||||
return 8
|
||||
elif bpv <= 10:
|
||||
# If we have 10 bits per value or less then we can compute the prefix sum of 64 ints that store 2 16-bit values each without overflowing.
|
||||
return 16
|
||||
else:
|
||||
# No risk of overflow with 32 bits per value
|
||||
return 32
|
||||
|
||||
def next_primitive(bpv):
|
||||
if bpv <= 8:
|
||||
return 8
|
||||
elif bpv <= 16:
|
||||
return 16
|
||||
else:
|
||||
return 32
|
||||
|
||||
def writeRemainder(bpv, next_primitive, remaining_bits_per_int, o, num_values, f):
|
||||
iteration = 1
|
||||
num_ints = bpv * num_values / remaining_bits_per_int
|
||||
while num_ints % 2 == 0 and num_values % 2 == 0:
|
||||
num_ints /= 2
|
||||
num_values /= 2
|
||||
iteration *= 2
|
||||
f.write(' for (int iter = 0, tmpIdx = 0, intsIdx = %d; iter < %d; ++iter, tmpIdx += %d, intsIdx += %d) {\n' %(o, iteration, num_ints, num_values))
|
||||
i = 0
|
||||
remaining_bits = 0
|
||||
tmp_idx = 0
|
||||
for i in range(int(num_values)):
|
||||
b = bpv
|
||||
if remaining_bits == 0:
|
||||
b -= remaining_bits_per_int
|
||||
f.write(' int l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
|
||||
else:
|
||||
b -= remaining_bits
|
||||
f.write(' int l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b))
|
||||
tmp_idx += 1
|
||||
while b >= remaining_bits_per_int:
|
||||
b -= remaining_bits_per_int
|
||||
f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
|
||||
tmp_idx += 1
|
||||
if b > 0:
|
||||
f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_int-b, next_primitive, b))
|
||||
remaining_bits = remaining_bits_per_int-b
|
||||
f.write(' ints[intsIdx + %d] = l%d;\n' %(i, i))
|
||||
f.write(' }\n')
|
||||
|
||||
def writeDecode(bpv, f):
|
||||
next_primitive = primitive_size_for_bpv(bpv)
|
||||
if next_primitive % bpv == 0:
|
||||
f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %(bpv, next_primitive))
|
||||
else:
|
||||
f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {\n' %(bpv, next_primitive))
|
||||
if bpv == next_primitive:
|
||||
f.write(' pdu.in.readInts(ints, 0, %d);\n' %(bpv*4))
|
||||
else:
|
||||
num_values_per_int = 32 / next_primitive
|
||||
remaining_bits = next_primitive % bpv
|
||||
num_iters = (next_primitive - 1) // bpv
|
||||
o = 4 * bpv * num_iters
|
||||
if remaining_bits == 0:
|
||||
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, ints, %d, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
else:
|
||||
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_int - o, f)
|
||||
f.write(' }\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
f = open(OUTPUT_FILE, 'w')
|
||||
f.write(HEADER)
|
||||
f.write("""
|
||||
/**
|
||||
* Delta-decode 128 integers into {@code ints}.
|
||||
*/
|
||||
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int[] ints) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
""")
|
||||
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
primitive_size = primitive_size_for_bpv(bpv)
|
||||
f.write(' case %d:\n' %bpv)
|
||||
if next_primitive(bpv) == primitive_size:
|
||||
if primitive_size % bpv == 0:
|
||||
f.write(' decode%d(pdu, ints);\n' %bpv)
|
||||
else:
|
||||
f.write(' decode%d(pdu, tmp, ints);\n' %bpv)
|
||||
else:
|
||||
if primitive_size % bpv == 0:
|
||||
f.write(' decode%dTo%d(pdu, ints);\n' %(bpv, primitive_size))
|
||||
else:
|
||||
f.write(' decode%dTo%d(pdu, tmp, ints);\n' %(bpv, primitive_size))
|
||||
f.write(' prefixSum%d(ints, base);\n' %primitive_size)
|
||||
f.write(' break;\n')
|
||||
f.write(' default:\n')
|
||||
f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n')
|
||||
f.write(' prefixSum32(ints, base);\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' }\n')
|
||||
f.write(' }\n')
|
||||
|
||||
f.write('\n')
|
||||
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
if next_primitive(bpv) != primitive_size_for_bpv(bpv):
|
||||
writeDecode(bpv, f)
|
||||
if bpv < MAX_SPECIALIZED_BITS_PER_VALUE:
|
||||
f.write('\n')
|
||||
|
||||
f.write('}\n')
|
|
@ -0,0 +1,327 @@
|
|||
#! /usr/bin/env python
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from math import gcd
|
||||
|
||||
"""Code generation for ForUtil.java"""
|
||||
|
||||
MAX_SPECIALIZED_BITS_PER_VALUE = 24
|
||||
OUTPUT_FILE = "ForUtil.java"
|
||||
PRIMITIVE_SIZE = [8, 16, 32]
|
||||
HEADER = """// This file has been automatically generated, DO NOT EDIT
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/
|
||||
* Encodes multiple integers in one to get SIMD-like speedups.
|
||||
* If bitsPerValue <= 8 then we pack 4 ints per Java int
|
||||
* else if bitsPerValue <= 16 we pack 2 ints per Java int
|
||||
* else we do scalar operations.
|
||||
*/
|
||||
public final class ForUtil {
|
||||
|
||||
public static final int BLOCK_SIZE = 128;
|
||||
static final int BLOCK_SIZE_LOG2 = 7;
|
||||
|
||||
static int expandMask16(int mask16) {
|
||||
return mask16 | (mask16 << 16);
|
||||
}
|
||||
|
||||
static int expandMask8(int mask8) {
|
||||
return expandMask16(mask8 | (mask8 << 8));
|
||||
}
|
||||
|
||||
static int mask32(int bitsPerValue) {
|
||||
return (1 << bitsPerValue) - 1;
|
||||
}
|
||||
|
||||
static int mask16(int bitsPerValue) {
|
||||
return expandMask16((1 << bitsPerValue) - 1);
|
||||
}
|
||||
|
||||
static int mask8(int bitsPerValue) {
|
||||
return expandMask8((1 << bitsPerValue) - 1);
|
||||
}
|
||||
|
||||
static void expand8(int[] arr) {
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
int l = arr[i];
|
||||
arr[i] = (l >>> 24) & 0xFF;
|
||||
arr[32 + i] = (l >>> 16) & 0xFF;
|
||||
arr[64 + i] = (l >>> 8) & 0xFF;
|
||||
arr[96 + i] = l & 0xFF;
|
||||
}
|
||||
}
|
||||
|
||||
static void collapse8(int[] arr) {
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
arr[i] =
|
||||
(arr[i] << 24)
|
||||
| (arr[32 + i] << 16)
|
||||
| (arr[64 + i] << 8)
|
||||
| arr[96 + i];
|
||||
}
|
||||
}
|
||||
|
||||
static void expand16(int[] arr) {
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
int l = arr[i];
|
||||
arr[i] = (l >>> 16) & 0xFFFF;
|
||||
arr[64 + i] = l & 0xFFFF;
|
||||
}
|
||||
}
|
||||
|
||||
static void collapse16(int[] arr) {
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
arr[i] = (arr[i] << 16) | arr[64 + i];
|
||||
}
|
||||
}
|
||||
|
||||
private final int[] tmp = new int[BLOCK_SIZE];
|
||||
|
||||
/** Encode 128 integers from {@code ints} into {@code out}. */
|
||||
void encode(int[] ints, int bitsPerValue, DataOutput out) throws IOException {
|
||||
final int nextPrimitive;
|
||||
if (bitsPerValue <= 8) {
|
||||
nextPrimitive = 8;
|
||||
collapse8(ints);
|
||||
} else if (bitsPerValue <= 16) {
|
||||
nextPrimitive = 16;
|
||||
collapse16(ints);
|
||||
} else {
|
||||
nextPrimitive = 32;
|
||||
}
|
||||
encode(ints, bitsPerValue, nextPrimitive, out, tmp);
|
||||
}
|
||||
|
||||
static void encode(int[] ints, int bitsPerValue, int primitiveSize, DataOutput out, int[] tmp) throws IOException {
|
||||
final int numInts = BLOCK_SIZE * primitiveSize / Integer.SIZE;
|
||||
|
||||
final int numIntsPerShift = bitsPerValue * 4;
|
||||
int idx = 0;
|
||||
int shift = primitiveSize - bitsPerValue;
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
tmp[i] = ints[idx++] << shift;
|
||||
}
|
||||
for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
tmp[i] |= ints[idx++] << shift;
|
||||
}
|
||||
}
|
||||
|
||||
final int remainingBitsPerInt = shift + bitsPerValue;
|
||||
final int maskRemainingBitsPerInt;
|
||||
if (primitiveSize == 8) {
|
||||
maskRemainingBitsPerInt = MASKS8[remainingBitsPerInt];
|
||||
} else if (primitiveSize == 16) {
|
||||
maskRemainingBitsPerInt = MASKS16[remainingBitsPerInt];
|
||||
} else {
|
||||
maskRemainingBitsPerInt = MASKS32[remainingBitsPerInt];
|
||||
}
|
||||
|
||||
int tmpIdx = 0;
|
||||
int remainingBitsPerValue = bitsPerValue;
|
||||
while (idx < numInts) {
|
||||
if (remainingBitsPerValue >= remainingBitsPerInt) {
|
||||
remainingBitsPerValue -= remainingBitsPerInt;
|
||||
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerInt;
|
||||
if (remainingBitsPerValue == 0) {
|
||||
idx++;
|
||||
remainingBitsPerValue = bitsPerValue;
|
||||
}
|
||||
} else {
|
||||
final int mask1, mask2;
|
||||
if (primitiveSize == 8) {
|
||||
mask1 = MASKS8[remainingBitsPerValue];
|
||||
mask2 = MASKS8[remainingBitsPerInt - remainingBitsPerValue];
|
||||
} else if (primitiveSize == 16) {
|
||||
mask1 = MASKS16[remainingBitsPerValue];
|
||||
mask2 = MASKS16[remainingBitsPerInt - remainingBitsPerValue];
|
||||
} else {
|
||||
mask1 = MASKS32[remainingBitsPerValue];
|
||||
mask2 = MASKS32[remainingBitsPerInt - remainingBitsPerValue];
|
||||
}
|
||||
tmp[tmpIdx] |= (ints[idx++] & mask1) << (remainingBitsPerInt - remainingBitsPerValue);
|
||||
remainingBitsPerValue = bitsPerValue - remainingBitsPerInt + remainingBitsPerValue;
|
||||
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & mask2;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
out.writeInt(tmp[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */
|
||||
static int numBytes(int bitsPerValue) {
|
||||
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
|
||||
}
|
||||
|
||||
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
final int numInts = bitsPerValue << 2;
|
||||
final int mask = MASKS32[bitsPerValue];
|
||||
pdu.splitInts(numInts, ints, 32 - bitsPerValue, 32, mask, tmp, 0, -1);
|
||||
final int remainingBitsPerInt = 32 - bitsPerValue;
|
||||
final int mask32RemainingBitsPerInt = MASKS32[remainingBitsPerInt];
|
||||
int tmpIdx = 0;
|
||||
int remainingBits = remainingBitsPerInt;
|
||||
for (int intsIdx = numInts; intsIdx < BLOCK_SIZE; ++intsIdx) {
|
||||
int b = bitsPerValue - remainingBits;
|
||||
int l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b;
|
||||
while (b >= remainingBitsPerInt) {
|
||||
b -= remainingBitsPerInt;
|
||||
l |= (tmp[tmpIdx++] & mask32RemainingBitsPerInt) << b;
|
||||
}
|
||||
if (b > 0) {
|
||||
l |= (tmp[tmpIdx] >>> (remainingBitsPerInt - b)) & MASKS32[b];
|
||||
remainingBits = remainingBitsPerInt - b;
|
||||
} else {
|
||||
remainingBits = remainingBitsPerInt;
|
||||
}
|
||||
ints[intsIdx] = l;
|
||||
}
|
||||
}
|
||||
|
||||
"""
|
||||
|
||||
def writeRemainder(bpv, next_primitive, remaining_bits_per_int, o, num_values, f):
|
||||
iteration = 1
|
||||
num_ints = bpv * num_values / remaining_bits_per_int
|
||||
while num_ints % 2 == 0 and num_values % 2 == 0:
|
||||
num_ints /= 2
|
||||
num_values /= 2
|
||||
iteration *= 2
|
||||
f.write(' for (int iter = 0, tmpIdx = 0, intsIdx = %d; iter < %d; ++iter, tmpIdx += %d, intsIdx += %d) {\n' %(o, iteration, num_ints, num_values))
|
||||
i = 0
|
||||
remaining_bits = 0
|
||||
tmp_idx = 0
|
||||
for i in range(int(num_values)):
|
||||
b = bpv
|
||||
if remaining_bits == 0:
|
||||
b -= remaining_bits_per_int
|
||||
f.write(' int l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
|
||||
else:
|
||||
b -= remaining_bits
|
||||
f.write(' int l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b))
|
||||
tmp_idx += 1
|
||||
while b >= remaining_bits_per_int:
|
||||
b -= remaining_bits_per_int
|
||||
f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
|
||||
tmp_idx += 1
|
||||
if b > 0:
|
||||
f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_int-b, next_primitive, b))
|
||||
remaining_bits = remaining_bits_per_int-b
|
||||
f.write(' ints[intsIdx + %d] = l%d;\n' %(i, i))
|
||||
f.write(' }\n')
|
||||
|
||||
|
||||
def writeDecode(bpv, f):
|
||||
next_primitive = 32
|
||||
if bpv <= 8:
|
||||
next_primitive = 8
|
||||
elif bpv <= 16:
|
||||
next_primitive = 16
|
||||
if bpv == next_primitive:
|
||||
f.write(' static void decode%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %bpv)
|
||||
f.write(' pdu.in.readInts(ints, 0, %d);\n' %(bpv*4))
|
||||
else:
|
||||
num_values_per_int = 32 / next_primitive
|
||||
remaining_bits = next_primitive % bpv
|
||||
num_iters = (next_primitive - 1) // bpv
|
||||
o = 4 * bpv * num_iters
|
||||
if remaining_bits == 0:
|
||||
f.write(' static void decode%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %bpv)
|
||||
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, ints, %d, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
else:
|
||||
f.write(' static void decode%d(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {\n' %bpv)
|
||||
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_int - o, f)
|
||||
f.write(' }\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
f = open(OUTPUT_FILE, 'w')
|
||||
f.write(HEADER)
|
||||
for primitive_size in PRIMITIVE_SIZE:
|
||||
f.write(' static final int[] MASKS%d = new int[%d];\n' %(primitive_size, primitive_size))
|
||||
f.write('\n')
|
||||
f.write(' static {\n')
|
||||
for primitive_size in PRIMITIVE_SIZE:
|
||||
f.write(' for (int i = 0; i < %d; ++i) {\n' %primitive_size)
|
||||
f.write(' MASKS%d[i] = mask%d(i);\n' %(primitive_size, primitive_size))
|
||||
f.write(' }\n')
|
||||
f.write(' }')
|
||||
f.write("""
|
||||
// mark values in array as final ints to avoid the cost of reading array, arrays should only be
|
||||
// used when the idx is a variable
|
||||
""")
|
||||
for primitive_size in PRIMITIVE_SIZE:
|
||||
for bpv in range(1, min(MAX_SPECIALIZED_BITS_PER_VALUE + 1, primitive_size)):
|
||||
f.write(' static final int MASK%d_%d = MASKS%d[%d];\n' %(primitive_size, bpv, primitive_size, bpv))
|
||||
|
||||
f.write("""
|
||||
/** Decode 128 integers into {@code ints}. */
|
||||
void decode(int bitsPerValue, PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
""")
|
||||
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
next_primitive = 32
|
||||
if bpv <= 8:
|
||||
next_primitive = 8
|
||||
elif bpv <= 16:
|
||||
next_primitive = 16
|
||||
f.write(' case %d:\n' %bpv)
|
||||
if next_primitive % bpv == 0:
|
||||
f.write(' decode%d(pdu, ints);\n' %bpv)
|
||||
else:
|
||||
f.write(' decode%d(pdu, tmp, ints);\n' %bpv)
|
||||
if next_primitive != 32:
|
||||
f.write(' expand%d(ints);\n' %next_primitive)
|
||||
f.write(' break;\n')
|
||||
f.write(' default:\n')
|
||||
f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' }\n')
|
||||
f.write(' }\n')
|
||||
|
||||
for i in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
writeDecode(i, f)
|
||||
if i < MAX_SPECIALIZED_BITS_PER_VALUE:
|
||||
f.write('\n')
|
||||
|
||||
f.write('}\n')
|
|
@ -16,7 +16,7 @@
|
|||
*/
|
||||
|
||||
/**
|
||||
* Lucene 10.0 file format.
|
||||
* Lucene 10.1 file format.
|
||||
*
|
||||
* <h2>Apache Lucene - Index File Formats</h2>
|
||||
*
|
||||
|
@ -151,15 +151,15 @@
|
|||
* field names. These are used to store auxiliary information about the document, such as its
|
||||
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
|
||||
* <li>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term dictionary}. A
|
||||
* dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||
* pointers to the term's frequency and proximity data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
|
||||
* <li>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Frequency data}. For
|
||||
* each term in the dictionary, the numbers of all the documents that contain that term, and
|
||||
* the frequency of the term in that document, unless frequencies are omitted ({@link
|
||||
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
|
||||
* <li>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Proximity data}. For
|
||||
* each term in the dictionary, the positions that the term occurs in each document. Note that
|
||||
* this will not exist if all fields in all documents omit position data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||
|
@ -255,27 +255,27 @@
|
|||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}</td>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}</td>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}</td>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}</td>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}</td>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
|
@ -416,6 +416,8 @@
|
|||
* <li>In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
|
||||
* 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
|
||||
* need skipping, especially conjunctions.
|
||||
* <li>In version 10.1, block encoding changed to be optimized for int[] storage instead of
|
||||
* long[].
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Limitations"></a>
|
||||
|
@ -430,4 +432,4 @@
|
|||
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||
* VInt} values which have no limit. </div>
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene100;
|
||||
package org.apache.lucene.codecs.lucene101;
|
|
@ -21,8 +21,6 @@ import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_IND
|
|||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.index.BaseTermsEnum;
|
||||
|
@ -43,6 +41,7 @@ import org.apache.lucene.index.SortedNumericDocValues;
|
|||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
|
@ -59,12 +58,12 @@ import org.apache.lucene.util.packed.DirectReader;
|
|||
|
||||
/** reader for {@link Lucene90DocValuesFormat} */
|
||||
final class Lucene90DocValuesProducer extends DocValuesProducer {
|
||||
private final Map<String, NumericEntry> numerics;
|
||||
private final Map<String, BinaryEntry> binaries;
|
||||
private final Map<String, SortedEntry> sorted;
|
||||
private final Map<String, SortedSetEntry> sortedSets;
|
||||
private final Map<String, SortedNumericEntry> sortedNumerics;
|
||||
private final Map<String, DocValuesSkipperEntry> skippers;
|
||||
private final IntObjectHashMap<NumericEntry> numerics;
|
||||
private final IntObjectHashMap<BinaryEntry> binaries;
|
||||
private final IntObjectHashMap<SortedEntry> sorted;
|
||||
private final IntObjectHashMap<SortedSetEntry> sortedSets;
|
||||
private final IntObjectHashMap<SortedNumericEntry> sortedNumerics;
|
||||
private final IntObjectHashMap<DocValuesSkipperEntry> skippers;
|
||||
private final IndexInput data;
|
||||
private final int maxDoc;
|
||||
private int version = -1;
|
||||
|
@ -81,12 +80,12 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
String metaName =
|
||||
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||
this.maxDoc = state.segmentInfo.maxDoc();
|
||||
numerics = new HashMap<>();
|
||||
binaries = new HashMap<>();
|
||||
sorted = new HashMap<>();
|
||||
sortedSets = new HashMap<>();
|
||||
sortedNumerics = new HashMap<>();
|
||||
skippers = new HashMap<>();
|
||||
numerics = new IntObjectHashMap<>();
|
||||
binaries = new IntObjectHashMap<>();
|
||||
sorted = new IntObjectHashMap<>();
|
||||
sortedSets = new IntObjectHashMap<>();
|
||||
sortedNumerics = new IntObjectHashMap<>();
|
||||
skippers = new IntObjectHashMap<>();
|
||||
merging = false;
|
||||
|
||||
// read in the entries from the metadata file.
|
||||
|
@ -149,12 +148,12 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
// Used for cloning
|
||||
private Lucene90DocValuesProducer(
|
||||
Map<String, NumericEntry> numerics,
|
||||
Map<String, BinaryEntry> binaries,
|
||||
Map<String, SortedEntry> sorted,
|
||||
Map<String, SortedSetEntry> sortedSets,
|
||||
Map<String, SortedNumericEntry> sortedNumerics,
|
||||
Map<String, DocValuesSkipperEntry> skippers,
|
||||
IntObjectHashMap<NumericEntry> numerics,
|
||||
IntObjectHashMap<BinaryEntry> binaries,
|
||||
IntObjectHashMap<SortedEntry> sorted,
|
||||
IntObjectHashMap<SortedSetEntry> sortedSets,
|
||||
IntObjectHashMap<SortedNumericEntry> sortedNumerics,
|
||||
IntObjectHashMap<DocValuesSkipperEntry> skippers,
|
||||
IndexInput data,
|
||||
int maxDoc,
|
||||
int version,
|
||||
|
@ -194,18 +193,18 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
}
|
||||
byte type = meta.readByte();
|
||||
if (info.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) {
|
||||
skippers.put(info.name, readDocValueSkipperMeta(meta));
|
||||
skippers.put(info.number, readDocValueSkipperMeta(meta));
|
||||
}
|
||||
if (type == Lucene90DocValuesFormat.NUMERIC) {
|
||||
numerics.put(info.name, readNumeric(meta));
|
||||
numerics.put(info.number, readNumeric(meta));
|
||||
} else if (type == Lucene90DocValuesFormat.BINARY) {
|
||||
binaries.put(info.name, readBinary(meta));
|
||||
binaries.put(info.number, readBinary(meta));
|
||||
} else if (type == Lucene90DocValuesFormat.SORTED) {
|
||||
sorted.put(info.name, readSorted(meta));
|
||||
sorted.put(info.number, readSorted(meta));
|
||||
} else if (type == Lucene90DocValuesFormat.SORTED_SET) {
|
||||
sortedSets.put(info.name, readSortedSet(meta));
|
||||
sortedSets.put(info.number, readSortedSet(meta));
|
||||
} else if (type == Lucene90DocValuesFormat.SORTED_NUMERIC) {
|
||||
sortedNumerics.put(info.name, readSortedNumeric(meta));
|
||||
sortedNumerics.put(info.number, readSortedNumeric(meta));
|
||||
} else {
|
||||
throw new CorruptIndexException("invalid type: " + type, meta);
|
||||
}
|
||||
|
@ -430,7 +429,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
|
||||
NumericEntry entry = numerics.get(field.name);
|
||||
NumericEntry entry = numerics.get(field.number);
|
||||
return getNumeric(entry);
|
||||
}
|
||||
|
||||
|
@ -786,7 +785,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||
BinaryEntry entry = binaries.get(field.name);
|
||||
BinaryEntry entry = binaries.get(field.number);
|
||||
|
||||
if (entry.docsWithFieldOffset == -2) {
|
||||
return DocValues.emptyBinary();
|
||||
|
@ -887,7 +886,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
SortedEntry entry = sorted.get(field.name);
|
||||
SortedEntry entry = sorted.get(field.number);
|
||||
return getSorted(entry);
|
||||
}
|
||||
|
||||
|
@ -1363,7 +1362,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
|
||||
SortedNumericEntry entry = sortedNumerics.get(field.name);
|
||||
SortedNumericEntry entry = sortedNumerics.get(field.number);
|
||||
return getSortedNumeric(entry);
|
||||
}
|
||||
|
||||
|
@ -1508,7 +1507,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
||||
SortedSetEntry entry = sortedSets.get(field.name);
|
||||
SortedSetEntry entry = sortedSets.get(field.number);
|
||||
if (entry.singleValueEntry != null) {
|
||||
return DocValues.singleton(getSorted(entry.singleValueEntry));
|
||||
}
|
||||
|
@ -1782,7 +1781,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public DocValuesSkipper getSkipper(FieldInfo field) throws IOException {
|
||||
final DocValuesSkipperEntry entry = skippers.get(field.name);
|
||||
final DocValuesSkipperEntry entry = skippers.get(field.number);
|
||||
|
||||
final IndexInput input = data.slice("doc value skipper", entry.offset, entry.length);
|
||||
// Prefetch the first page of data. Following pages are expected to get prefetched through
|
||||
|
|
|
@ -21,8 +21,6 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readSi
|
|||
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.hnsw.FlatVectorsReader;
|
||||
import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
|
||||
|
@ -38,6 +36,7 @@ import org.apache.lucene.index.IndexFileNames;
|
|||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.VectorEncoding;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
@ -56,13 +55,15 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
|
|||
private static final long SHALLOW_SIZE =
|
||||
RamUsageEstimator.shallowSizeOfInstance(Lucene99FlatVectorsFormat.class);
|
||||
|
||||
private final Map<String, FieldEntry> fields = new HashMap<>();
|
||||
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
|
||||
private final IndexInput vectorData;
|
||||
private final FieldInfos fieldInfos;
|
||||
|
||||
public Lucene99FlatVectorsReader(SegmentReadState state, FlatVectorsScorer scorer)
|
||||
throws IOException {
|
||||
super(scorer);
|
||||
int versionMeta = readMetadata(state);
|
||||
this.fieldInfos = state.fieldInfos;
|
||||
boolean success = false;
|
||||
try {
|
||||
vectorData =
|
||||
|
@ -155,15 +156,13 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
|
|||
throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
|
||||
}
|
||||
FieldEntry fieldEntry = FieldEntry.create(meta, info);
|
||||
fields.put(info.name, fieldEntry);
|
||||
fields.put(info.number, fieldEntry);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return Lucene99FlatVectorsReader.SHALLOW_SIZE
|
||||
+ RamUsageEstimator.sizeOfMap(
|
||||
fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class));
|
||||
return Lucene99FlatVectorsReader.SHALLOW_SIZE + fields.ramBytesUsed();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -171,21 +170,27 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
|
|||
CodecUtil.checksumEntireFile(vectorData);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) {
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final FieldEntry fieldEntry;
|
||||
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
if (fieldEntry.vectorEncoding != expectedEncoding) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
+ field
|
||||
+ "\" is encoded as: "
|
||||
+ fieldEntry.vectorEncoding
|
||||
+ " expected: "
|
||||
+ VectorEncoding.FLOAT32);
|
||||
+ expectedEncoding);
|
||||
}
|
||||
return fieldEntry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
|
||||
return OffHeapFloatVectorValues.load(
|
||||
fieldEntry.similarityFunction,
|
||||
vectorScorer,
|
||||
|
@ -199,19 +204,7 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
|
|||
|
||||
@Override
|
||||
public ByteVectorValues getByteVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
+ field
|
||||
+ "\" is encoded as: "
|
||||
+ fieldEntry.vectorEncoding
|
||||
+ " expected: "
|
||||
+ VectorEncoding.BYTE);
|
||||
}
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
|
||||
return OffHeapByteVectorValues.load(
|
||||
fieldEntry.similarityFunction,
|
||||
vectorScorer,
|
||||
|
@ -225,10 +218,7 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
|
|||
|
||||
@Override
|
||||
public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
return null;
|
||||
}
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
|
||||
return vectorScorer.getRandomVectorScorer(
|
||||
fieldEntry.similarityFunction,
|
||||
OffHeapFloatVectorValues.load(
|
||||
|
@ -245,10 +235,7 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
|
|||
|
||||
@Override
|
||||
public RandomVectorScorer getRandomVectorScorer(String field, byte[] target) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||
return null;
|
||||
}
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
|
||||
return vectorScorer.getRandomVectorScorer(
|
||||
fieldEntry.similarityFunction,
|
||||
OffHeapByteVectorValues.load(
|
||||
|
|
|
@ -21,9 +21,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.codecs.hnsw.FlatVectorsReader;
|
||||
|
@ -37,6 +35,7 @@ import org.apache.lucene.index.IndexFileNames;
|
|||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.VectorEncoding;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.search.KnnCollector;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -70,7 +69,7 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
|
||||
private final FlatVectorsReader flatVectorsReader;
|
||||
private final FieldInfos fieldInfos;
|
||||
private final Map<String, FieldEntry> fields = new HashMap<>();
|
||||
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
|
||||
private final IndexInput vectorIndex;
|
||||
|
||||
public Lucene99HnswVectorsReader(SegmentReadState state, FlatVectorsReader flatVectorsReader)
|
||||
|
@ -162,7 +161,7 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
}
|
||||
FieldEntry fieldEntry = readField(meta, info);
|
||||
validateFieldEntry(info, fieldEntry);
|
||||
fields.put(info.name, fieldEntry);
|
||||
fields.put(info.number, fieldEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -225,8 +224,7 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return Lucene99HnswVectorsReader.SHALLOW_SIZE
|
||||
+ RamUsageEstimator.sizeOfMap(
|
||||
fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class))
|
||||
+ fields.ramBytesUsed()
|
||||
+ flatVectorsReader.ramBytesUsed();
|
||||
}
|
||||
|
||||
|
@ -246,25 +244,43 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
return flatVectorsReader.getByteVectorValues(field);
|
||||
}
|
||||
|
||||
private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) {
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final FieldEntry fieldEntry;
|
||||
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != expectedEncoding) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
+ field
|
||||
+ "\" is encoded as: "
|
||||
+ fieldEntry.vectorEncoding
|
||||
+ " expected: "
|
||||
+ expectedEncoding);
|
||||
}
|
||||
return fieldEntry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32);
|
||||
search(
|
||||
fields.get(field),
|
||||
fieldEntry,
|
||||
knnCollector,
|
||||
acceptDocs,
|
||||
VectorEncoding.FLOAT32,
|
||||
() -> flatVectorsReader.getRandomVectorScorer(field, target));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE);
|
||||
search(
|
||||
fields.get(field),
|
||||
fieldEntry,
|
||||
knnCollector,
|
||||
acceptDocs,
|
||||
VectorEncoding.BYTE,
|
||||
() -> flatVectorsReader.getRandomVectorScorer(field, target));
|
||||
}
|
||||
|
||||
|
@ -272,13 +288,10 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
FieldEntry fieldEntry,
|
||||
KnnCollector knnCollector,
|
||||
Bits acceptDocs,
|
||||
VectorEncoding vectorEncoding,
|
||||
IOSupplier<RandomVectorScorer> scorerSupplier)
|
||||
throws IOException {
|
||||
|
||||
if (fieldEntry.size() == 0
|
||||
|| knnCollector.k() == 0
|
||||
|| fieldEntry.vectorEncoding != vectorEncoding) {
|
||||
if (fieldEntry.size() == 0 || knnCollector.k() == 0) {
|
||||
return;
|
||||
}
|
||||
final RandomVectorScorer scorer = scorerSupplier.get();
|
||||
|
@ -304,12 +317,12 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
|
||||
@Override
|
||||
public HnswGraph getGraph(String field) throws IOException {
|
||||
FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
if (info == null) {
|
||||
throw new IllegalArgumentException("No such field '" + field + "'");
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final FieldEntry entry;
|
||||
if (info == null || (entry = fields.get(info.number)) == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
FieldEntry entry = fields.get(field);
|
||||
if (entry != null && entry.vectorIndexLength > 0) {
|
||||
if (entry.vectorIndexLength > 0) {
|
||||
return getGraph(entry);
|
||||
} else {
|
||||
return HnswGraph.EMPTY;
|
||||
|
|
|
@ -21,8 +21,6 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readSi
|
|||
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.hnsw.FlatVectorsReader;
|
||||
import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
|
||||
|
@ -36,6 +34,7 @@ import org.apache.lucene.index.IndexFileNames;
|
|||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.VectorEncoding;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.search.VectorScorer;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
@ -59,15 +58,17 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
private static final long SHALLOW_SIZE =
|
||||
RamUsageEstimator.shallowSizeOfInstance(Lucene99ScalarQuantizedVectorsReader.class);
|
||||
|
||||
private final Map<String, FieldEntry> fields = new HashMap<>();
|
||||
private final IntObjectHashMap<FieldEntry> fields = new IntObjectHashMap<>();
|
||||
private final IndexInput quantizedVectorData;
|
||||
private final FlatVectorsReader rawVectorsReader;
|
||||
private final FieldInfos fieldInfos;
|
||||
|
||||
public Lucene99ScalarQuantizedVectorsReader(
|
||||
SegmentReadState state, FlatVectorsReader rawVectorsReader, FlatVectorsScorer scorer)
|
||||
throws IOException {
|
||||
super(scorer);
|
||||
this.rawVectorsReader = rawVectorsReader;
|
||||
this.fieldInfos = state.fieldInfos;
|
||||
int versionMeta = -1;
|
||||
String metaFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
|
@ -118,7 +119,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
}
|
||||
FieldEntry fieldEntry = readField(meta, versionMeta, info);
|
||||
validateFieldEntry(info, fieldEntry);
|
||||
fields.put(info.name, fieldEntry);
|
||||
fields.put(info.number, fieldEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -163,10 +164,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
CodecUtil.checksumEntireFile(quantizedVectorData);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
private FieldEntry getFieldEntry(String field) {
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final FieldEntry fieldEntry;
|
||||
if (info == null || (fieldEntry = fields.get(info.number)) == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
|
@ -178,6 +179,12 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
+ " expected: "
|
||||
+ VectorEncoding.FLOAT32);
|
||||
}
|
||||
return fieldEntry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field);
|
||||
final FloatVectorValues rawVectorValues = rawVectorsReader.getFloatVectorValues(field);
|
||||
OffHeapQuantizedByteVectorValues quantizedByteVectorValues =
|
||||
OffHeapQuantizedByteVectorValues.load(
|
||||
|
@ -241,10 +248,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
|
||||
@Override
|
||||
public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
return null;
|
||||
}
|
||||
final FieldEntry fieldEntry = getFieldEntry(field);
|
||||
if (fieldEntry.scalarQuantizer == null) {
|
||||
return rawVectorsReader.getRandomVectorScorer(field, target);
|
||||
}
|
||||
|
@ -275,12 +279,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
long size = SHALLOW_SIZE;
|
||||
size +=
|
||||
RamUsageEstimator.sizeOfMap(
|
||||
fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class));
|
||||
size += rawVectorsReader.ramBytesUsed();
|
||||
return size;
|
||||
return SHALLOW_SIZE + fields.ramBytesUsed() + rawVectorsReader.ramBytesUsed();
|
||||
}
|
||||
|
||||
private FieldEntry readField(IndexInput input, int versionMeta, FieldInfo info)
|
||||
|
@ -301,11 +300,8 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
}
|
||||
|
||||
@Override
|
||||
public QuantizedByteVectorValues getQuantizedVectorValues(String fieldName) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(fieldName);
|
||||
if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
return null;
|
||||
}
|
||||
public QuantizedByteVectorValues getQuantizedVectorValues(String field) throws IOException {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field);
|
||||
return OffHeapQuantizedByteVectorValues.load(
|
||||
fieldEntry.ordToDoc,
|
||||
fieldEntry.dimension,
|
||||
|
@ -320,11 +316,8 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
}
|
||||
|
||||
@Override
|
||||
public ScalarQuantizer getQuantizationState(String fieldName) {
|
||||
FieldEntry fieldEntry = fields.get(fieldName);
|
||||
if (fieldEntry == null || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
return null;
|
||||
}
|
||||
public ScalarQuantizer getQuantizationState(String field) {
|
||||
final FieldEntry fieldEntry = getFieldEntry(field);
|
||||
return fieldEntry.scalarQuantizer;
|
||||
}
|
||||
|
||||
|
|
|
@ -38,6 +38,7 @@ import org.apache.lucene.index.SegmentWriteState;
|
|||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedNumericDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
|
@ -256,7 +257,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
|
|||
|
||||
private static class FieldsReader extends DocValuesProducer {
|
||||
|
||||
private final Map<String, DocValuesProducer> fields = new HashMap<>();
|
||||
private final IntObjectHashMap<DocValuesProducer> fields = new IntObjectHashMap<>();
|
||||
private final Map<String, DocValuesProducer> formats = new HashMap<>();
|
||||
|
||||
// clone for merge
|
||||
|
@ -270,10 +271,10 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
|
|||
}
|
||||
|
||||
// Then rebuild fields:
|
||||
for (Map.Entry<String, DocValuesProducer> ent : other.fields.entrySet()) {
|
||||
DocValuesProducer producer = oldToNew.get(ent.getValue());
|
||||
for (IntObjectHashMap.IntObjectCursor<DocValuesProducer> ent : other.fields) {
|
||||
DocValuesProducer producer = oldToNew.get(ent.value);
|
||||
assert producer != null;
|
||||
fields.put(ent.getKey(), producer);
|
||||
fields.put(ent.key, producer);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -302,7 +303,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
|
|||
segmentSuffix,
|
||||
format.fieldsProducer(new SegmentReadState(readState, segmentSuffix)));
|
||||
}
|
||||
fields.put(fieldName, formats.get(segmentSuffix));
|
||||
fields.put(fi.number, formats.get(segmentSuffix));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -316,37 +317,37 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
|
|||
|
||||
@Override
|
||||
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
|
||||
DocValuesProducer producer = fields.get(field.name);
|
||||
DocValuesProducer producer = fields.get(field.number);
|
||||
return producer == null ? null : producer.getNumeric(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||
DocValuesProducer producer = fields.get(field.name);
|
||||
DocValuesProducer producer = fields.get(field.number);
|
||||
return producer == null ? null : producer.getBinary(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
DocValuesProducer producer = fields.get(field.name);
|
||||
DocValuesProducer producer = fields.get(field.number);
|
||||
return producer == null ? null : producer.getSorted(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
|
||||
DocValuesProducer producer = fields.get(field.name);
|
||||
DocValuesProducer producer = fields.get(field.number);
|
||||
return producer == null ? null : producer.getSortedNumeric(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
||||
DocValuesProducer producer = fields.get(field.name);
|
||||
DocValuesProducer producer = fields.get(field.number);
|
||||
return producer == null ? null : producer.getSortedSet(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesSkipper getSkipper(FieldInfo field) throws IOException {
|
||||
DocValuesProducer producer = fields.get(field.name);
|
||||
DocValuesProducer producer = fields.get(field.number);
|
||||
return producer == null ? null : producer.getSkipper(field);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,9 @@ package org.apache.lucene.codecs.perfield;
|
|||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.ServiceLoader;
|
||||
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
|
||||
|
@ -28,11 +30,14 @@ import org.apache.lucene.codecs.KnnVectorsReader;
|
|||
import org.apache.lucene.codecs.KnnVectorsWriter;
|
||||
import org.apache.lucene.index.ByteVectorValues;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.FloatVectorValues;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.Sorter;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.internal.hppc.ObjectCursor;
|
||||
import org.apache.lucene.search.KnnCollector;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -186,7 +191,8 @@ public abstract class PerFieldKnnVectorsFormat extends KnnVectorsFormat {
|
|||
/** VectorReader that can wrap multiple delegate readers, selected by field. */
|
||||
public static class FieldsReader extends KnnVectorsReader {
|
||||
|
||||
private final Map<String, KnnVectorsReader> fields = new HashMap<>();
|
||||
private final IntObjectHashMap<KnnVectorsReader> fields = new IntObjectHashMap<>();
|
||||
private final FieldInfos fieldInfos;
|
||||
|
||||
/**
|
||||
* Create a FieldsReader over a segment, opening VectorReaders for each KnnVectorsFormat
|
||||
|
@ -196,7 +202,7 @@ public abstract class PerFieldKnnVectorsFormat extends KnnVectorsFormat {
|
|||
* @throws IOException if one of the delegate readers throws
|
||||
*/
|
||||
public FieldsReader(final SegmentReadState readState) throws IOException {
|
||||
|
||||
this.fieldInfos = readState.fieldInfos;
|
||||
// Init each unique format:
|
||||
boolean success = false;
|
||||
Map<String, KnnVectorsReader> formats = new HashMap<>();
|
||||
|
@ -221,7 +227,7 @@ public abstract class PerFieldKnnVectorsFormat extends KnnVectorsFormat {
|
|||
segmentSuffix,
|
||||
format.fieldsReader(new SegmentReadState(readState, segmentSuffix)));
|
||||
}
|
||||
fields.put(fieldName, formats.get(segmentSuffix));
|
||||
fields.put(fi.number, formats.get(segmentSuffix));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -239,51 +245,69 @@ public abstract class PerFieldKnnVectorsFormat extends KnnVectorsFormat {
|
|||
* @param field the name of a numeric vector field
|
||||
*/
|
||||
public KnnVectorsReader getFieldReader(String field) {
|
||||
return fields.get(field);
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
if (info == null) {
|
||||
return null;
|
||||
}
|
||||
return fields.get(info.number);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkIntegrity() throws IOException {
|
||||
for (KnnVectorsReader reader : fields.values()) {
|
||||
reader.checkIntegrity();
|
||||
for (ObjectCursor<KnnVectorsReader> cursor : fields.values()) {
|
||||
cursor.value.checkIntegrity();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
KnnVectorsReader knnVectorsReader = fields.get(field);
|
||||
if (knnVectorsReader == null) {
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final KnnVectorsReader reader;
|
||||
if (info == null || (reader = fields.get(info.number)) == null) {
|
||||
return null;
|
||||
} else {
|
||||
return knnVectorsReader.getFloatVectorValues(field);
|
||||
}
|
||||
return reader.getFloatVectorValues(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ByteVectorValues getByteVectorValues(String field) throws IOException {
|
||||
KnnVectorsReader knnVectorsReader = fields.get(field);
|
||||
if (knnVectorsReader == null) {
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final KnnVectorsReader reader;
|
||||
if (info == null || (reader = fields.get(info.number)) == null) {
|
||||
return null;
|
||||
} else {
|
||||
return knnVectorsReader.getByteVectorValues(field);
|
||||
}
|
||||
return reader.getByteVectorValues(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
fields.get(field).search(field, target, knnCollector, acceptDocs);
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final KnnVectorsReader reader;
|
||||
if (info == null || (reader = fields.get(info.number)) == null) {
|
||||
return;
|
||||
}
|
||||
reader.search(field, target, knnCollector, acceptDocs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs)
|
||||
throws IOException {
|
||||
fields.get(field).search(field, target, knnCollector, acceptDocs);
|
||||
final FieldInfo info = fieldInfos.fieldInfo(field);
|
||||
final KnnVectorsReader reader;
|
||||
if (info == null || (reader = fields.get(info.number)) == null) {
|
||||
return;
|
||||
}
|
||||
reader.search(field, target, knnCollector, acceptDocs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
IOUtils.close(fields.values());
|
||||
List<KnnVectorsReader> readers = new ArrayList<>(fields.size());
|
||||
for (ObjectCursor<KnnVectorsReader> cursor : fields.values()) {
|
||||
readers.add(cursor.value);
|
||||
}
|
||||
IOUtils.close(readers);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.index.Terms;
|
|||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.LeafSimScorer;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryVisitor;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
|
@ -120,7 +119,6 @@ final class FeatureQuery extends Query {
|
|||
|
||||
@Override
|
||||
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
|
||||
final Weight thisWeight = this;
|
||||
Terms terms = Terms.getTerms(context.reader(), fieldName);
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
|
||||
|
@ -135,10 +133,8 @@ final class FeatureQuery extends Query {
|
|||
@Override
|
||||
public Scorer get(long leadCost) throws IOException {
|
||||
final SimScorer scorer = function.scorer(boost);
|
||||
final LeafSimScorer simScorer =
|
||||
new LeafSimScorer(scorer, context.reader(), fieldName, false);
|
||||
final ImpactsEnum impacts = termsEnum.impacts(PostingsEnum.FREQS);
|
||||
return new TermScorer(thisWeight, impacts, simScorer, topLevelScoringClause);
|
||||
return new TermScorer(impacts, scorer, null, topLevelScoringClause);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -20,7 +20,6 @@ import static org.apache.lucene.geo.GeoEncodingUtils.encodeLatitude;
|
|||
import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitude;
|
||||
import static org.apache.lucene.geo.GeoUtils.lineCrossesLine;
|
||||
import static org.apache.lucene.geo.GeoUtils.lineOverlapLine;
|
||||
import static org.apache.lucene.geo.GeoUtils.orient;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
@ -215,7 +214,7 @@ public final class Tessellator {
|
|||
* Creates a circular doubly linked list using polygon points. The order is governed by the
|
||||
* specified winding order
|
||||
*/
|
||||
private static final Node createDoublyLinkedList(
|
||||
private static Node createDoublyLinkedList(
|
||||
final double[] x,
|
||||
final double[] y,
|
||||
final WindingOrder polyWindingOrder,
|
||||
|
@ -243,7 +242,7 @@ public final class Tessellator {
|
|||
return filterPoints(lastNode, null);
|
||||
}
|
||||
|
||||
private static final Node eliminateHoles(final XYPolygon polygon, Node outerNode) {
|
||||
private static Node eliminateHoles(final XYPolygon polygon, Node outerNode) {
|
||||
// Define a list to hole a reference to each filtered hole list.
|
||||
final List<Node> holeList = new ArrayList<>();
|
||||
// keep a reference to the hole
|
||||
|
@ -273,8 +272,8 @@ public final class Tessellator {
|
|||
return eliminateHoles(holeList, holeListPolygons, outerNode);
|
||||
}
|
||||
|
||||
/** Links every hole into the outer loop, producing a single-ring polygon without holes. * */
|
||||
private static final Node eliminateHoles(final Polygon polygon, Node outerNode) {
|
||||
/** Links every hole into the outer loop, producing a single-ring polygon without holes. */
|
||||
private static Node eliminateHoles(final Polygon polygon, Node outerNode) {
|
||||
// Define a list to hole a reference to each filtered hole list.
|
||||
final List<Node> holeList = new ArrayList<>();
|
||||
// keep a reference to the hole
|
||||
|
@ -304,7 +303,7 @@ public final class Tessellator {
|
|||
return eliminateHoles(holeList, holeListPolygons, outerNode);
|
||||
}
|
||||
|
||||
private static final Node eliminateHoles(
|
||||
private static Node eliminateHoles(
|
||||
List<Node> holeList, final Map<Node, ?> holeListPolygons, Node outerNode) {
|
||||
// Sort the hole vertices by x coordinate
|
||||
holeList.sort(
|
||||
|
@ -350,30 +349,19 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Finds a bridge between vertices that connects a hole with an outer ring, and links it */
|
||||
private static final void eliminateHole(
|
||||
private static void eliminateHole(
|
||||
final Node holeNode,
|
||||
Node outerNode,
|
||||
double holeMinX,
|
||||
double holeMaxX,
|
||||
double holeMinY,
|
||||
double holeMaxY) {
|
||||
// Attempt to find a common point between the HoleNode and OuterNode.
|
||||
Node next = outerNode;
|
||||
do {
|
||||
if (Rectangle.containsPoint(
|
||||
next.getY(), next.getX(), holeMinY, holeMaxY, holeMinX, holeMaxX)) {
|
||||
Node sharedVertex = getSharedVertex(holeNode, next);
|
||||
if (sharedVertex != null) {
|
||||
// Split the resulting polygon.
|
||||
Node node = splitPolygon(next, sharedVertex, true);
|
||||
// Filter the split nodes.
|
||||
filterPoints(node, node.next);
|
||||
return;
|
||||
}
|
||||
}
|
||||
next = next.next;
|
||||
} while (next != outerNode);
|
||||
|
||||
// Attempt to merge the hole using a common point between if it exists.
|
||||
if (maybeMergeHoleWithSharedVertices(
|
||||
holeNode, outerNode, holeMinX, holeMaxX, holeMinY, holeMaxY)) {
|
||||
return;
|
||||
}
|
||||
// Attempt to find a logical bridge between the HoleNode and OuterNode.
|
||||
outerNode = fetchHoleBridge(holeNode, outerNode);
|
||||
|
||||
|
@ -390,12 +378,112 @@ public final class Tessellator {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Choose a common vertex between the polygon and the hole if it exists and return true, otherwise
|
||||
* return false
|
||||
*/
|
||||
private static boolean maybeMergeHoleWithSharedVertices(
|
||||
final Node holeNode,
|
||||
Node outerNode,
|
||||
double holeMinX,
|
||||
double holeMaxX,
|
||||
double holeMinY,
|
||||
double holeMaxY) {
|
||||
// Attempt to find a common point between the HoleNode and OuterNode.
|
||||
Node sharedVertex = null;
|
||||
Node sharedVertexConnection = null;
|
||||
Node next = outerNode;
|
||||
do {
|
||||
if (Rectangle.containsPoint(
|
||||
next.getY(), next.getX(), holeMinY, holeMaxY, holeMinX, holeMaxX)) {
|
||||
Node newSharedVertex = getSharedVertex(holeNode, next);
|
||||
if (newSharedVertex != null) {
|
||||
if (sharedVertex == null) {
|
||||
sharedVertex = newSharedVertex;
|
||||
sharedVertexConnection = next;
|
||||
} else if (newSharedVertex.equals(sharedVertex)) {
|
||||
// This can only happen if this vertex has been already used for a bridge. We need to
|
||||
// choose the right one.
|
||||
sharedVertexConnection =
|
||||
getSharedInsideVertex(sharedVertex, sharedVertexConnection, next);
|
||||
}
|
||||
}
|
||||
}
|
||||
next = next.next;
|
||||
} while (next != outerNode);
|
||||
if (sharedVertex != null) {
|
||||
// Split the resulting polygon.
|
||||
Node node = splitPolygon(sharedVertexConnection, sharedVertex, true);
|
||||
// Filter the split nodes.
|
||||
filterPoints(node, node.next);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Check if the provided vertex is in the polygon and return it */
|
||||
private static Node getSharedVertex(final Node polygon, final Node vertex) {
|
||||
Node next = polygon;
|
||||
do {
|
||||
if (isVertexEquals(next, vertex)) {
|
||||
return next;
|
||||
}
|
||||
next = next.next;
|
||||
} while (next != polygon);
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Choose the vertex that has a smaller angle with the hole vertex */
|
||||
static Node getSharedInsideVertex(Node holeVertex, Node candidateA, Node candidateB) {
|
||||
assert isVertexEquals(holeVertex, candidateA) && isVertexEquals(holeVertex, candidateB);
|
||||
// we are joining candidate.prevNode -> holeVertex.node -> holeVertex.nextNode.
|
||||
// A negative area means a convex angle. if both are convex/reflex choose the point of
|
||||
// minimum angle
|
||||
final double a1 =
|
||||
area(
|
||||
candidateA.previous.getX(),
|
||||
candidateA.previous.getY(),
|
||||
holeVertex.getX(),
|
||||
holeVertex.getY(),
|
||||
holeVertex.next.getX(),
|
||||
holeVertex.next.getY());
|
||||
final double a2 =
|
||||
area(
|
||||
candidateB.previous.getX(),
|
||||
candidateB.previous.getY(),
|
||||
holeVertex.getX(),
|
||||
holeVertex.getY(),
|
||||
holeVertex.next.getX(),
|
||||
holeVertex.next.getY());
|
||||
|
||||
if (a1 < 0 != a2 < 0) {
|
||||
// one is convex, the other reflex, get the convex one
|
||||
return a1 < a2 ? candidateA : candidateB;
|
||||
} else {
|
||||
// both are convex / reflex, choose the smallest angle
|
||||
final double angle1 = angle(candidateA.previous, candidateA, holeVertex.next);
|
||||
final double angle2 = angle(candidateB.previous, candidateB, holeVertex.next);
|
||||
return angle1 < angle2 ? candidateA : candidateB;
|
||||
}
|
||||
}
|
||||
|
||||
private static double angle(Node a, Node b, Node c) {
|
||||
final double ax = a.getX() - b.getX();
|
||||
final double ay = a.getY() - b.getY();
|
||||
final double cx = c.getX() - b.getX();
|
||||
final double cy = c.getY() - b.getY();
|
||||
final double dotProduct = ax * cx + ay * cy;
|
||||
final double aLength = Math.sqrt(ax * ax + ay * ay);
|
||||
final double bLength = Math.sqrt(cx * cx + cy * cy);
|
||||
return Math.acos(dotProduct / (aLength * bLength));
|
||||
}
|
||||
|
||||
/**
|
||||
* David Eberly's algorithm for finding a bridge between a hole and outer polygon
|
||||
*
|
||||
* <p>see: http://www.geometrictools.com/Documentation/TriangulationByEarClipping.pdf
|
||||
*/
|
||||
private static final Node fetchHoleBridge(final Node holeNode, final Node outerNode) {
|
||||
private static Node fetchHoleBridge(final Node holeNode, final Node outerNode) {
|
||||
Node p = outerNode;
|
||||
double qx = Double.NEGATIVE_INFINITY;
|
||||
final double hx = holeNode.getX();
|
||||
|
@ -453,34 +541,8 @@ public final class Tessellator {
|
|||
return connection;
|
||||
}
|
||||
|
||||
/** Check if the provided vertex is in the polygon and return it * */
|
||||
private static Node getSharedVertex(final Node polygon, final Node vertex) {
|
||||
Node next = polygon;
|
||||
do {
|
||||
if (isVertexEquals(next, vertex)) {
|
||||
// make sure we are not crossing the polygon. This might happen when several holes share the
|
||||
// same polygon vertex.
|
||||
boolean crosses =
|
||||
GeoUtils.lineCrossesLine(
|
||||
next.previous.getX(),
|
||||
next.previous.getY(),
|
||||
vertex.next.getX(),
|
||||
vertex.next.getY(),
|
||||
next.next.getX(),
|
||||
next.next.getY(),
|
||||
vertex.previous.getX(),
|
||||
vertex.previous.getY());
|
||||
if (crosses == false) {
|
||||
return next;
|
||||
}
|
||||
}
|
||||
next = next.next;
|
||||
} while (next != polygon);
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Finds the left-most hole of a polygon ring. * */
|
||||
private static final Node fetchLeftmost(final Node start) {
|
||||
private static Node fetchLeftmost(final Node start) {
|
||||
Node node = start;
|
||||
Node leftMost = start;
|
||||
do {
|
||||
|
@ -502,7 +564,7 @@ public final class Tessellator {
|
|||
* Main ear slicing loop which triangulates the vertices of a polygon, provided as a doubly-linked
|
||||
* list. *
|
||||
*/
|
||||
private static final List<Triangle> earcutLinkedList(
|
||||
private static List<Triangle> earcutLinkedList(
|
||||
Object polygon,
|
||||
Node currEar,
|
||||
final List<Triangle> tessellation,
|
||||
|
@ -587,7 +649,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Determines whether a polygon node forms a valid ear with adjacent nodes. * */
|
||||
private static final boolean isEar(final Node ear, final boolean mortonOptimized) {
|
||||
private static boolean isEar(final Node ear, final boolean mortonOptimized) {
|
||||
if (mortonOptimized == true) {
|
||||
return mortonIsEar(ear);
|
||||
}
|
||||
|
@ -623,7 +685,7 @@ public final class Tessellator {
|
|||
* Uses morton code for speed to determine whether or a polygon node forms a valid ear w/ adjacent
|
||||
* nodes
|
||||
*/
|
||||
private static final boolean mortonIsEar(final Node ear) {
|
||||
private static boolean mortonIsEar(final Node ear) {
|
||||
// triangle bbox (flip the bits so negative encoded values are < positive encoded values)
|
||||
int minTX = StrictMath.min(StrictMath.min(ear.previous.x, ear.x), ear.next.x) ^ 0x80000000;
|
||||
int minTY = StrictMath.min(StrictMath.min(ear.previous.y, ear.y), ear.next.y) ^ 0x80000000;
|
||||
|
@ -740,7 +802,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Iterate through all polygon nodes and remove small local self-intersections * */
|
||||
private static final Node cureLocalIntersections(
|
||||
private static Node cureLocalIntersections(
|
||||
Node startNode, final List<Triangle> tessellation, final boolean mortonOptimized) {
|
||||
Node node = startNode;
|
||||
Node nextNode;
|
||||
|
@ -794,7 +856,7 @@ public final class Tessellator {
|
|||
* Attempt to split a polygon and independently triangulate each side. Return true if the polygon
|
||||
* was splitted *
|
||||
*/
|
||||
private static final boolean splitEarcut(
|
||||
private static boolean splitEarcut(
|
||||
final Object polygon,
|
||||
final Node start,
|
||||
final List<Triangle> tessellation,
|
||||
|
@ -858,7 +920,7 @@ public final class Tessellator {
|
|||
* Uses morton code for speed to determine whether or not and edge defined by a and b overlaps
|
||||
* with a polygon edge
|
||||
*/
|
||||
private static final void mortonCheckIntersection(final Node a, final Node b) {
|
||||
private static void mortonCheckIntersection(final Node a, final Node b) {
|
||||
// edge bbox (flip the bits so negative encoded values are < positive encoded values)
|
||||
int minTX = StrictMath.min(a.x, a.next.x) ^ 0x80000000;
|
||||
int minTY = StrictMath.min(a.y, a.next.y) ^ 0x80000000;
|
||||
|
@ -974,7 +1036,7 @@ public final class Tessellator {
|
|||
* Uses morton code for speed to determine whether or not and edge defined by a and b overlaps
|
||||
* with a polygon edge
|
||||
*/
|
||||
private static final boolean isMortonEdgeFromPolygon(final Node a, final Node b) {
|
||||
private static boolean isMortonEdgeFromPolygon(final Node a, final Node b) {
|
||||
// edge bbox (flip the bits so negative encoded values are < positive encoded values)
|
||||
final int minTX = StrictMath.min(a.x, b.x) ^ 0x80000000;
|
||||
final int minTY = StrictMath.min(a.y, b.y) ^ 0x80000000;
|
||||
|
@ -1060,7 +1122,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Links two polygon vertices using a bridge. * */
|
||||
private static final Node splitPolygon(final Node a, final Node b, boolean edgeFromPolygon) {
|
||||
private static Node splitPolygon(final Node a, final Node b, boolean edgeFromPolygon) {
|
||||
final Node a2 = new Node(a);
|
||||
final Node b2 = new Node(b);
|
||||
final Node an = a.next;
|
||||
|
@ -1136,7 +1198,7 @@ public final class Tessellator {
|
|||
return windingSum;
|
||||
}
|
||||
|
||||
private static final boolean isLocallyInside(final Node a, final Node b) {
|
||||
private static boolean isLocallyInside(final Node a, final Node b) {
|
||||
double area =
|
||||
area(
|
||||
a.previous.getX(), a.previous.getY(), a.getX(), a.getY(), a.next.getX(), a.next.getY());
|
||||
|
@ -1156,7 +1218,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Determine whether the middle point of a polygon diagonal is contained within the polygon */
|
||||
private static final boolean middleInsert(
|
||||
private static boolean middleInsert(
|
||||
final Node start, final double x0, final double y0, final double x1, final double y1) {
|
||||
Node node = start;
|
||||
Node nextNode;
|
||||
|
@ -1179,7 +1241,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Determines if the diagonal of a polygon is intersecting with any polygon elements. * */
|
||||
private static final boolean isIntersectingPolygon(
|
||||
private static boolean isIntersectingPolygon(
|
||||
final Node start, final double x0, final double y0, final double x1, final double y1) {
|
||||
Node node = start;
|
||||
Node nextNode;
|
||||
|
@ -1198,7 +1260,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Determines whether two line segments intersect. * */
|
||||
public static final boolean linesIntersect(
|
||||
public static boolean linesIntersect(
|
||||
final double aX0,
|
||||
final double aY0,
|
||||
final double aX1,
|
||||
|
@ -1212,7 +1274,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Interlinks polygon nodes in Z-Order. It reset the values on the z values* */
|
||||
private static final void sortByMortonWithReset(Node start) {
|
||||
private static void sortByMortonWithReset(Node start) {
|
||||
Node next = start;
|
||||
do {
|
||||
next.previousZ = next.previous;
|
||||
|
@ -1223,7 +1285,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Interlinks polygon nodes in Z-Order. * */
|
||||
private static final void sortByMorton(Node start) {
|
||||
private static void sortByMorton(Node start) {
|
||||
start.previousZ.nextZ = null;
|
||||
start.previousZ = null;
|
||||
// Sort the generated ring using Z ordering.
|
||||
|
@ -1234,7 +1296,7 @@ public final class Tessellator {
|
|||
* Simon Tatham's doubly-linked list O(n log n) mergesort see:
|
||||
* http://www.chiark.greenend.org.uk/~sgtatham/algorithms/listsort.html
|
||||
*/
|
||||
private static final void tathamSort(Node list) {
|
||||
private static void tathamSort(Node list) {
|
||||
Node p, q, e, tail;
|
||||
int i, numMerges, pSize, qSize;
|
||||
int inSize = 1;
|
||||
|
@ -1290,7 +1352,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Eliminate colinear/duplicate points from the doubly linked list */
|
||||
private static final Node filterPoints(final Node start, Node end) {
|
||||
private static Node filterPoints(final Node start, Node end) {
|
||||
if (start == null) {
|
||||
return start;
|
||||
}
|
||||
|
@ -1343,7 +1405,7 @@ public final class Tessellator {
|
|||
/**
|
||||
* Creates a node and optionally links it with a previous node in a circular doubly-linked list
|
||||
*/
|
||||
private static final Node insertNode(
|
||||
private static Node insertNode(
|
||||
final double[] x,
|
||||
final double[] y,
|
||||
int index,
|
||||
|
@ -1370,7 +1432,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Removes a node from the doubly linked list */
|
||||
private static final void removeNode(Node node, boolean edgeFromPolygon) {
|
||||
private static void removeNode(Node node, boolean edgeFromPolygon) {
|
||||
node.next.previous = node.previous;
|
||||
node.previous.next = node.next;
|
||||
node.previous.isNextEdgeFromPolygon = edgeFromPolygon;
|
||||
|
@ -1384,16 +1446,16 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Determines if two point vertices are equal. * */
|
||||
private static final boolean isVertexEquals(final Node a, final Node b) {
|
||||
private static boolean isVertexEquals(final Node a, final Node b) {
|
||||
return isVertexEquals(a, b.getX(), b.getY());
|
||||
}
|
||||
|
||||
/** Determines if two point vertices are equal. * */
|
||||
private static final boolean isVertexEquals(final Node a, final double x, final double y) {
|
||||
private static boolean isVertexEquals(final Node a, final double x, final double y) {
|
||||
return a.getX() == x && a.getY() == y;
|
||||
}
|
||||
|
||||
/** Compute signed area of triangle */
|
||||
/** Compute signed area of triangle, negative means convex angle and positive reflex angle. */
|
||||
private static double area(
|
||||
final double aX,
|
||||
final double aY,
|
||||
|
@ -1419,29 +1481,6 @@ public final class Tessellator {
|
|||
&& (bx - x) * (cy - y) - (cx - x) * (by - y) >= 0;
|
||||
}
|
||||
|
||||
/** compute whether the given x, y point is in a triangle; uses the winding order method */
|
||||
public static boolean pointInTriangle(
|
||||
double x, double y, double ax, double ay, double bx, double by, double cx, double cy) {
|
||||
double minX = StrictMath.min(ax, StrictMath.min(bx, cx));
|
||||
double minY = StrictMath.min(ay, StrictMath.min(by, cy));
|
||||
double maxX = StrictMath.max(ax, StrictMath.max(bx, cx));
|
||||
double maxY = StrictMath.max(ay, StrictMath.max(by, cy));
|
||||
// check the bounding box because if the triangle is degenerated, e.g points and lines, we need
|
||||
// to filter out
|
||||
// coplanar points that are not part of the triangle.
|
||||
if (x >= minX && x <= maxX && y >= minY && y <= maxY) {
|
||||
int a = orient(x, y, ax, ay, bx, by);
|
||||
int b = orient(x, y, bx, by, cx, cy);
|
||||
if (a == 0 || b == 0 || a < 0 == b < 0) {
|
||||
int c = orient(x, y, cx, cy, ax, ay);
|
||||
return c == 0 || (c < 0 == (b < 0 || a < 0));
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of this interface will receive calls with internal data at each step of the
|
||||
* triangulation algorithm. This is of use for debugging complex cases, as well as gaining insight
|
||||
|
@ -1508,7 +1547,7 @@ public final class Tessellator {
|
|||
}
|
||||
|
||||
/** Circular Doubly-linked list used for polygon coordinates */
|
||||
protected static class Node {
|
||||
static class Node {
|
||||
// node index in the linked list
|
||||
private final int idx;
|
||||
// vertex index in the polygon
|
||||
|
@ -1524,9 +1563,9 @@ public final class Tessellator {
|
|||
private final long morton;
|
||||
|
||||
// previous node
|
||||
private Node previous;
|
||||
Node previous;
|
||||
// next node
|
||||
private Node next;
|
||||
Node next;
|
||||
// previous z node
|
||||
private Node previousZ;
|
||||
// next z node
|
||||
|
@ -1534,7 +1573,7 @@ public final class Tessellator {
|
|||
// if the edge from this node to the next node is part of the polygon edges
|
||||
private boolean isNextEdgeFromPolygon;
|
||||
|
||||
protected Node(
|
||||
Node(
|
||||
final double[] x,
|
||||
final double[] y,
|
||||
final int index,
|
||||
|
@ -1600,7 +1639,7 @@ public final class Tessellator {
|
|||
Node[] vertex;
|
||||
boolean[] edgeFromPolygon;
|
||||
|
||||
protected Triangle(
|
||||
private Triangle(
|
||||
Node a,
|
||||
boolean isABfromPolygon,
|
||||
Node b,
|
||||
|
@ -1636,19 +1675,6 @@ public final class Tessellator {
|
|||
return edgeFromPolygon[startVertex];
|
||||
}
|
||||
|
||||
/** utility method to compute whether the point is in the triangle */
|
||||
protected boolean containsPoint(double lat, double lon) {
|
||||
return pointInTriangle(
|
||||
lon,
|
||||
lat,
|
||||
vertex[0].getX(),
|
||||
vertex[0].getY(),
|
||||
vertex[1].getX(),
|
||||
vertex[1].getY(),
|
||||
vertex[2].getX(),
|
||||
vertex[2].getY());
|
||||
}
|
||||
|
||||
/** pretty print the triangle vertices */
|
||||
@Override
|
||||
public String toString() {
|
||||
|
|
|
@ -4284,21 +4284,8 @@ public final class CheckIndex implements Closeable {
|
|||
int level = Integer.parseInt(args[i]);
|
||||
Level.checkIfLevelInBounds(level);
|
||||
opts.level = level;
|
||||
} else if ("-fast".equals(arg)) {
|
||||
// Deprecated. Remove in Lucene 11.
|
||||
System.err.println(
|
||||
"-fast is deprecated, use '-level 1' for explicitly verifying file checksums only. This is also now the default "
|
||||
+ "behaviour!");
|
||||
} else if ("-slow".equals(arg)) {
|
||||
// Deprecated. Remove in Lucene 11.
|
||||
System.err.println("-slow is deprecated, use '-level 3' instead for slow checks");
|
||||
opts.level = Level.MIN_LEVEL_FOR_SLOW_CHECKS;
|
||||
} else if ("-exorcise".equals(arg)) {
|
||||
opts.doExorcise = true;
|
||||
} else if ("-crossCheckTermVectors".equals(arg)) {
|
||||
// Deprecated. Remove in Lucene 11.
|
||||
System.err.println("-crossCheckTermVectors is deprecated, use '-level 3' instead");
|
||||
opts.level = Level.MAX_VALUE;
|
||||
} else if (arg.equals("-verbose")) {
|
||||
opts.verbose = true;
|
||||
} else if (arg.equals("-segment")) {
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.index;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.internal.hppc.LongArrayList;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
|
@ -32,7 +31,7 @@ import org.apache.lucene.store.Directory;
|
|||
// producer?
|
||||
class SegmentDocValuesProducer extends DocValuesProducer {
|
||||
|
||||
final Map<String, DocValuesProducer> dvProducersByField = new HashMap<>();
|
||||
final IntObjectHashMap<DocValuesProducer> dvProducersByField = new IntObjectHashMap<>();
|
||||
final Set<DocValuesProducer> dvProducers =
|
||||
Collections.newSetFromMap(new IdentityHashMap<DocValuesProducer, Boolean>());
|
||||
final LongArrayList dvGens = new LongArrayList();
|
||||
|
@ -67,7 +66,7 @@ class SegmentDocValuesProducer extends DocValuesProducer {
|
|||
dvGens.add(docValuesGen);
|
||||
dvProducers.add(baseProducer);
|
||||
}
|
||||
dvProducersByField.put(fi.name, baseProducer);
|
||||
dvProducersByField.put(fi.number, baseProducer);
|
||||
} else {
|
||||
assert !dvGens.contains(docValuesGen);
|
||||
// otherwise, producer sees only the one fieldinfo it wrote
|
||||
|
@ -76,7 +75,7 @@ class SegmentDocValuesProducer extends DocValuesProducer {
|
|||
docValuesGen, si, dir, new FieldInfos(new FieldInfo[] {fi}));
|
||||
dvGens.add(docValuesGen);
|
||||
dvProducers.add(dvp);
|
||||
dvProducersByField.put(fi.name, dvp);
|
||||
dvProducersByField.put(fi.number, dvp);
|
||||
}
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
|
@ -91,42 +90,42 @@ class SegmentDocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
|
||||
assert dvProducer != null;
|
||||
return dvProducer.getNumeric(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
|
||||
assert dvProducer != null;
|
||||
return dvProducer.getBinary(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
|
||||
assert dvProducer != null;
|
||||
return dvProducer.getSorted(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
|
||||
assert dvProducer != null;
|
||||
return dvProducer.getSortedNumeric(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
|
||||
assert dvProducer != null;
|
||||
return dvProducer.getSortedSet(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesSkipper getSkipper(FieldInfo field) throws IOException {
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.name);
|
||||
DocValuesProducer dvProducer = dvProducersByField.get(field.number);
|
||||
assert dvProducer != null;
|
||||
return dvProducer.getSkipper(field);
|
||||
}
|
||||
|
|
|
@ -93,9 +93,8 @@ class StoredFieldsConsumer {
|
|||
|
||||
void finish(int maxDoc) throws IOException {
|
||||
while (lastDoc < maxDoc - 1) {
|
||||
startDocument(lastDoc);
|
||||
startDocument(lastDoc + 1);
|
||||
finishDocument();
|
||||
++lastDoc;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -197,4 +197,14 @@ final class DefaultVectorUtilSupport implements VectorUtilSupport {
|
|||
}
|
||||
return squareSum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int findNextGEQ(int[] buffer, int target, int from, int to) {
|
||||
for (int i = from; i < to; ++i) {
|
||||
if (buffer[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return to;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,19 +34,19 @@ public class PostingDecodingUtil {
|
|||
* Core methods for decoding blocks of docs / freqs / positions / offsets.
|
||||
*
|
||||
* <ul>
|
||||
* <li>Read {@code count} longs.
|
||||
* <li>Read {@code count} ints.
|
||||
* <li>For all {@code i} >= 0 so that {@code bShift - i * dec} > 0, apply shift {@code
|
||||
* bShift - i * dec} and store the result in {@code b} at offset {@code count * i}.
|
||||
* <li>Apply mask {@code cMask} and store the result in {@code c} starting at offset {@code
|
||||
* cIndex}.
|
||||
* </ul>
|
||||
*/
|
||||
public void splitLongs(
|
||||
int count, long[] b, int bShift, int dec, long bMask, long[] c, int cIndex, long cMask)
|
||||
public void splitInts(
|
||||
int count, int[] b, int bShift, int dec, int bMask, int[] c, int cIndex, int cMask)
|
||||
throws IOException {
|
||||
// Default implementation, which takes advantage of the C2 compiler's loop unrolling and
|
||||
// auto-vectorization.
|
||||
in.readLongs(c, cIndex, count);
|
||||
in.readInts(c, cIndex, count);
|
||||
int maxIter = (bShift - 1) / dec;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
for (int j = 0; j <= maxIter; ++j) {
|
||||
|
|
|
@ -44,4 +44,12 @@ public interface VectorUtilSupport {
|
|||
|
||||
/** Returns the sum of squared differences of the two byte vectors. */
|
||||
int squareDistance(byte[] a, byte[] b);
|
||||
|
||||
/**
|
||||
* Given an array {@code buffer} that is sorted between indexes {@code 0} inclusive and {@code to}
|
||||
* exclusive, find the first array index whose value is greater than or equal to {@code target}.
|
||||
* This index is guaranteed to be at least {@code from}. If there is no such array index, {@code
|
||||
* to} is returned.
|
||||
*/
|
||||
int findNextGEQ(int[] buffer, int target, int from, int to);
|
||||
}
|
||||
|
|
|
@ -38,12 +38,16 @@ import org.apache.lucene.util.VectorUtil;
|
|||
* vectorization modules in the Java runtime this class provides optimized implementations (using
|
||||
* SIMD) of several algorithms used throughout Apache Lucene.
|
||||
*
|
||||
* <p>Expert: set the {@value #UPPER_JAVA_FEATURE_VERSION_SYSPROP} system property to increase the
|
||||
* set of Java versions this class will provide optimized implementations for.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public abstract class VectorizationProvider {
|
||||
|
||||
static final OptionalInt TESTS_VECTOR_SIZE;
|
||||
static final boolean TESTS_FORCE_INTEGER_VECTORS;
|
||||
static final int UPPER_JAVA_FEATURE_VERSION = getUpperJavaFeatureVersion();
|
||||
|
||||
static {
|
||||
var vs = OptionalInt.empty();
|
||||
|
@ -71,6 +75,27 @@ public abstract class VectorizationProvider {
|
|||
TESTS_FORCE_INTEGER_VECTORS = enforce;
|
||||
}
|
||||
|
||||
private static final String UPPER_JAVA_FEATURE_VERSION_SYSPROP =
|
||||
"org.apache.lucene.vectorization.upperJavaFeatureVersion";
|
||||
private static final int DEFAULT_UPPER_JAVA_FEATURE_VERSION = 23;
|
||||
|
||||
private static int getUpperJavaFeatureVersion() {
|
||||
int runtimeVersion = DEFAULT_UPPER_JAVA_FEATURE_VERSION;
|
||||
try {
|
||||
String str = System.getProperty(UPPER_JAVA_FEATURE_VERSION_SYSPROP);
|
||||
if (str != null) {
|
||||
runtimeVersion = Math.max(Integer.parseInt(str), runtimeVersion);
|
||||
}
|
||||
} catch (@SuppressWarnings("unused") NumberFormatException | SecurityException ignored) {
|
||||
Logger.getLogger(VectorizationProvider.class.getName())
|
||||
.warning(
|
||||
"Cannot read sysprop "
|
||||
+ UPPER_JAVA_FEATURE_VERSION_SYSPROP
|
||||
+ ", so the default value will be used.");
|
||||
}
|
||||
return runtimeVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the default instance of the provider matching vectorization possibilities of actual
|
||||
* runtime.
|
||||
|
@ -108,7 +133,7 @@ public abstract class VectorizationProvider {
|
|||
static VectorizationProvider lookup(boolean testMode) {
|
||||
final int runtimeVersion = Runtime.version().feature();
|
||||
assert runtimeVersion >= 21;
|
||||
if (runtimeVersion <= 23) {
|
||||
if (runtimeVersion <= UPPER_JAVA_FEATURE_VERSION) {
|
||||
// only use vector module with Hotspot VM
|
||||
if (!Constants.IS_HOTSPOT_VM) {
|
||||
LOG.warning(
|
||||
|
@ -190,8 +215,8 @@ public abstract class VectorizationProvider {
|
|||
Set.of(
|
||||
"org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil",
|
||||
"org.apache.lucene.util.VectorUtil",
|
||||
"org.apache.lucene.codecs.lucene912.Lucene912PostingsReader",
|
||||
"org.apache.lucene.codecs.lucene912.PostingIndexInput");
|
||||
"org.apache.lucene.codecs.lucene101.Lucene101PostingsReader",
|
||||
"org.apache.lucene.codecs.lucene101.PostingIndexInput");
|
||||
|
||||
private static void ensureCaller() {
|
||||
final boolean validCaller =
|
||||
|
|
|
@ -87,6 +87,22 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a collection of BooleanClauses to this {@link Builder}. Note that the order in which
|
||||
* clauses are added does not have any impact on matching documents or query performance.
|
||||
*
|
||||
* @throws IndexSearcher.TooManyClauses if the new number of clauses exceeds the maximum clause
|
||||
* number
|
||||
*/
|
||||
public Builder add(Collection<BooleanClause> collection) {
|
||||
// see #addClause(BooleanClause)
|
||||
if ((clauses.size() + collection.size()) > IndexSearcher.maxClauseCount) {
|
||||
throw new IndexSearcher.TooManyClauses();
|
||||
}
|
||||
clauses.addAll(collection);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a new clause to this {@link Builder}. Note that the order in which clauses are added does
|
||||
* not have any impact on matching documents or query performance.
|
||||
|
@ -136,7 +152,7 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
|
|||
}
|
||||
|
||||
/** Return the collection of queries for the given {@link Occur}. */
|
||||
Collection<Query> getClauses(Occur occur) {
|
||||
public Collection<Query> getClauses(Occur occur) {
|
||||
return clauseSets.get(occur);
|
||||
}
|
||||
|
||||
|
|
|
@ -20,13 +20,14 @@ import java.io.IOException;
|
|||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.internal.hppc.LongArrayList;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* {@link BulkScorer} that is used for pure disjunctions and disjunctions that have low values of
|
||||
* {@link BooleanQuery.Builder#setMinimumNumberShouldMatch(int)} and dense clauses. This scorer
|
||||
* scores documents by batches of 2048 docs.
|
||||
* scores documents by batches of 4,096 docs.
|
||||
*/
|
||||
final class BooleanScorer extends BulkScorer {
|
||||
|
||||
|
@ -41,71 +42,32 @@ final class BooleanScorer extends BulkScorer {
|
|||
int freq;
|
||||
}
|
||||
|
||||
private class BulkScorerAndDoc {
|
||||
final BulkScorer scorer;
|
||||
final long cost;
|
||||
int next;
|
||||
|
||||
BulkScorerAndDoc(BulkScorer scorer) {
|
||||
this.scorer = scorer;
|
||||
this.cost = scorer.cost();
|
||||
this.next = -1;
|
||||
}
|
||||
|
||||
void advance(int min) throws IOException {
|
||||
score(orCollector, null, min, min);
|
||||
}
|
||||
|
||||
void score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
|
||||
next = scorer.score(collector, acceptDocs, min, max);
|
||||
}
|
||||
}
|
||||
|
||||
// See WANDScorer for an explanation
|
||||
private static long cost(Collection<BulkScorer> scorers, int minShouldMatch) {
|
||||
final PriorityQueue<BulkScorer> pq =
|
||||
new PriorityQueue<BulkScorer>(scorers.size() - minShouldMatch + 1) {
|
||||
@Override
|
||||
protected boolean lessThan(BulkScorer a, BulkScorer b) {
|
||||
return a.cost() > b.cost();
|
||||
}
|
||||
};
|
||||
for (BulkScorer scorer : scorers) {
|
||||
pq.insertWithOverflow(scorer);
|
||||
}
|
||||
long cost = 0;
|
||||
for (BulkScorer scorer = pq.pop(); scorer != null; scorer = pq.pop()) {
|
||||
cost += scorer.cost();
|
||||
}
|
||||
return cost;
|
||||
}
|
||||
|
||||
static final class HeadPriorityQueue extends PriorityQueue<BulkScorerAndDoc> {
|
||||
static final class HeadPriorityQueue extends PriorityQueue<DisiWrapper> {
|
||||
|
||||
public HeadPriorityQueue(int maxSize) {
|
||||
super(maxSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(BulkScorerAndDoc a, BulkScorerAndDoc b) {
|
||||
return a.next < b.next;
|
||||
protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
|
||||
return a.doc < b.doc;
|
||||
}
|
||||
}
|
||||
|
||||
static final class TailPriorityQueue extends PriorityQueue<BulkScorerAndDoc> {
|
||||
static final class TailPriorityQueue extends PriorityQueue<DisiWrapper> {
|
||||
|
||||
public TailPriorityQueue(int maxSize) {
|
||||
super(maxSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(BulkScorerAndDoc a, BulkScorerAndDoc b) {
|
||||
protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
|
||||
return a.cost < b.cost;
|
||||
}
|
||||
|
||||
public BulkScorerAndDoc get(int i) {
|
||||
public DisiWrapper get(int i) {
|
||||
Objects.checkIndex(i, size());
|
||||
return (BulkScorerAndDoc) getHeapArray()[1 + i];
|
||||
return (DisiWrapper) getHeapArray()[1 + i];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -115,7 +77,7 @@ final class BooleanScorer extends BulkScorer {
|
|||
// This is basically an inlined FixedBitSet... seems to help with bound checks
|
||||
final long[] matching = new long[SET_SIZE];
|
||||
|
||||
final BulkScorerAndDoc[] leads;
|
||||
final DisiWrapper[] leads;
|
||||
final HeadPriorityQueue head;
|
||||
final TailPriorityQueue tail;
|
||||
final Score score = new Score();
|
||||
|
@ -123,31 +85,6 @@ final class BooleanScorer extends BulkScorer {
|
|||
final long cost;
|
||||
final boolean needsScores;
|
||||
|
||||
final class OrCollector implements LeafCollector {
|
||||
Scorable scorer;
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorable scorer) {
|
||||
this.scorer = scorer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
final int i = doc & MASK;
|
||||
final int idx = i >>> 6;
|
||||
matching[idx] |= 1L << i;
|
||||
if (buckets != null) {
|
||||
final Bucket bucket = buckets[i];
|
||||
bucket.freq++;
|
||||
if (needsScores) {
|
||||
bucket.score += scorer.score();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final OrCollector orCollector = new OrCollector();
|
||||
|
||||
final class DocIdStreamView extends DocIdStream {
|
||||
|
||||
int base;
|
||||
|
@ -194,7 +131,7 @@ final class BooleanScorer extends BulkScorer {
|
|||
|
||||
private final DocIdStreamView docIdStreamView = new DocIdStreamView();
|
||||
|
||||
BooleanScorer(Collection<BulkScorer> scorers, int minShouldMatch, boolean needsScores) {
|
||||
BooleanScorer(Collection<Scorer> scorers, int minShouldMatch, boolean needsScores) {
|
||||
if (minShouldMatch < 1 || minShouldMatch > scorers.size()) {
|
||||
throw new IllegalArgumentException(
|
||||
"minShouldMatch should be within 1..num_scorers. Got " + minShouldMatch);
|
||||
|
@ -211,18 +148,21 @@ final class BooleanScorer extends BulkScorer {
|
|||
} else {
|
||||
buckets = null;
|
||||
}
|
||||
this.leads = new BulkScorerAndDoc[scorers.size()];
|
||||
this.leads = new DisiWrapper[scorers.size()];
|
||||
this.head = new HeadPriorityQueue(scorers.size() - minShouldMatch + 1);
|
||||
this.tail = new TailPriorityQueue(minShouldMatch - 1);
|
||||
this.minShouldMatch = minShouldMatch;
|
||||
this.needsScores = needsScores;
|
||||
for (BulkScorer scorer : scorers) {
|
||||
final BulkScorerAndDoc evicted = tail.insertWithOverflow(new BulkScorerAndDoc(scorer));
|
||||
LongArrayList costs = new LongArrayList(scorers.size());
|
||||
for (Scorer scorer : scorers) {
|
||||
DisiWrapper w = new DisiWrapper(scorer);
|
||||
costs.add(w.cost);
|
||||
final DisiWrapper evicted = tail.insertWithOverflow(w);
|
||||
if (evicted != null) {
|
||||
head.add(evicted);
|
||||
}
|
||||
}
|
||||
this.cost = cost(scorers, minShouldMatch);
|
||||
this.cost = ScorerUtil.costWithMinShouldMatch(costs.stream(), costs.size(), minShouldMatch);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -230,19 +170,49 @@ final class BooleanScorer extends BulkScorer {
|
|||
return cost;
|
||||
}
|
||||
|
||||
private void scoreDisiWrapperIntoBitSet(DisiWrapper w, Bits acceptDocs, int min, int max)
|
||||
throws IOException {
|
||||
boolean needsScores = BooleanScorer.this.needsScores;
|
||||
long[] matching = BooleanScorer.this.matching;
|
||||
Bucket[] buckets = BooleanScorer.this.buckets;
|
||||
|
||||
DocIdSetIterator it = w.iterator;
|
||||
Scorer scorer = w.scorer;
|
||||
int doc = w.doc;
|
||||
if (doc < min) {
|
||||
doc = it.advance(min);
|
||||
}
|
||||
for (; doc < max; doc = it.nextDoc()) {
|
||||
if (acceptDocs == null || acceptDocs.get(doc)) {
|
||||
final int i = doc & MASK;
|
||||
final int idx = i >> 6;
|
||||
matching[idx] |= 1L << i;
|
||||
if (buckets != null) {
|
||||
final Bucket bucket = buckets[i];
|
||||
bucket.freq++;
|
||||
if (needsScores) {
|
||||
bucket.score += scorer.score();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
w.doc = doc;
|
||||
}
|
||||
|
||||
private void scoreWindowIntoBitSetAndReplay(
|
||||
LeafCollector collector,
|
||||
Bits acceptDocs,
|
||||
int base,
|
||||
int min,
|
||||
int max,
|
||||
BulkScorerAndDoc[] scorers,
|
||||
DisiWrapper[] scorers,
|
||||
int numScorers)
|
||||
throws IOException {
|
||||
for (int i = 0; i < numScorers; ++i) {
|
||||
final BulkScorerAndDoc scorer = scorers[i];
|
||||
assert scorer.next < max;
|
||||
scorer.score(orCollector, acceptDocs, min, max);
|
||||
final DisiWrapper w = scorers[i];
|
||||
assert w.doc < max;
|
||||
scoreDisiWrapperIntoBitSet(w, acceptDocs, min, max);
|
||||
}
|
||||
|
||||
docIdStreamView.base = base;
|
||||
|
@ -251,20 +221,20 @@ final class BooleanScorer extends BulkScorer {
|
|||
Arrays.fill(matching, 0L);
|
||||
}
|
||||
|
||||
private BulkScorerAndDoc advance(int min) throws IOException {
|
||||
private DisiWrapper advance(int min) throws IOException {
|
||||
assert tail.size() == minShouldMatch - 1;
|
||||
final HeadPriorityQueue head = this.head;
|
||||
final TailPriorityQueue tail = this.tail;
|
||||
BulkScorerAndDoc headTop = head.top();
|
||||
BulkScorerAndDoc tailTop = tail.top();
|
||||
while (headTop.next < min) {
|
||||
DisiWrapper headTop = head.top();
|
||||
DisiWrapper tailTop = tail.top();
|
||||
while (headTop.doc < min) {
|
||||
if (tailTop == null || headTop.cost <= tailTop.cost) {
|
||||
headTop.advance(min);
|
||||
headTop.doc = headTop.iterator.advance(min);
|
||||
headTop = head.updateTop();
|
||||
} else {
|
||||
// swap the top of head and tail
|
||||
final BulkScorerAndDoc previousHeadTop = headTop;
|
||||
tailTop.advance(min);
|
||||
final DisiWrapper previousHeadTop = headTop;
|
||||
tailTop.doc = tailTop.iterator.advance(min);
|
||||
headTop = head.updateTop(tailTop);
|
||||
tailTop = tail.updateTop(previousHeadTop);
|
||||
}
|
||||
|
@ -282,9 +252,11 @@ final class BooleanScorer extends BulkScorer {
|
|||
throws IOException {
|
||||
while (maxFreq < minShouldMatch && maxFreq + tail.size() >= minShouldMatch) {
|
||||
// a match is still possible
|
||||
final BulkScorerAndDoc candidate = tail.pop();
|
||||
candidate.advance(windowMin);
|
||||
if (candidate.next < windowMax) {
|
||||
final DisiWrapper candidate = tail.pop();
|
||||
if (candidate.doc < windowMin) {
|
||||
candidate.doc = candidate.iterator.advance(windowMin);
|
||||
}
|
||||
if (candidate.doc < windowMax) {
|
||||
leads[maxFreq++] = candidate;
|
||||
} else {
|
||||
head.add(candidate);
|
||||
|
@ -304,7 +276,7 @@ final class BooleanScorer extends BulkScorer {
|
|||
|
||||
// Push back scorers into head and tail
|
||||
for (int i = 0; i < maxFreq; ++i) {
|
||||
final BulkScorerAndDoc evicted = head.insertWithOverflow(leads[i]);
|
||||
final DisiWrapper evicted = head.insertWithOverflow(leads[i]);
|
||||
if (evicted != null) {
|
||||
tail.add(evicted);
|
||||
}
|
||||
|
@ -312,7 +284,7 @@ final class BooleanScorer extends BulkScorer {
|
|||
}
|
||||
|
||||
private void scoreWindowSingleScorer(
|
||||
BulkScorerAndDoc bulkScorer,
|
||||
DisiWrapper w,
|
||||
LeafCollector collector,
|
||||
Bits acceptDocs,
|
||||
int windowMin,
|
||||
|
@ -320,33 +292,44 @@ final class BooleanScorer extends BulkScorer {
|
|||
int max)
|
||||
throws IOException {
|
||||
assert tail.size() == 0;
|
||||
final int nextWindowBase = head.top().next & ~MASK;
|
||||
final int nextWindowBase = head.top().doc & ~MASK;
|
||||
final int end = Math.max(windowMax, Math.min(max, nextWindowBase));
|
||||
|
||||
bulkScorer.score(collector, acceptDocs, windowMin, end);
|
||||
DocIdSetIterator it = w.iterator;
|
||||
int doc = w.doc;
|
||||
if (doc < windowMin) {
|
||||
doc = it.advance(windowMin);
|
||||
}
|
||||
collector.setScorer(w.scorer);
|
||||
for (; doc < end; doc = it.nextDoc()) {
|
||||
if (acceptDocs == null || acceptDocs.get(doc)) {
|
||||
collector.collect(doc);
|
||||
}
|
||||
}
|
||||
w.doc = doc;
|
||||
|
||||
// reset the scorer that should be used for the general case
|
||||
collector.setScorer(score);
|
||||
}
|
||||
|
||||
private BulkScorerAndDoc scoreWindow(
|
||||
BulkScorerAndDoc top, LeafCollector collector, Bits acceptDocs, int min, int max)
|
||||
private DisiWrapper scoreWindow(
|
||||
DisiWrapper top, LeafCollector collector, Bits acceptDocs, int min, int max)
|
||||
throws IOException {
|
||||
final int windowBase = top.next & ~MASK; // find the window that the next match belongs to
|
||||
final int windowBase = top.doc & ~MASK; // find the window that the next match belongs to
|
||||
final int windowMin = Math.max(min, windowBase);
|
||||
final int windowMax = Math.min(max, windowBase + SIZE);
|
||||
|
||||
// Fill 'leads' with all scorers from 'head' that are in the right window
|
||||
leads[0] = head.pop();
|
||||
int maxFreq = 1;
|
||||
while (head.size() > 0 && head.top().next < windowMax) {
|
||||
while (head.size() > 0 && head.top().doc < windowMax) {
|
||||
leads[maxFreq++] = head.pop();
|
||||
}
|
||||
|
||||
if (minShouldMatch == 1 && maxFreq == 1) {
|
||||
// special case: only one scorer can match in the current window,
|
||||
// we can collect directly
|
||||
final BulkScorerAndDoc bulkScorer = leads[0];
|
||||
final DisiWrapper bulkScorer = leads[0];
|
||||
scoreWindowSingleScorer(bulkScorer, collector, acceptDocs, windowMin, windowMax, max);
|
||||
return head.add(bulkScorer);
|
||||
} else {
|
||||
|
@ -360,11 +343,11 @@ final class BooleanScorer extends BulkScorer {
|
|||
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
|
||||
collector.setScorer(score);
|
||||
|
||||
BulkScorerAndDoc top = advance(min);
|
||||
while (top.next < max) {
|
||||
DisiWrapper top = advance(min);
|
||||
while (top.doc < max) {
|
||||
top = scoreWindow(top, collector, acceptDocs, min, max);
|
||||
}
|
||||
|
||||
return top.next;
|
||||
return top.doc;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -289,9 +289,9 @@ final class BooleanScorerSupplier extends ScorerSupplier {
|
|||
return new MaxScoreBulkScorer(maxDoc, optionalScorers);
|
||||
}
|
||||
|
||||
List<BulkScorer> optional = new ArrayList<BulkScorer>();
|
||||
List<Scorer> optional = new ArrayList<Scorer>();
|
||||
for (ScorerSupplier ss : subs.get(Occur.SHOULD)) {
|
||||
optional.add(ss.bulkScorer());
|
||||
optional.add(ss.get(Long.MAX_VALUE));
|
||||
}
|
||||
|
||||
return new BooleanScorer(optional, Math.max(1, minShouldMatch), scoreMode.needsScores());
|
||||
|
|
|
@ -153,70 +153,6 @@ final class BooleanWeight extends Weight {
|
|||
return MatchesUtils.fromSubMatches(matches);
|
||||
}
|
||||
|
||||
// Return a BulkScorer for the optional clauses only,
|
||||
// or null if it is not applicable
|
||||
// pkg-private for forcing use of BooleanScorer in tests
|
||||
BulkScorer optionalBulkScorer(LeafReaderContext context) throws IOException {
|
||||
if (scoreMode == ScoreMode.TOP_SCORES) {
|
||||
if (!query.isPureDisjunction()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<ScorerSupplier> optional = new ArrayList<>();
|
||||
for (WeightedBooleanClause wc : weightedClauses) {
|
||||
Weight w = wc.weight;
|
||||
BooleanClause c = wc.clause;
|
||||
if (c.occur() != Occur.SHOULD) {
|
||||
continue;
|
||||
}
|
||||
ScorerSupplier scorer = w.scorerSupplier(context);
|
||||
if (scorer != null) {
|
||||
optional.add(scorer);
|
||||
}
|
||||
}
|
||||
|
||||
if (optional.size() <= 1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<Scorer> optionalScorers = new ArrayList<>();
|
||||
for (ScorerSupplier ss : optional) {
|
||||
optionalScorers.add(ss.get(Long.MAX_VALUE));
|
||||
}
|
||||
|
||||
return new MaxScoreBulkScorer(context.reader().maxDoc(), optionalScorers);
|
||||
}
|
||||
|
||||
List<BulkScorer> optional = new ArrayList<BulkScorer>();
|
||||
for (WeightedBooleanClause wc : weightedClauses) {
|
||||
Weight w = wc.weight;
|
||||
BooleanClause c = wc.clause;
|
||||
if (c.occur() != Occur.SHOULD) {
|
||||
continue;
|
||||
}
|
||||
BulkScorer subScorer = w.bulkScorer(context);
|
||||
|
||||
if (subScorer != null) {
|
||||
optional.add(subScorer);
|
||||
}
|
||||
}
|
||||
|
||||
if (optional.size() == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (query.getMinimumNumberShouldMatch() > optional.size()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (optional.size() == 1) {
|
||||
return optional.get(0);
|
||||
}
|
||||
|
||||
return new BooleanScorer(
|
||||
optional, Math.max(1, query.getMinimumNumberShouldMatch()), scoreMode.needsScores());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int count(LeafReaderContext context) throws IOException {
|
||||
final int numDocs = context.reader().numDocs();
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Collections;
|
|||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
||||
/**
|
||||
|
@ -44,6 +45,7 @@ public final class DisjunctionMaxQuery extends Query implements Iterable<Query>
|
|||
|
||||
/* The subqueries */
|
||||
private final Multiset<Query> disjuncts = new Multiset<>();
|
||||
private final List<Query> orderedQueries; // used for toString()
|
||||
|
||||
/* Multiple of the non-max disjunct scores added into our final score. Non-zero values support tie-breaking. */
|
||||
private final float tieBreakerMultiplier;
|
||||
|
@ -65,6 +67,7 @@ public final class DisjunctionMaxQuery extends Query implements Iterable<Query>
|
|||
}
|
||||
this.tieBreakerMultiplier = tieBreakerMultiplier;
|
||||
this.disjuncts.addAll(disjuncts);
|
||||
this.orderedQueries = new ArrayList<>(disjuncts); // order from the caller
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -295,24 +298,19 @@ public final class DisjunctionMaxQuery extends Query implements Iterable<Query>
|
|||
*/
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
buffer.append("(");
|
||||
Iterator<Query> it = disjuncts.iterator();
|
||||
for (int i = 0; it.hasNext(); i++) {
|
||||
Query subquery = it.next();
|
||||
if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens
|
||||
buffer.append("(");
|
||||
buffer.append(subquery.toString(field));
|
||||
buffer.append(")");
|
||||
} else buffer.append(subquery.toString(field));
|
||||
if (i != disjuncts.size() - 1) buffer.append(" | ");
|
||||
}
|
||||
buffer.append(")");
|
||||
if (tieBreakerMultiplier != 0.0f) {
|
||||
buffer.append("~");
|
||||
buffer.append(tieBreakerMultiplier);
|
||||
}
|
||||
return buffer.toString();
|
||||
return this.orderedQueries.stream()
|
||||
.map(
|
||||
subquery -> {
|
||||
if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens
|
||||
return "(" + subquery.toString(field) + ")";
|
||||
}
|
||||
return subquery.toString(field);
|
||||
})
|
||||
.collect(
|
||||
Collectors.joining(
|
||||
" | ",
|
||||
"(",
|
||||
")" + ((tieBreakerMultiplier != 0.0f) ? "~" + tieBreakerMultiplier : "")));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,147 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.util.concurrent.atomic.LongAdder;
|
||||
|
||||
/** Used for defining custom algorithms to allow searches to early terminate */
|
||||
abstract class HitsThresholdChecker {
|
||||
/** Implementation of HitsThresholdChecker which allows global hit counting */
|
||||
private static class GlobalHitsThresholdChecker extends HitsThresholdChecker {
|
||||
private final LongAdder globalHitCount = new LongAdder();
|
||||
// Cache whether the threshold has been reached already. It is not volatile or synchronized on
|
||||
// purpose to contain the overhead of reading the value similarly to what String#hashCode()
|
||||
// does. This does not affect correctness.
|
||||
private boolean thresholdReached = false;
|
||||
|
||||
GlobalHitsThresholdChecker(int totalHitsThreshold) {
|
||||
super(totalHitsThreshold);
|
||||
assert totalHitsThreshold != Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
void incrementHitCount() {
|
||||
if (thresholdReached == false) {
|
||||
globalHitCount.increment();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean isThresholdReached() {
|
||||
if (thresholdReached) {
|
||||
return true;
|
||||
}
|
||||
if (globalHitCount.longValue() > getHitsThreshold()) {
|
||||
thresholdReached = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
ScoreMode scoreMode() {
|
||||
return ScoreMode.TOP_SCORES;
|
||||
}
|
||||
}
|
||||
|
||||
/** Default implementation of HitsThresholdChecker to be used for single threaded execution */
|
||||
private static class LocalHitsThresholdChecker extends HitsThresholdChecker {
|
||||
private int hitCount;
|
||||
|
||||
LocalHitsThresholdChecker(int totalHitsThreshold) {
|
||||
super(totalHitsThreshold);
|
||||
assert totalHitsThreshold != Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
void incrementHitCount() {
|
||||
++hitCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean isThresholdReached() {
|
||||
return hitCount > getHitsThreshold();
|
||||
}
|
||||
|
||||
@Override
|
||||
ScoreMode scoreMode() {
|
||||
return ScoreMode.TOP_SCORES;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* No-op implementation of {@link HitsThresholdChecker} that does no counting, as the threshold
|
||||
* can never be reached. This is useful for cases where early termination is never desired, so
|
||||
* that the overhead of counting hits can be avoided.
|
||||
*/
|
||||
private static final HitsThresholdChecker EXACT_HITS_COUNT_THRESHOLD_CHECKER =
|
||||
new HitsThresholdChecker(Integer.MAX_VALUE) {
|
||||
@Override
|
||||
void incrementHitCount() {
|
||||
// noop
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean isThresholdReached() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
ScoreMode scoreMode() {
|
||||
return ScoreMode.COMPLETE;
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* Returns a threshold checker that is useful for single threaded searches
|
||||
*/
|
||||
static HitsThresholdChecker create(final int totalHitsThreshold) {
|
||||
return totalHitsThreshold == Integer.MAX_VALUE
|
||||
? HitsThresholdChecker.EXACT_HITS_COUNT_THRESHOLD_CHECKER
|
||||
: new LocalHitsThresholdChecker(totalHitsThreshold);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns a threshold checker that is based on a shared counter
|
||||
*/
|
||||
static HitsThresholdChecker createShared(final int totalHitsThreshold) {
|
||||
return totalHitsThreshold == Integer.MAX_VALUE
|
||||
? HitsThresholdChecker.EXACT_HITS_COUNT_THRESHOLD_CHECKER
|
||||
: new GlobalHitsThresholdChecker(totalHitsThreshold);
|
||||
}
|
||||
|
||||
private final int totalHitsThreshold;
|
||||
|
||||
HitsThresholdChecker(int totalHitsThreshold) {
|
||||
if (totalHitsThreshold < 0) {
|
||||
throw new IllegalArgumentException(
|
||||
"totalHitsThreshold must be >= 0, got " + totalHitsThreshold);
|
||||
}
|
||||
this.totalHitsThreshold = totalHitsThreshold;
|
||||
}
|
||||
|
||||
final int getHitsThreshold() {
|
||||
return totalHitsThreshold;
|
||||
}
|
||||
|
||||
abstract boolean isThresholdReached();
|
||||
|
||||
abstract ScoreMode scoreMode();
|
||||
|
||||
abstract void incrementHitCount();
|
||||
}
|
|
@ -106,6 +106,10 @@ public final class ImpactsDISI extends DocIdSetIterator {
|
|||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
DocIdSetIterator in = this.in;
|
||||
if (in.docID() < upTo) {
|
||||
return in.nextDoc();
|
||||
}
|
||||
return advance(in.docID() + 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -27,7 +27,6 @@ import java.util.Objects;
|
|||
import java.util.Set;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
@ -115,13 +114,7 @@ public class IndexSearcher {
|
|||
protected final IndexReaderContext readerContext;
|
||||
protected final List<LeafReaderContext> leafContexts;
|
||||
|
||||
/**
|
||||
* Used with executor - LeafSlice supplier where each slice holds a set of leafs executed within
|
||||
* one thread. We are caching it instead of creating it eagerly to avoid calling a protected
|
||||
* method from constructor, which is a bad practice. Always non-null, regardless of whether an
|
||||
* executor is provided or not.
|
||||
*/
|
||||
private final Supplier<LeafSlice[]> leafSlicesSupplier;
|
||||
private volatile LeafSlice[] leafSlices;
|
||||
|
||||
// Used internally for load balancing threads executing for the query
|
||||
private final TaskExecutor taskExecutor;
|
||||
|
@ -230,20 +223,18 @@ public class IndexSearcher {
|
|||
executor == null ? new TaskExecutor(Runnable::run) : new TaskExecutor(executor);
|
||||
this.readerContext = context;
|
||||
leafContexts = context.leaves();
|
||||
Function<List<LeafReaderContext>, LeafSlice[]> slicesProvider =
|
||||
executor == null
|
||||
? leaves ->
|
||||
leaves.isEmpty()
|
||||
? new LeafSlice[0]
|
||||
: new LeafSlice[] {
|
||||
new LeafSlice(
|
||||
new ArrayList<>(
|
||||
leaves.stream()
|
||||
.map(LeafReaderContextPartition::createForEntireSegment)
|
||||
.toList()))
|
||||
}
|
||||
: this::slices;
|
||||
leafSlicesSupplier = new CachingLeafSlicesSupplier(slicesProvider, leafContexts);
|
||||
if (executor == null) {
|
||||
leafSlices =
|
||||
leafContexts.isEmpty()
|
||||
? new LeafSlice[0]
|
||||
: new LeafSlice[] {
|
||||
new LeafSlice(
|
||||
new ArrayList<>(
|
||||
leafContexts.stream()
|
||||
.map(LeafReaderContextPartition::createForEntireSegment)
|
||||
.toList()))
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -540,7 +531,43 @@ public class IndexSearcher {
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public final LeafSlice[] getSlices() {
|
||||
return leafSlicesSupplier.get();
|
||||
LeafSlice[] res = leafSlices;
|
||||
if (res == null) {
|
||||
res = computeAndCacheSlices();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private synchronized LeafSlice[] computeAndCacheSlices() {
|
||||
LeafSlice[] res = leafSlices;
|
||||
if (res == null) {
|
||||
res = slices(leafContexts);
|
||||
/*
|
||||
* Enforce that there aren't multiple leaf partitions within the same leaf slice pointing to the
|
||||
* same leaf context. It is a requirement that {@link Collector#getLeafCollector(LeafReaderContext)}
|
||||
* gets called once per leaf context. Also, it does not make sense to partition a segment to then search
|
||||
* those partitions as part of the same slice, because the goal of partitioning is parallel searching
|
||||
* which happens at the slice level.
|
||||
*/
|
||||
for (LeafSlice leafSlice : res) {
|
||||
if (leafSlice.partitions.length <= 1) {
|
||||
continue;
|
||||
}
|
||||
enforceDistinctLeaves(leafSlice);
|
||||
}
|
||||
leafSlices = res;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private static void enforceDistinctLeaves(LeafSlice leafSlice) {
|
||||
Set<LeafReaderContext> distinctLeaves = new HashSet<>();
|
||||
for (LeafReaderContextPartition leafPartition : leafSlice.partitions) {
|
||||
if (distinctLeaves.add(leafPartition.ctx) == false) {
|
||||
throw new IllegalStateException(
|
||||
"The same slice targets multiple leaf partitions of the same leaf reader context. A physical segment should rather get partitioned to be searched concurrently from as many slices as the number of leaf partitions it is split into.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -564,10 +591,8 @@ public class IndexSearcher {
|
|||
}
|
||||
|
||||
final int cappedNumHits = Math.min(numHits, limit);
|
||||
final boolean supportsConcurrency = getSlices().length > 1;
|
||||
CollectorManager<TopScoreDocCollector, TopDocs> manager =
|
||||
new TopScoreDocCollectorManager(
|
||||
cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
|
||||
new TopScoreDocCollectorManager(cappedNumHits, after, TOTAL_HITS_THRESHOLD);
|
||||
|
||||
return search(query, manager);
|
||||
}
|
||||
|
@ -699,12 +724,9 @@ public class IndexSearcher {
|
|||
}
|
||||
final int cappedNumHits = Math.min(numHits, limit);
|
||||
final Sort rewrittenSort = sort.rewrite(this);
|
||||
final LeafSlice[] leafSlices = getSlices();
|
||||
|
||||
final boolean supportsConcurrency = leafSlices.length > 1;
|
||||
final CollectorManager<TopFieldCollector, TopFieldDocs> manager =
|
||||
new TopFieldCollectorManager(
|
||||
rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
|
||||
new TopFieldCollectorManager(rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD);
|
||||
|
||||
TopFieldDocs topDocs = search(query, manager);
|
||||
if (doDocScores) {
|
||||
|
@ -1169,60 +1191,4 @@ public class IndexSearcher {
|
|||
+ IndexSearcher.getMaxClauseCount());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplier for {@link LeafSlice} slices which computes and caches the value on first invocation
|
||||
* and returns cached value on subsequent invocation. If the passed in provider for slice
|
||||
* computation throws exception then same will be passed to the caller of this supplier on each
|
||||
* invocation. If the provider returns null then {@link NullPointerException} will be thrown to
|
||||
* the caller.
|
||||
*
|
||||
* <p>NOTE: To provide thread safe caching mechanism this class is implementing the (subtle) <a
|
||||
* href="https://shipilev.net/blog/2014/safe-public-construction/">double-checked locking
|
||||
* idiom</a>
|
||||
*/
|
||||
private static class CachingLeafSlicesSupplier implements Supplier<LeafSlice[]> {
|
||||
private volatile LeafSlice[] leafSlices;
|
||||
|
||||
private final Function<List<LeafReaderContext>, LeafSlice[]> sliceProvider;
|
||||
|
||||
private final List<LeafReaderContext> leaves;
|
||||
|
||||
private CachingLeafSlicesSupplier(
|
||||
Function<List<LeafReaderContext>, LeafSlice[]> provider, List<LeafReaderContext> leaves) {
|
||||
this.sliceProvider = Objects.requireNonNull(provider, "leaf slice provider cannot be null");
|
||||
this.leaves = Objects.requireNonNull(leaves, "list of LeafReaderContext cannot be null");
|
||||
}
|
||||
|
||||
@Override
|
||||
public LeafSlice[] get() {
|
||||
if (leafSlices == null) {
|
||||
synchronized (this) {
|
||||
if (leafSlices == null) {
|
||||
leafSlices =
|
||||
Objects.requireNonNull(
|
||||
sliceProvider.apply(leaves), "slices computed by the provider is null");
|
||||
/*
|
||||
* Enforce that there aren't multiple leaf partitions within the same leaf slice pointing to the
|
||||
* same leaf context. It is a requirement that {@link Collector#getLeafCollector(LeafReaderContext)}
|
||||
* gets called once per leaf context. Also, it does not make sense to partition a segment to then search
|
||||
* those partitions as part of the same slice, because the goal of partitioning is parallel searching
|
||||
* which happens at the slice level.
|
||||
*/
|
||||
for (LeafSlice leafSlice : leafSlices) {
|
||||
Set<LeafReaderContext> distinctLeaves = new HashSet<>();
|
||||
for (LeafReaderContextPartition leafPartition : leafSlice.partitions) {
|
||||
distinctLeaves.add(leafPartition.ctx);
|
||||
}
|
||||
if (leafSlice.partitions.length != distinctLeaves.size()) {
|
||||
throw new IllegalStateException(
|
||||
"The same slice targets multiple leaf partitions of the same leaf reader context. A physical segment should rather get partitioned to be searched concurrently from as many slices as the number of leaf partitions it is split into.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return leafSlices;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -186,10 +186,44 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
|
|||
@Override
|
||||
public int count(LeafReaderContext context) throws IOException {
|
||||
if (context.reader().hasDeletions() == false) {
|
||||
IteratorAndCount itAndCount = getDocIdSetIteratorOrNull(context);
|
||||
if (lowerValue > upperValue) {
|
||||
return 0;
|
||||
}
|
||||
IteratorAndCount itAndCount = null;
|
||||
LeafReader reader = context.reader();
|
||||
|
||||
// first use bkd optimization if possible
|
||||
SortedNumericDocValues sortedNumericValues = DocValues.getSortedNumeric(reader, field);
|
||||
NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues);
|
||||
PointValues pointValues = reader.getPointValues(field);
|
||||
if (pointValues != null && pointValues.getDocCount() == reader.maxDoc()) {
|
||||
itAndCount = getDocIdSetIteratorOrNullFromBkd(context, numericValues);
|
||||
}
|
||||
if (itAndCount != null && itAndCount.count != -1) {
|
||||
return itAndCount.count;
|
||||
}
|
||||
|
||||
// use index sort optimization if possible
|
||||
Sort indexSort = reader.getMetaData().sort();
|
||||
if (indexSort != null
|
||||
&& indexSort.getSort().length > 0
|
||||
&& indexSort.getSort()[0].getField().equals(field)) {
|
||||
final SortField sortField = indexSort.getSort()[0];
|
||||
final SortField.Type sortFieldType = getSortFieldType(sortField);
|
||||
// The index sort optimization is only supported for Type.INT and Type.LONG
|
||||
if (sortFieldType == Type.INT || sortFieldType == Type.LONG) {
|
||||
Object missingValue = sortField.getMissingValue();
|
||||
final long missingLongValue = missingValue == null ? 0L : (long) missingValue;
|
||||
// all documents have docValues or missing value falls outside the range
|
||||
if ((pointValues != null && pointValues.getDocCount() == reader.maxDoc())
|
||||
|| (missingLongValue < lowerValue || missingLongValue > upperValue)) {
|
||||
itAndCount = getDocIdSetIterator(sortField, sortFieldType, context, numericValues);
|
||||
}
|
||||
if (itAndCount != null && itAndCount.count != -1) {
|
||||
return itAndCount.count;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return fallbackWeight.count(context);
|
||||
}
|
||||
|
|
|
@ -1,72 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
|
||||
/** {@link SimScorer} on a specific {@link LeafReader}. */
|
||||
public final class LeafSimScorer {
|
||||
|
||||
private final SimScorer scorer;
|
||||
private final NumericDocValues norms;
|
||||
|
||||
/** Sole constructor: Score documents of {@code reader} with {@code scorer}. */
|
||||
public LeafSimScorer(SimScorer scorer, LeafReader reader, String field, boolean needsScores)
|
||||
throws IOException {
|
||||
this.scorer = Objects.requireNonNull(scorer);
|
||||
norms = needsScores ? reader.getNormValues(field) : null;
|
||||
}
|
||||
|
||||
/** Return the wrapped {@link SimScorer}. */
|
||||
public SimScorer getSimScorer() {
|
||||
return scorer;
|
||||
}
|
||||
|
||||
private long getNormValue(int doc) throws IOException {
|
||||
if (norms != null) {
|
||||
boolean found = norms.advanceExact(doc);
|
||||
assert found;
|
||||
return norms.longValue();
|
||||
} else {
|
||||
return 1L; // default norm
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Score the provided document assuming the given term document frequency. This method must be
|
||||
* called on non-decreasing sequences of doc ids.
|
||||
*
|
||||
* @see SimScorer#score(float, long)
|
||||
*/
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
return scorer.score(freq, getNormValue(doc));
|
||||
}
|
||||
|
||||
/**
|
||||
* Explain the score for the provided document assuming the given term document frequency. This
|
||||
* method must be called on non-decreasing sequences of doc ids.
|
||||
*
|
||||
* @see SimScorer#explain(Explanation, long)
|
||||
*/
|
||||
public Explanation explain(int doc, Explanation freqExpl) throws IOException {
|
||||
return scorer.explain(freqExpl, getNormValue(doc));
|
||||
}
|
||||
}
|
|
@ -66,6 +66,15 @@ final class MaxScoreBulkScorer extends BulkScorer {
|
|||
maxScoreSums = new double[allScorers.length];
|
||||
}
|
||||
|
||||
// Number of outer windows that have been evaluated
|
||||
private int numOuterWindows;
|
||||
// Number of candidate matches so far
|
||||
private int numCandidates;
|
||||
// Minimum window size. See #computeOuterWindowMax where we have heuristics that adjust the
|
||||
// minimum window size based on the average number of candidate matches per outer window, to keep
|
||||
// the per-window overhead under control.
|
||||
private int minWindowSize = 1;
|
||||
|
||||
@Override
|
||||
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
|
||||
collector.setScorer(scorable);
|
||||
|
@ -124,6 +133,7 @@ final class MaxScoreBulkScorer extends BulkScorer {
|
|||
}
|
||||
|
||||
outerWindowMin = Math.min(top.doc, outerWindowMax);
|
||||
++numOuterWindows;
|
||||
}
|
||||
|
||||
return nextCandidate(max);
|
||||
|
@ -278,6 +288,23 @@ final class MaxScoreBulkScorer extends BulkScorer {
|
|||
windowMax = (int) Math.min(windowMax, upTo + 1L); // upTo is inclusive
|
||||
}
|
||||
|
||||
if (allScorers.length - firstWindowLead > 1) {
|
||||
// The more clauses we consider to compute outer windows, the higher chances that one of these
|
||||
// clauses has a block boundary in the next few doc IDs. This situation can result in more
|
||||
// time spent computing maximum scores per outer window than evaluating hits. To avoid such
|
||||
// situations, we target at least 32 candidate matches per clause per outer window on average,
|
||||
// to make sure we amortize the cost of computing maximum scores.
|
||||
long threshold = numOuterWindows * 32L * allScorers.length;
|
||||
if (numCandidates < threshold) {
|
||||
minWindowSize = Math.min(minWindowSize << 1, INNER_WINDOW_SIZE);
|
||||
} else {
|
||||
minWindowSize = 1;
|
||||
}
|
||||
|
||||
int minWindowMax = (int) Math.min(Integer.MAX_VALUE, (long) windowMin + minWindowSize);
|
||||
windowMax = Math.max(windowMax, minWindowMax);
|
||||
}
|
||||
|
||||
return windowMax;
|
||||
}
|
||||
|
||||
|
@ -300,6 +327,9 @@ final class MaxScoreBulkScorer extends BulkScorer {
|
|||
private void scoreNonEssentialClauses(
|
||||
LeafCollector collector, int doc, double essentialScore, int numNonEssentialClauses)
|
||||
throws IOException {
|
||||
|
||||
++numCandidates;
|
||||
|
||||
double score = essentialScore;
|
||||
for (int i = numNonEssentialClauses - 1; i >= 0; --i) {
|
||||
float maxPossibleScore =
|
||||
|
|
|
@ -21,8 +21,8 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.index.ImpactsEnum;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -399,10 +399,10 @@ public class PhraseQuery extends Query {
|
|||
/**
|
||||
* A guess of the average number of simple operations for the initial seek and buffer refill per
|
||||
* document for the positions of a term. See also {@link
|
||||
* Lucene912PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
|
||||
* Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
|
||||
*
|
||||
* <p>Aside: Instead of being constant this could depend among others on {@link
|
||||
* Lucene912PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
|
||||
* Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
|
||||
* TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs),
|
||||
* {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block
|
||||
* size of the device storing the index.
|
||||
|
@ -411,7 +411,7 @@ public class PhraseQuery extends Query {
|
|||
|
||||
/**
|
||||
* Number of simple operations in {@link
|
||||
* Lucene912PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill
|
||||
* Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill
|
||||
* is done.
|
||||
*/
|
||||
private static final int TERM_OPS_PER_POS = 7;
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
|
||||
class PhraseScorer extends Scorer {
|
||||
|
||||
|
@ -26,16 +28,19 @@ class PhraseScorer extends Scorer {
|
|||
final MaxScoreCache maxScoreCache;
|
||||
final PhraseMatcher matcher;
|
||||
final ScoreMode scoreMode;
|
||||
private final LeafSimScorer simScorer;
|
||||
private final SimScorer simScorer;
|
||||
private final NumericDocValues norms;
|
||||
final float matchCost;
|
||||
|
||||
private float minCompetitiveScore = 0;
|
||||
private float freq = 0;
|
||||
|
||||
PhraseScorer(PhraseMatcher matcher, ScoreMode scoreMode, LeafSimScorer simScorer) {
|
||||
PhraseScorer(
|
||||
PhraseMatcher matcher, ScoreMode scoreMode, SimScorer simScorer, NumericDocValues norms) {
|
||||
this.matcher = matcher;
|
||||
this.scoreMode = scoreMode;
|
||||
this.simScorer = simScorer;
|
||||
this.norms = norms;
|
||||
this.matchCost = matcher.getMatchCost();
|
||||
this.approximation = matcher.approximation();
|
||||
this.impactsApproximation = matcher.impactsApproximation();
|
||||
|
@ -50,7 +55,11 @@ class PhraseScorer extends Scorer {
|
|||
matcher.reset();
|
||||
if (scoreMode == ScoreMode.TOP_SCORES && minCompetitiveScore > 0) {
|
||||
float maxFreq = matcher.maxFreq();
|
||||
if (simScorer.score(docID(), maxFreq) < minCompetitiveScore) {
|
||||
long norm = 1L;
|
||||
if (norms != null && norms.advanceExact(docID())) {
|
||||
norm = norms.longValue();
|
||||
}
|
||||
if (simScorer.score(maxFreq, norm) < minCompetitiveScore) {
|
||||
// The maximum score we could get is less than the min competitive score
|
||||
return false;
|
||||
}
|
||||
|
@ -79,7 +88,11 @@ class PhraseScorer extends Scorer {
|
|||
freq += matcher.sloppyWeight();
|
||||
}
|
||||
}
|
||||
return simScorer.score(docID(), freq);
|
||||
long norm = 1L;
|
||||
if (norms != null && norms.advanceExact(docID())) {
|
||||
norm = norms.longValue();
|
||||
}
|
||||
return simScorer.score(freq, norm);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
|
||||
|
@ -63,9 +64,8 @@ public abstract class PhraseWeight extends Weight {
|
|||
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
|
||||
PhraseMatcher matcher = getPhraseMatcher(context, stats, false);
|
||||
if (matcher == null) return null;
|
||||
LeafSimScorer simScorer =
|
||||
new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores());
|
||||
final var scorer = new PhraseScorer(matcher, scoreMode, simScorer);
|
||||
NumericDocValues norms = scoreMode.needsScores() ? context.reader().getNormValues(field) : null;
|
||||
final var scorer = new PhraseScorer(matcher, scoreMode, stats, norms);
|
||||
return new DefaultScorerSupplier(scorer);
|
||||
}
|
||||
|
||||
|
@ -83,10 +83,13 @@ public abstract class PhraseWeight extends Weight {
|
|||
while (matcher.nextMatch()) {
|
||||
freq += matcher.sloppyWeight();
|
||||
}
|
||||
LeafSimScorer docScorer =
|
||||
new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores());
|
||||
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
|
||||
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
|
||||
NumericDocValues norms = scoreMode.needsScores() ? context.reader().getNormValues(field) : null;
|
||||
long norm = 1L;
|
||||
if (norms != null && norms.advanceExact(doc)) {
|
||||
norm = norms.longValue();
|
||||
}
|
||||
Explanation scoreExplanation = stats.explain(freqExplanation, norm);
|
||||
return Explanation.match(
|
||||
scoreExplanation.getValue(),
|
||||
"weight("
|
||||
|
|
|
@ -46,9 +46,7 @@ public class SortRescorer extends Rescorer {
|
|||
List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
|
||||
|
||||
TopFieldCollector collector =
|
||||
new TopFieldCollectorManager(
|
||||
sort, topN, null, Integer.MAX_VALUE, searcher.getSlices().length > 1)
|
||||
.newCollector();
|
||||
new TopFieldCollectorManager(sort, topN, null, Integer.MAX_VALUE).newCollector();
|
||||
|
||||
// Now merge sort docIDs from hits, with reader's leaves:
|
||||
int hitUpto = 0;
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.index.Impacts;
|
|||
import org.apache.lucene.index.ImpactsEnum;
|
||||
import org.apache.lucene.index.ImpactsSource;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.SlowImpactsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -38,6 +39,7 @@ import org.apache.lucene.index.TermStates;
|
|||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOSupplier;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
@ -259,9 +261,13 @@ public final class SynonymQuery extends Query {
|
|||
assert scorer instanceof TermScorer;
|
||||
freq = ((TermScorer) scorer).freq();
|
||||
}
|
||||
LeafSimScorer docScorer = new LeafSimScorer(simWeight, context.reader(), field, true);
|
||||
Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
|
||||
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
|
||||
NumericDocValues norms = context.reader().getNormValues(field);
|
||||
long norm = 1L;
|
||||
if (norms != null && norms.advanceExact(doc)) {
|
||||
norm = norms.longValue();
|
||||
}
|
||||
Explanation scoreExplanation = simWeight.explain(freqExplanation, norm);
|
||||
return Explanation.match(
|
||||
scoreExplanation.getValue(),
|
||||
"weight("
|
||||
|
@ -334,27 +340,27 @@ public final class SynonymQuery extends Query {
|
|||
return new ConstantScoreScorer(0f, scoreMode, DocIdSetIterator.empty());
|
||||
}
|
||||
|
||||
LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), field, true);
|
||||
NumericDocValues norms = context.reader().getNormValues(field);
|
||||
|
||||
// we must optimize this case (term not in segment), disjunctions require >= 2 subs
|
||||
if (iterators.size() == 1) {
|
||||
final TermScorer scorer;
|
||||
if (scoreMode == ScoreMode.TOP_SCORES) {
|
||||
scorer = new TermScorer(impacts.get(0), simScorer);
|
||||
scorer = new TermScorer(impacts.get(0), simWeight, norms);
|
||||
} else {
|
||||
scorer = new TermScorer(iterators.get(0), simScorer);
|
||||
scorer = new TermScorer(iterators.get(0), simWeight, norms);
|
||||
}
|
||||
float boost = termBoosts.get(0);
|
||||
return scoreMode == ScoreMode.COMPLETE_NO_SCORES || boost == 1f
|
||||
? scorer
|
||||
: new FreqBoostTermScorer(boost, scorer, simScorer);
|
||||
: new FreqBoostTermScorer(boost, scorer, simWeight, norms);
|
||||
} else {
|
||||
|
||||
// we use termscorers + disjunction as an impl detail
|
||||
DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size());
|
||||
for (int i = 0; i < iterators.size(); i++) {
|
||||
PostingsEnum postings = iterators.get(i);
|
||||
final TermScorer termScorer = new TermScorer(postings, simScorer);
|
||||
final TermScorer termScorer = new TermScorer(postings, simWeight, norms);
|
||||
float boost = termBoosts.get(i);
|
||||
final DisiWrapperFreq wrapper = new DisiWrapperFreq(termScorer, boost);
|
||||
queue.add(wrapper);
|
||||
|
@ -368,8 +374,7 @@ public final class SynonymQuery extends Query {
|
|||
boosts[i] = termBoosts.get(i);
|
||||
}
|
||||
ImpactsSource impactsSource = mergeImpacts(impacts.toArray(new ImpactsEnum[0]), boosts);
|
||||
MaxScoreCache maxScoreCache =
|
||||
new MaxScoreCache(impactsSource, simScorer.getSimScorer());
|
||||
MaxScoreCache maxScoreCache = new MaxScoreCache(impactsSource, simWeight);
|
||||
ImpactsDISI impactsDisi = new ImpactsDISI(iterator, maxScoreCache);
|
||||
|
||||
if (scoreMode == ScoreMode.TOP_SCORES) {
|
||||
|
@ -379,7 +384,7 @@ public final class SynonymQuery extends Query {
|
|||
iterator = impactsDisi;
|
||||
}
|
||||
|
||||
return new SynonymScorer(queue, iterator, impactsDisi, simScorer);
|
||||
return new SynonymScorer(queue, iterator, impactsDisi, simWeight, norms);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -575,18 +580,21 @@ public final class SynonymQuery extends Query {
|
|||
private final DocIdSetIterator iterator;
|
||||
private final MaxScoreCache maxScoreCache;
|
||||
private final ImpactsDISI impactsDisi;
|
||||
private final LeafSimScorer simScorer;
|
||||
private final SimScorer scorer;
|
||||
private final NumericDocValues norms;
|
||||
|
||||
SynonymScorer(
|
||||
DisiPriorityQueue queue,
|
||||
DocIdSetIterator iterator,
|
||||
ImpactsDISI impactsDisi,
|
||||
LeafSimScorer simScorer) {
|
||||
SimScorer scorer,
|
||||
NumericDocValues norms) {
|
||||
this.queue = queue;
|
||||
this.iterator = iterator;
|
||||
this.maxScoreCache = impactsDisi.getMaxScoreCache();
|
||||
this.impactsDisi = impactsDisi;
|
||||
this.simScorer = simScorer;
|
||||
this.scorer = scorer;
|
||||
this.norms = norms;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -605,7 +613,11 @@ public final class SynonymQuery extends Query {
|
|||
|
||||
@Override
|
||||
public float score() throws IOException {
|
||||
return simScorer.score(iterator.docID(), freq());
|
||||
long norm = 1L;
|
||||
if (norms != null && norms.advanceExact(iterator.docID())) {
|
||||
norm = norms.longValue();
|
||||
}
|
||||
return scorer.score(freq(), norm);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -647,9 +659,11 @@ public final class SynonymQuery extends Query {
|
|||
private static class FreqBoostTermScorer extends FilterScorer {
|
||||
final float boost;
|
||||
final TermScorer in;
|
||||
final LeafSimScorer docScorer;
|
||||
final SimScorer scorer;
|
||||
final NumericDocValues norms;
|
||||
|
||||
public FreqBoostTermScorer(float boost, TermScorer in, LeafSimScorer docScorer) {
|
||||
public FreqBoostTermScorer(
|
||||
float boost, TermScorer in, SimScorer scorer, NumericDocValues norms) {
|
||||
super(in);
|
||||
if (Float.isNaN(boost) || Float.compare(boost, 0f) < 0 || Float.compare(boost, 1f) > 0) {
|
||||
throw new IllegalArgumentException(
|
||||
|
@ -657,7 +671,8 @@ public final class SynonymQuery extends Query {
|
|||
}
|
||||
this.boost = boost;
|
||||
this.in = in;
|
||||
this.docScorer = docScorer;
|
||||
this.scorer = scorer;
|
||||
this.norms = norms;
|
||||
}
|
||||
|
||||
float freq() throws IOException {
|
||||
|
@ -666,8 +681,11 @@ public final class SynonymQuery extends Query {
|
|||
|
||||
@Override
|
||||
public float score() throws IOException {
|
||||
assert docID() != DocIdSetIterator.NO_MORE_DOCS;
|
||||
return docScorer.score(in.docID(), freq());
|
||||
long norm = 1L;
|
||||
if (norms != null && norms.advanceExact(in.docID())) {
|
||||
norm = norms.longValue();
|
||||
}
|
||||
return scorer.score(freq(), norm);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Objects;
|
|||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -150,19 +151,17 @@ public class TermQuery extends Query {
|
|||
return new ConstantScoreScorer(0f, scoreMode, DocIdSetIterator.empty());
|
||||
}
|
||||
|
||||
LeafSimScorer scorer =
|
||||
new LeafSimScorer(simScorer, context.reader(), term.field(), scoreMode.needsScores());
|
||||
NumericDocValues norms = null;
|
||||
if (scoreMode.needsScores()) {
|
||||
norms = context.reader().getNormValues(term.field());
|
||||
}
|
||||
|
||||
if (scoreMode == ScoreMode.TOP_SCORES) {
|
||||
return new TermScorer(
|
||||
TermWeight.this,
|
||||
termsEnum.impacts(PostingsEnum.FREQS),
|
||||
scorer,
|
||||
topLevelScoringClause);
|
||||
termsEnum.impacts(PostingsEnum.FREQS), simScorer, norms, topLevelScoringClause);
|
||||
} else {
|
||||
return new TermScorer(
|
||||
termsEnum.postings(
|
||||
null, scoreMode.needsScores() ? PostingsEnum.FREQS : PostingsEnum.NONE),
|
||||
scorer);
|
||||
int flags = scoreMode.needsScores() ? PostingsEnum.FREQS : PostingsEnum.NONE;
|
||||
return new TermScorer(termsEnum.postings(null, flags), simScorer, norms);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -223,11 +222,14 @@ public class TermQuery extends Query {
|
|||
int newDoc = scorer.iterator().advance(doc);
|
||||
if (newDoc == doc) {
|
||||
float freq = ((TermScorer) scorer).freq();
|
||||
LeafSimScorer docScorer =
|
||||
new LeafSimScorer(simScorer, context.reader(), term.field(), true);
|
||||
NumericDocValues norms = context.reader().getNormValues(term.field());
|
||||
long norm = 1L;
|
||||
if (norms != null && norms.advanceExact(doc)) {
|
||||
norm = norms.longValue();
|
||||
}
|
||||
Explanation freqExplanation =
|
||||
Explanation.match(freq, "freq, occurrences of term within document");
|
||||
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
|
||||
Explanation scoreExplanation = simScorer.explain(freqExplanation, norm);
|
||||
return Explanation.match(
|
||||
scoreExplanation.getValue(),
|
||||
"weight("
|
||||
|
|
|
@ -18,8 +18,10 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.ImpactsEnum;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.SlowImpactsEnum;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
|
||||
/**
|
||||
* Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
|
||||
|
@ -29,17 +31,19 @@ import org.apache.lucene.index.SlowImpactsEnum;
|
|||
public final class TermScorer extends Scorer {
|
||||
private final PostingsEnum postingsEnum;
|
||||
private final DocIdSetIterator iterator;
|
||||
private final LeafSimScorer docScorer;
|
||||
private final SimScorer scorer;
|
||||
private final NumericDocValues norms;
|
||||
private final ImpactsDISI impactsDisi;
|
||||
private final MaxScoreCache maxScoreCache;
|
||||
|
||||
/** Construct a {@link TermScorer} that will iterate all documents. */
|
||||
public TermScorer(PostingsEnum postingsEnum, LeafSimScorer docScorer) {
|
||||
public TermScorer(PostingsEnum postingsEnum, SimScorer scorer, NumericDocValues norms) {
|
||||
iterator = this.postingsEnum = postingsEnum;
|
||||
ImpactsEnum impactsEnum = new SlowImpactsEnum(postingsEnum);
|
||||
maxScoreCache = new MaxScoreCache(impactsEnum, docScorer.getSimScorer());
|
||||
maxScoreCache = new MaxScoreCache(impactsEnum, scorer);
|
||||
impactsDisi = null;
|
||||
this.docScorer = docScorer;
|
||||
this.scorer = scorer;
|
||||
this.norms = norms;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -47,12 +51,12 @@ public final class TermScorer extends Scorer {
|
|||
* documents.
|
||||
*/
|
||||
public TermScorer(
|
||||
Weight weight,
|
||||
ImpactsEnum impactsEnum,
|
||||
LeafSimScorer docScorer,
|
||||
SimScorer scorer,
|
||||
NumericDocValues norms,
|
||||
boolean topLevelScoringClause) {
|
||||
postingsEnum = impactsEnum;
|
||||
maxScoreCache = new MaxScoreCache(impactsEnum, docScorer.getSimScorer());
|
||||
maxScoreCache = new MaxScoreCache(impactsEnum, scorer);
|
||||
if (topLevelScoringClause) {
|
||||
impactsDisi = new ImpactsDISI(impactsEnum, maxScoreCache);
|
||||
iterator = impactsDisi;
|
||||
|
@ -60,7 +64,8 @@ public final class TermScorer extends Scorer {
|
|||
impactsDisi = null;
|
||||
iterator = impactsEnum;
|
||||
}
|
||||
this.docScorer = docScorer;
|
||||
this.scorer = scorer;
|
||||
this.norms = norms;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -80,13 +85,23 @@ public final class TermScorer extends Scorer {
|
|||
|
||||
@Override
|
||||
public float score() throws IOException {
|
||||
assert docID() != DocIdSetIterator.NO_MORE_DOCS;
|
||||
return docScorer.score(postingsEnum.docID(), postingsEnum.freq());
|
||||
var postingsEnum = this.postingsEnum;
|
||||
var norms = this.norms;
|
||||
|
||||
long norm = 1L;
|
||||
if (norms != null && norms.advanceExact(postingsEnum.docID())) {
|
||||
norm = norms.longValue();
|
||||
}
|
||||
return scorer.score(postingsEnum.freq(), norm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float smoothingScore(int docId) throws IOException {
|
||||
return docScorer.score(docId, 0);
|
||||
long norm = 1L;
|
||||
if (norms != null && norms.advanceExact(docId)) {
|
||||
norm = norms.longValue();
|
||||
}
|
||||
return scorer.score(0, norm);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -71,15 +71,14 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
}
|
||||
|
||||
void countHit(int doc) throws IOException {
|
||||
++totalHits;
|
||||
hitsThresholdChecker.incrementHitCount();
|
||||
int hitCountSoFar = ++totalHits;
|
||||
|
||||
if (minScoreAcc != null && (totalHits & minScoreAcc.modInterval) == 0) {
|
||||
if (minScoreAcc != null && (hitCountSoFar & minScoreAcc.modInterval) == 0) {
|
||||
updateGlobalMinCompetitiveScore(scorer);
|
||||
}
|
||||
if (scoreMode.isExhaustive() == false
|
||||
&& totalHitsRelation == TotalHits.Relation.EQUAL_TO
|
||||
&& hitsThresholdChecker.isThresholdReached()) {
|
||||
&& totalHits > totalHitsThreshold) {
|
||||
// for the first time hitsThreshold is reached, notify comparator about this
|
||||
comparator.setHitsThresholdReached();
|
||||
totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
|
||||
|
@ -92,7 +91,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
// this document is larger than anything else in the queue, and
|
||||
// therefore not competitive.
|
||||
if (searchSortPartOfIndexSort) {
|
||||
if (hitsThresholdChecker.isThresholdReached()) {
|
||||
if (totalHits > totalHitsThreshold) {
|
||||
totalHitsRelation = Relation.GREATER_THAN_OR_EQUAL_TO;
|
||||
throw new CollectionTerminatedException();
|
||||
} else {
|
||||
|
@ -180,9 +179,9 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
Sort sort,
|
||||
FieldValueHitQueue<Entry> queue,
|
||||
int numHits,
|
||||
HitsThresholdChecker hitsThresholdChecker,
|
||||
int totalHitsThreshold,
|
||||
MaxScoreAccumulator minScoreAcc) {
|
||||
super(queue, numHits, hitsThresholdChecker, sort.needsScores(), minScoreAcc);
|
||||
super(queue, numHits, totalHitsThreshold, sort.needsScores(), minScoreAcc);
|
||||
this.sort = sort;
|
||||
this.queue = queue;
|
||||
}
|
||||
|
@ -235,9 +234,9 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
FieldValueHitQueue<Entry> queue,
|
||||
FieldDoc after,
|
||||
int numHits,
|
||||
HitsThresholdChecker hitsThresholdChecker,
|
||||
int totalHitsThreshold,
|
||||
MaxScoreAccumulator minScoreAcc) {
|
||||
super(queue, numHits, hitsThresholdChecker, sort.needsScores(), minScoreAcc);
|
||||
super(queue, numHits, totalHitsThreshold, sort.needsScores(), minScoreAcc);
|
||||
this.sort = sort;
|
||||
this.queue = queue;
|
||||
this.after = after;
|
||||
|
@ -301,7 +300,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
private static final ScoreDoc[] EMPTY_SCOREDOCS = new ScoreDoc[0];
|
||||
|
||||
final int numHits;
|
||||
final HitsThresholdChecker hitsThresholdChecker;
|
||||
final int totalHitsThreshold;
|
||||
final FieldComparator<?> firstComparator;
|
||||
final boolean canSetMinScore;
|
||||
|
||||
|
@ -327,25 +326,25 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
private TopFieldCollector(
|
||||
FieldValueHitQueue<Entry> pq,
|
||||
int numHits,
|
||||
HitsThresholdChecker hitsThresholdChecker,
|
||||
int totalHitsThreshold,
|
||||
boolean needsScores,
|
||||
MaxScoreAccumulator minScoreAcc) {
|
||||
super(pq);
|
||||
this.needsScores = needsScores;
|
||||
this.numHits = numHits;
|
||||
this.hitsThresholdChecker = hitsThresholdChecker;
|
||||
this.totalHitsThreshold = Math.max(totalHitsThreshold, numHits);
|
||||
this.numComparators = pq.getComparators().length;
|
||||
this.firstComparator = pq.getComparators()[0];
|
||||
int reverseMul = pq.reverseMul[0];
|
||||
|
||||
if (firstComparator.getClass().equals(FieldComparator.RelevanceComparator.class)
|
||||
&& reverseMul == 1 // if the natural sort is preserved (sort by descending relevance)
|
||||
&& hitsThresholdChecker.getHitsThreshold() != Integer.MAX_VALUE) {
|
||||
&& totalHitsThreshold != Integer.MAX_VALUE) {
|
||||
scoreMode = ScoreMode.TOP_SCORES;
|
||||
canSetMinScore = true;
|
||||
} else {
|
||||
canSetMinScore = false;
|
||||
if (hitsThresholdChecker.getHitsThreshold() != Integer.MAX_VALUE) {
|
||||
if (totalHitsThreshold != Integer.MAX_VALUE) {
|
||||
scoreMode = needsScores ? ScoreMode.TOP_DOCS_WITH_SCORES : ScoreMode.TOP_DOCS;
|
||||
} else {
|
||||
scoreMode = needsScores ? ScoreMode.COMPLETE : ScoreMode.COMPLETE_NO_SCORES;
|
||||
|
@ -361,10 +360,10 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
|
||||
protected void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOException {
|
||||
assert minScoreAcc != null;
|
||||
if (canSetMinScore && hitsThresholdChecker.isThresholdReached()) {
|
||||
// we can start checking the global maximum score even
|
||||
// if the local queue is not full because the threshold
|
||||
// is reached.
|
||||
if (canSetMinScore) {
|
||||
// we can start checking the global maximum score even if the local queue is not full or if
|
||||
// the threshold is not reached on the local competitor: the fact that there is a shared min
|
||||
// competitive score implies that one of the collectors hit its totalHitsThreshold already
|
||||
long maxMinScore = minScoreAcc.getRaw();
|
||||
float score;
|
||||
if (maxMinScore != Long.MIN_VALUE
|
||||
|
@ -377,7 +376,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
|
|||
}
|
||||
|
||||
protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {
|
||||
if (canSetMinScore && queueFull && hitsThresholdChecker.isThresholdReached()) {
|
||||
if (canSetMinScore && queueFull && totalHits > totalHitsThreshold) {
|
||||
assert bottom != null;
|
||||
float minScore = (float) firstComparator.value(bottom.slot);
|
||||
if (minScore > minCompetitiveScore) {
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue