mirror of https://github.com/apache/lucene.git
Move postings back to int[] to take advantage of having more lanes per vector. (#13968)
In Lucene 8.4, we updated postings to work on long[] arrays internally. This allowed us to workaround the lack of explicit vectorization (auto-vectorization doesn't detect all the scenarios that we would like to handle) support in the JVM by summing up two integers in one operation for instance. With explicit vectorization now available, it looks like we can get more benefits from the ability to compare multiple intetgers in one operations than from summing up two integers in one operation. Moving back to ints helps compare 2x more integers at once vs. longs.
This commit is contained in:
parent
494b16063e
commit
cfdd20f5bc
|
@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
|
|||
description "Regenerate gen_ForUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene101")
|
||||
def genScript = file("${genDir}/gen_ForUtil.py")
|
||||
def genOutput = file("${genDir}/ForUtil.java")
|
||||
|
||||
|
@ -48,7 +48,7 @@ configure(project(":lucene:core")) {
|
|||
description "Regenerate gen_ForDeltaUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene101")
|
||||
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
|
||||
def genOutput = file("${genDir}/ForDeltaUtil.java")
|
||||
|
||||
|
@ -68,6 +68,7 @@ configure(project(":lucene:core")) {
|
|||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
}
|
||||
|
||||
configure(project(":lucene:backward-codecs")) {
|
||||
|
@ -146,5 +147,55 @@ configure(project(":lucene:backward-codecs")) {
|
|||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
task generateForUtil912Internal() {
|
||||
description "Regenerate gen_ForUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene912")
|
||||
def genScript = file("${genDir}/gen_ForUtil.py")
|
||||
def genOutput = file("${genDir}/ForUtil.java")
|
||||
|
||||
inputs.file genScript
|
||||
outputs.file genOutput
|
||||
|
||||
doLast {
|
||||
quietExec {
|
||||
workingDir genDir
|
||||
executable project.externalTool("python3")
|
||||
args = [ '-B', genScript ]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil912Internal, [
|
||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
task generateForDeltaUtil912Internal() {
|
||||
description "Regenerate gen_ForDeltaUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene912")
|
||||
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
|
||||
def genOutput = file("${genDir}/ForDeltaUtil.java")
|
||||
|
||||
inputs.file genScript
|
||||
outputs.file genOutput
|
||||
|
||||
doLast {
|
||||
quietExec {
|
||||
workingDir genDir
|
||||
executable project.externalTool("python3")
|
||||
args = [ '-B', genScript ]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtil912Internal, [
|
||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
}
|
||||
|
||||
|
|
|
@ -84,6 +84,11 @@ Optimizations
|
|||
|
||||
* GITHUB#13763: Replace Map<String,Object> with IntObjectHashMap for KnnVectorsReader (Pan Guixin)
|
||||
|
||||
* GITHUB#13968: Switch postings from storing doc IDs in a long[] to an int[].
|
||||
Lucene 8.4 had moved to a long[] to help speed up block decoding by using
|
||||
longs that would pack two integers. We are now moving back to integers to be
|
||||
able to take advantage of 2x more lanes with the vector API. (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForDeltaUtil.java": "b81961f0b277b1458ca259e0d23ccc4eeeb47fe7",
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForDeltaUtil.py": "3191d7591309b7876c5c709fb9375af5b87c2ef8"
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/ForUtil.java": "e6db3c665dfebca8b93eb6b4651d2eb3af637b02",
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/gen_ForUtil.py": "993ecc9cf7ea821963384070669695257b16e040"
|
||||
}
|
|
@ -37,6 +37,7 @@ module org.apache.lucene.backward_codecs {
|
|||
exports org.apache.lucene.backward_codecs.lucene95;
|
||||
exports org.apache.lucene.backward_codecs.lucene99;
|
||||
exports org.apache.lucene.backward_codecs.lucene912;
|
||||
exports org.apache.lucene.backward_codecs.lucene100;
|
||||
exports org.apache.lucene.backward_codecs.packed;
|
||||
exports org.apache.lucene.backward_codecs.store;
|
||||
|
||||
|
@ -46,7 +47,8 @@ module org.apache.lucene.backward_codecs {
|
|||
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat;
|
||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
|
||||
|
@ -64,5 +66,6 @@ module org.apache.lucene.backward_codecs {
|
|||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
|
||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec,
|
||||
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec;
|
||||
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec,
|
||||
org.apache.lucene.backward_codecs.lucene100.Lucene100Codec;
|
||||
}
|
||||
|
|
|
@ -14,9 +14,10 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene100;
|
||||
package org.apache.lucene.backward_codecs.lucene100;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
|
@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||
|
@ -50,7 +50,7 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|||
*
|
||||
* <p>If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene100 package documentation for file format details.
|
||||
* @see org.apache.lucene.backward_codecs.lucene100 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene100Codec extends Codec {
|
|
@ -15,5 +15,5 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Lucene 9.12 file format. */
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
/** Lucene 10.0 file format. */
|
||||
package org.apache.lucene.backward_codecs.lucene100;
|
|
@ -16,13 +16,13 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene912.ForUtil.*;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.ForUtil.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
|
@ -30,7 +30,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* SIMD-like speedups. If bitsPerValue <= 4 then we pack 8 ints per long else if bitsPerValue
|
||||
* <= 11 we pack 4 ints per long else we pack 2 ints per long
|
||||
*/
|
||||
public final class ForDeltaUtil {
|
||||
final class ForDeltaUtil {
|
||||
|
||||
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
|
||||
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
|
||||
|
@ -272,125 +272,124 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
|
||||
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */
|
||||
void decodeAndPrefixSum(PostingDecodingUtil pdu, long base, long[] longs) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
|
||||
void decodeAndPrefixSum(IndexInput in, long base, long[] longs) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(in.readByte());
|
||||
if (bitsPerValue == 0) {
|
||||
prefixSumOfOnes(longs, base);
|
||||
} else {
|
||||
decodeAndPrefixSum(bitsPerValue, pdu, base, longs);
|
||||
decodeAndPrefixSum(bitsPerValue, in, base, longs);
|
||||
}
|
||||
}
|
||||
|
||||
/** Delta-decode 128 integers into {@code longs}. */
|
||||
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, long base, long[] longs)
|
||||
void decodeAndPrefixSum(int bitsPerValue, IndexInput in, long base, long[] longs)
|
||||
throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
case 1:
|
||||
decode1(pdu, longs);
|
||||
decode1(in, longs);
|
||||
prefixSum8(longs, base);
|
||||
break;
|
||||
case 2:
|
||||
decode2(pdu, longs);
|
||||
decode2(in, longs);
|
||||
prefixSum8(longs, base);
|
||||
break;
|
||||
case 3:
|
||||
decode3(pdu, tmp, longs);
|
||||
decode3(in, tmp, longs);
|
||||
prefixSum8(longs, base);
|
||||
break;
|
||||
case 4:
|
||||
decode4(pdu, longs);
|
||||
decode4(in, longs);
|
||||
prefixSum8(longs, base);
|
||||
break;
|
||||
case 5:
|
||||
decode5To16(pdu, tmp, longs);
|
||||
decode5To16(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 6:
|
||||
decode6To16(pdu, tmp, longs);
|
||||
decode6To16(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 7:
|
||||
decode7To16(pdu, tmp, longs);
|
||||
decode7To16(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 8:
|
||||
decode8To16(pdu, longs);
|
||||
decode8To16(in, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 9:
|
||||
decode9(pdu, tmp, longs);
|
||||
decode9(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 10:
|
||||
decode10(pdu, tmp, longs);
|
||||
decode10(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 11:
|
||||
decode11(pdu, tmp, longs);
|
||||
decode11(in, tmp, longs);
|
||||
prefixSum16(longs, base);
|
||||
break;
|
||||
case 12:
|
||||
decode12To32(pdu, tmp, longs);
|
||||
decode12To32(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 13:
|
||||
decode13To32(pdu, tmp, longs);
|
||||
decode13To32(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 14:
|
||||
decode14To32(pdu, tmp, longs);
|
||||
decode14To32(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 15:
|
||||
decode15To32(pdu, tmp, longs);
|
||||
decode15To32(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 16:
|
||||
decode16To32(pdu, longs);
|
||||
decode16To32(in, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 17:
|
||||
decode17(pdu, tmp, longs);
|
||||
decode17(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 18:
|
||||
decode18(pdu, tmp, longs);
|
||||
decode18(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 19:
|
||||
decode19(pdu, tmp, longs);
|
||||
decode19(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 20:
|
||||
decode20(pdu, tmp, longs);
|
||||
decode20(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 21:
|
||||
decode21(pdu, tmp, longs);
|
||||
decode21(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 22:
|
||||
decode22(pdu, tmp, longs);
|
||||
decode22(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 23:
|
||||
decode23(pdu, tmp, longs);
|
||||
decode23(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
case 24:
|
||||
decode24(pdu, tmp, longs);
|
||||
decode24(in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
default:
|
||||
decodeSlow(bitsPerValue, pdu, tmp, longs);
|
||||
decodeSlow(bitsPerValue, in, tmp, longs);
|
||||
prefixSum32(longs, base);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode5To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(10, longs, 11, 5, MASK16_5, tmp, 0, MASK16_1);
|
||||
private static void decode5To16(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 10, longs, 11, 5, MASK16_5, tmp, 0, MASK16_1);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 30; iter < 2; ++iter, tmpIdx += 5, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= tmp[tmpIdx + 1] << 3;
|
||||
|
@ -401,9 +400,8 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode6To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(12, longs, 10, 6, MASK16_6, tmp, 0, MASK16_4);
|
||||
private static void decode6To16(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 12, longs, 10, 6, MASK16_6, tmp, 0, MASK16_4);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 24; iter < 4; ++iter, tmpIdx += 3, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_2;
|
||||
|
@ -414,9 +412,8 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode7To16(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(14, longs, 9, 7, MASK16_7, tmp, 0, MASK16_2);
|
||||
private static void decode7To16(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 14, longs, 9, 7, MASK16_7, tmp, 0, MASK16_2);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 28; iter < 2; ++iter, tmpIdx += 7, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 5;
|
||||
l0 |= tmp[tmpIdx + 1] << 3;
|
||||
|
@ -431,13 +428,12 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode8To16(PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
pdu.splitLongs(16, longs, 8, 8, MASK16_8, longs, 16, MASK16_8);
|
||||
private static void decode8To16(IndexInput in, long[] longs) throws IOException {
|
||||
splitLongs(in, 16, longs, 8, 8, MASK16_8, longs, 16, MASK16_8);
|
||||
}
|
||||
|
||||
private static void decode12To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(24, longs, 20, 12, MASK32_12, tmp, 0, MASK32_8);
|
||||
private static void decode12To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 24, longs, 20, 12, MASK32_12, tmp, 0, MASK32_8);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 48; iter < 8; ++iter, tmpIdx += 3, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_4;
|
||||
|
@ -448,9 +444,8 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode13To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(26, longs, 19, 13, MASK32_13, tmp, 0, MASK32_6);
|
||||
private static void decode13To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 26, longs, 19, 13, MASK32_13, tmp, 0, MASK32_6);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 52; iter < 2; ++iter, tmpIdx += 13, longsIdx += 6) {
|
||||
long l0 = tmp[tmpIdx + 0] << 7;
|
||||
l0 |= tmp[tmpIdx + 1] << 1;
|
||||
|
@ -479,9 +474,8 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode14To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(28, longs, 18, 14, MASK32_14, tmp, 0, MASK32_4);
|
||||
private static void decode14To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 28, longs, 18, 14, MASK32_14, tmp, 0, MASK32_4);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 56; iter < 4; ++iter, tmpIdx += 7, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= tmp[tmpIdx + 1] << 6;
|
||||
|
@ -496,9 +490,8 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode15To32(PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
pdu.splitLongs(30, longs, 17, 15, MASK32_15, tmp, 0, MASK32_2);
|
||||
private static void decode15To32(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 30, longs, 17, 15, MASK32_15, tmp, 0, MASK32_2);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 60; iter < 2; ++iter, tmpIdx += 15, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 13;
|
||||
l0 |= tmp[tmpIdx + 1] << 11;
|
||||
|
@ -521,7 +514,7 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
}
|
||||
|
||||
private static void decode16To32(PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
pdu.splitLongs(32, longs, 16, 16, MASK32_16, longs, 32, MASK32_16);
|
||||
private static void decode16To32(IndexInput in, long[] longs) throws IOException {
|
||||
splitLongs(in, 32, longs, 16, 16, MASK32_16, longs, 32, MASK32_16);
|
||||
}
|
||||
}
|
|
@ -16,18 +16,18 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in a long to get
|
||||
* SIMD-like speedups. If bitsPerValue <= 8 then we pack 8 ints per long else if bitsPerValue
|
||||
* <= 16 we pack 4 ints per long else we pack 2 ints per long
|
||||
*/
|
||||
public final class ForUtil {
|
||||
final class ForUtil {
|
||||
|
||||
public static final int BLOCK_SIZE = 128;
|
||||
static final int BLOCK_SIZE_LOG2 = 7;
|
||||
|
@ -196,11 +196,11 @@ public final class ForUtil {
|
|||
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
|
||||
}
|
||||
|
||||
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
static void decodeSlow(int bitsPerValue, IndexInput in, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
final int numLongs = bitsPerValue << 1;
|
||||
final long mask = MASKS32[bitsPerValue];
|
||||
pdu.splitLongs(numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
|
||||
splitLongs(in, numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
|
||||
final int remainingBitsPerLong = 32 - bitsPerValue;
|
||||
final long mask32RemainingBitsPerLong = MASKS32[remainingBitsPerLong];
|
||||
int tmpIdx = 0;
|
||||
|
@ -222,6 +222,28 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void splitLongs(
|
||||
IndexInput in,
|
||||
int count,
|
||||
long[] b,
|
||||
int bShift,
|
||||
int dec,
|
||||
long bMask,
|
||||
long[] c,
|
||||
int cIndex,
|
||||
long cMask)
|
||||
throws IOException {
|
||||
// takes advantage of the C2 compiler's loop unrolling and auto-vectorization.
|
||||
in.readLongs(c, cIndex, count);
|
||||
int maxIter = (bShift - 1) / dec;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
for (int j = 0; j <= maxIter; ++j) {
|
||||
b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask;
|
||||
}
|
||||
c[cIndex + i] &= cMask;
|
||||
}
|
||||
}
|
||||
|
||||
static final long[] MASKS8 = new long[8];
|
||||
static final long[] MASKS16 = new long[16];
|
||||
static final long[] MASKS32 = new long[32];
|
||||
|
@ -288,121 +310,121 @@ public final class ForUtil {
|
|||
static final long MASK32_24 = MASKS32[24];
|
||||
|
||||
/** Decode 128 integers into {@code longs}. */
|
||||
void decode(int bitsPerValue, PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
void decode(int bitsPerValue, IndexInput in, long[] longs) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
case 1:
|
||||
decode1(pdu, longs);
|
||||
decode1(in, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 2:
|
||||
decode2(pdu, longs);
|
||||
decode2(in, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 3:
|
||||
decode3(pdu, tmp, longs);
|
||||
decode3(in, tmp, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 4:
|
||||
decode4(pdu, longs);
|
||||
decode4(in, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 5:
|
||||
decode5(pdu, tmp, longs);
|
||||
decode5(in, tmp, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 6:
|
||||
decode6(pdu, tmp, longs);
|
||||
decode6(in, tmp, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 7:
|
||||
decode7(pdu, tmp, longs);
|
||||
decode7(in, tmp, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 8:
|
||||
decode8(pdu, longs);
|
||||
decode8(in, longs);
|
||||
expand8(longs);
|
||||
break;
|
||||
case 9:
|
||||
decode9(pdu, tmp, longs);
|
||||
decode9(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 10:
|
||||
decode10(pdu, tmp, longs);
|
||||
decode10(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 11:
|
||||
decode11(pdu, tmp, longs);
|
||||
decode11(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 12:
|
||||
decode12(pdu, tmp, longs);
|
||||
decode12(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 13:
|
||||
decode13(pdu, tmp, longs);
|
||||
decode13(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 14:
|
||||
decode14(pdu, tmp, longs);
|
||||
decode14(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 15:
|
||||
decode15(pdu, tmp, longs);
|
||||
decode15(in, tmp, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 16:
|
||||
decode16(pdu, longs);
|
||||
decode16(in, longs);
|
||||
expand16(longs);
|
||||
break;
|
||||
case 17:
|
||||
decode17(pdu, tmp, longs);
|
||||
decode17(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 18:
|
||||
decode18(pdu, tmp, longs);
|
||||
decode18(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 19:
|
||||
decode19(pdu, tmp, longs);
|
||||
decode19(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 20:
|
||||
decode20(pdu, tmp, longs);
|
||||
decode20(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 21:
|
||||
decode21(pdu, tmp, longs);
|
||||
decode21(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 22:
|
||||
decode22(pdu, tmp, longs);
|
||||
decode22(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 23:
|
||||
decode23(pdu, tmp, longs);
|
||||
decode23(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
case 24:
|
||||
decode24(pdu, tmp, longs);
|
||||
decode24(in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
default:
|
||||
decodeSlow(bitsPerValue, pdu, tmp, longs);
|
||||
decodeSlow(bitsPerValue, in, tmp, longs);
|
||||
expand32(longs);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode1(PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
pdu.splitLongs(2, longs, 7, 1, MASK8_1, longs, 14, MASK8_1);
|
||||
static void decode1(IndexInput in, long[] longs) throws IOException {
|
||||
splitLongs(in, 2, longs, 7, 1, MASK8_1, longs, 14, MASK8_1);
|
||||
}
|
||||
|
||||
static void decode2(PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
pdu.splitLongs(4, longs, 6, 2, MASK8_2, longs, 12, MASK8_2);
|
||||
static void decode2(IndexInput in, long[] longs) throws IOException {
|
||||
splitLongs(in, 4, longs, 6, 2, MASK8_2, longs, 12, MASK8_2);
|
||||
}
|
||||
|
||||
static void decode3(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(6, longs, 5, 3, MASK8_3, tmp, 0, MASK8_2);
|
||||
static void decode3(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 6, longs, 5, 3, MASK8_3, tmp, 0, MASK8_2);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 12; iter < 2; ++iter, tmpIdx += 3, longsIdx += 2) {
|
||||
long l0 = tmp[tmpIdx + 0] << 1;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_1;
|
||||
|
@ -413,12 +435,12 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode4(PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
pdu.splitLongs(8, longs, 4, 4, MASK8_4, longs, 8, MASK8_4);
|
||||
static void decode4(IndexInput in, long[] longs) throws IOException {
|
||||
splitLongs(in, 8, longs, 4, 4, MASK8_4, longs, 8, MASK8_4);
|
||||
}
|
||||
|
||||
static void decode5(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(10, longs, 3, 5, MASK8_5, tmp, 0, MASK8_3);
|
||||
static void decode5(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 10, longs, 3, 5, MASK8_5, tmp, 0, MASK8_3);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 10; iter < 2; ++iter, tmpIdx += 5, longsIdx += 3) {
|
||||
long l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_2;
|
||||
|
@ -433,8 +455,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode6(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(12, longs, 2, 6, MASK8_6, tmp, 0, MASK8_2);
|
||||
static void decode6(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 12, longs, 2, 6, MASK8_6, tmp, 0, MASK8_2);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 12; iter < 4; ++iter, tmpIdx += 3, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= tmp[tmpIdx + 1] << 2;
|
||||
|
@ -443,8 +465,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode7(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(14, longs, 1, 7, MASK8_7, tmp, 0, MASK8_1);
|
||||
static void decode7(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 14, longs, 1, 7, MASK8_7, tmp, 0, MASK8_1);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 14; iter < 2; ++iter, tmpIdx += 7, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= tmp[tmpIdx + 1] << 5;
|
||||
|
@ -457,12 +479,12 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode8(PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
pdu.in.readLongs(longs, 0, 16);
|
||||
static void decode8(IndexInput in, long[] longs) throws IOException {
|
||||
in.readLongs(longs, 0, 16);
|
||||
}
|
||||
|
||||
static void decode9(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(18, longs, 7, 9, MASK16_9, tmp, 0, MASK16_7);
|
||||
static void decode9(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 18, longs, 7, 9, MASK16_9, tmp, 0, MASK16_7);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 18; iter < 2; ++iter, tmpIdx += 9, longsIdx += 7) {
|
||||
long l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 5) & MASK16_2;
|
||||
|
@ -489,8 +511,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode10(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(20, longs, 6, 10, MASK16_10, tmp, 0, MASK16_6);
|
||||
static void decode10(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 20, longs, 6, 10, MASK16_10, tmp, 0, MASK16_6);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 20; iter < 4; ++iter, tmpIdx += 5, longsIdx += 3) {
|
||||
long l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_4;
|
||||
|
@ -505,8 +527,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode11(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(22, longs, 5, 11, MASK16_11, tmp, 0, MASK16_5);
|
||||
static void decode11(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 22, longs, 5, 11, MASK16_11, tmp, 0, MASK16_5);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 22; iter < 2; ++iter, tmpIdx += 11, longsIdx += 5) {
|
||||
long l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= tmp[tmpIdx + 1] << 1;
|
||||
|
@ -531,8 +553,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode12(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(24, longs, 4, 12, MASK16_12, tmp, 0, MASK16_4);
|
||||
static void decode12(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 24, longs, 4, 12, MASK16_12, tmp, 0, MASK16_4);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 24; iter < 8; ++iter, tmpIdx += 3, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 8;
|
||||
l0 |= tmp[tmpIdx + 1] << 4;
|
||||
|
@ -541,8 +563,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode13(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(26, longs, 3, 13, MASK16_13, tmp, 0, MASK16_3);
|
||||
static void decode13(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 26, longs, 3, 13, MASK16_13, tmp, 0, MASK16_3);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 26; iter < 2; ++iter, tmpIdx += 13, longsIdx += 3) {
|
||||
long l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= tmp[tmpIdx + 1] << 7;
|
||||
|
@ -565,8 +587,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode14(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(28, longs, 2, 14, MASK16_14, tmp, 0, MASK16_2);
|
||||
static void decode14(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 28, longs, 2, 14, MASK16_14, tmp, 0, MASK16_2);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 28; iter < 4; ++iter, tmpIdx += 7, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 12;
|
||||
l0 |= tmp[tmpIdx + 1] << 10;
|
||||
|
@ -579,8 +601,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode15(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(30, longs, 1, 15, MASK16_15, tmp, 0, MASK16_1);
|
||||
static void decode15(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 30, longs, 1, 15, MASK16_15, tmp, 0, MASK16_1);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 30; iter < 2; ++iter, tmpIdx += 15, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 14;
|
||||
l0 |= tmp[tmpIdx + 1] << 13;
|
||||
|
@ -601,12 +623,12 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode16(PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
pdu.in.readLongs(longs, 0, 32);
|
||||
static void decode16(IndexInput in, long[] longs) throws IOException {
|
||||
in.readLongs(longs, 0, 32);
|
||||
}
|
||||
|
||||
static void decode17(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(34, longs, 15, 17, MASK32_17, tmp, 0, MASK32_15);
|
||||
static void decode17(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 34, longs, 15, 17, MASK32_17, tmp, 0, MASK32_15);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 34; iter < 2; ++iter, tmpIdx += 17, longsIdx += 15) {
|
||||
long l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 13) & MASK32_2;
|
||||
|
@ -657,8 +679,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode18(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(36, longs, 14, 18, MASK32_18, tmp, 0, MASK32_14);
|
||||
static void decode18(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 36, longs, 14, 18, MASK32_18, tmp, 0, MASK32_14);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 36; iter < 4; ++iter, tmpIdx += 9, longsIdx += 7) {
|
||||
long l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 10) & MASK32_4;
|
||||
|
@ -685,8 +707,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode19(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(38, longs, 13, 19, MASK32_19, tmp, 0, MASK32_13);
|
||||
static void decode19(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 38, longs, 13, 19, MASK32_19, tmp, 0, MASK32_13);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 38; iter < 2; ++iter, tmpIdx += 19, longsIdx += 13) {
|
||||
long l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 7) & MASK32_6;
|
||||
|
@ -735,8 +757,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode20(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(40, longs, 12, 20, MASK32_20, tmp, 0, MASK32_12);
|
||||
static void decode20(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 40, longs, 12, 20, MASK32_20, tmp, 0, MASK32_12);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 40; iter < 8; ++iter, tmpIdx += 5, longsIdx += 3) {
|
||||
long l0 = tmp[tmpIdx + 0] << 8;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_8;
|
||||
|
@ -751,8 +773,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode21(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(42, longs, 11, 21, MASK32_21, tmp, 0, MASK32_11);
|
||||
static void decode21(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 42, longs, 11, 21, MASK32_21, tmp, 0, MASK32_11);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 42; iter < 2; ++iter, tmpIdx += 21, longsIdx += 11) {
|
||||
long l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK32_10;
|
||||
|
@ -799,8 +821,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode22(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(44, longs, 10, 22, MASK32_22, tmp, 0, MASK32_10);
|
||||
static void decode22(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 44, longs, 10, 22, MASK32_22, tmp, 0, MASK32_10);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 44; iter < 4; ++iter, tmpIdx += 11, longsIdx += 5) {
|
||||
long l0 = tmp[tmpIdx + 0] << 12;
|
||||
l0 |= tmp[tmpIdx + 1] << 2;
|
||||
|
@ -825,8 +847,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode23(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(46, longs, 9, 23, MASK32_23, tmp, 0, MASK32_9);
|
||||
static void decode23(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 46, longs, 9, 23, MASK32_23, tmp, 0, MASK32_9);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 46; iter < 2; ++iter, tmpIdx += 23, longsIdx += 9) {
|
||||
long l0 = tmp[tmpIdx + 0] << 14;
|
||||
l0 |= tmp[tmpIdx + 1] << 5;
|
||||
|
@ -871,8 +893,8 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void decode24(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {
|
||||
pdu.splitLongs(48, longs, 8, 24, MASK32_24, tmp, 0, MASK32_8);
|
||||
static void decode24(IndexInput in, long[] tmp, long[] longs) throws IOException {
|
||||
splitLongs(in, 48, longs, 8, 24, MASK32_24, tmp, 0, MASK32_8);
|
||||
for (int iter = 0, tmpIdx = 0, longsIdx = 48; iter < 16; ++iter, tmpIdx += 3, longsIdx += 1) {
|
||||
long l0 = tmp[tmpIdx + 0] << 16;
|
||||
l0 |= tmp[tmpIdx + 1] << 8;
|
|
@ -37,7 +37,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
|
@ -23,7 +23,6 @@ import org.apache.lucene.codecs.FieldsConsumer;
|
|||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
|
@ -318,7 +317,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene912PostingsFormat extends PostingsFormat {
|
||||
public class Lucene912PostingsFormat extends PostingsFormat {
|
||||
|
||||
/** Filename extension for some small metadata about how postings are encoded. */
|
||||
public static final String META_EXTENSION = "psm";
|
||||
|
@ -341,7 +340,7 @@ public final class Lucene912PostingsFormat extends PostingsFormat {
|
|||
/** Size of blocks. */
|
||||
public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE;
|
||||
|
||||
public static final int BLOCK_MASK = BLOCK_SIZE - 1;
|
||||
static final int BLOCK_MASK = BLOCK_SIZE - 1;
|
||||
|
||||
/** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */
|
||||
public static final int LEVEL1_FACTOR = 32;
|
||||
|
@ -349,7 +348,7 @@ public final class Lucene912PostingsFormat extends PostingsFormat {
|
|||
/** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */
|
||||
public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE;
|
||||
|
||||
public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1;
|
||||
static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1;
|
||||
|
||||
static final String TERMS_CODEC = "Lucene90PostingsWriterTerms";
|
||||
static final String META_CODEC = "Lucene912PostingsWriterMeta";
|
||||
|
@ -360,45 +359,15 @@ public final class Lucene912PostingsFormat extends PostingsFormat {
|
|||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/** Creates {@code Lucene912PostingsFormat} with default settings. */
|
||||
public Lucene912PostingsFormat() {
|
||||
this(
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@code Lucene912PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
*
|
||||
* @see
|
||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||
*/
|
||||
public Lucene912PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("Lucene912");
|
||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene90BlockTreeTermsWriter(
|
||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
throw new UnsupportedOperationException(
|
||||
"This postings format may not be used for writing, use the current postings format");
|
||||
}
|
||||
|
||||
@Override
|
|
@ -14,17 +14,17 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene912.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.LEVEL1_NUM_DOCS;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_START;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.LEVEL1_NUM_DOCS;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_START;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.AbstractList;
|
||||
|
@ -32,10 +32,10 @@ import java.util.Arrays;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.RandomAccess;
|
||||
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Impact;
|
||||
import org.apache.lucene.index.Impacts;
|
||||
|
@ -45,9 +45,6 @@ import org.apache.lucene.index.IndexOptions;
|
|||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SlowImpactsEnum;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.internal.vectorization.VectorUtilSupport;
|
||||
import org.apache.lucene.internal.vectorization.VectorizationProvider;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -65,9 +62,6 @@ import org.apache.lucene.util.IOUtils;
|
|||
*/
|
||||
public final class Lucene912PostingsReader extends PostingsReaderBase {
|
||||
|
||||
static final VectorizationProvider VECTORIZATION_PROVIDER = VectorizationProvider.getInstance();
|
||||
private static final VectorUtilSupport VECTOR_SUPPORT =
|
||||
VECTORIZATION_PROVIDER.getVectorUtilSupport();
|
||||
// Dummy impacts, composed of the maximum possible term frequency and the lowest possible
|
||||
// (unsigned) norm value. This is typically used on tail blocks, which don't actually record
|
||||
// impacts as the storage overhead would not be worth any query evaluation speedup, since there's
|
||||
|
@ -355,7 +349,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
protected int docBufferUpto;
|
||||
|
||||
protected IndexInput docIn;
|
||||
protected PostingDecodingUtil docInUtil;
|
||||
|
||||
protected AbstractPostingsEnum(FieldInfo fieldInfo) {
|
||||
indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
|
@ -376,7 +369,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
if (docIn == null) {
|
||||
// lazy init
|
||||
docIn = Lucene912PostingsReader.this.docIn.clone();
|
||||
docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn);
|
||||
}
|
||||
prefetchPostings(docIn, termState);
|
||||
}
|
||||
|
@ -442,7 +434,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
public int freq() throws IOException {
|
||||
if (freqFP != -1) {
|
||||
docIn.seek(freqFP);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
pforUtil.decode(docIn, freqBuffer);
|
||||
freqFP = -1;
|
||||
}
|
||||
|
||||
|
@ -472,7 +464,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
private void refillFullBlock() throws IOException {
|
||||
assert docFreq - docCountUpto >= BLOCK_SIZE;
|
||||
|
||||
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
|
||||
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
|
||||
|
||||
if (indexHasFreq) {
|
||||
if (needsFreq) {
|
||||
|
@ -601,7 +593,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
int next = VECTOR_SUPPORT.findNextGEQ(docBuffer, docBufferSize, target, docBufferUpto);
|
||||
int next = findNextGEQ(docBuffer, docBufferSize, target, docBufferUpto);
|
||||
this.doc = (int) docBuffer[next];
|
||||
docBufferUpto = next + 1;
|
||||
return doc;
|
||||
|
@ -633,9 +625,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
private int posBufferUpto;
|
||||
|
||||
final IndexInput posIn;
|
||||
final PostingDecodingUtil posInUtil;
|
||||
final IndexInput payIn;
|
||||
final PostingDecodingUtil payInUtil;
|
||||
final BytesRef payload;
|
||||
|
||||
final boolean indexHasOffsets;
|
||||
|
@ -678,13 +668,10 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads;
|
||||
|
||||
this.posIn = Lucene912PostingsReader.this.posIn.clone();
|
||||
posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn);
|
||||
if (indexHasOffsetsOrPayloads) {
|
||||
this.payIn = Lucene912PostingsReader.this.payIn.clone();
|
||||
payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn);
|
||||
} else {
|
||||
this.payIn = null;
|
||||
payInUtil = null;
|
||||
}
|
||||
if (indexHasOffsets) {
|
||||
offsetStartDeltaBuffer = new long[BLOCK_SIZE];
|
||||
|
@ -771,8 +758,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
assert left >= 0;
|
||||
|
||||
if (left >= BLOCK_SIZE) {
|
||||
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
|
||||
pforUtil.decode(docIn, freqBuffer);
|
||||
docCountUpto += BLOCK_SIZE;
|
||||
} else if (docFreq == 1) {
|
||||
docBuffer[0] = singletonDocID;
|
||||
|
@ -950,7 +937,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
refillDocs();
|
||||
}
|
||||
|
||||
int next = VECTOR_SUPPORT.findNextGEQ(docBuffer, docBufferSize, target, docBufferUpto);
|
||||
int next = findNextGEQ(docBuffer, docBufferSize, target, docBufferUpto);
|
||||
posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1);
|
||||
this.freq = (int) freqBuffer[next];
|
||||
this.docBufferUpto = next + 1;
|
||||
|
@ -1044,11 +1031,11 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
payloadByteUpto = 0;
|
||||
} else {
|
||||
pforUtil.decode(posInUtil, posDeltaBuffer);
|
||||
pforUtil.decode(posIn, posDeltaBuffer);
|
||||
|
||||
if (indexHasPayloads) {
|
||||
if (needsPayloads) {
|
||||
pforUtil.decode(payInUtil, payloadLengthBuffer);
|
||||
pforUtil.decode(payIn, payloadLengthBuffer);
|
||||
int numBytes = payIn.readVInt();
|
||||
|
||||
if (numBytes > payloadBytes.length) {
|
||||
|
@ -1067,8 +1054,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
|
||||
if (indexHasOffsets) {
|
||||
if (needsOffsets) {
|
||||
pforUtil.decode(payInUtil, offsetStartDeltaBuffer);
|
||||
pforUtil.decode(payInUtil, offsetLengthBuffer);
|
||||
pforUtil.decode(payIn, offsetStartDeltaBuffer);
|
||||
pforUtil.decode(payIn, offsetLengthBuffer);
|
||||
} else {
|
||||
// this works, because when writing a vint block we always force the first length to be
|
||||
// written
|
||||
|
@ -1149,7 +1136,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
protected final int docFreq; // number of docs in this posting list
|
||||
|
||||
protected final IndexInput docIn;
|
||||
protected final PostingDecodingUtil docInUtil;
|
||||
|
||||
protected int docCountUpto; // number of docs in or before the current block
|
||||
protected int doc = -1; // doc we last read
|
||||
|
@ -1175,7 +1161,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
private BlockImpactsEnum(IntBlockTermState termState) throws IOException {
|
||||
this.docFreq = termState.docFreq;
|
||||
this.docIn = Lucene912PostingsReader.this.docIn.clone();
|
||||
this.docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn);
|
||||
prefetchPostings(docIn, termState);
|
||||
level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0);
|
||||
level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1);
|
||||
|
@ -1279,7 +1264,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
public int freq() throws IOException {
|
||||
if (freqFP != -1) {
|
||||
docIn.seek(freqFP);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
pforUtil.decode(docIn, freqBuffer);
|
||||
freqFP = -1;
|
||||
}
|
||||
return (int) freqBuffer[docBufferUpto - 1];
|
||||
|
@ -1295,7 +1280,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
assert left >= 0;
|
||||
|
||||
if (left >= BLOCK_SIZE) {
|
||||
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
|
||||
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
|
||||
freqFP = docIn.getFilePointer();
|
||||
PForUtil.skip(docIn);
|
||||
docCountUpto += BLOCK_SIZE;
|
||||
|
@ -1438,7 +1423,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
needsRefilling = false;
|
||||
}
|
||||
|
||||
int next = VECTOR_SUPPORT.findNextGEQ(docBuffer, docBufferSize, target, docBufferUpto);
|
||||
int next = findNextGEQ(docBuffer, docBufferSize, target, docBufferUpto);
|
||||
this.doc = (int) docBuffer[next];
|
||||
docBufferUpto = next + 1;
|
||||
return doc;
|
||||
|
@ -1450,7 +1435,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
|
||||
private int posBufferUpto;
|
||||
final IndexInput posIn;
|
||||
final PostingDecodingUtil posInUtil;
|
||||
|
||||
final boolean indexHasFreq;
|
||||
final boolean indexHasOffsets;
|
||||
|
@ -1491,7 +1475,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads;
|
||||
|
||||
this.posIn = Lucene912PostingsReader.this.posIn.clone();
|
||||
posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn);
|
||||
|
||||
// Where this term's postings start in the .pos file:
|
||||
final long posTermStartFP = termState.posStartFP;
|
||||
|
@ -1522,8 +1505,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
assert left >= 0;
|
||||
|
||||
if (left >= BLOCK_SIZE) {
|
||||
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
|
||||
pforUtil.decode(docInUtil, freqBuffer);
|
||||
forDeltaUtil.decodeAndPrefixSum(docIn, prevDocID, docBuffer);
|
||||
pforUtil.decode(docIn, freqBuffer);
|
||||
docCountUpto += BLOCK_SIZE;
|
||||
} else if (docFreq == 1) {
|
||||
docBuffer[0] = singletonDocID;
|
||||
|
@ -1671,7 +1654,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
needsRefilling = false;
|
||||
}
|
||||
|
||||
int next = VECTOR_SUPPORT.findNextGEQ(docBuffer, docBufferSize, target, docBufferUpto);
|
||||
int next = findNextGEQ(docBuffer, docBufferSize, target, docBufferUpto);
|
||||
posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1);
|
||||
freq = (int) freqBuffer[next];
|
||||
docBufferUpto = next + 1;
|
||||
|
@ -1729,7 +1712,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
} else {
|
||||
pforUtil.decode(posInUtil, posDeltaBuffer);
|
||||
pforUtil.decode(posIn, posDeltaBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1754,9 +1737,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @see Lucene912PostingsWriter#writeVInt15(org.apache.lucene.store.DataOutput, int)
|
||||
*/
|
||||
static int readVInt15(DataInput in) throws IOException {
|
||||
short s = in.readShort();
|
||||
if (s >= 0) {
|
||||
|
@ -1766,9 +1746,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @see Lucene912PostingsWriter#writeVLong15(org.apache.lucene.store.DataOutput, long)
|
||||
*/
|
||||
static long readVLong15(DataInput in) throws IOException {
|
||||
short s = in.readShort();
|
||||
if (s >= 0) {
|
||||
|
@ -1778,6 +1755,15 @@ public final class Lucene912PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
private static int findNextGEQ(long[] buffer, int length, long target, int from) {
|
||||
for (int i = from; i < length; ++i) {
|
||||
if (buffer[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
private static void prefetchPostings(IndexInput docIn, IntBlockTermState state)
|
||||
throws IOException {
|
||||
assert state.docFreq > 1; // Singletons are inlined in the terms dict, nothing to prefetch
|
|
@ -14,13 +14,13 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.LongHeap;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
|
@ -104,14 +104,13 @@ final class PForUtil {
|
|||
}
|
||||
|
||||
/** Decode 128 integers into {@code ints}. */
|
||||
void decode(PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
var in = pdu.in;
|
||||
void decode(IndexInput in, long[] longs) throws IOException {
|
||||
final int token = Byte.toUnsignedInt(in.readByte());
|
||||
final int bitsPerValue = token & 0x1f;
|
||||
if (bitsPerValue == 0) {
|
||||
Arrays.fill(longs, 0, ForUtil.BLOCK_SIZE, in.readVLong());
|
||||
} else {
|
||||
forUtil.decode(bitsPerValue, pdu, longs);
|
||||
forUtil.decode(bitsPerValue, in, longs);
|
||||
}
|
||||
final int numExceptions = token >>> 5;
|
||||
for (int i = 0; i < numExceptions; ++i) {
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataOutput;
|
|
@ -40,15 +40,14 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene912.ForUtil.*;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.ForUtil.*;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/
|
||||
|
@ -57,7 +56,7 @@ import static org.apache.lucene.codecs.lucene912.ForUtil.*;
|
|||
* else if bitsPerValue <= 11 we pack 4 ints per long
|
||||
* else we pack 2 ints per long
|
||||
*/
|
||||
public final class ForDeltaUtil {
|
||||
final class ForDeltaUtil {
|
||||
|
||||
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
|
||||
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
|
||||
|
@ -299,12 +298,12 @@ public final class ForDeltaUtil {
|
|||
}
|
||||
|
||||
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */
|
||||
void decodeAndPrefixSum(PostingDecodingUtil pdu, long base, long[] longs) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
|
||||
void decodeAndPrefixSum(IndexInput in, long base, long[] longs) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(in.readByte());
|
||||
if (bitsPerValue == 0) {
|
||||
prefixSumOfOnes(longs, base);
|
||||
} else {
|
||||
decodeAndPrefixSum(bitsPerValue, pdu, base, longs);
|
||||
decodeAndPrefixSum(bitsPerValue, in, base, longs);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -362,20 +361,20 @@ def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values,
|
|||
def writeDecode(bpv, f):
|
||||
next_primitive = primitive_size_for_bpv(bpv)
|
||||
if next_primitive % bpv == 0:
|
||||
f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, long[] longs) throws IOException {\n' %(bpv, next_primitive))
|
||||
f.write(' private static void decode%dTo%d(IndexInput in, long[] longs) throws IOException {\n' %(bpv, next_primitive))
|
||||
else:
|
||||
f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {\n' %(bpv, next_primitive))
|
||||
f.write(' private static void decode%dTo%d(IndexInput in, long[] tmp, long[] longs) throws IOException {\n' %(bpv, next_primitive))
|
||||
if bpv == next_primitive:
|
||||
f.write(' pdu.in.readLongs(longs, 0, %d);\n' %(bpv*2))
|
||||
f.write(' in.readLongs(longs, 0, %d);\n' %(bpv*2))
|
||||
else:
|
||||
num_values_per_long = 64 / next_primitive
|
||||
remaining_bits = next_primitive % bpv
|
||||
num_iters = (next_primitive - 1) // bpv
|
||||
o = 2 * bpv * num_iters
|
||||
if remaining_bits == 0:
|
||||
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
else:
|
||||
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_long - o, f)
|
||||
f.write(' }\n')
|
||||
|
||||
|
@ -386,7 +385,7 @@ if __name__ == '__main__':
|
|||
/**
|
||||
* Delta-decode 128 integers into {@code longs}.
|
||||
*/
|
||||
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, long base, long[] longs) throws IOException {
|
||||
void decodeAndPrefixSum(int bitsPerValue, IndexInput in, long base, long[] longs) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
""")
|
||||
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
|
@ -394,18 +393,18 @@ if __name__ == '__main__':
|
|||
f.write(' case %d:\n' %bpv)
|
||||
if next_primitive(bpv) == primitive_size:
|
||||
if primitive_size % bpv == 0:
|
||||
f.write(' decode%d(pdu, longs);\n' %bpv)
|
||||
f.write(' decode%d(in, longs);\n' %bpv)
|
||||
else:
|
||||
f.write(' decode%d(pdu, tmp, longs);\n' %bpv)
|
||||
f.write(' decode%d(in, tmp, longs);\n' %bpv)
|
||||
else:
|
||||
if primitive_size % bpv == 0:
|
||||
f.write(' decode%dTo%d(pdu, longs);\n' %(bpv, primitive_size))
|
||||
f.write(' decode%dTo%d(in, longs);\n' %(bpv, primitive_size))
|
||||
else:
|
||||
f.write(' decode%dTo%d(pdu, tmp, longs);\n' %(bpv, primitive_size))
|
||||
f.write(' decode%dTo%d(in, tmp, longs);\n' %(bpv, primitive_size))
|
||||
f.write(' prefixSum%d(longs, base);\n' %primitive_size)
|
||||
f.write(' break;\n')
|
||||
f.write(' default:\n')
|
||||
f.write(' decodeSlow(bitsPerValue, pdu, tmp, longs);\n')
|
||||
f.write(' decodeSlow(bitsPerValue, in, tmp, longs);\n')
|
||||
f.write(' prefixSum32(longs, base);\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' }\n')
|
|
@ -40,10 +40,9 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
|
@ -54,7 +53,7 @@ import org.apache.lucene.store.IndexInput;
|
|||
* else if bitsPerValue <= 16 we pack 4 ints per long
|
||||
* else we pack 2 ints per long
|
||||
*/
|
||||
public final class ForUtil {
|
||||
final class ForUtil {
|
||||
|
||||
public static final int BLOCK_SIZE = 128;
|
||||
static final int BLOCK_SIZE_LOG2 = 7;
|
||||
|
@ -222,11 +221,11 @@ public final class ForUtil {
|
|||
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
|
||||
}
|
||||
|
||||
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, long[] tmp, long[] longs)
|
||||
static void decodeSlow(int bitsPerValue, IndexInput in, long[] tmp, long[] longs)
|
||||
throws IOException {
|
||||
final int numLongs = bitsPerValue << 1;
|
||||
final long mask = MASKS32[bitsPerValue];
|
||||
pdu.splitLongs(numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
|
||||
splitLongs(in, numLongs, longs, 32 - bitsPerValue, 32, mask, tmp, 0, -1L);
|
||||
final int remainingBitsPerLong = 32 - bitsPerValue;
|
||||
final long mask32RemainingBitsPerLong = MASKS32[remainingBitsPerLong];
|
||||
int tmpIdx = 0;
|
||||
|
@ -248,6 +247,20 @@ public final class ForUtil {
|
|||
}
|
||||
}
|
||||
|
||||
static void splitLongs(
|
||||
IndexInput in, int count, long[] b, int bShift, int dec, long bMask, long[] c, int cIndex, long cMask)
|
||||
throws IOException {
|
||||
// takes advantage of the C2 compiler's loop unrolling and auto-vectorization.
|
||||
in.readLongs(c, cIndex, count);
|
||||
int maxIter = (bShift - 1) / dec;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
for (int j = 0; j <= maxIter; ++j) {
|
||||
b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask;
|
||||
}
|
||||
c[cIndex + i] &= cMask;
|
||||
}
|
||||
}
|
||||
|
||||
"""
|
||||
|
||||
def writeRemainder(bpv, next_primitive, remaining_bits_per_long, o, num_values, f):
|
||||
|
@ -288,19 +301,19 @@ def writeDecode(bpv, f):
|
|||
elif bpv <= 16:
|
||||
next_primitive = 16
|
||||
if bpv == next_primitive:
|
||||
f.write(' static void decode%d(PostingDecodingUtil pdu, long[] longs) throws IOException {\n' %bpv)
|
||||
f.write(' pdu.in.readLongs(longs, 0, %d);\n' %(bpv*2))
|
||||
f.write(' static void decode%d(IndexInput in, long[] longs) throws IOException {\n' %bpv)
|
||||
f.write(' in.readLongs(longs, 0, %d);\n' %(bpv*2))
|
||||
else:
|
||||
num_values_per_long = 64 / next_primitive
|
||||
remaining_bits = next_primitive % bpv
|
||||
num_iters = (next_primitive - 1) // bpv
|
||||
o = 2 * bpv * num_iters
|
||||
if remaining_bits == 0:
|
||||
f.write(' static void decode%d(PostingDecodingUtil pdu, long[] longs) throws IOException {\n' %bpv)
|
||||
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
f.write(' static void decode%d(IndexInput in, long[] longs) throws IOException {\n' %bpv)
|
||||
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, longs, %d, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
else:
|
||||
f.write(' static void decode%d(PostingDecodingUtil pdu, long[] tmp, long[] longs) throws IOException {\n' %bpv)
|
||||
f.write(' pdu.splitLongs(%d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
f.write(' static void decode%d(IndexInput in, long[] tmp, long[] longs) throws IOException {\n' %bpv)
|
||||
f.write(' splitLongs(in, %d, longs, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*2, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_long - o, f)
|
||||
f.write(' }\n')
|
||||
|
||||
|
@ -326,7 +339,7 @@ if __name__ == '__main__':
|
|||
|
||||
f.write("""
|
||||
/** Decode 128 integers into {@code longs}. */
|
||||
void decode(int bitsPerValue, PostingDecodingUtil pdu, long[] longs) throws IOException {
|
||||
void decode(int bitsPerValue, IndexInput in, long[] longs) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
""")
|
||||
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
|
@ -337,13 +350,13 @@ if __name__ == '__main__':
|
|||
next_primitive = 16
|
||||
f.write(' case %d:\n' %bpv)
|
||||
if next_primitive % bpv == 0:
|
||||
f.write(' decode%d(pdu, longs);\n' %bpv)
|
||||
f.write(' decode%d(in, longs);\n' %bpv)
|
||||
else:
|
||||
f.write(' decode%d(pdu, tmp, longs);\n' %bpv)
|
||||
f.write(' decode%d(in, tmp, longs);\n' %bpv)
|
||||
f.write(' expand%d(longs);\n' %next_primitive)
|
||||
f.write(' break;\n')
|
||||
f.write(' default:\n')
|
||||
f.write(' decodeSlow(bitsPerValue, pdu, tmp, longs);\n')
|
||||
f.write(' decodeSlow(bitsPerValue, in, tmp, longs);\n')
|
||||
f.write(' expand32(longs);\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' }\n')
|
|
@ -15,419 +15,5 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lucene 9.12 file format.
|
||||
*
|
||||
* <h2>Apache Lucene - Index File Formats</h2>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <ul>
|
||||
* <li><a href="#Introduction">Introduction</a>
|
||||
* <li><a href="#Definitions">Definitions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
|
||||
* <li><a href="#Types_of_Fields">Types of Fields</a>
|
||||
* <li><a href="#Segments">Segments</a>
|
||||
* <li><a href="#Document_Numbers">Document Numbers</a>
|
||||
* </ul>
|
||||
* <li><a href="#Overview">Index Structure Overview</a>
|
||||
* <li><a href="#File_Naming">File Naming</a>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a>
|
||||
* <li><a href="#History">History</a>
|
||||
* <li><a href="#Limitations">Limitations</a>
|
||||
* </ul>
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Introduction"></a>
|
||||
*
|
||||
* <h3>Introduction</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>This document defines the index file formats used in this version of Lucene. If you are using
|
||||
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
|
||||
* with the version you are using.
|
||||
*
|
||||
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
|
||||
* </div> <a id="Definitions"></a>
|
||||
*
|
||||
* <h3>Definitions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.
|
||||
*
|
||||
* <p>An index contains a sequence of documents.
|
||||
*
|
||||
* <ul>
|
||||
* <li>A document is a sequence of fields.
|
||||
* <li>A field is a named sequence of terms.
|
||||
* <li>A term is a sequence of bytes.
|
||||
* </ul>
|
||||
*
|
||||
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
|
||||
* are represented as a pair: the string naming the field, and the bytes within the field. <a
|
||||
* id="Inverted_Indexing"></a>
|
||||
*
|
||||
* <h4>Inverted Indexing</h4>
|
||||
*
|
||||
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
|
||||
* search more efficient. Lucene's terms index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
|
||||
* This is the inverse of the natural relationship, in which documents list terms. <a
|
||||
* id="Types_of_Fields"></a>
|
||||
*
|
||||
* <h4>Types of Fields</h4>
|
||||
*
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
|
||||
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
|
||||
* may be both stored and indexed.
|
||||
*
|
||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
|
||||
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
|
||||
* useful for certain identifier fields to be indexed literally.
|
||||
*
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
|
||||
* Fields. <a id="Segments"></a>
|
||||
*
|
||||
* <h4>Segments</h4>
|
||||
*
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
|
||||
* fully independent index, which could be searched separately. Indexes evolve by:
|
||||
*
|
||||
* <ol>
|
||||
* <li>Creating new segments for newly added documents.
|
||||
* <li>Merging existing segments.
|
||||
* </ol>
|
||||
*
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
|
||||
* composed of a set of segments. <a id="Document_Numbers"></a>
|
||||
*
|
||||
* <h4>Document Numbers</h4>
|
||||
*
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
|
||||
* document added to an index is numbered zero, and each subsequent document added gets a number one
|
||||
* greater than the previous.
|
||||
*
|
||||
* <p>Note that a document's number may change, so caution should be taken when storing these
|
||||
* numbers outside of Lucene. In particular, numbers may change in the following situations:
|
||||
*
|
||||
* <ul>
|
||||
* <li>
|
||||
* <p>The numbers stored in each segment are unique only within the segment, and must be
|
||||
* converted before they can be used in a larger context. The standard technique is to
|
||||
* allocate each segment a range of values, based on the range of numbers used in that
|
||||
* segment. To convert a document number from a segment to an external value, the segment's
|
||||
* <i>base</i> document number is added. To convert an external value back to a
|
||||
* segment-specific value, the segment is identified by the range that the external value is
|
||||
* in, and the segment's base value is subtracted. For example two five document segments
|
||||
* might be combined, so that the first segment has a base value of zero, and the second of
|
||||
* five. Document three from the second segment would have an external value of eight.
|
||||
* <li>
|
||||
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
|
||||
* removed as the index evolves through merging. Deleted documents are dropped when segments
|
||||
* are merged. A freshly-merged segment thus has no gaps in its numbering.
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Overview"></a>
|
||||
*
|
||||
* <h3>Index Structure Overview</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Each segment index maintains the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
|
||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||
* information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
|
||||
* contains metadata about the set of named fields used in the index.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||
* This contains, for each document, a list of attribute-value pairs, where the attributes are
|
||||
* field names. These are used to store auxiliary information about the document, such as its
|
||||
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
|
||||
* dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||
* pointers to the term's frequency and proximity data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
|
||||
* each term in the dictionary, the numbers of all the documents that contain that term, and
|
||||
* the frequency of the term in that document, unless frequencies are omitted ({@link
|
||||
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
|
||||
* each term in the dictionary, the positions that the term occurs in each document. Note that
|
||||
* this will not exist if all fields in all documents omit position data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||
* each field in each document, a value is stored that is multiplied into the score for hits
|
||||
* on that field.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
|
||||
* field in each document, the term vector (sometimes called document vector) may be stored. A
|
||||
* term vector consists of term text and term frequency. To add Term Vectors to your index see
|
||||
* the {@link org.apache.lucene.document.Field Field} constructors
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
|
||||
* stored values, these are also keyed by document number, but are generally intended to be
|
||||
* loaded into main memory for fast access. Whereas stored values are generally intended for
|
||||
* summary results from searches, per-document values are useful for things like scoring
|
||||
* factors.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
|
||||
* optional file indicating which documents are live.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
|
||||
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
|
||||
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
|
||||
* intersection (2D, 3D).
|
||||
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
|
||||
* vector format stores numeric vectors in a format optimized for random access and
|
||||
* computation, supporting high-dimensional nearest-neighbor search.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
|
||||
*
|
||||
* <h3>File Naming</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
|
||||
* correspond to the different file formats described below. When using the Compound File format
|
||||
* (default for small segments) these files (except for the Segment info file, the Lock file, and
|
||||
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
|
||||
*
|
||||
* <p>Typically, all segments in an index are stored in a single directory, although this is not
|
||||
* required.
|
||||
*
|
||||
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
|
||||
* never before used filename. This is achieved using a simple generations approach. For example,
|
||||
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
|
||||
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
|
||||
*
|
||||
* <h3>Summary of File Extensions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The following table summarizes the names and extensions of the files in Lucene:
|
||||
*
|
||||
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
||||
* <caption>lucene filenames by extension</caption>
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
* <th>Brief Description</th>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||
* <td>segments_N</td>
|
||||
* <td>Stores information about a commit point</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td><a href="#Lock_File">Lock File</a></td>
|
||||
* <td>write.lock</td>
|
||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
|
||||
* <td>.cfs, .cfe</td>
|
||||
* <td>An optional "virtual" file consisting of all the other index files for
|
||||
* systems that frequently run out of file handles.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
|
||||
* <td>.fnm</td>
|
||||
* <td>Stores information about the fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
|
||||
* <td>.fdx</td>
|
||||
* <td>Contains pointers to field data</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
|
||||
* <td>.fdt</td>
|
||||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
|
||||
* <td>.nvd, .nvm</td>
|
||||
* <td>Encodes length and boost factors for docs and fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
|
||||
* <td>.dvd, .dvm</td>
|
||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
|
||||
* <td>.tvx</td>
|
||||
* <td>Stores offset into the document data file</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
|
||||
* <td>.tvd</td>
|
||||
* <td>Contains term vector data.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
|
||||
* <td>.liv</td>
|
||||
* <td>Info about what documents are live</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
|
||||
* <td>.kdd, .kdi, .kdm</td>
|
||||
* <td>Holds indexed points</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
|
||||
* <td>.vec, .vem, .veq, vex</td>
|
||||
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
|
||||
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
|
||||
* hnsw graph data.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
* </div> <a id="Lock_File"></a>
|
||||
*
|
||||
* <h3>Lock File</h3>
|
||||
*
|
||||
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
|
||||
* lock directory is different from the index directory then the write lock will be named
|
||||
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
|
||||
* directory. When this file is present, a writer is currently modifying the index (adding or
|
||||
* removing documents). This lock file ensures that only one writer is modifying the index at a
|
||||
* time. <a id="History"></a>
|
||||
*
|
||||
* <h3>History</h3>
|
||||
*
|
||||
* <p>Compatibility notes are provided in this document, describing how file formats have changed
|
||||
* from prior versions:
|
||||
*
|
||||
* <ul>
|
||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
|
||||
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
|
||||
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
|
||||
* written in the new file format (meaning no specific "upgrade" process is needed). But note
|
||||
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
|
||||
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
|
||||
* store (vectors & stored fields) files. This allows for faster indexing in certain
|
||||
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
|
||||
* change in 2.1).
|
||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
|
||||
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
|
||||
* details.
|
||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
|
||||
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
|
||||
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
|
||||
* details. Also, diagnostics were added to each segment written recording details about why
|
||||
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
|
||||
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
|
||||
* read, but on merge the new segment will write them, uncompressed). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
|
||||
* <li>In version 3.1, segments records the code version that created them. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||
* Additionally segments track explicitly whether or not they have term vectors. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
|
||||
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
|
||||
* they were stored in text format only.
|
||||
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
|
||||
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
|
||||
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
|
||||
* was introduced. Normalization factors need no longer be a single byte, they can be any
|
||||
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
|
||||
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
|
||||
* the postings lists. Payloads can be stored in the term vectors.
|
||||
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
|
||||
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
|
||||
* were changed to inline directly into the term dictionary. Stored fields are compressed by
|
||||
* default.
|
||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
|
||||
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
|
||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
|
||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||
* allow updating NumericDocValues fields.
|
||||
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
|
||||
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
|
||||
* checksum of the file.
|
||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
|
||||
* suitable for faceting/sorting/analytics.
|
||||
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
|
||||
* for binary fields and ord indexes for multi-valued fields.
|
||||
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
||||
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
|
||||
* sorting.
|
||||
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
|
||||
* an iterator API.
|
||||
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
|
||||
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
|
||||
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
|
||||
* if they may not produce high enough scores. Additionally doc values and norms has been
|
||||
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
|
||||
* elements to skip when advancing in the data.
|
||||
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
||||
* performant encoding that is vectorized.
|
||||
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
|
||||
* user-defined sorts to be used
|
||||
* <li>In version 8.6, points fields split the index tree and leaf data into separate files, to
|
||||
* allow for different access patterns to the different data structures
|
||||
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
|
||||
* smaller stored fields.
|
||||
* <li>In version 9.0, vector-valued fields were added.
|
||||
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
|
||||
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
|
||||
* IndexDISI. ordToDoc mappings was added to .vem.
|
||||
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
|
||||
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
|
||||
* layer and not writing the node ids for the zeroth layer.
|
||||
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
|
||||
* format to utilize int8 quantized vectors for float32 vector search.
|
||||
* <li>In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
|
||||
* 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
|
||||
* need skipping, especially conjunctions.
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Limitations"></a>
|
||||
*
|
||||
* <h3>Limitations</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
|
||||
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
|
||||
* index file format and the current implementation. Eventually these should be replaced with either
|
||||
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||
* VInt} values which have no limit. </div>
|
||||
*/
|
||||
/** Lucene 9.12 file format. */
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
|
@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||
|
|
|
@ -24,3 +24,4 @@ org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
|
|||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec
|
||||
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec
|
||||
org.apache.lucene.backward_codecs.lucene100.Lucene100Codec
|
||||
|
|
|
@ -17,3 +17,4 @@ org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat
|
|||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat
|
||||
|
|
|
@ -14,25 +14,25 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.LEVEL1_MASK;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.LEVEL1_MASK;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.META_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.VERSION_CURRENT;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.PushPostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Impact;
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Read-write impersonation of {@link Lucene912PostingsFormat}. */
|
||||
public final class Lucene912RWPostingsFormat extends Lucene912PostingsFormat {
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/** Creates {@code Lucene912PostingsFormat} with default settings. */
|
||||
public Lucene912RWPostingsFormat() {
|
||||
this(
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@code Lucene912PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
*
|
||||
* @see
|
||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||
*/
|
||||
public Lucene912RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super();
|
||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene90BlockTreeTermsWriter(
|
||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,12 +14,11 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
@ -65,13 +64,11 @@ public class TestForDeltaUtil extends LuceneTestCase {
|
|||
{
|
||||
// decode
|
||||
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
|
||||
PostingDecodingUtil pdu =
|
||||
Lucene912PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
|
||||
ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
long base = 0;
|
||||
final long[] restored = new long[ForUtil.BLOCK_SIZE];
|
||||
forDeltaUtil.decodeAndPrefixSum(pdu, base, restored);
|
||||
forDeltaUtil.decodeAndPrefixSum(in, base, restored);
|
||||
final long[] expected = new long[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
expected[j] = values[i * ForUtil.BLOCK_SIZE + j];
|
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
public class TestForUtil extends LuceneTestCase {
|
||||
|
||||
public void testEncodeDecode() throws IOException {
|
||||
final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
|
||||
final int[] values = new int[iterations * ForUtil.BLOCK_SIZE];
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
final int bpv = TestUtil.nextInt(random(), 1, 31);
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
values[i * ForUtil.BLOCK_SIZE + j] =
|
||||
RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
|
||||
}
|
||||
}
|
||||
|
||||
final Directory d = new ByteBuffersDirectory();
|
||||
final long endPointer;
|
||||
|
||||
{
|
||||
// encode
|
||||
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
|
||||
final ForUtil forUtil = new ForUtil();
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
long[] source = new long[ForUtil.BLOCK_SIZE];
|
||||
long or = 0;
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
source[j] = values[i * ForUtil.BLOCK_SIZE + j];
|
||||
or |= source[j];
|
||||
}
|
||||
final int bpv = PackedInts.bitsRequired(or);
|
||||
out.writeByte((byte) bpv);
|
||||
forUtil.encode(source, bpv, out);
|
||||
}
|
||||
endPointer = out.getFilePointer();
|
||||
out.close();
|
||||
}
|
||||
|
||||
{
|
||||
// decode
|
||||
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
|
||||
ForUtil forUtil = new ForUtil();
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
final int bitsPerValue = in.readByte();
|
||||
final long currentFilePointer = in.getFilePointer();
|
||||
final long[] restored = new long[ForUtil.BLOCK_SIZE];
|
||||
forUtil.decode(bitsPerValue, in, restored);
|
||||
int[] ints = new int[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
ints[j] = Math.toIntExact(restored[j]);
|
||||
}
|
||||
assertArrayEquals(
|
||||
Arrays.toString(ints),
|
||||
ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE),
|
||||
ints);
|
||||
assertEquals(ForUtil.numBytes(bitsPerValue), in.getFilePointer() - currentFilePointer);
|
||||
}
|
||||
assertEquals(endPointer, in.getFilePointer());
|
||||
in.close();
|
||||
}
|
||||
|
||||
d.close();
|
||||
}
|
||||
}
|
|
@ -14,17 +14,17 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsReader.MutableImpactList;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader.MutableImpactList;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
|
@ -45,7 +45,7 @@ public class TestLucene912PostingsFormat extends BasePostingsFormatTestCase {
|
|||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return TestUtil.alwaysPostingsFormat(new Lucene912PostingsFormat());
|
||||
return TestUtil.alwaysPostingsFormat(new Lucene912RWPostingsFormat());
|
||||
}
|
||||
|
||||
public void testVInt15() throws IOException {
|
|
@ -0,0 +1,104 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
public class TestPForUtil extends LuceneTestCase {
|
||||
|
||||
public void testEncodeDecode() throws IOException {
|
||||
final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
|
||||
final int[] values = createTestData(iterations, 31);
|
||||
|
||||
final Directory d = new ByteBuffersDirectory();
|
||||
final long endPointer = encodeTestData(iterations, values, d);
|
||||
|
||||
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
|
||||
final PForUtil pforUtil = new PForUtil();
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
if (random().nextInt(5) == 0) {
|
||||
PForUtil.skip(in);
|
||||
continue;
|
||||
}
|
||||
final long[] restored = new long[ForUtil.BLOCK_SIZE];
|
||||
pforUtil.decode(in, restored);
|
||||
int[] ints = new int[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
ints[j] = Math.toIntExact(restored[j]);
|
||||
}
|
||||
assertArrayEquals(
|
||||
Arrays.toString(ints),
|
||||
ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE),
|
||||
ints);
|
||||
}
|
||||
assertEquals(endPointer, in.getFilePointer());
|
||||
in.close();
|
||||
|
||||
d.close();
|
||||
}
|
||||
|
||||
private int[] createTestData(int iterations, int maxBpv) {
|
||||
final int[] values = new int[iterations * ForUtil.BLOCK_SIZE];
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
final int bpv = TestUtil.nextInt(random(), 0, maxBpv);
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
values[i * ForUtil.BLOCK_SIZE + j] =
|
||||
RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
|
||||
if (random().nextInt(100) == 0) {
|
||||
final int exceptionBpv;
|
||||
if (random().nextInt(10) == 0) {
|
||||
exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 9, 16), maxBpv);
|
||||
} else {
|
||||
exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 1, 8), maxBpv);
|
||||
}
|
||||
values[i * ForUtil.BLOCK_SIZE + j] |= random().nextInt(1 << (exceptionBpv - bpv)) << bpv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return values;
|
||||
}
|
||||
|
||||
private long encodeTestData(int iterations, int[] values, Directory d) throws IOException {
|
||||
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
|
||||
final PForUtil pforUtil = new PForUtil();
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
long[] source = new long[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
source[j] = values[i * ForUtil.BLOCK_SIZE + j];
|
||||
}
|
||||
pforUtil.encode(source, out);
|
||||
}
|
||||
final long endPointer = out.getFilePointer();
|
||||
out.close();
|
||||
|
||||
return endPointer;
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.backward_codecs.lucene912;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.Directory;
|
|
@ -19,12 +19,13 @@ package org.apache.lucene.backward_codecs.lucene99;
|
|||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
|
||||
|
||||
public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene99Codec() {
|
||||
return new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene99RWHnswScalarQuantizationVectorsFormat();
|
||||
|
|
|
@ -20,10 +20,10 @@ import static org.apache.lucene.backward_index.TestBasicBackwardsCompatibility.a
|
|||
|
||||
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader;
|
||||
|
@ -69,7 +69,7 @@ public class TestInt7HnswBackwardsCompatibility extends BackwardsCompatibilityTe
|
|||
}
|
||||
|
||||
protected Codec getCodec() {
|
||||
return new Lucene99Codec() {
|
||||
return new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene99HnswScalarQuantizedVectorsFormat(
|
||||
|
|
|
@ -50,9 +50,9 @@ import org.openjdk.jmh.annotations.Warmup;
|
|||
})
|
||||
public class AdvanceBenchmark {
|
||||
|
||||
private final long[] values = new long[129];
|
||||
private final int[] values = new int[129];
|
||||
private final int[] startIndexes = new int[1_000];
|
||||
private final long[] targets = new long[startIndexes.length];
|
||||
private final int[] targets = new int[startIndexes.length];
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws Exception {
|
||||
|
@ -75,7 +75,7 @@ public class AdvanceBenchmark {
|
|||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch(long[] values, long target, int startIndex) {
|
||||
private static int binarySearch(int[] values, int target, int startIndex) {
|
||||
// Standard binary search
|
||||
int i = Arrays.binarySearch(values, startIndex, values.length, target);
|
||||
if (i < 0) {
|
||||
|
@ -92,7 +92,7 @@ public class AdvanceBenchmark {
|
|||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int inlinedBranchlessBinarySearch(long[] values, long target) {
|
||||
private static int inlinedBranchlessBinarySearch(int[] values, int target) {
|
||||
// This compiles to cmov instructions.
|
||||
int start = 0;
|
||||
|
||||
|
@ -129,7 +129,7 @@ public class AdvanceBenchmark {
|
|||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int linearSearch(long[] values, long target, int startIndex) {
|
||||
private static int linearSearch(int[] values, long target, int startIndex) {
|
||||
// Naive linear search.
|
||||
for (int i = startIndex; i < values.length; ++i) {
|
||||
if (values[i] >= target) {
|
||||
|
@ -147,7 +147,7 @@ public class AdvanceBenchmark {
|
|||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int vectorUtilSearch(long[] values, long target, int startIndex) {
|
||||
private static int vectorUtilSearch(int[] values, int target, int startIndex) {
|
||||
return VectorUtil.findNextGEQ(values, 128, target, startIndex);
|
||||
}
|
||||
|
||||
|
@ -159,7 +159,7 @@ public class AdvanceBenchmark {
|
|||
|
||||
public static void main(String[] args) {
|
||||
// For testing purposes
|
||||
long[] values = new long[129];
|
||||
int[] values = new int[129];
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
values[i] = i;
|
||||
}
|
||||
|
|
|
@ -21,9 +21,9 @@ import java.nio.file.Files;
|
|||
import java.nio.file.Path;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.codecs.lucene912.ForDeltaUtil;
|
||||
import org.apache.lucene.codecs.lucene912.ForUtil;
|
||||
import org.apache.lucene.codecs.lucene912.PostingIndexInput;
|
||||
import org.apache.lucene.codecs.lucene101.ForDeltaUtil;
|
||||
import org.apache.lucene.codecs.lucene101.ForUtil;
|
||||
import org.apache.lucene.codecs.lucene101.PostingIndexInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
@ -61,7 +61,7 @@ public class PostingIndexInputBenchmark {
|
|||
private PostingIndexInput postingIn;
|
||||
private final ForUtil forUtil = new ForUtil();
|
||||
private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||
private final long[] values = new long[128];
|
||||
private final int[] values = new int[ForUtil.BLOCK_SIZE];
|
||||
|
||||
@Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"})
|
||||
public int bpv;
|
||||
|
|
|
@ -22,14 +22,14 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene912PostingsWriter}. */
|
||||
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene101PostingsWriter}. */
|
||||
public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
||||
|
||||
private final int minTermBlockSize;
|
||||
|
@ -67,7 +67,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
|
@ -84,7 +84,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);
|
||||
|
|
|
@ -24,7 +24,7 @@ import java.util.TreeMap;
|
|||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
import org.apache.lucene.index.BaseTermsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Fields;
|
||||
|
@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.TransitionAccessor;
|
|||
// - or: longer dense skip lists than just next byte?
|
||||
|
||||
/**
|
||||
* Wraps {@link Lucene912PostingsFormat} format for on-disk storage, but then at read time loads and
|
||||
* Wraps {@link Lucene101PostingsFormat} format for on-disk storage, but then at read time loads and
|
||||
* stores all terms and postings directly in RAM as byte[], int[].
|
||||
*
|
||||
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
|
||||
|
@ -97,12 +97,12 @@ public final class DirectPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
return PostingsFormat.forName("Lucene912").fieldsConsumer(state);
|
||||
return PostingsFormat.forName("Lucene101").fieldsConsumer(state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
FieldsProducer postings = PostingsFormat.forName("Lucene912").fieldsProducer(state);
|
||||
FieldsProducer postings = PostingsFormat.forName("Lucene101").fieldsProducer(state);
|
||||
if (state.context.context() != IOContext.Context.MERGE) {
|
||||
FieldsProducer loadedPostings;
|
||||
try {
|
||||
|
|
|
@ -22,8 +22,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -41,7 +41,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
|
@ -57,7 +57,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new FSTTermsReader(state, postingsReader);
|
||||
|
|
|
@ -17,13 +17,13 @@
|
|||
|
||||
package org.apache.lucene.codecs.uniformsplit;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.TermState;
|
||||
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
|
||||
/**
|
||||
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
|
||||
* pointer. It differs from {@link Lucene912PostingsWriter#encodeTerm} which encodes each file
|
||||
* pointer. It differs from {@link Lucene101PostingsWriter#encodeTerm} which encodes each file
|
||||
* pointer as a delta relative to the previous file pointer.
|
||||
*
|
||||
* <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP,
|
||||
|
@ -95,7 +95,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
/**
|
||||
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
|
||||
*
|
||||
* <p>Simpler variant of {@link Lucene912PostingsWriter#encodeTerm(DataOutput, FieldInfo,
|
||||
* <p>Simpler variant of {@link Lucene101PostingsWriter#encodeTerm(DataOutput, FieldInfo,
|
||||
* BlockTermState, boolean)}.
|
||||
*/
|
||||
public void writeTermState(
|
||||
|
@ -145,7 +145,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
/**
|
||||
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
|
||||
*
|
||||
* <p>Simpler variant of {@link Lucene912PostingsReader#decodeTerm(DataInput, FieldInfo,
|
||||
* <p>Simpler variant of {@link Lucene101PostingsReader#decodeTerm(DataInput, FieldInfo,
|
||||
* BlockTermState, boolean)}.
|
||||
*
|
||||
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.
|
||||
|
|
|
@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -113,7 +113,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer termsWriter =
|
||||
|
@ -130,7 +130,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer termsReader =
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
* org.apache.lucene.search.PhraseQuery})
|
||||
* <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery}
|
||||
* <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case
|
||||
* prefer {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
|
||||
* prefer {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat}
|
||||
* </ul>
|
||||
*/
|
||||
package org.apache.lucene.codecs.uniformsplit;
|
||||
|
|
|
@ -22,7 +22,7 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.KnnByteVectorField;
|
||||
|
@ -42,7 +42,7 @@ import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase;
|
|||
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene100Codec() {
|
||||
return new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new HnswBitVectorsFormat();
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
package org.apache.lucene.codecs.lucene90.tests;
|
||||
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
|
||||
|
||||
/** Test utility class to create mock {@link IntBlockTermState}. */
|
||||
public class MockTermStateFactory {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
{
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "b662da5848b0decc8bceb4225f433875ae9e3c11",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "01787b97bbe79edb7703498cef8ddb85901a6b1e"
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "0ff7fb9159693055d9e4b9468b004166156f6550",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "8c55b7aaced028388408c5eb968b1f1197e11142"
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
{
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "02e0c8c290e65d0314664fde24c9331bdec44925",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "d7850f37e52a16c6592322950d0f6219cad23a33"
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java": "10ceb79f031232bc1e4564db7e3ebb16eedd2e0a",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py": "d69e734bce30375952046a3776bbb7a5c1edbd51"
|
||||
}
|
|
@ -15,8 +15,6 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
|
||||
|
||||
/** Lucene Core. */
|
||||
@SuppressWarnings("module") // the test framework is compiled after the core...
|
||||
module org.apache.lucene.core {
|
||||
|
@ -33,8 +31,7 @@ module org.apache.lucene.core {
|
|||
exports org.apache.lucene.codecs.lucene94;
|
||||
exports org.apache.lucene.codecs.lucene95;
|
||||
exports org.apache.lucene.codecs.lucene99;
|
||||
exports org.apache.lucene.codecs.lucene912;
|
||||
exports org.apache.lucene.codecs.lucene100;
|
||||
exports org.apache.lucene.codecs.lucene101;
|
||||
exports org.apache.lucene.codecs.perfield;
|
||||
exports org.apache.lucene.codecs;
|
||||
exports org.apache.lucene.document;
|
||||
|
@ -73,7 +70,7 @@ module org.apache.lucene.core {
|
|||
provides org.apache.lucene.analysis.TokenizerFactory with
|
||||
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
||||
provides org.apache.lucene.codecs.Codec with
|
||||
Lucene100Codec;
|
||||
org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
provides org.apache.lucene.codecs.DocValuesFormat with
|
||||
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||
|
@ -81,7 +78,7 @@ module org.apache.lucene.core {
|
|||
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
|
||||
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat;
|
||||
provides org.apache.lucene.codecs.PostingsFormat with
|
||||
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
provides org.apache.lucene.index.SortFieldProvider with
|
||||
org.apache.lucene.search.SortField.Provider,
|
||||
org.apache.lucene.search.SortedNumericSortField.Provider,
|
||||
|
|
|
@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
|
|||
return LOADER;
|
||||
}
|
||||
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene100");
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene101");
|
||||
}
|
||||
|
||||
private final String name;
|
||||
|
|
|
@ -0,0 +1,525 @@
|
|||
// This file has been automatically generated, DO NOT EDIT
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene101.ForUtil.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in a Java int to
|
||||
* get SIMD-like speedups. If bitsPerValue <= 4 then we pack 4 ints per Java int else if
|
||||
* bitsPerValue <= 11 we pack 2 ints per Java int else we use scalar operations.
|
||||
*/
|
||||
public final class ForDeltaUtil {
|
||||
|
||||
private static final int HALF_BLOCK_SIZE = BLOCK_SIZE / 2;
|
||||
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
|
||||
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
|
||||
private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4;
|
||||
|
||||
// IDENTITY_PLUS_ONE[i] == i+1
|
||||
private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
IDENTITY_PLUS_ONE[i] = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSumOfOnes(int[] arr, int base) {
|
||||
System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE);
|
||||
// This loop gets auto-vectorized
|
||||
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
arr[i] += base;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum8(int[] arr, int base) {
|
||||
// When the number of bits per value is 4 or less, we can sum up all values in a block without
|
||||
// risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4
|
||||
// values at once.
|
||||
innerPrefixSum8(arr);
|
||||
expand8(arr);
|
||||
final int l0 = base;
|
||||
final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1];
|
||||
final int l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1];
|
||||
final int l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1];
|
||||
|
||||
for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) {
|
||||
arr[i] += l0;
|
||||
arr[ONE_BLOCK_SIZE_FOURTH + i] += l1;
|
||||
arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2;
|
||||
arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum16(int[] arr, int base) {
|
||||
// When the number of bits per value is 11 or less, we can sum up all values in a block without
|
||||
// risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2
|
||||
// values at once.
|
||||
innerPrefixSum16(arr);
|
||||
expand16(arr);
|
||||
final int l0 = base;
|
||||
final int l1 = base + arr[HALF_BLOCK_SIZE - 1];
|
||||
for (int i = 0; i < HALF_BLOCK_SIZE; ++i) {
|
||||
arr[i] += l0;
|
||||
arr[HALF_BLOCK_SIZE + i] += l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum32(int[] arr, int base) {
|
||||
arr[0] += base;
|
||||
for (int i = 1; i < BLOCK_SIZE; ++i) {
|
||||
arr[i] += arr[i - 1];
|
||||
}
|
||||
}
|
||||
|
||||
// For some reason unrolling seems to help
|
||||
private static void innerPrefixSum8(int[] arr) {
|
||||
arr[1] += arr[0];
|
||||
arr[2] += arr[1];
|
||||
arr[3] += arr[2];
|
||||
arr[4] += arr[3];
|
||||
arr[5] += arr[4];
|
||||
arr[6] += arr[5];
|
||||
arr[7] += arr[6];
|
||||
arr[8] += arr[7];
|
||||
arr[9] += arr[8];
|
||||
arr[10] += arr[9];
|
||||
arr[11] += arr[10];
|
||||
arr[12] += arr[11];
|
||||
arr[13] += arr[12];
|
||||
arr[14] += arr[13];
|
||||
arr[15] += arr[14];
|
||||
arr[16] += arr[15];
|
||||
arr[17] += arr[16];
|
||||
arr[18] += arr[17];
|
||||
arr[19] += arr[18];
|
||||
arr[20] += arr[19];
|
||||
arr[21] += arr[20];
|
||||
arr[22] += arr[21];
|
||||
arr[23] += arr[22];
|
||||
arr[24] += arr[23];
|
||||
arr[25] += arr[24];
|
||||
arr[26] += arr[25];
|
||||
arr[27] += arr[26];
|
||||
arr[28] += arr[27];
|
||||
arr[29] += arr[28];
|
||||
arr[30] += arr[29];
|
||||
arr[31] += arr[30];
|
||||
}
|
||||
|
||||
// For some reason unrolling seems to help
|
||||
private static void innerPrefixSum16(int[] arr) {
|
||||
arr[1] += arr[0];
|
||||
arr[2] += arr[1];
|
||||
arr[3] += arr[2];
|
||||
arr[4] += arr[3];
|
||||
arr[5] += arr[4];
|
||||
arr[6] += arr[5];
|
||||
arr[7] += arr[6];
|
||||
arr[8] += arr[7];
|
||||
arr[9] += arr[8];
|
||||
arr[10] += arr[9];
|
||||
arr[11] += arr[10];
|
||||
arr[12] += arr[11];
|
||||
arr[13] += arr[12];
|
||||
arr[14] += arr[13];
|
||||
arr[15] += arr[14];
|
||||
arr[16] += arr[15];
|
||||
arr[17] += arr[16];
|
||||
arr[18] += arr[17];
|
||||
arr[19] += arr[18];
|
||||
arr[20] += arr[19];
|
||||
arr[21] += arr[20];
|
||||
arr[22] += arr[21];
|
||||
arr[23] += arr[22];
|
||||
arr[24] += arr[23];
|
||||
arr[25] += arr[24];
|
||||
arr[26] += arr[25];
|
||||
arr[27] += arr[26];
|
||||
arr[28] += arr[27];
|
||||
arr[29] += arr[28];
|
||||
arr[30] += arr[29];
|
||||
arr[31] += arr[30];
|
||||
arr[32] += arr[31];
|
||||
arr[33] += arr[32];
|
||||
arr[34] += arr[33];
|
||||
arr[35] += arr[34];
|
||||
arr[36] += arr[35];
|
||||
arr[37] += arr[36];
|
||||
arr[38] += arr[37];
|
||||
arr[39] += arr[38];
|
||||
arr[40] += arr[39];
|
||||
arr[41] += arr[40];
|
||||
arr[42] += arr[41];
|
||||
arr[43] += arr[42];
|
||||
arr[44] += arr[43];
|
||||
arr[45] += arr[44];
|
||||
arr[46] += arr[45];
|
||||
arr[47] += arr[46];
|
||||
arr[48] += arr[47];
|
||||
arr[49] += arr[48];
|
||||
arr[50] += arr[49];
|
||||
arr[51] += arr[50];
|
||||
arr[52] += arr[51];
|
||||
arr[53] += arr[52];
|
||||
arr[54] += arr[53];
|
||||
arr[55] += arr[54];
|
||||
arr[56] += arr[55];
|
||||
arr[57] += arr[56];
|
||||
arr[58] += arr[57];
|
||||
arr[59] += arr[58];
|
||||
arr[60] += arr[59];
|
||||
arr[61] += arr[60];
|
||||
arr[62] += arr[61];
|
||||
arr[63] += arr[62];
|
||||
}
|
||||
|
||||
private final int[] tmp = new int[BLOCK_SIZE];
|
||||
|
||||
/**
|
||||
* Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code
|
||||
* ints} are expected to be deltas between consecutive values.
|
||||
*/
|
||||
void encodeDeltas(int[] ints, DataOutput out) throws IOException {
|
||||
if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings
|
||||
out.writeByte((byte) 0);
|
||||
} else {
|
||||
int or = 0;
|
||||
for (int l : ints) {
|
||||
or |= l;
|
||||
}
|
||||
assert or != 0;
|
||||
final int bitsPerValue = PackedInts.bitsRequired(or);
|
||||
out.writeByte((byte) bitsPerValue);
|
||||
|
||||
final int primitiveSize;
|
||||
if (bitsPerValue <= 3) {
|
||||
primitiveSize = 8;
|
||||
collapse8(ints);
|
||||
} else if (bitsPerValue <= 10) {
|
||||
primitiveSize = 16;
|
||||
collapse16(ints);
|
||||
} else {
|
||||
primitiveSize = 32;
|
||||
}
|
||||
encode(ints, bitsPerValue, primitiveSize, out, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */
|
||||
void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
|
||||
if (bitsPerValue == 0) {
|
||||
prefixSumOfOnes(ints, base);
|
||||
} else {
|
||||
decodeAndPrefixSum(bitsPerValue, pdu, base, ints);
|
||||
}
|
||||
}
|
||||
|
||||
/** Delta-decode 128 integers into {@code ints}. */
|
||||
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int[] ints)
|
||||
throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
case 1:
|
||||
decode1(pdu, ints);
|
||||
prefixSum8(ints, base);
|
||||
break;
|
||||
case 2:
|
||||
decode2(pdu, ints);
|
||||
prefixSum8(ints, base);
|
||||
break;
|
||||
case 3:
|
||||
decode3(pdu, tmp, ints);
|
||||
prefixSum8(ints, base);
|
||||
break;
|
||||
case 4:
|
||||
decode4To16(pdu, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 5:
|
||||
decode5To16(pdu, tmp, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 6:
|
||||
decode6To16(pdu, tmp, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 7:
|
||||
decode7To16(pdu, tmp, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 8:
|
||||
decode8To16(pdu, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 9:
|
||||
decode9(pdu, tmp, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 10:
|
||||
decode10(pdu, tmp, ints);
|
||||
prefixSum16(ints, base);
|
||||
break;
|
||||
case 11:
|
||||
decode11To32(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 12:
|
||||
decode12To32(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 13:
|
||||
decode13To32(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 14:
|
||||
decode14To32(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 15:
|
||||
decode15To32(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 16:
|
||||
decode16To32(pdu, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 17:
|
||||
decode17(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 18:
|
||||
decode18(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 19:
|
||||
decode19(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 20:
|
||||
decode20(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 21:
|
||||
decode21(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 22:
|
||||
decode22(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 23:
|
||||
decode23(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
case 24:
|
||||
decode24(pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
default:
|
||||
decodeSlow(bitsPerValue, pdu, tmp, ints);
|
||||
prefixSum32(ints, base);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode4To16(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(16, ints, 12, 4, MASK16_4, ints, 48, MASK16_4);
|
||||
}
|
||||
|
||||
private static void decode5To16(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(20, ints, 11, 5, MASK16_5, tmp, 0, MASK16_1);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 60; iter < 4; ++iter, tmpIdx += 5, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= tmp[tmpIdx + 1] << 3;
|
||||
l0 |= tmp[tmpIdx + 2] << 2;
|
||||
l0 |= tmp[tmpIdx + 3] << 1;
|
||||
l0 |= tmp[tmpIdx + 4] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode6To16(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(24, ints, 10, 6, MASK16_6, tmp, 0, MASK16_4);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 48; iter < 8; ++iter, tmpIdx += 3, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK16_2) << 4;
|
||||
l1 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode7To16(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(28, ints, 9, 7, MASK16_7, tmp, 0, MASK16_2);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 56; iter < 4; ++iter, tmpIdx += 7, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 5;
|
||||
l0 |= tmp[tmpIdx + 1] << 3;
|
||||
l0 |= tmp[tmpIdx + 2] << 1;
|
||||
l0 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 3] & MASK16_1) << 6;
|
||||
l1 |= tmp[tmpIdx + 4] << 4;
|
||||
l1 |= tmp[tmpIdx + 5] << 2;
|
||||
l1 |= tmp[tmpIdx + 6] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode8To16(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(32, ints, 8, 8, MASK16_8, ints, 32, MASK16_8);
|
||||
}
|
||||
|
||||
private static void decode11To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(44, ints, 21, 11, MASK32_11, tmp, 0, MASK32_10);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 88; iter < 4; ++iter, tmpIdx += 11, intsIdx += 10) {
|
||||
int l0 = tmp[tmpIdx + 0] << 1;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 9) & MASK32_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_9) << 2;
|
||||
l1 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 2] & MASK32_8) << 3;
|
||||
l2 |= (tmp[tmpIdx + 3] >>> 7) & MASK32_3;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 3] & MASK32_7) << 4;
|
||||
l3 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 4] & MASK32_6) << 5;
|
||||
l4 |= (tmp[tmpIdx + 5] >>> 5) & MASK32_5;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 5] & MASK32_5) << 6;
|
||||
l5 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 6] & MASK32_4) << 7;
|
||||
l6 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_7;
|
||||
ints[intsIdx + 6] = l6;
|
||||
int l7 = (tmp[tmpIdx + 7] & MASK32_3) << 8;
|
||||
l7 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8;
|
||||
ints[intsIdx + 7] = l7;
|
||||
int l8 = (tmp[tmpIdx + 8] & MASK32_2) << 9;
|
||||
l8 |= (tmp[tmpIdx + 9] >>> 1) & MASK32_9;
|
||||
ints[intsIdx + 8] = l8;
|
||||
int l9 = (tmp[tmpIdx + 9] & MASK32_1) << 10;
|
||||
l9 |= tmp[tmpIdx + 10] << 0;
|
||||
ints[intsIdx + 9] = l9;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode12To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(48, ints, 20, 12, MASK32_12, tmp, 0, MASK32_8);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 96; iter < 16; ++iter, tmpIdx += 3, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_4;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_4) << 8;
|
||||
l1 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode13To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(52, ints, 19, 13, MASK32_13, tmp, 0, MASK32_6);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 104; iter < 4; ++iter, tmpIdx += 13, intsIdx += 6) {
|
||||
int l0 = tmp[tmpIdx + 0] << 7;
|
||||
l0 |= tmp[tmpIdx + 1] << 1;
|
||||
l0 |= (tmp[tmpIdx + 2] >>> 5) & MASK32_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 2] & MASK32_5) << 8;
|
||||
l1 |= tmp[tmpIdx + 3] << 2;
|
||||
l1 |= (tmp[tmpIdx + 4] >>> 4) & MASK32_2;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 4] & MASK32_4) << 9;
|
||||
l2 |= tmp[tmpIdx + 5] << 3;
|
||||
l2 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_3;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 6] & MASK32_3) << 10;
|
||||
l3 |= tmp[tmpIdx + 7] << 4;
|
||||
l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_4;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 8] & MASK32_2) << 11;
|
||||
l4 |= tmp[tmpIdx + 9] << 5;
|
||||
l4 |= (tmp[tmpIdx + 10] >>> 1) & MASK32_5;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 10] & MASK32_1) << 12;
|
||||
l5 |= tmp[tmpIdx + 11] << 6;
|
||||
l5 |= tmp[tmpIdx + 12] << 0;
|
||||
ints[intsIdx + 5] = l5;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode14To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(56, ints, 18, 14, MASK32_14, tmp, 0, MASK32_4);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 112; iter < 8; ++iter, tmpIdx += 7, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= tmp[tmpIdx + 1] << 6;
|
||||
l0 |= tmp[tmpIdx + 2] << 2;
|
||||
l0 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 3] & MASK32_2) << 12;
|
||||
l1 |= tmp[tmpIdx + 4] << 8;
|
||||
l1 |= tmp[tmpIdx + 5] << 4;
|
||||
l1 |= tmp[tmpIdx + 6] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode15To32(PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
pdu.splitInts(60, ints, 17, 15, MASK32_15, tmp, 0, MASK32_2);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 120; iter < 4; ++iter, tmpIdx += 15, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 13;
|
||||
l0 |= tmp[tmpIdx + 1] << 11;
|
||||
l0 |= tmp[tmpIdx + 2] << 9;
|
||||
l0 |= tmp[tmpIdx + 3] << 7;
|
||||
l0 |= tmp[tmpIdx + 4] << 5;
|
||||
l0 |= tmp[tmpIdx + 5] << 3;
|
||||
l0 |= tmp[tmpIdx + 6] << 1;
|
||||
l0 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 7] & MASK32_1) << 14;
|
||||
l1 |= tmp[tmpIdx + 8] << 12;
|
||||
l1 |= tmp[tmpIdx + 9] << 10;
|
||||
l1 |= tmp[tmpIdx + 10] << 8;
|
||||
l1 |= tmp[tmpIdx + 11] << 6;
|
||||
l1 |= tmp[tmpIdx + 12] << 4;
|
||||
l1 |= tmp[tmpIdx + 13] << 2;
|
||||
l1 |= tmp[tmpIdx + 14] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void decode16To32(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(64, ints, 16, 16, MASK32_16, ints, 64, MASK32_16);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,841 @@
|
|||
// This file has been automatically generated, DO NOT EDIT
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in one to get
|
||||
* SIMD-like speedups. If bitsPerValue <= 8 then we pack 4 ints per Java int else if bitsPerValue
|
||||
* <= 16 we pack 2 ints per Java int else we do scalar operations.
|
||||
*/
|
||||
public final class ForUtil {
|
||||
|
||||
public static final int BLOCK_SIZE = 128;
|
||||
static final int BLOCK_SIZE_LOG2 = 7;
|
||||
|
||||
static int expandMask16(int mask16) {
|
||||
return mask16 | (mask16 << 16);
|
||||
}
|
||||
|
||||
static int expandMask8(int mask8) {
|
||||
return expandMask16(mask8 | (mask8 << 8));
|
||||
}
|
||||
|
||||
static int mask32(int bitsPerValue) {
|
||||
return (1 << bitsPerValue) - 1;
|
||||
}
|
||||
|
||||
static int mask16(int bitsPerValue) {
|
||||
return expandMask16((1 << bitsPerValue) - 1);
|
||||
}
|
||||
|
||||
static int mask8(int bitsPerValue) {
|
||||
return expandMask8((1 << bitsPerValue) - 1);
|
||||
}
|
||||
|
||||
static void expand8(int[] arr) {
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
int l = arr[i];
|
||||
arr[i] = (l >>> 24) & 0xFF;
|
||||
arr[32 + i] = (l >>> 16) & 0xFF;
|
||||
arr[64 + i] = (l >>> 8) & 0xFF;
|
||||
arr[96 + i] = l & 0xFF;
|
||||
}
|
||||
}
|
||||
|
||||
static void collapse8(int[] arr) {
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
arr[i] = (arr[i] << 24) | (arr[32 + i] << 16) | (arr[64 + i] << 8) | arr[96 + i];
|
||||
}
|
||||
}
|
||||
|
||||
static void expand16(int[] arr) {
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
int l = arr[i];
|
||||
arr[i] = (l >>> 16) & 0xFFFF;
|
||||
arr[64 + i] = l & 0xFFFF;
|
||||
}
|
||||
}
|
||||
|
||||
static void collapse16(int[] arr) {
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
arr[i] = (arr[i] << 16) | arr[64 + i];
|
||||
}
|
||||
}
|
||||
|
||||
private final int[] tmp = new int[BLOCK_SIZE];
|
||||
|
||||
/** Encode 128 integers from {@code ints} into {@code out}. */
|
||||
void encode(int[] ints, int bitsPerValue, DataOutput out) throws IOException {
|
||||
final int nextPrimitive;
|
||||
if (bitsPerValue <= 8) {
|
||||
nextPrimitive = 8;
|
||||
collapse8(ints);
|
||||
} else if (bitsPerValue <= 16) {
|
||||
nextPrimitive = 16;
|
||||
collapse16(ints);
|
||||
} else {
|
||||
nextPrimitive = 32;
|
||||
}
|
||||
encode(ints, bitsPerValue, nextPrimitive, out, tmp);
|
||||
}
|
||||
|
||||
static void encode(int[] ints, int bitsPerValue, int primitiveSize, DataOutput out, int[] tmp)
|
||||
throws IOException {
|
||||
final int numInts = BLOCK_SIZE * primitiveSize / Integer.SIZE;
|
||||
|
||||
final int numIntsPerShift = bitsPerValue * 4;
|
||||
int idx = 0;
|
||||
int shift = primitiveSize - bitsPerValue;
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
tmp[i] = ints[idx++] << shift;
|
||||
}
|
||||
for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
tmp[i] |= ints[idx++] << shift;
|
||||
}
|
||||
}
|
||||
|
||||
final int remainingBitsPerInt = shift + bitsPerValue;
|
||||
final int maskRemainingBitsPerInt;
|
||||
if (primitiveSize == 8) {
|
||||
maskRemainingBitsPerInt = MASKS8[remainingBitsPerInt];
|
||||
} else if (primitiveSize == 16) {
|
||||
maskRemainingBitsPerInt = MASKS16[remainingBitsPerInt];
|
||||
} else {
|
||||
maskRemainingBitsPerInt = MASKS32[remainingBitsPerInt];
|
||||
}
|
||||
|
||||
int tmpIdx = 0;
|
||||
int remainingBitsPerValue = bitsPerValue;
|
||||
while (idx < numInts) {
|
||||
if (remainingBitsPerValue >= remainingBitsPerInt) {
|
||||
remainingBitsPerValue -= remainingBitsPerInt;
|
||||
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerInt;
|
||||
if (remainingBitsPerValue == 0) {
|
||||
idx++;
|
||||
remainingBitsPerValue = bitsPerValue;
|
||||
}
|
||||
} else {
|
||||
final int mask1, mask2;
|
||||
if (primitiveSize == 8) {
|
||||
mask1 = MASKS8[remainingBitsPerValue];
|
||||
mask2 = MASKS8[remainingBitsPerInt - remainingBitsPerValue];
|
||||
} else if (primitiveSize == 16) {
|
||||
mask1 = MASKS16[remainingBitsPerValue];
|
||||
mask2 = MASKS16[remainingBitsPerInt - remainingBitsPerValue];
|
||||
} else {
|
||||
mask1 = MASKS32[remainingBitsPerValue];
|
||||
mask2 = MASKS32[remainingBitsPerInt - remainingBitsPerValue];
|
||||
}
|
||||
tmp[tmpIdx] |= (ints[idx++] & mask1) << (remainingBitsPerInt - remainingBitsPerValue);
|
||||
remainingBitsPerValue = bitsPerValue - remainingBitsPerInt + remainingBitsPerValue;
|
||||
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & mask2;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
out.writeInt(tmp[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */
|
||||
static int numBytes(int bitsPerValue) {
|
||||
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
|
||||
}
|
||||
|
||||
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
final int numInts = bitsPerValue << 2;
|
||||
final int mask = MASKS32[bitsPerValue];
|
||||
pdu.splitInts(numInts, ints, 32 - bitsPerValue, 32, mask, tmp, 0, -1);
|
||||
final int remainingBitsPerInt = 32 - bitsPerValue;
|
||||
final int mask32RemainingBitsPerInt = MASKS32[remainingBitsPerInt];
|
||||
int tmpIdx = 0;
|
||||
int remainingBits = remainingBitsPerInt;
|
||||
for (int intsIdx = numInts; intsIdx < BLOCK_SIZE; ++intsIdx) {
|
||||
int b = bitsPerValue - remainingBits;
|
||||
int l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b;
|
||||
while (b >= remainingBitsPerInt) {
|
||||
b -= remainingBitsPerInt;
|
||||
l |= (tmp[tmpIdx++] & mask32RemainingBitsPerInt) << b;
|
||||
}
|
||||
if (b > 0) {
|
||||
l |= (tmp[tmpIdx] >>> (remainingBitsPerInt - b)) & MASKS32[b];
|
||||
remainingBits = remainingBitsPerInt - b;
|
||||
} else {
|
||||
remainingBits = remainingBitsPerInt;
|
||||
}
|
||||
ints[intsIdx] = l;
|
||||
}
|
||||
}
|
||||
|
||||
static final int[] MASKS8 = new int[8];
|
||||
static final int[] MASKS16 = new int[16];
|
||||
static final int[] MASKS32 = new int[32];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
MASKS8[i] = mask8(i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
MASKS16[i] = mask16(i);
|
||||
}
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
MASKS32[i] = mask32(i);
|
||||
}
|
||||
}
|
||||
|
||||
// mark values in array as final ints to avoid the cost of reading array, arrays should only be
|
||||
// used when the idx is a variable
|
||||
static final int MASK8_1 = MASKS8[1];
|
||||
static final int MASK8_2 = MASKS8[2];
|
||||
static final int MASK8_3 = MASKS8[3];
|
||||
static final int MASK8_4 = MASKS8[4];
|
||||
static final int MASK8_5 = MASKS8[5];
|
||||
static final int MASK8_6 = MASKS8[6];
|
||||
static final int MASK8_7 = MASKS8[7];
|
||||
static final int MASK16_1 = MASKS16[1];
|
||||
static final int MASK16_2 = MASKS16[2];
|
||||
static final int MASK16_3 = MASKS16[3];
|
||||
static final int MASK16_4 = MASKS16[4];
|
||||
static final int MASK16_5 = MASKS16[5];
|
||||
static final int MASK16_6 = MASKS16[6];
|
||||
static final int MASK16_7 = MASKS16[7];
|
||||
static final int MASK16_8 = MASKS16[8];
|
||||
static final int MASK16_9 = MASKS16[9];
|
||||
static final int MASK16_10 = MASKS16[10];
|
||||
static final int MASK16_11 = MASKS16[11];
|
||||
static final int MASK16_12 = MASKS16[12];
|
||||
static final int MASK16_13 = MASKS16[13];
|
||||
static final int MASK16_14 = MASKS16[14];
|
||||
static final int MASK16_15 = MASKS16[15];
|
||||
static final int MASK32_1 = MASKS32[1];
|
||||
static final int MASK32_2 = MASKS32[2];
|
||||
static final int MASK32_3 = MASKS32[3];
|
||||
static final int MASK32_4 = MASKS32[4];
|
||||
static final int MASK32_5 = MASKS32[5];
|
||||
static final int MASK32_6 = MASKS32[6];
|
||||
static final int MASK32_7 = MASKS32[7];
|
||||
static final int MASK32_8 = MASKS32[8];
|
||||
static final int MASK32_9 = MASKS32[9];
|
||||
static final int MASK32_10 = MASKS32[10];
|
||||
static final int MASK32_11 = MASKS32[11];
|
||||
static final int MASK32_12 = MASKS32[12];
|
||||
static final int MASK32_13 = MASKS32[13];
|
||||
static final int MASK32_14 = MASKS32[14];
|
||||
static final int MASK32_15 = MASKS32[15];
|
||||
static final int MASK32_16 = MASKS32[16];
|
||||
static final int MASK32_17 = MASKS32[17];
|
||||
static final int MASK32_18 = MASKS32[18];
|
||||
static final int MASK32_19 = MASKS32[19];
|
||||
static final int MASK32_20 = MASKS32[20];
|
||||
static final int MASK32_21 = MASKS32[21];
|
||||
static final int MASK32_22 = MASKS32[22];
|
||||
static final int MASK32_23 = MASKS32[23];
|
||||
static final int MASK32_24 = MASKS32[24];
|
||||
|
||||
/** Decode 128 integers into {@code ints}. */
|
||||
void decode(int bitsPerValue, PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
case 1:
|
||||
decode1(pdu, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 2:
|
||||
decode2(pdu, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 3:
|
||||
decode3(pdu, tmp, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 4:
|
||||
decode4(pdu, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 5:
|
||||
decode5(pdu, tmp, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 6:
|
||||
decode6(pdu, tmp, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 7:
|
||||
decode7(pdu, tmp, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 8:
|
||||
decode8(pdu, ints);
|
||||
expand8(ints);
|
||||
break;
|
||||
case 9:
|
||||
decode9(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 10:
|
||||
decode10(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 11:
|
||||
decode11(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 12:
|
||||
decode12(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 13:
|
||||
decode13(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 14:
|
||||
decode14(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 15:
|
||||
decode15(pdu, tmp, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 16:
|
||||
decode16(pdu, ints);
|
||||
expand16(ints);
|
||||
break;
|
||||
case 17:
|
||||
decode17(pdu, tmp, ints);
|
||||
break;
|
||||
case 18:
|
||||
decode18(pdu, tmp, ints);
|
||||
break;
|
||||
case 19:
|
||||
decode19(pdu, tmp, ints);
|
||||
break;
|
||||
case 20:
|
||||
decode20(pdu, tmp, ints);
|
||||
break;
|
||||
case 21:
|
||||
decode21(pdu, tmp, ints);
|
||||
break;
|
||||
case 22:
|
||||
decode22(pdu, tmp, ints);
|
||||
break;
|
||||
case 23:
|
||||
decode23(pdu, tmp, ints);
|
||||
break;
|
||||
case 24:
|
||||
decode24(pdu, tmp, ints);
|
||||
break;
|
||||
default:
|
||||
decodeSlow(bitsPerValue, pdu, tmp, ints);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode1(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(4, ints, 7, 1, MASK8_1, ints, 28, MASK8_1);
|
||||
}
|
||||
|
||||
static void decode2(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(8, ints, 6, 2, MASK8_2, ints, 24, MASK8_2);
|
||||
}
|
||||
|
||||
static void decode3(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(12, ints, 5, 3, MASK8_3, tmp, 0, MASK8_2);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 24; iter < 4; ++iter, tmpIdx += 3, intsIdx += 2) {
|
||||
int l0 = tmp[tmpIdx + 0] << 1;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK8_1) << 2;
|
||||
l1 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 1] = l1;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode4(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.splitInts(16, ints, 4, 4, MASK8_4, ints, 16, MASK8_4);
|
||||
}
|
||||
|
||||
static void decode5(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(20, ints, 3, 5, MASK8_5, tmp, 0, MASK8_3);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 20; iter < 4; ++iter, tmpIdx += 5, intsIdx += 3) {
|
||||
int l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK8_1) << 4;
|
||||
l1 |= tmp[tmpIdx + 2] << 1;
|
||||
l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK8_1;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 3] & MASK8_2) << 3;
|
||||
l2 |= tmp[tmpIdx + 4] << 0;
|
||||
ints[intsIdx + 2] = l2;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode6(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(24, ints, 2, 6, MASK8_6, tmp, 0, MASK8_2);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 24; iter < 8; ++iter, tmpIdx += 3, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= tmp[tmpIdx + 1] << 2;
|
||||
l0 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode7(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(28, ints, 1, 7, MASK8_7, tmp, 0, MASK8_1);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 28; iter < 4; ++iter, tmpIdx += 7, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= tmp[tmpIdx + 1] << 5;
|
||||
l0 |= tmp[tmpIdx + 2] << 4;
|
||||
l0 |= tmp[tmpIdx + 3] << 3;
|
||||
l0 |= tmp[tmpIdx + 4] << 2;
|
||||
l0 |= tmp[tmpIdx + 5] << 1;
|
||||
l0 |= tmp[tmpIdx + 6] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode8(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.in.readInts(ints, 0, 32);
|
||||
}
|
||||
|
||||
static void decode9(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(36, ints, 7, 9, MASK16_9, tmp, 0, MASK16_7);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 36; iter < 4; ++iter, tmpIdx += 9, intsIdx += 7) {
|
||||
int l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 5) & MASK16_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK16_5) << 4;
|
||||
l1 |= (tmp[tmpIdx + 2] >>> 3) & MASK16_4;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 2] & MASK16_3) << 6;
|
||||
l2 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_6;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 3] & MASK16_1) << 8;
|
||||
l3 |= tmp[tmpIdx + 4] << 1;
|
||||
l3 |= (tmp[tmpIdx + 5] >>> 6) & MASK16_1;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 5] & MASK16_6) << 3;
|
||||
l4 |= (tmp[tmpIdx + 6] >>> 4) & MASK16_3;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 6] & MASK16_4) << 5;
|
||||
l5 |= (tmp[tmpIdx + 7] >>> 2) & MASK16_5;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 7] & MASK16_2) << 7;
|
||||
l6 |= tmp[tmpIdx + 8] << 0;
|
||||
ints[intsIdx + 6] = l6;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode10(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(40, ints, 6, 10, MASK16_10, tmp, 0, MASK16_6);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 40; iter < 8; ++iter, tmpIdx += 5, intsIdx += 3) {
|
||||
int l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_4;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK16_2) << 8;
|
||||
l1 |= tmp[tmpIdx + 2] << 2;
|
||||
l1 |= (tmp[tmpIdx + 3] >>> 4) & MASK16_2;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 3] & MASK16_4) << 6;
|
||||
l2 |= tmp[tmpIdx + 4] << 0;
|
||||
ints[intsIdx + 2] = l2;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode11(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(44, ints, 5, 11, MASK16_11, tmp, 0, MASK16_5);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 44; iter < 4; ++iter, tmpIdx += 11, intsIdx += 5) {
|
||||
int l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= tmp[tmpIdx + 1] << 1;
|
||||
l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK16_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 2] & MASK16_4) << 7;
|
||||
l1 |= tmp[tmpIdx + 3] << 2;
|
||||
l1 |= (tmp[tmpIdx + 4] >>> 3) & MASK16_2;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 4] & MASK16_3) << 8;
|
||||
l2 |= tmp[tmpIdx + 5] << 3;
|
||||
l2 |= (tmp[tmpIdx + 6] >>> 2) & MASK16_3;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 6] & MASK16_2) << 9;
|
||||
l3 |= tmp[tmpIdx + 7] << 4;
|
||||
l3 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_4;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 8] & MASK16_1) << 10;
|
||||
l4 |= tmp[tmpIdx + 9] << 5;
|
||||
l4 |= tmp[tmpIdx + 10] << 0;
|
||||
ints[intsIdx + 4] = l4;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode12(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(48, ints, 4, 12, MASK16_12, tmp, 0, MASK16_4);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 48; iter < 16; ++iter, tmpIdx += 3, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 8;
|
||||
l0 |= tmp[tmpIdx + 1] << 4;
|
||||
l0 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode13(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(52, ints, 3, 13, MASK16_13, tmp, 0, MASK16_3);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 52; iter < 4; ++iter, tmpIdx += 13, intsIdx += 3) {
|
||||
int l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= tmp[tmpIdx + 1] << 7;
|
||||
l0 |= tmp[tmpIdx + 2] << 4;
|
||||
l0 |= tmp[tmpIdx + 3] << 1;
|
||||
l0 |= (tmp[tmpIdx + 4] >>> 2) & MASK16_1;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 4] & MASK16_2) << 11;
|
||||
l1 |= tmp[tmpIdx + 5] << 8;
|
||||
l1 |= tmp[tmpIdx + 6] << 5;
|
||||
l1 |= tmp[tmpIdx + 7] << 2;
|
||||
l1 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_2;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 8] & MASK16_1) << 12;
|
||||
l2 |= tmp[tmpIdx + 9] << 9;
|
||||
l2 |= tmp[tmpIdx + 10] << 6;
|
||||
l2 |= tmp[tmpIdx + 11] << 3;
|
||||
l2 |= tmp[tmpIdx + 12] << 0;
|
||||
ints[intsIdx + 2] = l2;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode14(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(56, ints, 2, 14, MASK16_14, tmp, 0, MASK16_2);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 56; iter < 8; ++iter, tmpIdx += 7, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 12;
|
||||
l0 |= tmp[tmpIdx + 1] << 10;
|
||||
l0 |= tmp[tmpIdx + 2] << 8;
|
||||
l0 |= tmp[tmpIdx + 3] << 6;
|
||||
l0 |= tmp[tmpIdx + 4] << 4;
|
||||
l0 |= tmp[tmpIdx + 5] << 2;
|
||||
l0 |= tmp[tmpIdx + 6] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode15(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(60, ints, 1, 15, MASK16_15, tmp, 0, MASK16_1);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 60; iter < 4; ++iter, tmpIdx += 15, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 14;
|
||||
l0 |= tmp[tmpIdx + 1] << 13;
|
||||
l0 |= tmp[tmpIdx + 2] << 12;
|
||||
l0 |= tmp[tmpIdx + 3] << 11;
|
||||
l0 |= tmp[tmpIdx + 4] << 10;
|
||||
l0 |= tmp[tmpIdx + 5] << 9;
|
||||
l0 |= tmp[tmpIdx + 6] << 8;
|
||||
l0 |= tmp[tmpIdx + 7] << 7;
|
||||
l0 |= tmp[tmpIdx + 8] << 6;
|
||||
l0 |= tmp[tmpIdx + 9] << 5;
|
||||
l0 |= tmp[tmpIdx + 10] << 4;
|
||||
l0 |= tmp[tmpIdx + 11] << 3;
|
||||
l0 |= tmp[tmpIdx + 12] << 2;
|
||||
l0 |= tmp[tmpIdx + 13] << 1;
|
||||
l0 |= tmp[tmpIdx + 14] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode16(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
pdu.in.readInts(ints, 0, 64);
|
||||
}
|
||||
|
||||
static void decode17(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(68, ints, 15, 17, MASK32_17, tmp, 0, MASK32_15);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 68; iter < 4; ++iter, tmpIdx += 17, intsIdx += 15) {
|
||||
int l0 = tmp[tmpIdx + 0] << 2;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 13) & MASK32_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_13) << 4;
|
||||
l1 |= (tmp[tmpIdx + 2] >>> 11) & MASK32_4;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 2] & MASK32_11) << 6;
|
||||
l2 |= (tmp[tmpIdx + 3] >>> 9) & MASK32_6;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 3] & MASK32_9) << 8;
|
||||
l3 |= (tmp[tmpIdx + 4] >>> 7) & MASK32_8;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 4] & MASK32_7) << 10;
|
||||
l4 |= (tmp[tmpIdx + 5] >>> 5) & MASK32_10;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 5] & MASK32_5) << 12;
|
||||
l5 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_12;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 6] & MASK32_3) << 14;
|
||||
l6 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_14;
|
||||
ints[intsIdx + 6] = l6;
|
||||
int l7 = (tmp[tmpIdx + 7] & MASK32_1) << 16;
|
||||
l7 |= tmp[tmpIdx + 8] << 1;
|
||||
l7 |= (tmp[tmpIdx + 9] >>> 14) & MASK32_1;
|
||||
ints[intsIdx + 7] = l7;
|
||||
int l8 = (tmp[tmpIdx + 9] & MASK32_14) << 3;
|
||||
l8 |= (tmp[tmpIdx + 10] >>> 12) & MASK32_3;
|
||||
ints[intsIdx + 8] = l8;
|
||||
int l9 = (tmp[tmpIdx + 10] & MASK32_12) << 5;
|
||||
l9 |= (tmp[tmpIdx + 11] >>> 10) & MASK32_5;
|
||||
ints[intsIdx + 9] = l9;
|
||||
int l10 = (tmp[tmpIdx + 11] & MASK32_10) << 7;
|
||||
l10 |= (tmp[tmpIdx + 12] >>> 8) & MASK32_7;
|
||||
ints[intsIdx + 10] = l10;
|
||||
int l11 = (tmp[tmpIdx + 12] & MASK32_8) << 9;
|
||||
l11 |= (tmp[tmpIdx + 13] >>> 6) & MASK32_9;
|
||||
ints[intsIdx + 11] = l11;
|
||||
int l12 = (tmp[tmpIdx + 13] & MASK32_6) << 11;
|
||||
l12 |= (tmp[tmpIdx + 14] >>> 4) & MASK32_11;
|
||||
ints[intsIdx + 12] = l12;
|
||||
int l13 = (tmp[tmpIdx + 14] & MASK32_4) << 13;
|
||||
l13 |= (tmp[tmpIdx + 15] >>> 2) & MASK32_13;
|
||||
ints[intsIdx + 13] = l13;
|
||||
int l14 = (tmp[tmpIdx + 15] & MASK32_2) << 15;
|
||||
l14 |= tmp[tmpIdx + 16] << 0;
|
||||
ints[intsIdx + 14] = l14;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode18(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(72, ints, 14, 18, MASK32_18, tmp, 0, MASK32_14);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 72; iter < 8; ++iter, tmpIdx += 9, intsIdx += 7) {
|
||||
int l0 = tmp[tmpIdx + 0] << 4;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 10) & MASK32_4;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_10) << 8;
|
||||
l1 |= (tmp[tmpIdx + 2] >>> 6) & MASK32_8;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 2] & MASK32_6) << 12;
|
||||
l2 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_12;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 3] & MASK32_2) << 16;
|
||||
l3 |= tmp[tmpIdx + 4] << 2;
|
||||
l3 |= (tmp[tmpIdx + 5] >>> 12) & MASK32_2;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 5] & MASK32_12) << 6;
|
||||
l4 |= (tmp[tmpIdx + 6] >>> 8) & MASK32_6;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 6] & MASK32_8) << 10;
|
||||
l5 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_10;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 7] & MASK32_4) << 14;
|
||||
l6 |= tmp[tmpIdx + 8] << 0;
|
||||
ints[intsIdx + 6] = l6;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode19(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(76, ints, 13, 19, MASK32_19, tmp, 0, MASK32_13);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 76; iter < 4; ++iter, tmpIdx += 19, intsIdx += 13) {
|
||||
int l0 = tmp[tmpIdx + 0] << 6;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 7) & MASK32_6;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_7) << 12;
|
||||
l1 |= (tmp[tmpIdx + 2] >>> 1) & MASK32_12;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 2] & MASK32_1) << 18;
|
||||
l2 |= tmp[tmpIdx + 3] << 5;
|
||||
l2 |= (tmp[tmpIdx + 4] >>> 8) & MASK32_5;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 4] & MASK32_8) << 11;
|
||||
l3 |= (tmp[tmpIdx + 5] >>> 2) & MASK32_11;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 5] & MASK32_2) << 17;
|
||||
l4 |= tmp[tmpIdx + 6] << 4;
|
||||
l4 |= (tmp[tmpIdx + 7] >>> 9) & MASK32_4;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 7] & MASK32_9) << 10;
|
||||
l5 |= (tmp[tmpIdx + 8] >>> 3) & MASK32_10;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 8] & MASK32_3) << 16;
|
||||
l6 |= tmp[tmpIdx + 9] << 3;
|
||||
l6 |= (tmp[tmpIdx + 10] >>> 10) & MASK32_3;
|
||||
ints[intsIdx + 6] = l6;
|
||||
int l7 = (tmp[tmpIdx + 10] & MASK32_10) << 9;
|
||||
l7 |= (tmp[tmpIdx + 11] >>> 4) & MASK32_9;
|
||||
ints[intsIdx + 7] = l7;
|
||||
int l8 = (tmp[tmpIdx + 11] & MASK32_4) << 15;
|
||||
l8 |= tmp[tmpIdx + 12] << 2;
|
||||
l8 |= (tmp[tmpIdx + 13] >>> 11) & MASK32_2;
|
||||
ints[intsIdx + 8] = l8;
|
||||
int l9 = (tmp[tmpIdx + 13] & MASK32_11) << 8;
|
||||
l9 |= (tmp[tmpIdx + 14] >>> 5) & MASK32_8;
|
||||
ints[intsIdx + 9] = l9;
|
||||
int l10 = (tmp[tmpIdx + 14] & MASK32_5) << 14;
|
||||
l10 |= tmp[tmpIdx + 15] << 1;
|
||||
l10 |= (tmp[tmpIdx + 16] >>> 12) & MASK32_1;
|
||||
ints[intsIdx + 10] = l10;
|
||||
int l11 = (tmp[tmpIdx + 16] & MASK32_12) << 7;
|
||||
l11 |= (tmp[tmpIdx + 17] >>> 6) & MASK32_7;
|
||||
ints[intsIdx + 11] = l11;
|
||||
int l12 = (tmp[tmpIdx + 17] & MASK32_6) << 13;
|
||||
l12 |= tmp[tmpIdx + 18] << 0;
|
||||
ints[intsIdx + 12] = l12;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode20(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(80, ints, 12, 20, MASK32_20, tmp, 0, MASK32_12);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 80; iter < 16; ++iter, tmpIdx += 5, intsIdx += 3) {
|
||||
int l0 = tmp[tmpIdx + 0] << 8;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_8;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_4) << 16;
|
||||
l1 |= tmp[tmpIdx + 2] << 4;
|
||||
l1 |= (tmp[tmpIdx + 3] >>> 8) & MASK32_4;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 3] & MASK32_8) << 12;
|
||||
l2 |= tmp[tmpIdx + 4] << 0;
|
||||
ints[intsIdx + 2] = l2;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode21(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(84, ints, 11, 21, MASK32_21, tmp, 0, MASK32_11);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 84; iter < 4; ++iter, tmpIdx += 21, intsIdx += 11) {
|
||||
int l0 = tmp[tmpIdx + 0] << 10;
|
||||
l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK32_10;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 1] & MASK32_1) << 20;
|
||||
l1 |= tmp[tmpIdx + 2] << 9;
|
||||
l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_9;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 3] & MASK32_2) << 19;
|
||||
l2 |= tmp[tmpIdx + 4] << 8;
|
||||
l2 |= (tmp[tmpIdx + 5] >>> 3) & MASK32_8;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 5] & MASK32_3) << 18;
|
||||
l3 |= tmp[tmpIdx + 6] << 7;
|
||||
l3 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_7;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 7] & MASK32_4) << 17;
|
||||
l4 |= tmp[tmpIdx + 8] << 6;
|
||||
l4 |= (tmp[tmpIdx + 9] >>> 5) & MASK32_6;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 9] & MASK32_5) << 16;
|
||||
l5 |= tmp[tmpIdx + 10] << 5;
|
||||
l5 |= (tmp[tmpIdx + 11] >>> 6) & MASK32_5;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 11] & MASK32_6) << 15;
|
||||
l6 |= tmp[tmpIdx + 12] << 4;
|
||||
l6 |= (tmp[tmpIdx + 13] >>> 7) & MASK32_4;
|
||||
ints[intsIdx + 6] = l6;
|
||||
int l7 = (tmp[tmpIdx + 13] & MASK32_7) << 14;
|
||||
l7 |= tmp[tmpIdx + 14] << 3;
|
||||
l7 |= (tmp[tmpIdx + 15] >>> 8) & MASK32_3;
|
||||
ints[intsIdx + 7] = l7;
|
||||
int l8 = (tmp[tmpIdx + 15] & MASK32_8) << 13;
|
||||
l8 |= tmp[tmpIdx + 16] << 2;
|
||||
l8 |= (tmp[tmpIdx + 17] >>> 9) & MASK32_2;
|
||||
ints[intsIdx + 8] = l8;
|
||||
int l9 = (tmp[tmpIdx + 17] & MASK32_9) << 12;
|
||||
l9 |= tmp[tmpIdx + 18] << 1;
|
||||
l9 |= (tmp[tmpIdx + 19] >>> 10) & MASK32_1;
|
||||
ints[intsIdx + 9] = l9;
|
||||
int l10 = (tmp[tmpIdx + 19] & MASK32_10) << 11;
|
||||
l10 |= tmp[tmpIdx + 20] << 0;
|
||||
ints[intsIdx + 10] = l10;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode22(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(88, ints, 10, 22, MASK32_22, tmp, 0, MASK32_10);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 88; iter < 8; ++iter, tmpIdx += 11, intsIdx += 5) {
|
||||
int l0 = tmp[tmpIdx + 0] << 12;
|
||||
l0 |= tmp[tmpIdx + 1] << 2;
|
||||
l0 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 2] & MASK32_8) << 14;
|
||||
l1 |= tmp[tmpIdx + 3] << 4;
|
||||
l1 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 4] & MASK32_6) << 16;
|
||||
l2 |= tmp[tmpIdx + 5] << 6;
|
||||
l2 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 6] & MASK32_4) << 18;
|
||||
l3 |= tmp[tmpIdx + 7] << 8;
|
||||
l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 8] & MASK32_2) << 20;
|
||||
l4 |= tmp[tmpIdx + 9] << 10;
|
||||
l4 |= tmp[tmpIdx + 10] << 0;
|
||||
ints[intsIdx + 4] = l4;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode23(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(92, ints, 9, 23, MASK32_23, tmp, 0, MASK32_9);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 92; iter < 4; ++iter, tmpIdx += 23, intsIdx += 9) {
|
||||
int l0 = tmp[tmpIdx + 0] << 14;
|
||||
l0 |= tmp[tmpIdx + 1] << 5;
|
||||
l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK32_5;
|
||||
ints[intsIdx + 0] = l0;
|
||||
int l1 = (tmp[tmpIdx + 2] & MASK32_4) << 19;
|
||||
l1 |= tmp[tmpIdx + 3] << 10;
|
||||
l1 |= tmp[tmpIdx + 4] << 1;
|
||||
l1 |= (tmp[tmpIdx + 5] >>> 8) & MASK32_1;
|
||||
ints[intsIdx + 1] = l1;
|
||||
int l2 = (tmp[tmpIdx + 5] & MASK32_8) << 15;
|
||||
l2 |= tmp[tmpIdx + 6] << 6;
|
||||
l2 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_6;
|
||||
ints[intsIdx + 2] = l2;
|
||||
int l3 = (tmp[tmpIdx + 7] & MASK32_3) << 20;
|
||||
l3 |= tmp[tmpIdx + 8] << 11;
|
||||
l3 |= tmp[tmpIdx + 9] << 2;
|
||||
l3 |= (tmp[tmpIdx + 10] >>> 7) & MASK32_2;
|
||||
ints[intsIdx + 3] = l3;
|
||||
int l4 = (tmp[tmpIdx + 10] & MASK32_7) << 16;
|
||||
l4 |= tmp[tmpIdx + 11] << 7;
|
||||
l4 |= (tmp[tmpIdx + 12] >>> 2) & MASK32_7;
|
||||
ints[intsIdx + 4] = l4;
|
||||
int l5 = (tmp[tmpIdx + 12] & MASK32_2) << 21;
|
||||
l5 |= tmp[tmpIdx + 13] << 12;
|
||||
l5 |= tmp[tmpIdx + 14] << 3;
|
||||
l5 |= (tmp[tmpIdx + 15] >>> 6) & MASK32_3;
|
||||
ints[intsIdx + 5] = l5;
|
||||
int l6 = (tmp[tmpIdx + 15] & MASK32_6) << 17;
|
||||
l6 |= tmp[tmpIdx + 16] << 8;
|
||||
l6 |= (tmp[tmpIdx + 17] >>> 1) & MASK32_8;
|
||||
ints[intsIdx + 6] = l6;
|
||||
int l7 = (tmp[tmpIdx + 17] & MASK32_1) << 22;
|
||||
l7 |= tmp[tmpIdx + 18] << 13;
|
||||
l7 |= tmp[tmpIdx + 19] << 4;
|
||||
l7 |= (tmp[tmpIdx + 20] >>> 5) & MASK32_4;
|
||||
ints[intsIdx + 7] = l7;
|
||||
int l8 = (tmp[tmpIdx + 20] & MASK32_5) << 18;
|
||||
l8 |= tmp[tmpIdx + 21] << 9;
|
||||
l8 |= tmp[tmpIdx + 22] << 0;
|
||||
ints[intsIdx + 8] = l8;
|
||||
}
|
||||
}
|
||||
|
||||
static void decode24(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {
|
||||
pdu.splitInts(96, ints, 8, 24, MASK32_24, tmp, 0, MASK32_8);
|
||||
for (int iter = 0, tmpIdx = 0, intsIdx = 96; iter < 32; ++iter, tmpIdx += 3, intsIdx += 1) {
|
||||
int l0 = tmp[tmpIdx + 0] << 16;
|
||||
l0 |= tmp[tmpIdx + 1] << 8;
|
||||
l0 |= tmp[tmpIdx + 2] << 0;
|
||||
ints[intsIdx + 0] = l0;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,217 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PointsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 10.1 index format
|
||||
*
|
||||
* <p>If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene101 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene101Codec extends Codec {
|
||||
|
||||
/** Configuration option for the codec. */
|
||||
public enum Mode {
|
||||
/** Trade compression ratio for retrieval speed. */
|
||||
BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED),
|
||||
/** Trade retrieval speed for compression ratio. */
|
||||
BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION);
|
||||
|
||||
private final Lucene90StoredFieldsFormat.Mode storedMode;
|
||||
|
||||
private Mode(Lucene90StoredFieldsFormat.Mode storedMode) {
|
||||
this.storedMode = Objects.requireNonNull(storedMode);
|
||||
}
|
||||
}
|
||||
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat();
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat();
|
||||
private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
|
||||
private final CompoundFormat compoundFormat = new Lucene90CompoundFormat();
|
||||
private final NormsFormat normsFormat = new Lucene90NormsFormat();
|
||||
|
||||
private final PostingsFormat defaultPostingsFormat;
|
||||
private final PostingsFormat postingsFormat =
|
||||
new PerFieldPostingsFormat() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return Lucene101Codec.this.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final DocValuesFormat defaultDVFormat;
|
||||
private final DocValuesFormat docValuesFormat =
|
||||
new PerFieldDocValuesFormat() {
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return Lucene101Codec.this.getDocValuesFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final KnnVectorsFormat defaultKnnVectorsFormat;
|
||||
private final KnnVectorsFormat knnVectorsFormat =
|
||||
new PerFieldKnnVectorsFormat() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return Lucene101Codec.this.getKnnVectorsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final StoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/** Instantiates a new codec. */
|
||||
public Lucene101Codec() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new codec, specifying the stored fields compression mode to use.
|
||||
*
|
||||
* @param mode stored fields compression mode to use for newly flushed/merged segments.
|
||||
*/
|
||||
public Lucene101Codec(Mode mode) {
|
||||
super("Lucene101");
|
||||
this.storedFieldsFormat =
|
||||
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
|
||||
this.defaultPostingsFormat = new Lucene101PostingsFormat();
|
||||
this.defaultDVFormat = new Lucene90DocValuesFormat();
|
||||
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final StoredFieldsFormat storedFieldsFormat() {
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final TermVectorsFormat termVectorsFormat() {
|
||||
return vectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PostingsFormat postingsFormat() {
|
||||
return postingsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final FieldInfosFormat fieldInfosFormat() {
|
||||
return fieldInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final LiveDocsFormat liveDocsFormat() {
|
||||
return liveDocsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final CompoundFormat compoundFormat() {
|
||||
return compoundFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PointsFormat pointsFormat() {
|
||||
return new Lucene90PointsFormat();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final KnnVectorsFormat knnVectorsFormat() {
|
||||
return knnVectorsFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the postings format that should be used for writing new segments of <code>field</code>.
|
||||
*
|
||||
* <p>The default implementation always returns "Lucene101".
|
||||
*
|
||||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||
* future version of Lucene are only guaranteed to be able to read the default implementation,
|
||||
*/
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return defaultPostingsFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the docvalues format that should be used for writing new segments of <code>field</code>
|
||||
* .
|
||||
*
|
||||
* <p>The default implementation always returns "Lucene90".
|
||||
*
|
||||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||
* future version of Lucene are only guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return defaultDVFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the vectors format that should be used for writing new segments of <code>field</code>
|
||||
*
|
||||
* <p>The default implementation always returns "Lucene99HnswVectorsFormat".
|
||||
*
|
||||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||
* future version of Lucene are only guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return defaultKnnVectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final DocValuesFormat docValuesFormat() {
|
||||
return docValuesFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,492 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Lucene 10.1 postings format, which encodes postings in packed integer blocks for fast decode.
|
||||
*
|
||||
* <p>Basic idea:
|
||||
*
|
||||
* <ul>
|
||||
* <li><b>Packed Blocks and VInt Blocks</b>:
|
||||
* <p>In packed blocks, integers are encoded with the same bit width ({@link PackedInts packed
|
||||
* format}): the block size (i.e. number of integers inside block) is fixed (currently 128).
|
||||
* Additionally blocks that are all the same value are encoded in an optimized way.
|
||||
* <p>In VInt blocks, integers are encoded as {@link DataOutput#writeVInt VInt}: the block
|
||||
* size is variable.
|
||||
* <li><b>Block structure</b>:
|
||||
* <p>When the postings are long enough, Lucene101PostingsFormat will try to encode most
|
||||
* integer data as a packed block.
|
||||
* <p>Take a term with 259 documents as an example, the first 256 document ids are encoded as
|
||||
* two packed blocks, while the remaining 3 are encoded as one VInt block.
|
||||
* <p>Different kinds of data are always encoded separately into different packed blocks, but
|
||||
* may possibly be interleaved into the same VInt block.
|
||||
* <p>This strategy is applied to pairs: <document number, frequency>, <position,
|
||||
* payload length>, <position, offset start, offset length>, and <position,
|
||||
* payload length, offsetstart, offset length>.
|
||||
* <li><b>Skipdata</b>:
|
||||
* <p>Skipdata is interleaved with blocks on 2 levels. Level 0 skip data is interleaved
|
||||
* between every packed block. Level 1 skip data is interleaved between every 32 packed
|
||||
* blocks.
|
||||
* <li><b>Positions, Payloads, and Offsets</b>:
|
||||
* <p>A position is an integer indicating where the term occurs within one document. A payload
|
||||
* is a blob of metadata associated with current position. An offset is a pair of integers
|
||||
* indicating the tokenized start/end offsets for given term in current position: it is
|
||||
* essentially a specialized payload.
|
||||
* <p>When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets
|
||||
* (assuming a null payload contributes one count). As mentioned in block structure, it is
|
||||
* possible to encode these three either combined or separately.
|
||||
* <p>In all cases, payloads and offsets are stored together. When encoded as a packed block,
|
||||
* position data is separated out as .pos, while payloads and offsets are encoded in .pay
|
||||
* (payload metadata will also be stored directly in .pay). When encoded as VInt blocks, all
|
||||
* these three are stored interleaved into the .pos (so is payload metadata).
|
||||
* <p>With this strategy, the majority of payload and offset data will be outside .pos file.
|
||||
* So for queries that require only position data, running on a full index with payloads and
|
||||
* offsets, this reduces disk pre-fetches.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Files and detailed format:
|
||||
*
|
||||
* <ul>
|
||||
* <li><code>.tim</code>: <a href="#Termdictionary">Term Dictionary</a>
|
||||
* <li><code>.tip</code>: <a href="#Termindex">Term Index</a>
|
||||
* <li><code>.doc</code>: <a href="#Frequencies">Frequencies and Skip Data</a>
|
||||
* <li><code>.pos</code>: <a href="#Positions">Positions</a>
|
||||
* <li><code>.pay</code>: <a href="#Payloads">Payloads and Offsets</a>
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Termdictionary"></a>
|
||||
*
|
||||
* <dl>
|
||||
* <dd><b>Term Dictionary</b>
|
||||
* <p>The .tim file contains the list of terms in each field along with per-term statistics
|
||||
* (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the
|
||||
* .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on
|
||||
* the format.
|
||||
* <p>NOTE: The term dictionary can plug into different postings implementations: the postings
|
||||
* writer/reader are actually responsible for encoding and decoding the PostingsHeader and
|
||||
* TermMetadata sections described here:
|
||||
* <ul>
|
||||
* <li>PostingsHeader --> Header, PackedBlockSize
|
||||
* <li>TermMetadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?,
|
||||
* PayFPDelta?
|
||||
* <li>Header, --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
* <li>PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt}
|
||||
* <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta --> {@link
|
||||
* DataOutput#writeVLong VLong}
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}
|
||||
* </ul>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version
|
||||
* information for the postings.
|
||||
* <li>PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width
|
||||
* is determined by the largest integer. Smaller block size result in smaller variance
|
||||
* among width of integers hence smaller indexes. Larger block size result in more
|
||||
* efficient bulk i/o hence better acceleration. This value should always be a multiple
|
||||
* of 64, currently fixed as 128 as a tradeoff. It is also the skip interval used to
|
||||
* accelerate {@link org.apache.lucene.index.PostingsEnum#advance(int)}.
|
||||
* <li>DocFPDelta determines the position of this term's TermFreqs within the .doc file. In
|
||||
* particular, it is the difference of file offset between this term's data and previous
|
||||
* term's data (or zero, for the first term in the block).On disk it is stored as the
|
||||
* difference from previous value in sequence.
|
||||
* <li>PosFPDelta determines the position of this term's TermPositions within the .pos file.
|
||||
* While PayFPDelta determines the position of this term's <TermPayloads,
|
||||
* TermOffsets?> within the .pay file. Similar to DocFPDelta, it is the difference
|
||||
* between two file positions (or neglected, for fields that omit payloads and offsets).
|
||||
* <li>PosVIntBlockFPDelta determines the position of this term's last TermPosition in last
|
||||
* pos packed block within the .pos file. It is synonym for PayVIntBlockFPDelta or
|
||||
* OffsetVIntBlockFPDelta. This is actually used to indicate whether it is necessary to
|
||||
* load following payloads and offsets from .pos instead of .pay. Every time a new block
|
||||
* of positions are to be loaded, the PostingsReader will use this value to check
|
||||
* whether current block is packed format or VInt. When packed format, payloads and
|
||||
* offsets are fetched from .pay, otherwise from .pos. (this value is neglected when
|
||||
* total number of positions i.e. totalTermFreq is less or equal to PackedBlockSize).
|
||||
* <li>SingletonDocID is an optimization when a term only appears in one document. In this
|
||||
* case, instead of writing a file pointer to the .doc file (DocFPDelta), and then a
|
||||
* VIntBlock at that location, the single document ID is written to the term dictionary.
|
||||
* </ul>
|
||||
* </dl>
|
||||
*
|
||||
* <a id="Termindex"></a>
|
||||
*
|
||||
* <dl>
|
||||
* <dd><b>Term Index</b>
|
||||
* <p>The .tip file contains an index into the term dictionary, so that it can be accessed
|
||||
* randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format.
|
||||
* </dl>
|
||||
*
|
||||
* <a id="Frequencies"></a>
|
||||
*
|
||||
* <dl>
|
||||
* <dd><b>Frequencies and Skip Data</b>
|
||||
* <p>The .doc file contains the lists of documents which contain each term, along with the
|
||||
* frequency of the term in that document (except when frequencies are omitted: {@link
|
||||
* IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data
|
||||
* is saved once for the entire postings list.
|
||||
* <ul>
|
||||
* <li>docFile(.doc) --> Header, <TermFreqs><sup>TermCount</sup>, Footer
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
* <li>TermFreqs --> <PackedBlock32> <sup>PackedDocBlockNum/32</sup>, VIntBlock?
|
||||
* <li>PackedBlock32 --> Level1SkipData, <PackedBlock> <sup>32</sup>
|
||||
* <li>PackedBlock --> Level0SkipData, PackedDocDeltaBlock, PackedFreqBlock?
|
||||
* <li>VIntBlock -->
|
||||
* <DocDelta[,Freq?]><sup>DocFreq-PackedBlockSize*PackedDocBlockNum</sup>
|
||||
* <li>Level1SkipData --> DocDelta, DocFPDelta, Skip1NumBytes?, ImpactLength?, Impacts?,
|
||||
* PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto?
|
||||
* <li>Level0SkipData --> Skip0NumBytes, DocDelta, DocFPDelta, PackedBlockLength,
|
||||
* ImpactLength?, Impacts?, PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto?
|
||||
* <li>PackedFreqBlock --> {@link PackedInts PackedInts}, uses patching
|
||||
* <li>PackedDocDeltaBlock --> {@link PackedInts PackedInts}, does not use patching
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}
|
||||
* </ul>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>PackedDocDeltaBlock is theoretically generated from two steps:
|
||||
* <ol>
|
||||
* <li>Calculate the difference between each document number and previous one, and get
|
||||
* a d-gaps list (for the first document, use absolute value);
|
||||
* <li>For those d-gaps from first one to
|
||||
* PackedDocBlockNum*PackedBlockSize<sup>th</sup>, separately encode as packed
|
||||
* blocks.
|
||||
* </ol>
|
||||
* If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step.
|
||||
* <li>VIntBlock stores remaining d-gaps (along with frequencies when possible) with a
|
||||
* format that encodes DocDelta and Freq:
|
||||
* <p>DocDelta: if frequencies are indexed, this determines both the document number and
|
||||
* the frequency. In particular, DocDelta/2 is the difference between this document
|
||||
* number and the previous document number (or zero when this is the first document in a
|
||||
* TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the
|
||||
* frequency is read as another VInt. If frequencies are omitted, DocDelta contains the
|
||||
* gap (not multiplied by 2) between document numbers and no frequency information is
|
||||
* stored.
|
||||
* <p>For example, the TermFreqs for a term which occurs once in document seven and
|
||||
* three times in document eleven, with frequencies indexed, would be the following
|
||||
* sequence of VInts:
|
||||
* <p>15, 8, 3
|
||||
* <p>If frequencies were omitted ({@link IndexOptions#DOCS}) it would be this sequence
|
||||
* of VInts instead:
|
||||
* <p>7,4
|
||||
* <li>PackedDocBlockNum is the number of packed blocks for current term's docids or
|
||||
* frequencies. In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize)
|
||||
* <li>On skip data, DocDelta is the delta between the last doc of the previous block - or
|
||||
* -1 if there is no previous block - and the last doc of this block. This helps know by
|
||||
* how much the doc ID should be incremented in case the block gets skipped.
|
||||
* <li>Skip0Length is the length of skip data at level 0. Encoding it is useful when skip
|
||||
* data is never needed to quickly skip over skip data, e.g. if only using nextDoc(). It
|
||||
* is also used when only the first fields of skip data are needed, in order to skip
|
||||
* over remaining fields without reading them.
|
||||
* <li>ImpactLength and Impacts are only stored if frequencies are indexed.
|
||||
* <li>Since positions and payloads are also block encoded, the skip should skip to related
|
||||
* block first, then fetch the values according to in-block offset. PosFPSkip and
|
||||
* PayFPSkip record the file offsets of related block in .pos and .pay, respectively.
|
||||
* While PosBlockOffset indicates which value to fetch inside the related block
|
||||
* (PayBlockOffset is unnecessary since it is always equal to PosBlockOffset). Same as
|
||||
* DocFPSkip, the file offsets are relative to the start of current term's TermFreqs,
|
||||
* and stored as a difference sequence.
|
||||
* <li>PayByteUpto indicates the start offset of the current payload. It is equivalent to
|
||||
* the sum of the payload lengths in the current block up to PosBlockOffset
|
||||
* <li>ImpactLength is the total length of CompetitiveFreqDelta and CompetitiveNormDelta
|
||||
* pairs. CompetitiveFreqDelta and CompetitiveNormDelta are used to safely skip score
|
||||
* calculation for uncompetitive documents; See {@link
|
||||
* org.apache.lucene.codecs.CompetitiveImpactAccumulator} for more details.
|
||||
* </ul>
|
||||
* </dl>
|
||||
*
|
||||
* <a id="Positions"></a>
|
||||
*
|
||||
* <dl>
|
||||
* <dd><b>Positions</b>
|
||||
* <p>The .pos file contains the lists of positions that each term occurs at within documents.
|
||||
* It also sometimes stores part of payloads and offsets for speedup.
|
||||
* <ul>
|
||||
* <li>PosFile(.pos) --> Header, <TermPositions> <sup>TermCount</sup>, Footer
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
* <li>TermPositions --> <PackedPosDeltaBlock> <sup>PackedPosBlockNum</sup>,
|
||||
* VIntBlock?
|
||||
* <li>VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?, OffsetDelta?,
|
||||
* OffsetLength?><sup>PosVIntCount</sup>
|
||||
* <li>PackedPosDeltaBlock --> {@link PackedInts PackedInts}
|
||||
* <li>PositionDelta, OffsetDelta, OffsetLength --> {@link DataOutput#writeVInt VInt}
|
||||
* <li>PayloadData --> {@link DataOutput#writeByte byte}<sup>PayLength</sup>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}
|
||||
* </ul>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>TermPositions are order by term (terms are implicit, from the term dictionary), and
|
||||
* position values for each term document pair are incremental, and ordered by document
|
||||
* number.
|
||||
* <li>PackedPosBlockNum is the number of packed blocks for current term's positions,
|
||||
* payloads or offsets. In particular, PackedPosBlockNum =
|
||||
* floor(totalTermFreq/PackedBlockSize)
|
||||
* <li>PosVIntCount is the number of positions encoded as VInt format. In particular,
|
||||
* PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize
|
||||
* <li>The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
|
||||
* in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.
|
||||
* <li>PositionDelta is, if payloads are disabled for the term's field, the difference
|
||||
* between the position of the current occurrence in the document and the previous
|
||||
* occurrence (or zero, if this is the first occurrence in this document). If payloads
|
||||
* are enabled for the term's field, then PositionDelta/2 is the difference between the
|
||||
* current and the previous position. If payloads are enabled and PositionDelta is odd,
|
||||
* then PayloadLength is stored, indicating the length of the payload at the current
|
||||
* term position.
|
||||
* <li>For example, the TermPositions for a term which occurs as the fourth term in one
|
||||
* document, and as the fifth and ninth term in a subsequent document, would be the
|
||||
* following sequence of VInts (payloads disabled):
|
||||
* <p>4, 5, 4
|
||||
* <li>PayloadData is metadata associated with the current term position. If PayloadLength
|
||||
* is stored at the current position, then it indicates the length of this payload. If
|
||||
* PayloadLength is not stored, then this payload has the same length as the payload at
|
||||
* the previous position.
|
||||
* <li>OffsetDelta/2 is the difference between this position's startOffset from the previous
|
||||
* occurrence (or zero, if this is the first occurrence in this document). If
|
||||
* OffsetDelta is odd, then the length (endOffset-startOffset) differs from the previous
|
||||
* occurrence and an OffsetLength follows. Offset data is only written for {@link
|
||||
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
|
||||
* </ul>
|
||||
* </dl>
|
||||
*
|
||||
* <a id="Payloads"></a>
|
||||
*
|
||||
* <dl>
|
||||
* <dd><b>Payloads and Offsets</b>
|
||||
* <p>The .pay file will store payloads and offsets associated with certain term-document
|
||||
* positions. Some payloads and offsets will be separated out into .pos file, for performance
|
||||
* reasons.
|
||||
* <ul>
|
||||
* <li>PayFile(.pay): --> Header, <TermPayloads?, TermOffsets?>
|
||||
* <sup>TermCount</sup>, Footer
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
* <li>TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData>
|
||||
* <sup>PackedPayBlockNum</sup>
|
||||
* <li>TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock>
|
||||
* <sup>PackedPayBlockNum</sup>
|
||||
* <li>PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock -->
|
||||
* {@link PackedInts PackedInts}
|
||||
* <li>SumPayLength --> {@link DataOutput#writeVInt VInt}
|
||||
* <li>PayData --> {@link DataOutput#writeByte byte}<sup>SumPayLength</sup>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}
|
||||
* </ul>
|
||||
* <p>Notes:
|
||||
* <ul>
|
||||
* <li>The order of TermPayloads/TermOffsets will be the same as TermPositions, note that
|
||||
* part of payload/offsets are stored in .pos.
|
||||
* <li>The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is
|
||||
* the same as PackedFreqBlock in chapter <a href="#Frequencies">Frequencies and Skip
|
||||
* Data</a>. While PackedStartDeltaBlock follows a same procedure as
|
||||
* PackedDocDeltaBlock.
|
||||
* <li>PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also
|
||||
* synonym for PackedOffsetBlockNum.
|
||||
* <li>SumPayLength is the total length of payloads written within one block, should be the
|
||||
* sum of PayLengths in one packed block.
|
||||
* <li>PayLength in PackedPayLengthBlock is the length of each payload associated with the
|
||||
* current position.
|
||||
* </ul>
|
||||
* </dl>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene101PostingsFormat extends PostingsFormat {
|
||||
|
||||
/** Filename extension for some small metadata about how postings are encoded. */
|
||||
public static final String META_EXTENSION = "psm";
|
||||
|
||||
/**
|
||||
* Filename extension for document number, frequencies, and skip data. See chapter: <a
|
||||
* href="#Frequencies">Frequencies and Skip Data</a>
|
||||
*/
|
||||
public static final String DOC_EXTENSION = "doc";
|
||||
|
||||
/** Filename extension for positions. See chapter: <a href="#Positions">Positions</a> */
|
||||
public static final String POS_EXTENSION = "pos";
|
||||
|
||||
/**
|
||||
* Filename extension for payloads and offsets. See chapter: <a href="#Payloads">Payloads and
|
||||
* Offsets</a>
|
||||
*/
|
||||
public static final String PAY_EXTENSION = "pay";
|
||||
|
||||
/** Size of blocks. */
|
||||
public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE;
|
||||
|
||||
public static final int BLOCK_MASK = BLOCK_SIZE - 1;
|
||||
|
||||
/** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */
|
||||
public static final int LEVEL1_FACTOR = 32;
|
||||
|
||||
/** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */
|
||||
public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE;
|
||||
|
||||
public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1;
|
||||
|
||||
static final String TERMS_CODEC = "Lucene90PostingsWriterTerms";
|
||||
static final String META_CODEC = "Lucene101PostingsWriterMeta";
|
||||
static final String DOC_CODEC = "Lucene101PostingsWriterDoc";
|
||||
static final String POS_CODEC = "Lucene101PostingsWriterPos";
|
||||
static final String PAY_CODEC = "Lucene101PostingsWriterPay";
|
||||
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/** Creates {@code Lucene101PostingsFormat} with default settings. */
|
||||
public Lucene101PostingsFormat() {
|
||||
this(
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@code Lucene101PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
*
|
||||
* @see
|
||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||
*/
|
||||
public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("Lucene101");
|
||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene90BlockTreeTermsWriter(
|
||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new Lucene90BlockTreeTermsReader(postingsReader, state);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds all state required for {@link Lucene101PostingsReader} to produce a {@link
|
||||
* org.apache.lucene.index.PostingsEnum} without re-seeking the terms dict.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public static final class IntBlockTermState extends BlockTermState {
|
||||
/** file pointer to the start of the doc ids enumeration, in {@link #DOC_EXTENSION} file */
|
||||
public long docStartFP;
|
||||
|
||||
/** file pointer to the start of the positions enumeration, in {@link #POS_EXTENSION} file */
|
||||
public long posStartFP;
|
||||
|
||||
/** file pointer to the start of the payloads enumeration, in {@link #PAY_EXTENSION} file */
|
||||
public long payStartFP;
|
||||
|
||||
/**
|
||||
* file offset for the last position in the last block, if there are more than {@link
|
||||
* ForUtil#BLOCK_SIZE} positions; otherwise -1
|
||||
*
|
||||
* <p>One might think to use total term frequency to track how many positions are left to read
|
||||
* as we decode the blocks, and decode the last block differently when num_left_positions <
|
||||
* BLOCK_SIZE. Unfortunately this won't work since the tracking will be messed up when we skip
|
||||
* blocks as the skipper will only tell us new position offset (start of block) and number of
|
||||
* positions to skip for that block, without telling us how many positions it has skipped.
|
||||
*/
|
||||
public long lastPosBlockOffset;
|
||||
|
||||
/**
|
||||
* docid when there is a single pulsed posting, otherwise -1. freq is always implicitly
|
||||
* totalTermFreq in this case.
|
||||
*/
|
||||
public int singletonDocID;
|
||||
|
||||
/** Sole constructor. */
|
||||
public IntBlockTermState() {
|
||||
lastPosBlockOffset = -1;
|
||||
singletonDocID = -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntBlockTermState clone() {
|
||||
IntBlockTermState other = new IntBlockTermState();
|
||||
other.copyFrom(this);
|
||||
return other;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyFrom(TermState _other) {
|
||||
super.copyFrom(_other);
|
||||
IntBlockTermState other = (IntBlockTermState) _other;
|
||||
docStartFP = other.docStartFP;
|
||||
posStartFP = other.posStartFP;
|
||||
payStartFP = other.payStartFP;
|
||||
lastPosBlockOffset = other.lastPosBlockOffset;
|
||||
singletonDocID = other.singletonDocID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return super.toString()
|
||||
+ " docStartFP="
|
||||
+ docStartFP
|
||||
+ " posStartFP="
|
||||
+ posStartFP
|
||||
+ " payStartFP="
|
||||
+ payStartFP
|
||||
+ " lastPosBlockOffset="
|
||||
+ lastPosBlockOffset
|
||||
+ " singletonDocID="
|
||||
+ singletonDocID;
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,681 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.LEVEL1_MASK;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.META_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.VERSION_CURRENT;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.PushPostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Impact;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.ByteBuffersDataOutput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Writer for {@link Lucene101PostingsFormat}. */
|
||||
public class Lucene101PostingsWriter extends PushPostingsWriterBase {
|
||||
|
||||
static final IntBlockTermState EMPTY_STATE = new IntBlockTermState();
|
||||
|
||||
IndexOutput metaOut;
|
||||
IndexOutput docOut;
|
||||
IndexOutput posOut;
|
||||
IndexOutput payOut;
|
||||
|
||||
IntBlockTermState lastState;
|
||||
|
||||
// Holds starting file pointers for current term:
|
||||
private long docStartFP;
|
||||
private long posStartFP;
|
||||
private long payStartFP;
|
||||
|
||||
final int[] docDeltaBuffer;
|
||||
final int[] freqBuffer;
|
||||
private int docBufferUpto;
|
||||
|
||||
final int[] posDeltaBuffer;
|
||||
final int[] payloadLengthBuffer;
|
||||
final int[] offsetStartDeltaBuffer;
|
||||
final int[] offsetLengthBuffer;
|
||||
private int posBufferUpto;
|
||||
|
||||
private byte[] payloadBytes;
|
||||
private int payloadByteUpto;
|
||||
|
||||
private int level0LastDocID;
|
||||
private long level0LastPosFP;
|
||||
private long level0LastPayFP;
|
||||
|
||||
private int level1LastDocID;
|
||||
private long level1LastPosFP;
|
||||
private long level1LastPayFP;
|
||||
|
||||
private int docID;
|
||||
private int lastDocID;
|
||||
private int lastPosition;
|
||||
private int lastStartOffset;
|
||||
private int docCount;
|
||||
|
||||
private final PForUtil pforUtil;
|
||||
private final ForDeltaUtil forDeltaUtil;
|
||||
|
||||
private boolean fieldHasNorms;
|
||||
private NumericDocValues norms;
|
||||
private final CompetitiveImpactAccumulator level0FreqNormAccumulator =
|
||||
new CompetitiveImpactAccumulator();
|
||||
private final CompetitiveImpactAccumulator level1CompetitiveFreqNormAccumulator =
|
||||
new CompetitiveImpactAccumulator();
|
||||
|
||||
private int maxNumImpactsAtLevel0;
|
||||
private int maxImpactNumBytesAtLevel0;
|
||||
private int maxNumImpactsAtLevel1;
|
||||
private int maxImpactNumBytesAtLevel1;
|
||||
|
||||
/** Scratch output that we use to be able to prepend the encoded length, e.g. impacts. */
|
||||
private final ByteBuffersDataOutput scratchOutput = ByteBuffersDataOutput.newResettableInstance();
|
||||
|
||||
/**
|
||||
* Output for a single block. This is useful to be able to prepend skip data before each block,
|
||||
* which can only be computed once the block is encoded. The content is then typically copied to
|
||||
* {@link #level1Output}.
|
||||
*/
|
||||
private final ByteBuffersDataOutput level0Output = ByteBuffersDataOutput.newResettableInstance();
|
||||
|
||||
/**
|
||||
* Output for groups of 32 blocks. This is useful to prepend skip data for these 32 blocks, which
|
||||
* can only be done once we have encoded these 32 blocks. The content is then typically copied to
|
||||
* {@link #docCount}.
|
||||
*/
|
||||
private final ByteBuffersDataOutput level1Output = ByteBuffersDataOutput.newResettableInstance();
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene101PostingsWriter(SegmentWriteState state) throws IOException {
|
||||
String metaFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.META_EXTENSION);
|
||||
String docFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.DOC_EXTENSION);
|
||||
metaOut = state.directory.createOutput(metaFileName, state.context);
|
||||
IndexOutput posOut = null;
|
||||
IndexOutput payOut = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
docOut = state.directory.createOutput(docFileName, state.context);
|
||||
CodecUtil.writeIndexHeader(
|
||||
metaOut, META_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
CodecUtil.writeIndexHeader(
|
||||
docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
forDeltaUtil = new ForDeltaUtil();
|
||||
pforUtil = new PForUtil();
|
||||
if (state.fieldInfos.hasProx()) {
|
||||
posDeltaBuffer = new int[BLOCK_SIZE];
|
||||
String posFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.POS_EXTENSION);
|
||||
posOut = state.directory.createOutput(posFileName, state.context);
|
||||
CodecUtil.writeIndexHeader(
|
||||
posOut, POS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
|
||||
if (state.fieldInfos.hasPayloads()) {
|
||||
payloadBytes = new byte[128];
|
||||
payloadLengthBuffer = new int[BLOCK_SIZE];
|
||||
} else {
|
||||
payloadBytes = null;
|
||||
payloadLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasOffsets()) {
|
||||
offsetStartDeltaBuffer = new int[BLOCK_SIZE];
|
||||
offsetLengthBuffer = new int[BLOCK_SIZE];
|
||||
} else {
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
}
|
||||
|
||||
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
|
||||
String payFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name,
|
||||
state.segmentSuffix,
|
||||
Lucene101PostingsFormat.PAY_EXTENSION);
|
||||
payOut = state.directory.createOutput(payFileName, state.context);
|
||||
CodecUtil.writeIndexHeader(
|
||||
payOut, PAY_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
}
|
||||
} else {
|
||||
posDeltaBuffer = null;
|
||||
payloadLengthBuffer = null;
|
||||
offsetStartDeltaBuffer = null;
|
||||
offsetLengthBuffer = null;
|
||||
payloadBytes = null;
|
||||
}
|
||||
this.payOut = payOut;
|
||||
this.posOut = posOut;
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut);
|
||||
}
|
||||
}
|
||||
|
||||
docDeltaBuffer = new int[BLOCK_SIZE];
|
||||
freqBuffer = new int[BLOCK_SIZE];
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntBlockTermState newTermState() {
|
||||
return new IntBlockTermState();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException {
|
||||
CodecUtil.writeIndexHeader(
|
||||
termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
termsOut.writeVInt(BLOCK_SIZE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setField(FieldInfo fieldInfo) {
|
||||
super.setField(fieldInfo);
|
||||
lastState = EMPTY_STATE;
|
||||
fieldHasNorms = fieldInfo.hasNorms();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startTerm(NumericDocValues norms) {
|
||||
docStartFP = docOut.getFilePointer();
|
||||
if (writePositions) {
|
||||
posStartFP = posOut.getFilePointer();
|
||||
level1LastPosFP = level0LastPosFP = posStartFP;
|
||||
if (writePayloads || writeOffsets) {
|
||||
payStartFP = payOut.getFilePointer();
|
||||
level1LastPayFP = level0LastPayFP = payStartFP;
|
||||
}
|
||||
}
|
||||
lastDocID = -1;
|
||||
level0LastDocID = -1;
|
||||
level1LastDocID = -1;
|
||||
this.norms = norms;
|
||||
if (writeFreqs) {
|
||||
level0FreqNormAccumulator.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDoc(int docID, int termDocFreq) throws IOException {
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
flushDocBlock(false);
|
||||
docBufferUpto = 0;
|
||||
}
|
||||
|
||||
final int docDelta = docID - lastDocID;
|
||||
|
||||
if (docID < 0 || docDelta <= 0) {
|
||||
throw new CorruptIndexException(
|
||||
"docs out of order (" + docID + " <= " + lastDocID + " )", docOut);
|
||||
}
|
||||
|
||||
docDeltaBuffer[docBufferUpto] = docDelta;
|
||||
if (writeFreqs) {
|
||||
freqBuffer[docBufferUpto] = termDocFreq;
|
||||
}
|
||||
|
||||
this.docID = docID;
|
||||
lastPosition = 0;
|
||||
lastStartOffset = 0;
|
||||
|
||||
if (writeFreqs) {
|
||||
long norm;
|
||||
if (fieldHasNorms) {
|
||||
boolean found = norms.advanceExact(docID);
|
||||
if (found == false) {
|
||||
// This can happen if indexing hits a problem after adding a doc to the
|
||||
// postings but before buffering the norm. Such documents are written
|
||||
// deleted and will go away on the first merge.
|
||||
norm = 1L;
|
||||
} else {
|
||||
norm = norms.longValue();
|
||||
assert norm != 0 : docID;
|
||||
}
|
||||
} else {
|
||||
norm = 1L;
|
||||
}
|
||||
|
||||
level0FreqNormAccumulator.add(termDocFreq, norm);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset)
|
||||
throws IOException {
|
||||
if (position > IndexWriter.MAX_POSITION) {
|
||||
throw new CorruptIndexException(
|
||||
"position="
|
||||
+ position
|
||||
+ " is too large (> IndexWriter.MAX_POSITION="
|
||||
+ IndexWriter.MAX_POSITION
|
||||
+ ")",
|
||||
docOut);
|
||||
}
|
||||
if (position < 0) {
|
||||
throw new CorruptIndexException("position=" + position + " is < 0", docOut);
|
||||
}
|
||||
posDeltaBuffer[posBufferUpto] = position - lastPosition;
|
||||
if (writePayloads) {
|
||||
if (payload == null || payload.length == 0) {
|
||||
// no payload
|
||||
payloadLengthBuffer[posBufferUpto] = 0;
|
||||
} else {
|
||||
payloadLengthBuffer[posBufferUpto] = payload.length;
|
||||
if (payloadByteUpto + payload.length > payloadBytes.length) {
|
||||
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
|
||||
}
|
||||
System.arraycopy(
|
||||
payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
|
||||
payloadByteUpto += payload.length;
|
||||
}
|
||||
}
|
||||
|
||||
if (writeOffsets) {
|
||||
assert startOffset >= lastStartOffset;
|
||||
assert endOffset >= startOffset;
|
||||
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
|
||||
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
|
||||
lastStartOffset = startOffset;
|
||||
}
|
||||
|
||||
posBufferUpto++;
|
||||
lastPosition = position;
|
||||
if (posBufferUpto == BLOCK_SIZE) {
|
||||
pforUtil.encode(posDeltaBuffer, posOut);
|
||||
|
||||
if (writePayloads) {
|
||||
pforUtil.encode(payloadLengthBuffer, payOut);
|
||||
payOut.writeVInt(payloadByteUpto);
|
||||
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
if (writeOffsets) {
|
||||
pforUtil.encode(offsetStartDeltaBuffer, payOut);
|
||||
pforUtil.encode(offsetLengthBuffer, payOut);
|
||||
}
|
||||
posBufferUpto = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishDoc() {
|
||||
docBufferUpto++;
|
||||
docCount++;
|
||||
|
||||
lastDocID = docID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Special vints that are encoded on 2 bytes if they require 15 bits or less. VInt becomes
|
||||
* especially slow when the number of bytes is variable, so this special layout helps in the case
|
||||
* when the number likely requires 15 bits or less
|
||||
*/
|
||||
static void writeVInt15(DataOutput out, int v) throws IOException {
|
||||
assert v >= 0;
|
||||
writeVLong15(out, v);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #writeVInt15(DataOutput, int)
|
||||
*/
|
||||
static void writeVLong15(DataOutput out, long v) throws IOException {
|
||||
assert v >= 0;
|
||||
if ((v & ~0x7FFFL) == 0) {
|
||||
out.writeShort((short) v);
|
||||
} else {
|
||||
out.writeShort((short) (0x8000 | (v & 0x7FFF)));
|
||||
out.writeVLong(v >> 15);
|
||||
}
|
||||
}
|
||||
|
||||
private void flushDocBlock(boolean finishTerm) throws IOException {
|
||||
assert docBufferUpto != 0;
|
||||
|
||||
if (docBufferUpto < BLOCK_SIZE) {
|
||||
assert finishTerm;
|
||||
PostingsUtil.writeVIntBlock(
|
||||
level0Output, docDeltaBuffer, freqBuffer, docBufferUpto, writeFreqs);
|
||||
} else {
|
||||
if (writeFreqs) {
|
||||
List<Impact> impacts = level0FreqNormAccumulator.getCompetitiveFreqNormPairs();
|
||||
if (impacts.size() > maxNumImpactsAtLevel0) {
|
||||
maxNumImpactsAtLevel0 = impacts.size();
|
||||
}
|
||||
writeImpacts(impacts, scratchOutput);
|
||||
assert level0Output.size() == 0;
|
||||
if (scratchOutput.size() > maxImpactNumBytesAtLevel0) {
|
||||
maxImpactNumBytesAtLevel0 = Math.toIntExact(scratchOutput.size());
|
||||
}
|
||||
level0Output.writeVLong(scratchOutput.size());
|
||||
scratchOutput.copyTo(level0Output);
|
||||
scratchOutput.reset();
|
||||
if (writePositions) {
|
||||
level0Output.writeVLong(posOut.getFilePointer() - level0LastPosFP);
|
||||
level0Output.writeByte((byte) posBufferUpto);
|
||||
level0LastPosFP = posOut.getFilePointer();
|
||||
|
||||
if (writeOffsets || writePayloads) {
|
||||
level0Output.writeVLong(payOut.getFilePointer() - level0LastPayFP);
|
||||
level0Output.writeVInt(payloadByteUpto);
|
||||
level0LastPayFP = payOut.getFilePointer();
|
||||
}
|
||||
}
|
||||
}
|
||||
long numSkipBytes = level0Output.size();
|
||||
forDeltaUtil.encodeDeltas(docDeltaBuffer, level0Output);
|
||||
if (writeFreqs) {
|
||||
pforUtil.encode(freqBuffer, level0Output);
|
||||
}
|
||||
|
||||
// docID - lastBlockDocID is at least 128, so it can never fit a single byte with a vint
|
||||
// Even if we subtracted 128, only extremely dense blocks would be eligible to a single byte
|
||||
// so let's go with 2 bytes right away
|
||||
writeVInt15(scratchOutput, docID - level0LastDocID);
|
||||
writeVLong15(scratchOutput, level0Output.size());
|
||||
numSkipBytes += scratchOutput.size();
|
||||
level1Output.writeVLong(numSkipBytes);
|
||||
scratchOutput.copyTo(level1Output);
|
||||
scratchOutput.reset();
|
||||
}
|
||||
|
||||
level0Output.copyTo(level1Output);
|
||||
level0Output.reset();
|
||||
level0LastDocID = docID;
|
||||
if (writeFreqs) {
|
||||
level1CompetitiveFreqNormAccumulator.addAll(level0FreqNormAccumulator);
|
||||
level0FreqNormAccumulator.clear();
|
||||
}
|
||||
|
||||
if ((docCount & LEVEL1_MASK) == 0) { // true every 32 blocks (4,096 docs)
|
||||
writeLevel1SkipData();
|
||||
level1LastDocID = docID;
|
||||
level1CompetitiveFreqNormAccumulator.clear();
|
||||
} else if (finishTerm) {
|
||||
level1Output.copyTo(docOut);
|
||||
level1Output.reset();
|
||||
level1CompetitiveFreqNormAccumulator.clear();
|
||||
}
|
||||
}
|
||||
|
||||
private void writeLevel1SkipData() throws IOException {
|
||||
docOut.writeVInt(docID - level1LastDocID);
|
||||
final long level1End;
|
||||
if (writeFreqs) {
|
||||
List<Impact> impacts = level1CompetitiveFreqNormAccumulator.getCompetitiveFreqNormPairs();
|
||||
if (impacts.size() > maxNumImpactsAtLevel1) {
|
||||
maxNumImpactsAtLevel1 = impacts.size();
|
||||
}
|
||||
writeImpacts(impacts, scratchOutput);
|
||||
long numImpactBytes = scratchOutput.size();
|
||||
if (numImpactBytes > maxImpactNumBytesAtLevel1) {
|
||||
maxImpactNumBytesAtLevel1 = Math.toIntExact(numImpactBytes);
|
||||
}
|
||||
if (writePositions) {
|
||||
scratchOutput.writeVLong(posOut.getFilePointer() - level1LastPosFP);
|
||||
scratchOutput.writeByte((byte) posBufferUpto);
|
||||
level1LastPosFP = posOut.getFilePointer();
|
||||
if (writeOffsets || writePayloads) {
|
||||
scratchOutput.writeVLong(payOut.getFilePointer() - level1LastPayFP);
|
||||
scratchOutput.writeVInt(payloadByteUpto);
|
||||
level1LastPayFP = payOut.getFilePointer();
|
||||
}
|
||||
}
|
||||
final long level1Len = 2 * Short.BYTES + scratchOutput.size() + level1Output.size();
|
||||
docOut.writeVLong(level1Len);
|
||||
level1End = docOut.getFilePointer() + level1Len;
|
||||
// There are at most 128 impacts, that require at most 2 bytes each
|
||||
assert numImpactBytes <= Short.MAX_VALUE;
|
||||
// Like impacts plus a few vlongs, still way under the max short value
|
||||
assert scratchOutput.size() + Short.BYTES <= Short.MAX_VALUE;
|
||||
docOut.writeShort((short) (scratchOutput.size() + Short.BYTES));
|
||||
docOut.writeShort((short) numImpactBytes);
|
||||
scratchOutput.copyTo(docOut);
|
||||
scratchOutput.reset();
|
||||
} else {
|
||||
docOut.writeVLong(level1Output.size());
|
||||
level1End = docOut.getFilePointer() + level1Output.size();
|
||||
}
|
||||
level1Output.copyTo(docOut);
|
||||
level1Output.reset();
|
||||
assert docOut.getFilePointer() == level1End : docOut.getFilePointer() + " " + level1End;
|
||||
}
|
||||
|
||||
static void writeImpacts(Collection<Impact> impacts, DataOutput out) throws IOException {
|
||||
Impact previous = new Impact(0, 0);
|
||||
for (Impact impact : impacts) {
|
||||
assert impact.freq > previous.freq;
|
||||
assert Long.compareUnsigned(impact.norm, previous.norm) > 0;
|
||||
int freqDelta = impact.freq - previous.freq - 1;
|
||||
long normDelta = impact.norm - previous.norm - 1;
|
||||
if (normDelta == 0) {
|
||||
// most of time, norm only increases by 1, so we can fold everything in a single byte
|
||||
out.writeVInt(freqDelta << 1);
|
||||
} else {
|
||||
out.writeVInt((freqDelta << 1) | 1);
|
||||
out.writeZLong(normDelta);
|
||||
}
|
||||
previous = impact;
|
||||
}
|
||||
}
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
@Override
|
||||
public void finishTerm(BlockTermState _state) throws IOException {
|
||||
IntBlockTermState state = (IntBlockTermState) _state;
|
||||
assert state.docFreq > 0;
|
||||
|
||||
// TODO: wasteful we are counting this (counting # docs
|
||||
// for this term) in two places?
|
||||
assert state.docFreq == docCount : state.docFreq + " vs " + docCount;
|
||||
|
||||
// docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to
|
||||
// it.
|
||||
final int singletonDocID;
|
||||
if (state.docFreq == 1) {
|
||||
// pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
|
||||
singletonDocID = docDeltaBuffer[0] - 1;
|
||||
} else {
|
||||
singletonDocID = -1;
|
||||
flushDocBlock(true);
|
||||
}
|
||||
|
||||
final long lastPosBlockOffset;
|
||||
|
||||
if (writePositions) {
|
||||
// totalTermFreq is just total number of positions(or payloads, or offsets)
|
||||
// associated with current term.
|
||||
assert state.totalTermFreq != -1;
|
||||
if (state.totalTermFreq > BLOCK_SIZE) {
|
||||
// record file offset for last pos in last block
|
||||
lastPosBlockOffset = posOut.getFilePointer() - posStartFP;
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
if (posBufferUpto > 0) {
|
||||
assert posBufferUpto < BLOCK_SIZE;
|
||||
// TODO: should we send offsets/payloads to
|
||||
// .pay...? seems wasteful (have to store extra
|
||||
// vLong for low (< BLOCK_SIZE) DF terms = vast vast
|
||||
// majority)
|
||||
|
||||
// vInt encode the remaining positions/payloads/offsets:
|
||||
int lastPayloadLength = -1; // force first payload length to be written
|
||||
int lastOffsetLength = -1; // force first offset length to be written
|
||||
int payloadBytesReadUpto = 0;
|
||||
for (int i = 0; i < posBufferUpto; i++) {
|
||||
final int posDelta = posDeltaBuffer[i];
|
||||
if (writePayloads) {
|
||||
final int payloadLength = payloadLengthBuffer[i];
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
lastPayloadLength = payloadLength;
|
||||
posOut.writeVInt((posDelta << 1) | 1);
|
||||
posOut.writeVInt(payloadLength);
|
||||
} else {
|
||||
posOut.writeVInt(posDelta << 1);
|
||||
}
|
||||
|
||||
if (payloadLength != 0) {
|
||||
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
|
||||
payloadBytesReadUpto += payloadLength;
|
||||
}
|
||||
} else {
|
||||
posOut.writeVInt(posDelta);
|
||||
}
|
||||
|
||||
if (writeOffsets) {
|
||||
int delta = offsetStartDeltaBuffer[i];
|
||||
int length = offsetLengthBuffer[i];
|
||||
if (length == lastOffsetLength) {
|
||||
posOut.writeVInt(delta << 1);
|
||||
} else {
|
||||
posOut.writeVInt(delta << 1 | 1);
|
||||
posOut.writeVInt(length);
|
||||
lastOffsetLength = length;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (writePayloads) {
|
||||
assert payloadBytesReadUpto == payloadByteUpto;
|
||||
payloadByteUpto = 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
lastPosBlockOffset = -1;
|
||||
}
|
||||
|
||||
state.docStartFP = docStartFP;
|
||||
state.posStartFP = posStartFP;
|
||||
state.payStartFP = payStartFP;
|
||||
state.singletonDocID = singletonDocID;
|
||||
|
||||
state.lastPosBlockOffset = lastPosBlockOffset;
|
||||
docBufferUpto = 0;
|
||||
posBufferUpto = 0;
|
||||
lastDocID = -1;
|
||||
docCount = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encodeTerm(
|
||||
DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute)
|
||||
throws IOException {
|
||||
IntBlockTermState state = (IntBlockTermState) _state;
|
||||
if (absolute) {
|
||||
lastState = EMPTY_STATE;
|
||||
assert lastState.docStartFP == 0;
|
||||
}
|
||||
|
||||
if (lastState.singletonDocID != -1
|
||||
&& state.singletonDocID != -1
|
||||
&& state.docStartFP == lastState.docStartFP) {
|
||||
// With runs of rare values such as ID fields, the increment of pointers in the docs file is
|
||||
// often 0.
|
||||
// Furthermore some ID schemes like auto-increment IDs or Flake IDs are monotonic, so we
|
||||
// encode the delta
|
||||
// between consecutive doc IDs to save space.
|
||||
final long delta = (long) state.singletonDocID - lastState.singletonDocID;
|
||||
out.writeVLong((BitUtil.zigZagEncode(delta) << 1) | 0x01);
|
||||
} else {
|
||||
out.writeVLong((state.docStartFP - lastState.docStartFP) << 1);
|
||||
if (state.singletonDocID != -1) {
|
||||
out.writeVInt(state.singletonDocID);
|
||||
}
|
||||
}
|
||||
|
||||
if (writePositions) {
|
||||
out.writeVLong(state.posStartFP - lastState.posStartFP);
|
||||
if (writePayloads || writeOffsets) {
|
||||
out.writeVLong(state.payStartFP - lastState.payStartFP);
|
||||
}
|
||||
}
|
||||
if (writePositions) {
|
||||
if (state.lastPosBlockOffset != -1) {
|
||||
out.writeVLong(state.lastPosBlockOffset);
|
||||
}
|
||||
}
|
||||
lastState = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
// TODO: add a finish() at least to PushBase? DV too...?
|
||||
boolean success = false;
|
||||
try {
|
||||
if (docOut != null) {
|
||||
CodecUtil.writeFooter(docOut);
|
||||
}
|
||||
if (posOut != null) {
|
||||
CodecUtil.writeFooter(posOut);
|
||||
}
|
||||
if (payOut != null) {
|
||||
CodecUtil.writeFooter(payOut);
|
||||
}
|
||||
if (metaOut != null) {
|
||||
metaOut.writeInt(maxNumImpactsAtLevel0);
|
||||
metaOut.writeInt(maxImpactNumBytesAtLevel0);
|
||||
metaOut.writeInt(maxNumImpactsAtLevel1);
|
||||
metaOut.writeInt(maxImpactNumBytesAtLevel1);
|
||||
metaOut.writeLong(docOut.getFilePointer());
|
||||
if (posOut != null) {
|
||||
metaOut.writeLong(posOut.getFilePointer());
|
||||
if (payOut != null) {
|
||||
metaOut.writeLong(payOut.getFilePointer());
|
||||
}
|
||||
}
|
||||
CodecUtil.writeFooter(metaOut);
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(metaOut, docOut, posOut, payOut);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut);
|
||||
}
|
||||
metaOut = docOut = posOut = payOut = null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.LongHeap;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/** Utility class to encode sequences of 128 small positive integers. */
|
||||
final class PForUtil {
|
||||
|
||||
private static final int MAX_EXCEPTIONS = 7;
|
||||
|
||||
static boolean allEqual(int[] l) {
|
||||
for (int i = 1; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
if (l[i] != l[0]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private final ForUtil forUtil = new ForUtil();
|
||||
|
||||
static {
|
||||
assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE;
|
||||
}
|
||||
|
||||
/** Encode 128 integers from {@code ints} into {@code out}. */
|
||||
void encode(int[] ints, DataOutput out) throws IOException {
|
||||
// Determine the top MAX_EXCEPTIONS + 1 values
|
||||
final LongHeap top = new LongHeap(MAX_EXCEPTIONS + 1);
|
||||
for (int i = 0; i <= MAX_EXCEPTIONS; ++i) {
|
||||
top.push(ints[i]);
|
||||
}
|
||||
long topValue = top.top();
|
||||
for (int i = MAX_EXCEPTIONS + 1; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
if (ints[i] > topValue) {
|
||||
topValue = top.updateTop(ints[i]);
|
||||
}
|
||||
}
|
||||
|
||||
long max = 0L;
|
||||
for (int i = 1; i <= top.size(); ++i) {
|
||||
max = Math.max(max, top.get(i));
|
||||
}
|
||||
|
||||
final int maxBitsRequired = PackedInts.bitsRequired(max);
|
||||
// We store the patch on a byte, so we can't decrease the number of bits required by more than 8
|
||||
final int patchedBitsRequired =
|
||||
Math.max(PackedInts.bitsRequired(topValue), maxBitsRequired - 8);
|
||||
int numExceptions = 0;
|
||||
final long maxUnpatchedValue = (1L << patchedBitsRequired) - 1;
|
||||
for (int i = 2; i <= top.size(); ++i) {
|
||||
if (top.get(i) > maxUnpatchedValue) {
|
||||
numExceptions++;
|
||||
}
|
||||
}
|
||||
final byte[] exceptions = new byte[numExceptions * 2];
|
||||
if (numExceptions > 0) {
|
||||
int exceptionCount = 0;
|
||||
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
if (ints[i] > maxUnpatchedValue) {
|
||||
exceptions[exceptionCount * 2] = (byte) i;
|
||||
exceptions[exceptionCount * 2 + 1] = (byte) (ints[i] >>> patchedBitsRequired);
|
||||
ints[i] &= maxUnpatchedValue;
|
||||
exceptionCount++;
|
||||
}
|
||||
}
|
||||
assert exceptionCount == numExceptions : exceptionCount + " " + numExceptions;
|
||||
}
|
||||
|
||||
if (allEqual(ints) && maxBitsRequired <= 8) {
|
||||
for (int i = 0; i < numExceptions; ++i) {
|
||||
exceptions[2 * i + 1] =
|
||||
(byte) (Byte.toUnsignedLong(exceptions[2 * i + 1]) << patchedBitsRequired);
|
||||
}
|
||||
out.writeByte((byte) (numExceptions << 5));
|
||||
out.writeVInt(ints[0]);
|
||||
} else {
|
||||
final int token = (numExceptions << 5) | patchedBitsRequired;
|
||||
out.writeByte((byte) token);
|
||||
forUtil.encode(ints, patchedBitsRequired, out);
|
||||
}
|
||||
out.writeBytes(exceptions, exceptions.length);
|
||||
}
|
||||
|
||||
/** Decode 128 integers into {@code ints}. */
|
||||
void decode(PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
var in = pdu.in;
|
||||
final int token = Byte.toUnsignedInt(in.readByte());
|
||||
final int bitsPerValue = token & 0x1f;
|
||||
if (bitsPerValue == 0) {
|
||||
Arrays.fill(ints, 0, ForUtil.BLOCK_SIZE, in.readVInt());
|
||||
} else {
|
||||
forUtil.decode(bitsPerValue, pdu, ints);
|
||||
}
|
||||
final int numExceptions = token >>> 5;
|
||||
for (int i = 0; i < numExceptions; ++i) {
|
||||
ints[Byte.toUnsignedInt(in.readByte())] |= Byte.toUnsignedLong(in.readByte()) << bitsPerValue;
|
||||
}
|
||||
}
|
||||
|
||||
/** Skip 128 integers. */
|
||||
static void skip(DataInput in) throws IOException {
|
||||
final int token = Byte.toUnsignedInt(in.readByte());
|
||||
final int bitsPerValue = token & 0x1f;
|
||||
final int numExceptions = token >>> 5;
|
||||
if (bitsPerValue == 0) {
|
||||
in.readVLong();
|
||||
in.skipBytes((numExceptions << 1));
|
||||
} else {
|
||||
in.skipBytes(ForUtil.numBytes(bitsPerValue) + (numExceptions << 1));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
|
@ -42,16 +42,16 @@ public final class PostingIndexInput {
|
|||
this.postingDecodingUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
|
||||
}
|
||||
|
||||
/** Decode 128 integers stored on {@code bitsPerValues} bits per value into {@code longs}. */
|
||||
public void decode(int bitsPerValue, long[] longs) throws IOException {
|
||||
forUtil.decode(bitsPerValue, postingDecodingUtil, longs);
|
||||
/** Decode 128 integers stored on {@code bitsPerValues} bits per value into {@code ints}. */
|
||||
public void decode(int bitsPerValue, int[] ints) throws IOException {
|
||||
forUtil.decode(bitsPerValue, postingDecodingUtil, ints);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode 128 integers stored on {@code bitsPerValues} bits per value, compute their prefix sum,
|
||||
* and store results into {@code longs}.
|
||||
* and store results into {@code ints}.
|
||||
*/
|
||||
public void decodeAndPrefixSum(int bitsPerValue, long base, long[] longs) throws IOException {
|
||||
forDeltaUtil.decodeAndPrefixSum(bitsPerValue, postingDecodingUtil, base, longs);
|
||||
public void decodeAndPrefixSum(int bitsPerValue, int base, int[] ints) throws IOException {
|
||||
forDeltaUtil.decodeAndPrefixSum(bitsPerValue, postingDecodingUtil, base, ints);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.GroupVIntUtil;
|
||||
|
||||
/** Utility class to encode/decode postings block. */
|
||||
final class PostingsUtil {
|
||||
|
||||
/**
|
||||
* Read values that have been written using variable-length encoding and group-varint encoding
|
||||
* instead of bit-packing.
|
||||
*/
|
||||
static void readVIntBlock(
|
||||
IndexInput docIn,
|
||||
int[] docBuffer,
|
||||
int[] freqBuffer,
|
||||
int num,
|
||||
boolean indexHasFreq,
|
||||
boolean decodeFreq)
|
||||
throws IOException {
|
||||
GroupVIntUtil.readGroupVInts(docIn, docBuffer, num);
|
||||
if (indexHasFreq && decodeFreq) {
|
||||
for (int i = 0; i < num; ++i) {
|
||||
freqBuffer[i] = docBuffer[i] & 0x01;
|
||||
docBuffer[i] >>>= 1;
|
||||
if (freqBuffer[i] == 0) {
|
||||
freqBuffer[i] = docIn.readVInt();
|
||||
}
|
||||
}
|
||||
} else if (indexHasFreq) {
|
||||
for (int i = 0; i < num; ++i) {
|
||||
docBuffer[i] >>>= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Write freq buffer with variable-length encoding and doc buffer with group-varint encoding. */
|
||||
static void writeVIntBlock(
|
||||
DataOutput docOut, int[] docBuffer, int[] freqBuffer, int num, boolean writeFreqs)
|
||||
throws IOException {
|
||||
if (writeFreqs) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
docBuffer[i] = (docBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
|
||||
}
|
||||
}
|
||||
docOut.writeGroupVInts(docBuffer, num);
|
||||
if (writeFreqs) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
final int freq = freqBuffer[i];
|
||||
if (freq != 1) {
|
||||
docOut.writeVInt(freq);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,377 @@
|
|||
#! /usr/bin/env python
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from math import gcd
|
||||
|
||||
"""Code generation for ForDeltaUtil.java"""
|
||||
|
||||
MAX_SPECIALIZED_BITS_PER_VALUE = 24
|
||||
OUTPUT_FILE = "ForDeltaUtil.java"
|
||||
PRIMITIVE_SIZE = [8, 16, 32]
|
||||
HEADER = """// This file has been automatically generated, DO NOT EDIT
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene101.ForUtil.*;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/
|
||||
* Encodes multiple integers in a Java int to get SIMD-like speedups.
|
||||
* If bitsPerValue <= 4 then we pack 4 ints per Java int
|
||||
* else if bitsPerValue <= 11 we pack 2 ints per Java int
|
||||
* else we use scalar operations.
|
||||
*/
|
||||
public final class ForDeltaUtil {
|
||||
|
||||
private static final int HALF_BLOCK_SIZE = BLOCK_SIZE / 2;
|
||||
private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4;
|
||||
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
|
||||
private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4;
|
||||
|
||||
// IDENTITY_PLUS_ONE[i] == i+1
|
||||
private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
IDENTITY_PLUS_ONE[i] = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSumOfOnes(int[] arr, int base) {
|
||||
System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE);
|
||||
// This loop gets auto-vectorized
|
||||
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
|
||||
arr[i] += base;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum8(int[] arr, int base) {
|
||||
// When the number of bits per value is 4 or less, we can sum up all values in a block without
|
||||
// risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4
|
||||
// values at once.
|
||||
innerPrefixSum8(arr);
|
||||
expand8(arr);
|
||||
final int l0 = base;
|
||||
final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1];
|
||||
final int l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1];
|
||||
final int l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1];
|
||||
|
||||
for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) {
|
||||
arr[i] += l0;
|
||||
arr[ONE_BLOCK_SIZE_FOURTH + i] += l1;
|
||||
arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2;
|
||||
arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum16(int[] arr, int base) {
|
||||
// When the number of bits per value is 11 or less, we can sum up all values in a block without
|
||||
// risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2
|
||||
// values at once.
|
||||
innerPrefixSum16(arr);
|
||||
expand16(arr);
|
||||
final int l0 = base;
|
||||
final int l1 = base + arr[HALF_BLOCK_SIZE - 1];
|
||||
for (int i = 0; i < HALF_BLOCK_SIZE; ++i) {
|
||||
arr[i] += l0;
|
||||
arr[HALF_BLOCK_SIZE + i] += l1;
|
||||
}
|
||||
}
|
||||
|
||||
private static void prefixSum32(int[] arr, int base) {
|
||||
arr[0] += base;
|
||||
for (int i = 1; i < BLOCK_SIZE; ++i) {
|
||||
arr[i] += arr[i-1];
|
||||
}
|
||||
}
|
||||
|
||||
// For some reason unrolling seems to help
|
||||
private static void innerPrefixSum8(int[] arr) {
|
||||
arr[1] += arr[0];
|
||||
arr[2] += arr[1];
|
||||
arr[3] += arr[2];
|
||||
arr[4] += arr[3];
|
||||
arr[5] += arr[4];
|
||||
arr[6] += arr[5];
|
||||
arr[7] += arr[6];
|
||||
arr[8] += arr[7];
|
||||
arr[9] += arr[8];
|
||||
arr[10] += arr[9];
|
||||
arr[11] += arr[10];
|
||||
arr[12] += arr[11];
|
||||
arr[13] += arr[12];
|
||||
arr[14] += arr[13];
|
||||
arr[15] += arr[14];
|
||||
arr[16] += arr[15];
|
||||
arr[17] += arr[16];
|
||||
arr[18] += arr[17];
|
||||
arr[19] += arr[18];
|
||||
arr[20] += arr[19];
|
||||
arr[21] += arr[20];
|
||||
arr[22] += arr[21];
|
||||
arr[23] += arr[22];
|
||||
arr[24] += arr[23];
|
||||
arr[25] += arr[24];
|
||||
arr[26] += arr[25];
|
||||
arr[27] += arr[26];
|
||||
arr[28] += arr[27];
|
||||
arr[29] += arr[28];
|
||||
arr[30] += arr[29];
|
||||
arr[31] += arr[30];
|
||||
}
|
||||
|
||||
// For some reason unrolling seems to help
|
||||
private static void innerPrefixSum16(int[] arr) {
|
||||
arr[1] += arr[0];
|
||||
arr[2] += arr[1];
|
||||
arr[3] += arr[2];
|
||||
arr[4] += arr[3];
|
||||
arr[5] += arr[4];
|
||||
arr[6] += arr[5];
|
||||
arr[7] += arr[6];
|
||||
arr[8] += arr[7];
|
||||
arr[9] += arr[8];
|
||||
arr[10] += arr[9];
|
||||
arr[11] += arr[10];
|
||||
arr[12] += arr[11];
|
||||
arr[13] += arr[12];
|
||||
arr[14] += arr[13];
|
||||
arr[15] += arr[14];
|
||||
arr[16] += arr[15];
|
||||
arr[17] += arr[16];
|
||||
arr[18] += arr[17];
|
||||
arr[19] += arr[18];
|
||||
arr[20] += arr[19];
|
||||
arr[21] += arr[20];
|
||||
arr[22] += arr[21];
|
||||
arr[23] += arr[22];
|
||||
arr[24] += arr[23];
|
||||
arr[25] += arr[24];
|
||||
arr[26] += arr[25];
|
||||
arr[27] += arr[26];
|
||||
arr[28] += arr[27];
|
||||
arr[29] += arr[28];
|
||||
arr[30] += arr[29];
|
||||
arr[31] += arr[30];
|
||||
arr[32] += arr[31];
|
||||
arr[33] += arr[32];
|
||||
arr[34] += arr[33];
|
||||
arr[35] += arr[34];
|
||||
arr[36] += arr[35];
|
||||
arr[37] += arr[36];
|
||||
arr[38] += arr[37];
|
||||
arr[39] += arr[38];
|
||||
arr[40] += arr[39];
|
||||
arr[41] += arr[40];
|
||||
arr[42] += arr[41];
|
||||
arr[43] += arr[42];
|
||||
arr[44] += arr[43];
|
||||
arr[45] += arr[44];
|
||||
arr[46] += arr[45];
|
||||
arr[47] += arr[46];
|
||||
arr[48] += arr[47];
|
||||
arr[49] += arr[48];
|
||||
arr[50] += arr[49];
|
||||
arr[51] += arr[50];
|
||||
arr[52] += arr[51];
|
||||
arr[53] += arr[52];
|
||||
arr[54] += arr[53];
|
||||
arr[55] += arr[54];
|
||||
arr[56] += arr[55];
|
||||
arr[57] += arr[56];
|
||||
arr[58] += arr[57];
|
||||
arr[59] += arr[58];
|
||||
arr[60] += arr[59];
|
||||
arr[61] += arr[60];
|
||||
arr[62] += arr[61];
|
||||
arr[63] += arr[62];
|
||||
}
|
||||
|
||||
private final int[] tmp = new int[BLOCK_SIZE];
|
||||
|
||||
/**
|
||||
* Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code
|
||||
* ints} are expected to be deltas between consecutive values.
|
||||
*/
|
||||
void encodeDeltas(int[] ints, DataOutput out) throws IOException {
|
||||
if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings
|
||||
out.writeByte((byte) 0);
|
||||
} else {
|
||||
int or = 0;
|
||||
for (int l : ints) {
|
||||
or |= l;
|
||||
}
|
||||
assert or != 0;
|
||||
final int bitsPerValue = PackedInts.bitsRequired(or);
|
||||
out.writeByte((byte) bitsPerValue);
|
||||
|
||||
final int primitiveSize;
|
||||
if (bitsPerValue <= 3) {
|
||||
primitiveSize = 8;
|
||||
collapse8(ints);
|
||||
} else if (bitsPerValue <= 10) {
|
||||
primitiveSize = 16;
|
||||
collapse16(ints);
|
||||
} else {
|
||||
primitiveSize = 32;
|
||||
}
|
||||
encode(ints, bitsPerValue, primitiveSize, out, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
/** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */
|
||||
void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException {
|
||||
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
|
||||
if (bitsPerValue == 0) {
|
||||
prefixSumOfOnes(ints, base);
|
||||
} else {
|
||||
decodeAndPrefixSum(bitsPerValue, pdu, base, ints);
|
||||
}
|
||||
}
|
||||
|
||||
"""
|
||||
|
||||
def primitive_size_for_bpv(bpv):
|
||||
if bpv <= 3:
|
||||
# If we have 4 bits per value or less then we can compute the prefix sum of 32 ints that store 4 8-bit values each without overflowing.
|
||||
return 8
|
||||
elif bpv <= 10:
|
||||
# If we have 10 bits per value or less then we can compute the prefix sum of 64 ints that store 2 16-bit values each without overflowing.
|
||||
return 16
|
||||
else:
|
||||
# No risk of overflow with 32 bits per value
|
||||
return 32
|
||||
|
||||
def next_primitive(bpv):
|
||||
if bpv <= 8:
|
||||
return 8
|
||||
elif bpv <= 16:
|
||||
return 16
|
||||
else:
|
||||
return 32
|
||||
|
||||
def writeRemainder(bpv, next_primitive, remaining_bits_per_int, o, num_values, f):
|
||||
iteration = 1
|
||||
num_ints = bpv * num_values / remaining_bits_per_int
|
||||
while num_ints % 2 == 0 and num_values % 2 == 0:
|
||||
num_ints /= 2
|
||||
num_values /= 2
|
||||
iteration *= 2
|
||||
f.write(' for (int iter = 0, tmpIdx = 0, intsIdx = %d; iter < %d; ++iter, tmpIdx += %d, intsIdx += %d) {\n' %(o, iteration, num_ints, num_values))
|
||||
i = 0
|
||||
remaining_bits = 0
|
||||
tmp_idx = 0
|
||||
for i in range(int(num_values)):
|
||||
b = bpv
|
||||
if remaining_bits == 0:
|
||||
b -= remaining_bits_per_int
|
||||
f.write(' int l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
|
||||
else:
|
||||
b -= remaining_bits
|
||||
f.write(' int l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b))
|
||||
tmp_idx += 1
|
||||
while b >= remaining_bits_per_int:
|
||||
b -= remaining_bits_per_int
|
||||
f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
|
||||
tmp_idx += 1
|
||||
if b > 0:
|
||||
f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_int-b, next_primitive, b))
|
||||
remaining_bits = remaining_bits_per_int-b
|
||||
f.write(' ints[intsIdx + %d] = l%d;\n' %(i, i))
|
||||
f.write(' }\n')
|
||||
|
||||
def writeDecode(bpv, f):
|
||||
next_primitive = primitive_size_for_bpv(bpv)
|
||||
if next_primitive % bpv == 0:
|
||||
f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %(bpv, next_primitive))
|
||||
else:
|
||||
f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {\n' %(bpv, next_primitive))
|
||||
if bpv == next_primitive:
|
||||
f.write(' pdu.in.readInts(ints, 0, %d);\n' %(bpv*4))
|
||||
else:
|
||||
num_values_per_int = 32 / next_primitive
|
||||
remaining_bits = next_primitive % bpv
|
||||
num_iters = (next_primitive - 1) // bpv
|
||||
o = 4 * bpv * num_iters
|
||||
if remaining_bits == 0:
|
||||
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, ints, %d, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
else:
|
||||
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_int - o, f)
|
||||
f.write(' }\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
f = open(OUTPUT_FILE, 'w')
|
||||
f.write(HEADER)
|
||||
f.write("""
|
||||
/**
|
||||
* Delta-decode 128 integers into {@code ints}.
|
||||
*/
|
||||
void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int[] ints) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
""")
|
||||
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
primitive_size = primitive_size_for_bpv(bpv)
|
||||
f.write(' case %d:\n' %bpv)
|
||||
if next_primitive(bpv) == primitive_size:
|
||||
if primitive_size % bpv == 0:
|
||||
f.write(' decode%d(pdu, ints);\n' %bpv)
|
||||
else:
|
||||
f.write(' decode%d(pdu, tmp, ints);\n' %bpv)
|
||||
else:
|
||||
if primitive_size % bpv == 0:
|
||||
f.write(' decode%dTo%d(pdu, ints);\n' %(bpv, primitive_size))
|
||||
else:
|
||||
f.write(' decode%dTo%d(pdu, tmp, ints);\n' %(bpv, primitive_size))
|
||||
f.write(' prefixSum%d(ints, base);\n' %primitive_size)
|
||||
f.write(' break;\n')
|
||||
f.write(' default:\n')
|
||||
f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n')
|
||||
f.write(' prefixSum32(ints, base);\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' }\n')
|
||||
f.write(' }\n')
|
||||
|
||||
f.write('\n')
|
||||
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
if next_primitive(bpv) != primitive_size_for_bpv(bpv):
|
||||
writeDecode(bpv, f)
|
||||
if bpv < MAX_SPECIALIZED_BITS_PER_VALUE:
|
||||
f.write('\n')
|
||||
|
||||
f.write('}\n')
|
|
@ -0,0 +1,327 @@
|
|||
#! /usr/bin/env python
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from math import gcd
|
||||
|
||||
"""Code generation for ForUtil.java"""
|
||||
|
||||
MAX_SPECIALIZED_BITS_PER_VALUE = 24
|
||||
OUTPUT_FILE = "ForUtil.java"
|
||||
PRIMITIVE_SIZE = [8, 16, 32]
|
||||
HEADER = """// This file has been automatically generated, DO NOT EDIT
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* Inspired from https://fulmicoton.com/posts/bitpacking/
|
||||
* Encodes multiple integers in one to get SIMD-like speedups.
|
||||
* If bitsPerValue <= 8 then we pack 4 ints per Java int
|
||||
* else if bitsPerValue <= 16 we pack 2 ints per Java int
|
||||
* else we do scalar operations.
|
||||
*/
|
||||
public final class ForUtil {
|
||||
|
||||
public static final int BLOCK_SIZE = 128;
|
||||
static final int BLOCK_SIZE_LOG2 = 7;
|
||||
|
||||
static int expandMask16(int mask16) {
|
||||
return mask16 | (mask16 << 16);
|
||||
}
|
||||
|
||||
static int expandMask8(int mask8) {
|
||||
return expandMask16(mask8 | (mask8 << 8));
|
||||
}
|
||||
|
||||
static int mask32(int bitsPerValue) {
|
||||
return (1 << bitsPerValue) - 1;
|
||||
}
|
||||
|
||||
static int mask16(int bitsPerValue) {
|
||||
return expandMask16((1 << bitsPerValue) - 1);
|
||||
}
|
||||
|
||||
static int mask8(int bitsPerValue) {
|
||||
return expandMask8((1 << bitsPerValue) - 1);
|
||||
}
|
||||
|
||||
static void expand8(int[] arr) {
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
int l = arr[i];
|
||||
arr[i] = (l >>> 24) & 0xFF;
|
||||
arr[32 + i] = (l >>> 16) & 0xFF;
|
||||
arr[64 + i] = (l >>> 8) & 0xFF;
|
||||
arr[96 + i] = l & 0xFF;
|
||||
}
|
||||
}
|
||||
|
||||
static void collapse8(int[] arr) {
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
arr[i] =
|
||||
(arr[i] << 24)
|
||||
| (arr[32 + i] << 16)
|
||||
| (arr[64 + i] << 8)
|
||||
| arr[96 + i];
|
||||
}
|
||||
}
|
||||
|
||||
static void expand16(int[] arr) {
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
int l = arr[i];
|
||||
arr[i] = (l >>> 16) & 0xFFFF;
|
||||
arr[64 + i] = l & 0xFFFF;
|
||||
}
|
||||
}
|
||||
|
||||
static void collapse16(int[] arr) {
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
arr[i] = (arr[i] << 16) | arr[64 + i];
|
||||
}
|
||||
}
|
||||
|
||||
private final int[] tmp = new int[BLOCK_SIZE];
|
||||
|
||||
/** Encode 128 integers from {@code ints} into {@code out}. */
|
||||
void encode(int[] ints, int bitsPerValue, DataOutput out) throws IOException {
|
||||
final int nextPrimitive;
|
||||
if (bitsPerValue <= 8) {
|
||||
nextPrimitive = 8;
|
||||
collapse8(ints);
|
||||
} else if (bitsPerValue <= 16) {
|
||||
nextPrimitive = 16;
|
||||
collapse16(ints);
|
||||
} else {
|
||||
nextPrimitive = 32;
|
||||
}
|
||||
encode(ints, bitsPerValue, nextPrimitive, out, tmp);
|
||||
}
|
||||
|
||||
static void encode(int[] ints, int bitsPerValue, int primitiveSize, DataOutput out, int[] tmp) throws IOException {
|
||||
final int numInts = BLOCK_SIZE * primitiveSize / Integer.SIZE;
|
||||
|
||||
final int numIntsPerShift = bitsPerValue * 4;
|
||||
int idx = 0;
|
||||
int shift = primitiveSize - bitsPerValue;
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
tmp[i] = ints[idx++] << shift;
|
||||
}
|
||||
for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) {
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
tmp[i] |= ints[idx++] << shift;
|
||||
}
|
||||
}
|
||||
|
||||
final int remainingBitsPerInt = shift + bitsPerValue;
|
||||
final int maskRemainingBitsPerInt;
|
||||
if (primitiveSize == 8) {
|
||||
maskRemainingBitsPerInt = MASKS8[remainingBitsPerInt];
|
||||
} else if (primitiveSize == 16) {
|
||||
maskRemainingBitsPerInt = MASKS16[remainingBitsPerInt];
|
||||
} else {
|
||||
maskRemainingBitsPerInt = MASKS32[remainingBitsPerInt];
|
||||
}
|
||||
|
||||
int tmpIdx = 0;
|
||||
int remainingBitsPerValue = bitsPerValue;
|
||||
while (idx < numInts) {
|
||||
if (remainingBitsPerValue >= remainingBitsPerInt) {
|
||||
remainingBitsPerValue -= remainingBitsPerInt;
|
||||
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerInt;
|
||||
if (remainingBitsPerValue == 0) {
|
||||
idx++;
|
||||
remainingBitsPerValue = bitsPerValue;
|
||||
}
|
||||
} else {
|
||||
final int mask1, mask2;
|
||||
if (primitiveSize == 8) {
|
||||
mask1 = MASKS8[remainingBitsPerValue];
|
||||
mask2 = MASKS8[remainingBitsPerInt - remainingBitsPerValue];
|
||||
} else if (primitiveSize == 16) {
|
||||
mask1 = MASKS16[remainingBitsPerValue];
|
||||
mask2 = MASKS16[remainingBitsPerInt - remainingBitsPerValue];
|
||||
} else {
|
||||
mask1 = MASKS32[remainingBitsPerValue];
|
||||
mask2 = MASKS32[remainingBitsPerInt - remainingBitsPerValue];
|
||||
}
|
||||
tmp[tmpIdx] |= (ints[idx++] & mask1) << (remainingBitsPerInt - remainingBitsPerValue);
|
||||
remainingBitsPerValue = bitsPerValue - remainingBitsPerInt + remainingBitsPerValue;
|
||||
tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & mask2;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < numIntsPerShift; ++i) {
|
||||
out.writeInt(tmp[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */
|
||||
static int numBytes(int bitsPerValue) {
|
||||
return bitsPerValue << (BLOCK_SIZE_LOG2 - 3);
|
||||
}
|
||||
|
||||
static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, int[] tmp, int[] ints)
|
||||
throws IOException {
|
||||
final int numInts = bitsPerValue << 2;
|
||||
final int mask = MASKS32[bitsPerValue];
|
||||
pdu.splitInts(numInts, ints, 32 - bitsPerValue, 32, mask, tmp, 0, -1);
|
||||
final int remainingBitsPerInt = 32 - bitsPerValue;
|
||||
final int mask32RemainingBitsPerInt = MASKS32[remainingBitsPerInt];
|
||||
int tmpIdx = 0;
|
||||
int remainingBits = remainingBitsPerInt;
|
||||
for (int intsIdx = numInts; intsIdx < BLOCK_SIZE; ++intsIdx) {
|
||||
int b = bitsPerValue - remainingBits;
|
||||
int l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b;
|
||||
while (b >= remainingBitsPerInt) {
|
||||
b -= remainingBitsPerInt;
|
||||
l |= (tmp[tmpIdx++] & mask32RemainingBitsPerInt) << b;
|
||||
}
|
||||
if (b > 0) {
|
||||
l |= (tmp[tmpIdx] >>> (remainingBitsPerInt - b)) & MASKS32[b];
|
||||
remainingBits = remainingBitsPerInt - b;
|
||||
} else {
|
||||
remainingBits = remainingBitsPerInt;
|
||||
}
|
||||
ints[intsIdx] = l;
|
||||
}
|
||||
}
|
||||
|
||||
"""
|
||||
|
||||
def writeRemainder(bpv, next_primitive, remaining_bits_per_int, o, num_values, f):
|
||||
iteration = 1
|
||||
num_ints = bpv * num_values / remaining_bits_per_int
|
||||
while num_ints % 2 == 0 and num_values % 2 == 0:
|
||||
num_ints /= 2
|
||||
num_values /= 2
|
||||
iteration *= 2
|
||||
f.write(' for (int iter = 0, tmpIdx = 0, intsIdx = %d; iter < %d; ++iter, tmpIdx += %d, intsIdx += %d) {\n' %(o, iteration, num_ints, num_values))
|
||||
i = 0
|
||||
remaining_bits = 0
|
||||
tmp_idx = 0
|
||||
for i in range(int(num_values)):
|
||||
b = bpv
|
||||
if remaining_bits == 0:
|
||||
b -= remaining_bits_per_int
|
||||
f.write(' int l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
|
||||
else:
|
||||
b -= remaining_bits
|
||||
f.write(' int l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b))
|
||||
tmp_idx += 1
|
||||
while b >= remaining_bits_per_int:
|
||||
b -= remaining_bits_per_int
|
||||
f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b))
|
||||
tmp_idx += 1
|
||||
if b > 0:
|
||||
f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_int-b, next_primitive, b))
|
||||
remaining_bits = remaining_bits_per_int-b
|
||||
f.write(' ints[intsIdx + %d] = l%d;\n' %(i, i))
|
||||
f.write(' }\n')
|
||||
|
||||
|
||||
def writeDecode(bpv, f):
|
||||
next_primitive = 32
|
||||
if bpv <= 8:
|
||||
next_primitive = 8
|
||||
elif bpv <= 16:
|
||||
next_primitive = 16
|
||||
if bpv == next_primitive:
|
||||
f.write(' static void decode%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %bpv)
|
||||
f.write(' pdu.in.readInts(ints, 0, %d);\n' %(bpv*4))
|
||||
else:
|
||||
num_values_per_int = 32 / next_primitive
|
||||
remaining_bits = next_primitive % bpv
|
||||
num_iters = (next_primitive - 1) // bpv
|
||||
o = 4 * bpv * num_iters
|
||||
if remaining_bits == 0:
|
||||
f.write(' static void decode%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %bpv)
|
||||
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, ints, %d, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv))
|
||||
else:
|
||||
f.write(' static void decode%d(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {\n' %bpv)
|
||||
f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv))
|
||||
writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_int - o, f)
|
||||
f.write(' }\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
f = open(OUTPUT_FILE, 'w')
|
||||
f.write(HEADER)
|
||||
for primitive_size in PRIMITIVE_SIZE:
|
||||
f.write(' static final int[] MASKS%d = new int[%d];\n' %(primitive_size, primitive_size))
|
||||
f.write('\n')
|
||||
f.write(' static {\n')
|
||||
for primitive_size in PRIMITIVE_SIZE:
|
||||
f.write(' for (int i = 0; i < %d; ++i) {\n' %primitive_size)
|
||||
f.write(' MASKS%d[i] = mask%d(i);\n' %(primitive_size, primitive_size))
|
||||
f.write(' }\n')
|
||||
f.write(' }')
|
||||
f.write("""
|
||||
// mark values in array as final ints to avoid the cost of reading array, arrays should only be
|
||||
// used when the idx is a variable
|
||||
""")
|
||||
for primitive_size in PRIMITIVE_SIZE:
|
||||
for bpv in range(1, min(MAX_SPECIALIZED_BITS_PER_VALUE + 1, primitive_size)):
|
||||
f.write(' static final int MASK%d_%d = MASKS%d[%d];\n' %(primitive_size, bpv, primitive_size, bpv))
|
||||
|
||||
f.write("""
|
||||
/** Decode 128 integers into {@code ints}. */
|
||||
void decode(int bitsPerValue, PostingDecodingUtil pdu, int[] ints) throws IOException {
|
||||
switch (bitsPerValue) {
|
||||
""")
|
||||
for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
next_primitive = 32
|
||||
if bpv <= 8:
|
||||
next_primitive = 8
|
||||
elif bpv <= 16:
|
||||
next_primitive = 16
|
||||
f.write(' case %d:\n' %bpv)
|
||||
if next_primitive % bpv == 0:
|
||||
f.write(' decode%d(pdu, ints);\n' %bpv)
|
||||
else:
|
||||
f.write(' decode%d(pdu, tmp, ints);\n' %bpv)
|
||||
if next_primitive != 32:
|
||||
f.write(' expand%d(ints);\n' %next_primitive)
|
||||
f.write(' break;\n')
|
||||
f.write(' default:\n')
|
||||
f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' }\n')
|
||||
f.write(' }\n')
|
||||
|
||||
for i in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1):
|
||||
writeDecode(i, f)
|
||||
if i < MAX_SPECIALIZED_BITS_PER_VALUE:
|
||||
f.write('\n')
|
||||
|
||||
f.write('}\n')
|
|
@ -16,7 +16,7 @@
|
|||
*/
|
||||
|
||||
/**
|
||||
* Lucene 10.0 file format.
|
||||
* Lucene 10.1 file format.
|
||||
*
|
||||
* <h2>Apache Lucene - Index File Formats</h2>
|
||||
*
|
||||
|
@ -151,15 +151,15 @@
|
|||
* field names. These are used to store auxiliary information about the document, such as its
|
||||
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
|
||||
* <li>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term dictionary}. A
|
||||
* dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||
* pointers to the term's frequency and proximity data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
|
||||
* <li>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Frequency data}. For
|
||||
* each term in the dictionary, the numbers of all the documents that contain that term, and
|
||||
* the frequency of the term in that document, unless frequencies are omitted ({@link
|
||||
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
|
||||
* <li>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Proximity data}. For
|
||||
* each term in the dictionary, the positions that the term occurs in each document. Note that
|
||||
* this will not exist if all fields in all documents omit position data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||
|
@ -255,27 +255,27 @@
|
|||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}</td>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}</td>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}</td>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}</td>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}</td>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
|
@ -416,6 +416,8 @@
|
|||
* <li>In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
|
||||
* 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
|
||||
* need skipping, especially conjunctions.
|
||||
* <li>In version 10.1, block encoding changed to be optimized for int[] storage instead of
|
||||
* long[].
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Limitations"></a>
|
||||
|
@ -430,4 +432,4 @@
|
|||
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||
* VInt} values which have no limit. </div>
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene100;
|
||||
package org.apache.lucene.codecs.lucene101;
|
|
@ -199,7 +199,7 @@ final class DefaultVectorUtilSupport implements VectorUtilSupport {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int findNextGEQ(long[] buffer, int length, long target, int from) {
|
||||
public int findNextGEQ(int[] buffer, int length, int target, int from) {
|
||||
for (int i = from; i < length; ++i) {
|
||||
if (buffer[i] >= target) {
|
||||
return i;
|
||||
|
|
|
@ -34,19 +34,19 @@ public class PostingDecodingUtil {
|
|||
* Core methods for decoding blocks of docs / freqs / positions / offsets.
|
||||
*
|
||||
* <ul>
|
||||
* <li>Read {@code count} longs.
|
||||
* <li>Read {@code count} ints.
|
||||
* <li>For all {@code i} >= 0 so that {@code bShift - i * dec} > 0, apply shift {@code
|
||||
* bShift - i * dec} and store the result in {@code b} at offset {@code count * i}.
|
||||
* <li>Apply mask {@code cMask} and store the result in {@code c} starting at offset {@code
|
||||
* cIndex}.
|
||||
* </ul>
|
||||
*/
|
||||
public void splitLongs(
|
||||
int count, long[] b, int bShift, int dec, long bMask, long[] c, int cIndex, long cMask)
|
||||
public void splitInts(
|
||||
int count, int[] b, int bShift, int dec, int bMask, int[] c, int cIndex, int cMask)
|
||||
throws IOException {
|
||||
// Default implementation, which takes advantage of the C2 compiler's loop unrolling and
|
||||
// auto-vectorization.
|
||||
in.readLongs(c, cIndex, count);
|
||||
in.readInts(c, cIndex, count);
|
||||
int maxIter = (bShift - 1) / dec;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
for (int j = 0; j <= maxIter; ++j) {
|
||||
|
|
|
@ -51,5 +51,5 @@ public interface VectorUtilSupport {
|
|||
* target}. This index is guaranteed to be at least {@code from}. If there is no such array index,
|
||||
* {@code length} is returned.
|
||||
*/
|
||||
int findNextGEQ(long[] buffer, int length, long target, int from);
|
||||
int findNextGEQ(int[] buffer, int length, int target, int from);
|
||||
}
|
||||
|
|
|
@ -190,8 +190,8 @@ public abstract class VectorizationProvider {
|
|||
Set.of(
|
||||
"org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil",
|
||||
"org.apache.lucene.util.VectorUtil",
|
||||
"org.apache.lucene.codecs.lucene912.Lucene912PostingsReader",
|
||||
"org.apache.lucene.codecs.lucene912.PostingIndexInput");
|
||||
"org.apache.lucene.codecs.lucene101.Lucene101PostingsReader",
|
||||
"org.apache.lucene.codecs.lucene101.PostingIndexInput");
|
||||
|
||||
private static void ensureCaller() {
|
||||
final boolean validCaller =
|
||||
|
|
|
@ -21,8 +21,8 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.index.ImpactsEnum;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -399,10 +399,10 @@ public class PhraseQuery extends Query {
|
|||
/**
|
||||
* A guess of the average number of simple operations for the initial seek and buffer refill per
|
||||
* document for the positions of a term. See also {@link
|
||||
* Lucene912PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
|
||||
* Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
|
||||
*
|
||||
* <p>Aside: Instead of being constant this could depend among others on {@link
|
||||
* Lucene912PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
|
||||
* Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
|
||||
* TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs),
|
||||
* {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block
|
||||
* size of the device storing the index.
|
||||
|
@ -411,7 +411,7 @@ public class PhraseQuery extends Query {
|
|||
|
||||
/**
|
||||
* Number of simple operations in {@link
|
||||
* Lucene912PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill
|
||||
* Lucene101PostingsReader.BlockImpactsPostingsEnum#nextPosition()} when no seek or buffer refill
|
||||
* is done.
|
||||
*/
|
||||
private static final int TERM_OPS_PER_POS = 7;
|
||||
|
|
|
@ -151,7 +151,7 @@ public abstract class BufferedIndexInput extends IndexInput implements RandomAcc
|
|||
}
|
||||
|
||||
@Override
|
||||
public void readGroupVInt(long[] dst, int offset) throws IOException {
|
||||
public void readGroupVInt(int[] dst, int offset) throws IOException {
|
||||
final int len =
|
||||
GroupVIntUtil.readGroupVInt(
|
||||
this, buffer.remaining(), p -> buffer.getInt((int) p), buffer.position(), dst, offset);
|
||||
|
|
|
@ -204,7 +204,7 @@ public final class ByteBuffersDataInput extends DataInput
|
|||
}
|
||||
|
||||
@Override
|
||||
public void readGroupVInt(long[] dst, int offset) throws IOException {
|
||||
public void readGroupVInt(int[] dst, int offset) throws IOException {
|
||||
final ByteBuffer block = blocks[blockIndex(pos)];
|
||||
final int blockOffset = blockOffset(pos);
|
||||
// We MUST save the return value to local variable, could not use pos += readGroupVInt(...).
|
||||
|
|
|
@ -206,7 +206,7 @@ public final class ByteBuffersIndexInput extends IndexInput implements RandomAcc
|
|||
}
|
||||
|
||||
@Override
|
||||
public void readGroupVInt(long[] dst, int offset) throws IOException {
|
||||
public void readGroupVInt(int[] dst, int offset) throws IOException {
|
||||
ensureOpen();
|
||||
in.readGroupVInt(dst, offset);
|
||||
}
|
||||
|
|
|
@ -102,8 +102,10 @@ public abstract class DataInput implements Cloneable {
|
|||
/**
|
||||
* Override if you have an efficient implementation. In general this is when the input supports
|
||||
* random access.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public void readGroupVInt(long[] dst, int offset) throws IOException {
|
||||
public void readGroupVInt(int[] dst, int offset) throws IOException {
|
||||
GroupVIntUtil.readGroupVInt(this, dst, offset);
|
||||
}
|
||||
|
||||
|
|
|
@ -340,4 +340,19 @@ public abstract class DataOutput {
|
|||
}
|
||||
GroupVIntUtil.writeGroupVInts(this, groupVIntBytes, values, limit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode integers using group-varint. It uses {@link DataOutput#writeVInt VInt} to encode tail
|
||||
* values that are not enough for a group.
|
||||
*
|
||||
* @param values the values to write
|
||||
* @param limit the number of values to write.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public void writeGroupVInts(int[] values, int limit) throws IOException {
|
||||
if (groupVIntBytes == null) {
|
||||
groupVIntBytes = new byte[GroupVIntUtil.MAX_LENGTH_PER_GROUP];
|
||||
}
|
||||
GroupVIntUtil.writeGroupVInts(this, groupVIntBytes, values, limit);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,7 +30,8 @@ public final class GroupVIntUtil {
|
|||
public static final int MAX_LENGTH_PER_GROUP = 17;
|
||||
|
||||
// we use long array instead of int array to make negative integer to be read as positive long.
|
||||
private static final long[] MASKS = new long[] {0xFFL, 0xFFFFL, 0xFFFFFFL, 0xFFFFFFFFL};
|
||||
private static final long[] LONG_MASKS = new long[] {0xFFL, 0xFFFFL, 0xFFFFFFL, 0xFFFFFFFFL};
|
||||
private static final int[] INT_MASKS = new int[] {0xFF, 0xFFFF, 0xFFFFFF, ~0};
|
||||
|
||||
/**
|
||||
* Read all the group varints, including the tail vints. we need a long[] because this is what
|
||||
|
@ -43,13 +44,30 @@ public final class GroupVIntUtil {
|
|||
public static void readGroupVInts(DataInput in, long[] dst, int limit) throws IOException {
|
||||
int i;
|
||||
for (i = 0; i <= limit - 4; i += 4) {
|
||||
in.readGroupVInt(dst, i);
|
||||
readGroupVInt(in, dst, i);
|
||||
}
|
||||
for (; i < limit; ++i) {
|
||||
dst[i] = in.readVInt() & 0xFFFFFFFFL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read all the group varints, including the tail vints.
|
||||
*
|
||||
* @param dst the array to read ints into.
|
||||
* @param limit the number of int values to read.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static void readGroupVInts(DataInput in, int[] dst, int limit) throws IOException {
|
||||
int i;
|
||||
for (i = 0; i <= limit - 4; i += 4) {
|
||||
in.readGroupVInt(dst, i);
|
||||
}
|
||||
for (; i < limit; ++i) {
|
||||
dst[i] = in.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Default implementation of read single group, for optimal performance, you should use {@link
|
||||
* GroupVIntUtil#readGroupVInts(DataInput, long[], int)} instead.
|
||||
|
@ -66,22 +84,44 @@ public final class GroupVIntUtil {
|
|||
final int n3Minus1 = (flag >> 2) & 0x03;
|
||||
final int n4Minus1 = flag & 0x03;
|
||||
|
||||
dst[offset] = readLongInGroup(in, n1Minus1);
|
||||
dst[offset + 1] = readLongInGroup(in, n2Minus1);
|
||||
dst[offset + 2] = readLongInGroup(in, n3Minus1);
|
||||
dst[offset + 3] = readLongInGroup(in, n4Minus1);
|
||||
dst[offset] = readIntInGroup(in, n1Minus1) & 0xFFFFFFFFL;
|
||||
dst[offset + 1] = readIntInGroup(in, n2Minus1) & 0xFFFFFFFFL;
|
||||
dst[offset + 2] = readIntInGroup(in, n3Minus1) & 0xFFFFFFFFL;
|
||||
dst[offset + 3] = readIntInGroup(in, n4Minus1) & 0xFFFFFFFFL;
|
||||
}
|
||||
|
||||
private static long readLongInGroup(DataInput in, int numBytesMinus1) throws IOException {
|
||||
/**
|
||||
* Default implementation of read single group, for optimal performance, you should use {@link
|
||||
* GroupVIntUtil#readGroupVInts(DataInput, int[], int)} instead.
|
||||
*
|
||||
* @param in the input to use to read data.
|
||||
* @param dst the array to read ints into.
|
||||
* @param offset the offset in the array to start storing ints.
|
||||
*/
|
||||
public static void readGroupVInt(DataInput in, int[] dst, int offset) throws IOException {
|
||||
final int flag = in.readByte() & 0xFF;
|
||||
|
||||
final int n1Minus1 = flag >> 6;
|
||||
final int n2Minus1 = (flag >> 4) & 0x03;
|
||||
final int n3Minus1 = (flag >> 2) & 0x03;
|
||||
final int n4Minus1 = flag & 0x03;
|
||||
|
||||
dst[offset] = readIntInGroup(in, n1Minus1);
|
||||
dst[offset + 1] = readIntInGroup(in, n2Minus1);
|
||||
dst[offset + 2] = readIntInGroup(in, n3Minus1);
|
||||
dst[offset + 3] = readIntInGroup(in, n4Minus1);
|
||||
}
|
||||
|
||||
private static int readIntInGroup(DataInput in, int numBytesMinus1) throws IOException {
|
||||
switch (numBytesMinus1) {
|
||||
case 0:
|
||||
return in.readByte() & 0xFFL;
|
||||
return in.readByte() & 0xFF;
|
||||
case 1:
|
||||
return in.readShort() & 0xFFFFL;
|
||||
return in.readShort() & 0xFFFF;
|
||||
case 2:
|
||||
return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16);
|
||||
return (in.readShort() & 0xFFFF) | ((in.readByte() & 0xFF) << 16);
|
||||
default:
|
||||
return in.readInt() & 0xFFFFFFFFL;
|
||||
return in.readInt();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -123,13 +163,53 @@ public final class GroupVIntUtil {
|
|||
final int n4Minus1 = flag & 0x03;
|
||||
|
||||
// This code path has fewer conditionals and tends to be significantly faster in benchmarks
|
||||
dst[offset] = reader.read(pos) & MASKS[n1Minus1];
|
||||
dst[offset] = reader.read(pos) & LONG_MASKS[n1Minus1];
|
||||
pos += 1 + n1Minus1;
|
||||
dst[offset + 1] = reader.read(pos) & MASKS[n2Minus1];
|
||||
dst[offset + 1] = reader.read(pos) & LONG_MASKS[n2Minus1];
|
||||
pos += 1 + n2Minus1;
|
||||
dst[offset + 2] = reader.read(pos) & MASKS[n3Minus1];
|
||||
dst[offset + 2] = reader.read(pos) & LONG_MASKS[n3Minus1];
|
||||
pos += 1 + n3Minus1;
|
||||
dst[offset + 3] = reader.read(pos) & MASKS[n4Minus1];
|
||||
dst[offset + 3] = reader.read(pos) & LONG_MASKS[n4Minus1];
|
||||
pos += 1 + n4Minus1;
|
||||
return (int) (pos - posStart);
|
||||
}
|
||||
|
||||
/**
|
||||
* Faster implementation of read single group, It read values from the buffer that would not cross
|
||||
* boundaries.
|
||||
*
|
||||
* @param in the input to use to read data.
|
||||
* @param remaining the number of remaining bytes allowed to read for current block/segment.
|
||||
* @param reader the supplier of read int.
|
||||
* @param pos the start pos to read from the reader.
|
||||
* @param dst the array to read ints into.
|
||||
* @param offset the offset in the array to start storing ints.
|
||||
* @return the number of bytes read excluding the flag. this indicates the number of positions
|
||||
* should to be increased for caller, it is 0 or positive number and less than {@link
|
||||
* #MAX_LENGTH_PER_GROUP}
|
||||
*/
|
||||
public static int readGroupVInt(
|
||||
DataInput in, long remaining, IntReader reader, long pos, int[] dst, int offset)
|
||||
throws IOException {
|
||||
if (remaining < MAX_LENGTH_PER_GROUP) {
|
||||
readGroupVInt(in, dst, offset);
|
||||
return 0;
|
||||
}
|
||||
final int flag = in.readByte() & 0xFF;
|
||||
final long posStart = ++pos; // exclude the flag bytes, the position has updated via readByte().
|
||||
final int n1Minus1 = flag >> 6;
|
||||
final int n2Minus1 = (flag >> 4) & 0x03;
|
||||
final int n3Minus1 = (flag >> 2) & 0x03;
|
||||
final int n4Minus1 = flag & 0x03;
|
||||
|
||||
// This code path has fewer conditionals and tends to be significantly faster in benchmarks
|
||||
dst[offset] = reader.read(pos) & INT_MASKS[n1Minus1];
|
||||
pos += 1 + n1Minus1;
|
||||
dst[offset + 1] = reader.read(pos) & INT_MASKS[n2Minus1];
|
||||
pos += 1 + n2Minus1;
|
||||
dst[offset + 2] = reader.read(pos) & INT_MASKS[n3Minus1];
|
||||
pos += 1 + n3Minus1;
|
||||
dst[offset + 3] = reader.read(pos) & INT_MASKS[n4Minus1];
|
||||
pos += 1 + n4Minus1;
|
||||
return (int) (pos - posStart);
|
||||
}
|
||||
|
@ -180,4 +260,39 @@ public final class GroupVIntUtil {
|
|||
out.writeVInt(toInt(values[readPos]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The implementation for group-varint encoding, It uses a maximum of {@link
|
||||
* #MAX_LENGTH_PER_GROUP} bytes scratch buffer.
|
||||
*/
|
||||
public static void writeGroupVInts(DataOutput out, byte[] scratch, int[] values, int limit)
|
||||
throws IOException {
|
||||
int readPos = 0;
|
||||
|
||||
// encode each group
|
||||
while ((limit - readPos) >= 4) {
|
||||
int writePos = 0;
|
||||
final int n1Minus1 = numBytes(values[readPos]) - 1;
|
||||
final int n2Minus1 = numBytes(values[readPos + 1]) - 1;
|
||||
final int n3Minus1 = numBytes(values[readPos + 2]) - 1;
|
||||
final int n4Minus1 = numBytes(values[readPos + 3]) - 1;
|
||||
int flag = (n1Minus1 << 6) | (n2Minus1 << 4) | (n3Minus1 << 2) | (n4Minus1);
|
||||
scratch[writePos++] = (byte) flag;
|
||||
BitUtil.VH_LE_INT.set(scratch, writePos, values[readPos++]);
|
||||
writePos += n1Minus1 + 1;
|
||||
BitUtil.VH_LE_INT.set(scratch, writePos, values[readPos++]);
|
||||
writePos += n2Minus1 + 1;
|
||||
BitUtil.VH_LE_INT.set(scratch, writePos, values[readPos++]);
|
||||
writePos += n3Minus1 + 1;
|
||||
BitUtil.VH_LE_INT.set(scratch, writePos, values[readPos++]);
|
||||
writePos += n4Minus1 + 1;
|
||||
|
||||
out.writeBytes(scratch, writePos);
|
||||
}
|
||||
|
||||
// tail vints
|
||||
for (; readPos < limit; readPos++) {
|
||||
out.writeVInt(toInt(values[readPos]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -314,7 +314,7 @@ public final class VectorUtil {
|
|||
* target}. This index is guaranteed to be at least {@code from}. If there is no such array index,
|
||||
* {@code length} is returned.
|
||||
*/
|
||||
public static int findNextGEQ(long[] buffer, int length, long target, int from) {
|
||||
public static int findNextGEQ(int[] buffer, int length, int target, int from) {
|
||||
return IMPL.findNextGEQ(buffer, length, target, from);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,15 +19,15 @@ package org.apache.lucene.internal.vectorization;
|
|||
import java.io.IOException;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.nio.ByteOrder;
|
||||
import jdk.incubator.vector.LongVector;
|
||||
import jdk.incubator.vector.IntVector;
|
||||
import jdk.incubator.vector.VectorOperators;
|
||||
import jdk.incubator.vector.VectorSpecies;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
final class MemorySegmentPostingDecodingUtil extends PostingDecodingUtil {
|
||||
|
||||
private static final VectorSpecies<Long> LONG_SPECIES =
|
||||
PanamaVectorConstants.PRERERRED_LONG_SPECIES;
|
||||
private static final VectorSpecies<Integer> INT_SPECIES =
|
||||
PanamaVectorConstants.PRERERRED_INT_SPECIES;
|
||||
|
||||
private final MemorySegment memorySegment;
|
||||
|
||||
|
@ -37,7 +37,7 @@ final class MemorySegmentPostingDecodingUtil extends PostingDecodingUtil {
|
|||
}
|
||||
|
||||
private static void shift(
|
||||
LongVector vector, int bShift, int dec, int maxIter, long bMask, long[] b, int count, int i) {
|
||||
IntVector vector, int bShift, int dec, int maxIter, int bMask, int[] b, int count, int i) {
|
||||
for (int j = 0; j <= maxIter; ++j) {
|
||||
vector
|
||||
.lanewise(VectorOperators.LSHR, bShift - j * dec)
|
||||
|
@ -47,36 +47,35 @@ final class MemorySegmentPostingDecodingUtil extends PostingDecodingUtil {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void splitLongs(
|
||||
int count, long[] b, int bShift, int dec, long bMask, long[] c, int cIndex, long cMask)
|
||||
public void splitInts(
|
||||
int count, int[] b, int bShift, int dec, int bMask, int[] c, int cIndex, int cMask)
|
||||
throws IOException {
|
||||
if (count < LONG_SPECIES.length()) {
|
||||
if (count < INT_SPECIES.length()) {
|
||||
// Not enough data to vectorize without going out-of-bounds. In practice, this branch is never
|
||||
// used if the bit width is 256, and is used for 2 and 3 bits per value if the bit width is
|
||||
// 512.
|
||||
super.splitLongs(count, b, bShift, dec, bMask, c, cIndex, cMask);
|
||||
super.splitInts(count, b, bShift, dec, bMask, c, cIndex, cMask);
|
||||
return;
|
||||
}
|
||||
|
||||
int maxIter = (bShift - 1) / dec;
|
||||
long offset = in.getFilePointer();
|
||||
long endOffset = offset + count * Long.BYTES;
|
||||
int loopBound = LONG_SPECIES.loopBound(count - 1);
|
||||
long endOffset = offset + count * Integer.BYTES;
|
||||
int loopBound = INT_SPECIES.loopBound(count - 1);
|
||||
for (int i = 0;
|
||||
i < loopBound;
|
||||
i += LONG_SPECIES.length(), offset += LONG_SPECIES.length() * Long.BYTES) {
|
||||
LongVector vector =
|
||||
LongVector.fromMemorySegment(
|
||||
LONG_SPECIES, memorySegment, offset, ByteOrder.LITTLE_ENDIAN);
|
||||
i += INT_SPECIES.length(), offset += INT_SPECIES.length() * Integer.BYTES) {
|
||||
IntVector vector =
|
||||
IntVector.fromMemorySegment(INT_SPECIES, memorySegment, offset, ByteOrder.LITTLE_ENDIAN);
|
||||
shift(vector, bShift, dec, maxIter, bMask, b, count, i);
|
||||
vector.lanewise(VectorOperators.AND, cMask).intoArray(c, cIndex + i);
|
||||
}
|
||||
|
||||
// Handle the tail by reading a vector that is aligned with `count` on the right side.
|
||||
int i = count - LONG_SPECIES.length();
|
||||
offset = endOffset - LONG_SPECIES.length() * Long.BYTES;
|
||||
LongVector vector =
|
||||
LongVector.fromMemorySegment(LONG_SPECIES, memorySegment, offset, ByteOrder.LITTLE_ENDIAN);
|
||||
int i = count - INT_SPECIES.length();
|
||||
offset = endOffset - INT_SPECIES.length() * Integer.BYTES;
|
||||
IntVector vector =
|
||||
IntVector.fromMemorySegment(INT_SPECIES, memorySegment, offset, ByteOrder.LITTLE_ENDIAN);
|
||||
shift(vector, bShift, dec, maxIter, bMask, b, count, i);
|
||||
vector.lanewise(VectorOperators.AND, cMask).intoArray(c, cIndex + i);
|
||||
|
||||
|
|
|
@ -29,7 +29,6 @@ import java.lang.foreign.MemorySegment;
|
|||
import jdk.incubator.vector.ByteVector;
|
||||
import jdk.incubator.vector.FloatVector;
|
||||
import jdk.incubator.vector.IntVector;
|
||||
import jdk.incubator.vector.LongVector;
|
||||
import jdk.incubator.vector.ShortVector;
|
||||
import jdk.incubator.vector.Vector;
|
||||
import jdk.incubator.vector.VectorMask;
|
||||
|
@ -59,7 +58,6 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
|||
PanamaVectorConstants.PRERERRED_INT_SPECIES;
|
||||
private static final VectorSpecies<Byte> BYTE_SPECIES;
|
||||
private static final VectorSpecies<Short> SHORT_SPECIES;
|
||||
private static final VectorSpecies<Long> LONG_SPECIES;
|
||||
|
||||
static final int VECTOR_BITSIZE;
|
||||
|
||||
|
@ -75,7 +73,6 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
|||
BYTE_SPECIES = null;
|
||||
SHORT_SPECIES = null;
|
||||
}
|
||||
LONG_SPECIES = PanamaVectorConstants.PRERERRED_LONG_SPECIES;
|
||||
}
|
||||
|
||||
// the way FMA should work! if available use it, otherwise fall back to mul/add
|
||||
|
@ -767,17 +764,17 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
|||
return acc1.add(acc2).reduceLanes(ADD);
|
||||
}
|
||||
|
||||
// Experiments suggest that we need at least 4 lanes so that the overhead of going with the vector
|
||||
// Experiments suggest that we need at least 8 lanes so that the overhead of going with the vector
|
||||
// approach and counting trues on vector masks pays off.
|
||||
private static final boolean ENABLE_FIND_NEXT_GEQ_VECTOR_OPTO = LONG_SPECIES.length() >= 4;
|
||||
private static final boolean ENABLE_FIND_NEXT_GEQ_VECTOR_OPTO = INT_SPECIES.length() >= 8;
|
||||
|
||||
@Override
|
||||
public int findNextGEQ(long[] buffer, int length, long target, int from) {
|
||||
public int findNextGEQ(int[] buffer, int length, int target, int from) {
|
||||
if (ENABLE_FIND_NEXT_GEQ_VECTOR_OPTO) {
|
||||
for (; from + LONG_SPECIES.length() < length; from += LONG_SPECIES.length() + 1) {
|
||||
if (buffer[from + LONG_SPECIES.length()] >= target) {
|
||||
LongVector vector = LongVector.fromArray(LONG_SPECIES, buffer, from);
|
||||
VectorMask<Long> mask = vector.compare(VectorOperators.LT, target);
|
||||
for (; from + INT_SPECIES.length() < length; from += INT_SPECIES.length() + 1) {
|
||||
if (buffer[from + INT_SPECIES.length()] >= target) {
|
||||
IntVector vector = IntVector.fromArray(INT_SPECIES, buffer, from);
|
||||
VectorMask<Integer> mask = vector.compare(VectorOperators.LT, target);
|
||||
return from + mask.trueCount();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -419,7 +419,7 @@ abstract class MemorySegmentIndexInput extends IndexInput
|
|||
}
|
||||
|
||||
@Override
|
||||
public void readGroupVInt(long[] dst, int offset) throws IOException {
|
||||
public void readGroupVInt(int[] dst, int offset) throws IOException {
|
||||
try {
|
||||
final int len =
|
||||
GroupVIntUtil.readGroupVInt(
|
||||
|
|
|
@ -13,4 +13,4 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.codecs.lucene100.Lucene100Codec
|
||||
org.apache.lucene.codecs.lucene101.Lucene101Codec
|
||||
|
|
|
@ -13,4 +13,4 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat
|
||||
org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat
|
||||
|
|
|
@ -0,0 +1,92 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
public class TestForDeltaUtil extends LuceneTestCase {
|
||||
|
||||
public void testEncodeDecode() throws IOException {
|
||||
final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
|
||||
final int[] values = new int[iterations * ForUtil.BLOCK_SIZE];
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
final int bpv = TestUtil.nextInt(random(), 1, 31 - 7);
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
values[i * ForUtil.BLOCK_SIZE + j] =
|
||||
RandomNumbers.randomIntBetween(random(), 1, (int) PackedInts.maxValue(bpv));
|
||||
}
|
||||
}
|
||||
|
||||
final Directory d = new ByteBuffersDirectory();
|
||||
final long endPointer;
|
||||
|
||||
{
|
||||
// encode
|
||||
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
|
||||
final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
int[] source = new int[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
source[j] = values[i * ForUtil.BLOCK_SIZE + j];
|
||||
}
|
||||
forDeltaUtil.encodeDeltas(source, out);
|
||||
}
|
||||
endPointer = out.getFilePointer();
|
||||
out.close();
|
||||
}
|
||||
|
||||
{
|
||||
// decode
|
||||
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
|
||||
PostingDecodingUtil pdu =
|
||||
Lucene101PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
|
||||
ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
int base = 0;
|
||||
final int[] restored = new int[ForUtil.BLOCK_SIZE];
|
||||
forDeltaUtil.decodeAndPrefixSum(pdu, base, restored);
|
||||
final int[] expected = new int[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
expected[j] = values[i * ForUtil.BLOCK_SIZE + j];
|
||||
if (j > 0) {
|
||||
expected[j] += expected[j - 1];
|
||||
} else {
|
||||
expected[j] += base;
|
||||
}
|
||||
}
|
||||
assertArrayEquals(Arrays.toString(restored), expected, restored);
|
||||
}
|
||||
assertEquals(endPointer, in.getFilePointer());
|
||||
in.close();
|
||||
}
|
||||
|
||||
d.close();
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
||||
|
@ -53,7 +53,7 @@ public class TestForUtil extends LuceneTestCase {
|
|||
final ForUtil forUtil = new ForUtil();
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
long[] source = new long[ForUtil.BLOCK_SIZE];
|
||||
int[] source = new int[ForUtil.BLOCK_SIZE];
|
||||
long or = 0;
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
source[j] = values[i * ForUtil.BLOCK_SIZE + j];
|
||||
|
@ -71,12 +71,12 @@ public class TestForUtil extends LuceneTestCase {
|
|||
// decode
|
||||
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
|
||||
PostingDecodingUtil pdu =
|
||||
Lucene912PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
|
||||
Lucene101PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
|
||||
ForUtil forUtil = new ForUtil();
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
final int bitsPerValue = in.readByte();
|
||||
final long currentFilePointer = in.getFilePointer();
|
||||
final long[] restored = new long[ForUtil.BLOCK_SIZE];
|
||||
final int[] restored = new int[ForUtil.BLOCK_SIZE];
|
||||
forUtil.decode(bitsPerValue, pdu, restored);
|
||||
int[] ints = new int[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader.MutableImpactList;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.Impact;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.tests.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.tests.index.BasePostingsFormatTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
|
||||
public class TestLucene101PostingsFormat extends BasePostingsFormatTestCase {
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return TestUtil.alwaysPostingsFormat(new Lucene101PostingsFormat());
|
||||
}
|
||||
|
||||
public void testVInt15() throws IOException {
|
||||
byte[] bytes = new byte[5];
|
||||
ByteArrayDataOutput out = new ByteArrayDataOutput(bytes);
|
||||
ByteArrayDataInput in = new ByteArrayDataInput();
|
||||
for (int i : new int[] {0, 1, 127, 128, 32767, 32768, Integer.MAX_VALUE}) {
|
||||
out.reset(bytes);
|
||||
Lucene101PostingsWriter.writeVInt15(out, i);
|
||||
in.reset(bytes, 0, out.getPosition());
|
||||
assertEquals(i, Lucene101PostingsReader.readVInt15(in));
|
||||
assertEquals(out.getPosition(), in.getPosition());
|
||||
}
|
||||
}
|
||||
|
||||
public void testVLong15() throws IOException {
|
||||
byte[] bytes = new byte[9];
|
||||
ByteArrayDataOutput out = new ByteArrayDataOutput(bytes);
|
||||
ByteArrayDataInput in = new ByteArrayDataInput();
|
||||
for (long i : new long[] {0, 1, 127, 128, 32767, 32768, Integer.MAX_VALUE, Long.MAX_VALUE}) {
|
||||
out.reset(bytes);
|
||||
Lucene101PostingsWriter.writeVLong15(out, i);
|
||||
in.reset(bytes, 0, out.getPosition());
|
||||
assertEquals(i, Lucene101PostingsReader.readVLong15(in));
|
||||
assertEquals(out.getPosition(), in.getPosition());
|
||||
}
|
||||
}
|
||||
|
||||
/** Make sure the final sub-block(s) are not skipped. */
|
||||
public void testFinalBlock() throws Exception {
|
||||
Directory d = newDirectory();
|
||||
IndexWriter w = new IndexWriter(d, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
for (int i = 0; i < 25; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField("field", Character.toString((char) (97 + i)), Field.Store.NO));
|
||||
doc.add(newStringField("field", "z" + Character.toString((char) (97 + i)), Field.Store.NO));
|
||||
w.addDocument(doc);
|
||||
}
|
||||
w.forceMerge(1);
|
||||
|
||||
DirectoryReader r = DirectoryReader.open(w);
|
||||
assertEquals(1, r.leaves().size());
|
||||
FieldReader field = (FieldReader) r.leaves().get(0).reader().terms("field");
|
||||
// We should see exactly two blocks: one root block (prefix empty string) and one block for z*
|
||||
// terms (prefix z):
|
||||
Stats stats = field.getStats();
|
||||
assertEquals(0, stats.floorBlockCount);
|
||||
assertEquals(2, stats.nonFloorBlockCount);
|
||||
r.close();
|
||||
w.close();
|
||||
d.close();
|
||||
}
|
||||
|
||||
public void testImpactSerialization() throws IOException {
|
||||
// omit norms and omit freqs
|
||||
doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L)));
|
||||
|
||||
// omit freqs
|
||||
doTestImpactSerialization(Collections.singletonList(new Impact(1, 42L)));
|
||||
// omit freqs with very large norms
|
||||
doTestImpactSerialization(Collections.singletonList(new Impact(1, -100L)));
|
||||
|
||||
// omit norms
|
||||
doTestImpactSerialization(Collections.singletonList(new Impact(30, 1L)));
|
||||
// omit norms with large freq
|
||||
doTestImpactSerialization(Collections.singletonList(new Impact(500, 1L)));
|
||||
|
||||
// freqs and norms, basic
|
||||
doTestImpactSerialization(
|
||||
Arrays.asList(
|
||||
new Impact(1, 7L),
|
||||
new Impact(3, 9L),
|
||||
new Impact(7, 10L),
|
||||
new Impact(15, 11L),
|
||||
new Impact(20, 13L),
|
||||
new Impact(28, 14L)));
|
||||
|
||||
// freqs and norms, high values
|
||||
doTestImpactSerialization(
|
||||
Arrays.asList(
|
||||
new Impact(2, 2L),
|
||||
new Impact(10, 10L),
|
||||
new Impact(12, 50L),
|
||||
new Impact(50, -100L),
|
||||
new Impact(1000, -80L),
|
||||
new Impact(1005, -3L)));
|
||||
}
|
||||
|
||||
private void doTestImpactSerialization(List<Impact> impacts) throws IOException {
|
||||
CompetitiveImpactAccumulator acc = new CompetitiveImpactAccumulator();
|
||||
for (Impact impact : impacts) {
|
||||
acc.add(impact.freq, impact.norm);
|
||||
}
|
||||
try (Directory dir = newDirectory()) {
|
||||
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||
Lucene101PostingsWriter.writeImpacts(acc.getCompetitiveFreqNormPairs(), out);
|
||||
}
|
||||
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||
byte[] b = new byte[Math.toIntExact(in.length())];
|
||||
in.readBytes(b, 0, b.length);
|
||||
List<Impact> impacts2 =
|
||||
Lucene101PostingsReader.readImpacts(
|
||||
new ByteArrayDataInput(b),
|
||||
new MutableImpactList(impacts.size() + random().nextInt(3)));
|
||||
assertEquals(impacts, impacts2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene912;
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
||||
|
@ -41,14 +41,14 @@ public class TestPForUtil extends LuceneTestCase {
|
|||
|
||||
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
|
||||
PostingDecodingUtil pdu =
|
||||
Lucene912PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
|
||||
Lucene101PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in);
|
||||
final PForUtil pforUtil = new PForUtil();
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
if (random().nextInt(5) == 0) {
|
||||
PForUtil.skip(in);
|
||||
continue;
|
||||
}
|
||||
final long[] restored = new long[ForUtil.BLOCK_SIZE];
|
||||
final int[] restored = new int[ForUtil.BLOCK_SIZE];
|
||||
pforUtil.decode(pdu, restored);
|
||||
int[] ints = new int[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
|
@ -93,7 +93,7 @@ public class TestPForUtil extends LuceneTestCase {
|
|||
final PForUtil pforUtil = new PForUtil();
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
long[] source = new long[ForUtil.BLOCK_SIZE];
|
||||
int[] source = new int[ForUtil.BLOCK_SIZE];
|
||||
for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) {
|
||||
source[j] = values[i * ForUtil.BLOCK_SIZE + j];
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene101;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
|
||||
public class TestPostingsUtil extends LuceneTestCase {
|
||||
|
||||
// checks for bug described in https://github.com/apache/lucene/issues/13373
|
||||
public void testIntegerOverflow() throws IOException {
|
||||
final int size = random().nextInt(1, ForUtil.BLOCK_SIZE);
|
||||
final int[] docDeltaBuffer = new int[size];
|
||||
final int[] freqBuffer = new int[size];
|
||||
|
||||
final int delta = 1 << 30;
|
||||
docDeltaBuffer[0] = delta;
|
||||
try (Directory dir = newDirectory()) {
|
||||
try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) {
|
||||
// In old implementation, this would cause integer overflow exception.
|
||||
PostingsUtil.writeVIntBlock(out, docDeltaBuffer, freqBuffer, size, true);
|
||||
}
|
||||
int[] restoredDocs = new int[size];
|
||||
int[] restoredFreqs = new int[size];
|
||||
try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) {
|
||||
PostingsUtil.readVIntBlock(in, restoredDocs, restoredFreqs, size, true, true);
|
||||
}
|
||||
assertEquals(delta, restoredDocs[0]);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.codecs.lucene90;
|
|||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
|
@ -31,7 +31,7 @@ import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase;
|
|||
public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene100Codec(Lucene100Codec.Mode.BEST_COMPRESSION);
|
||||
return new Lucene101Codec(Lucene101Codec.Mode.BEST_COMPRESSION);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -42,7 +42,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
|
|||
for (int i = 0; i < 10; i++) {
|
||||
IndexWriterConfig iwc = newIndexWriterConfig();
|
||||
iwc.setCodec(
|
||||
new Lucene100Codec(RandomPicks.randomFrom(random(), Lucene100Codec.Mode.values())));
|
||||
new Lucene101Codec(RandomPicks.randomFrom(random(), Lucene101Codec.Mode.values())));
|
||||
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
|
||||
Document doc = new Document();
|
||||
doc.add(new StoredField("field1", "value1"));
|
||||
|
@ -72,7 +72,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
|
|||
expectThrows(
|
||||
NullPointerException.class,
|
||||
() -> {
|
||||
new Lucene100Codec(null);
|
||||
new Lucene101Codec(null);
|
||||
});
|
||||
|
||||
expectThrows(
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.codecs.Codec;
|
|||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.KnnFloatVectorField;
|
||||
|
@ -74,7 +74,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
|
|||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene100Codec() {
|
||||
return new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return format;
|
||||
|
@ -106,7 +106,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
|
|||
dir,
|
||||
newIndexWriterConfig()
|
||||
.setCodec(
|
||||
new Lucene100Codec() {
|
||||
new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return getKnnFormat(4);
|
||||
|
@ -126,7 +126,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
|
|||
dir,
|
||||
newIndexWriterConfig()
|
||||
.setCodec(
|
||||
new Lucene100Codec() {
|
||||
new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return getKnnFormat(7);
|
||||
|
@ -163,7 +163,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
|
|||
dir,
|
||||
newIndexWriterConfig()
|
||||
.setCodec(
|
||||
new Lucene100Codec() {
|
||||
new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene99HnswVectorsFormat();
|
||||
|
@ -183,7 +183,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
|
|||
dir,
|
||||
newIndexWriterConfig()
|
||||
.setCodec(
|
||||
new Lucene100Codec() {
|
||||
new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return getKnnFormat(7);
|
||||
|
@ -216,7 +216,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
|
|||
dir,
|
||||
newIndexWriterConfig()
|
||||
.setCodec(
|
||||
new Lucene100Codec() {
|
||||
new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene99HnswScalarQuantizedVectorsFormat(
|
||||
|
|
|
@ -27,7 +27,7 @@ import org.apache.lucene.codecs.Codec;
|
|||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
|
||||
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -52,7 +52,7 @@ import org.apache.lucene.util.quantization.ScalarQuantizer;
|
|||
public class TestLucene99ScalarQuantizedVectorScorer extends LuceneTestCase {
|
||||
|
||||
private static Codec getCodec(int bits, boolean compress) {
|
||||
return new Lucene100Codec() {
|
||||
return new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene99HnswScalarQuantizedVectorsFormat(
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.codecs.Codec;
|
|||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.KnnFloatVectorField;
|
||||
|
@ -70,7 +70,7 @@ public class TestLucene99ScalarQuantizedVectorsFormat extends BaseKnnVectorsForm
|
|||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene100Codec() {
|
||||
return new Lucene101Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return format;
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
*/
|
||||
package org.apache.lucene.internal.vectorization;
|
||||
|
||||
import org.apache.lucene.codecs.lucene912.ForUtil;
|
||||
import org.apache.lucene.codecs.lucene101.ForUtil;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
@ -27,7 +27,7 @@ import org.apache.lucene.tests.util.TestUtil;
|
|||
|
||||
public class TestPostingDecodingUtil extends LuceneTestCase {
|
||||
|
||||
public void testDuelSplitLongs() throws Exception {
|
||||
public void testDuelSplitInts() throws Exception {
|
||||
final int iterations = atLeast(100);
|
||||
|
||||
try (Directory dir = new MMapDirectory(createTempDir())) {
|
||||
|
@ -39,25 +39,25 @@ public class TestPostingDecodingUtil extends LuceneTestCase {
|
|||
}
|
||||
VectorizationProvider vectorizationProvider = VectorizationProvider.lookup(true);
|
||||
try (IndexInput in = dir.openInput("tests.bin", IOContext.DEFAULT)) {
|
||||
long[] expectedB = new long[ForUtil.BLOCK_SIZE];
|
||||
long[] expectedC = new long[ForUtil.BLOCK_SIZE];
|
||||
long[] actualB = new long[ForUtil.BLOCK_SIZE];
|
||||
long[] actualC = new long[ForUtil.BLOCK_SIZE];
|
||||
int[] expectedB = new int[ForUtil.BLOCK_SIZE];
|
||||
int[] expectedC = new int[ForUtil.BLOCK_SIZE];
|
||||
int[] actualB = new int[ForUtil.BLOCK_SIZE];
|
||||
int[] actualC = new int[ForUtil.BLOCK_SIZE];
|
||||
for (int iter = 0; iter < iterations; ++iter) {
|
||||
// Initialize arrays with random content.
|
||||
for (int i = 0; i < expectedB.length; ++i) {
|
||||
expectedB[i] = random().nextLong();
|
||||
expectedB[i] = random().nextInt();
|
||||
actualB[i] = expectedB[i];
|
||||
expectedC[i] = random().nextLong();
|
||||
expectedC[i] = random().nextInt();
|
||||
actualC[i] = expectedC[i];
|
||||
}
|
||||
int bShift = TestUtil.nextInt(random(), 1, 31);
|
||||
int dec = TestUtil.nextInt(random(), 1, bShift);
|
||||
int numIters = (bShift + dec - 1) / dec;
|
||||
int count = TestUtil.nextInt(random(), 1, 64 / numIters);
|
||||
long bMask = random().nextLong();
|
||||
int bMask = random().nextInt();
|
||||
int cIndex = random().nextInt(64);
|
||||
long cMask = random().nextLong();
|
||||
int cMask = random().nextInt();
|
||||
long startFP = random().nextInt(4);
|
||||
|
||||
// Work on a slice that has just the right number of bytes to make the test fail with an
|
||||
|
@ -69,10 +69,10 @@ public class TestPostingDecodingUtil extends LuceneTestCase {
|
|||
PostingDecodingUtil optimizedUtil = vectorizationProvider.newPostingDecodingUtil(slice);
|
||||
|
||||
slice.seek(startFP);
|
||||
defaultUtil.splitLongs(count, expectedB, bShift, dec, bMask, expectedC, cIndex, cMask);
|
||||
defaultUtil.splitInts(count, expectedB, bShift, dec, bMask, expectedC, cIndex, cMask);
|
||||
long expectedEndFP = slice.getFilePointer();
|
||||
slice.seek(startFP);
|
||||
optimizedUtil.splitLongs(count, actualB, bShift, dec, bMask, actualC, cIndex, cMask);
|
||||
optimizedUtil.splitInts(count, actualB, bShift, dec, bMask, actualC, cIndex, cMask);
|
||||
assertEquals(expectedEndFP, slice.getFilePointer());
|
||||
assertArrayEquals(expectedB, actualB);
|
||||
assertArrayEquals(expectedC, actualC);
|
||||
|
|
|
@ -356,8 +356,8 @@ public class TestVectorUtil extends LuceneTestCase {
|
|||
|
||||
public void testFindNextGEQ() {
|
||||
int padding = TestUtil.nextInt(random(), 0, 5);
|
||||
long[] values = new long[128 + padding];
|
||||
long v = 0;
|
||||
int[] values = new int[128 + padding];
|
||||
int v = 0;
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
v += TestUtil.nextInt(random(), 1, 1000);
|
||||
values[i] = v;
|
||||
|
@ -366,8 +366,8 @@ public class TestVectorUtil extends LuceneTestCase {
|
|||
// Now duel with slowFindFirstGreater
|
||||
for (int iter = 0; iter < 1_000; ++iter) {
|
||||
int from = TestUtil.nextInt(random(), 0, 127);
|
||||
long target =
|
||||
TestUtil.nextLong(random(), values[from], Math.max(values[from], values[127]))
|
||||
int target =
|
||||
TestUtil.nextInt(random(), values[from], Math.max(values[from], values[127]))
|
||||
+ random().nextInt(10)
|
||||
- 5;
|
||||
assertEquals(
|
||||
|
@ -376,7 +376,7 @@ public class TestVectorUtil extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private static int slowFindNextGEQ(long[] buffer, int length, long target, int from) {
|
||||
private static int slowFindNextGEQ(int[] buffer, int length, int target, int from) {
|
||||
for (int i = from; i < length; ++i) {
|
||||
if (buffer[i] >= target) {
|
||||
return i;
|
||||
|
|
|
@ -21,8 +21,8 @@ import java.io.IOException;
|
|||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
|
@ -261,10 +261,10 @@ class TermIntervalsSource extends IntervalsSource {
|
|||
/**
|
||||
* A guess of the average number of simple operations for the initial seek and buffer refill per
|
||||
* document for the positions of a term. See also {@link
|
||||
* Lucene912PostingsReader.EverythingEnum#nextPosition()}.
|
||||
* Lucene101PostingsReader.EverythingEnum#nextPosition()}.
|
||||
*
|
||||
* <p>Aside: Instead of being constant this could depend among others on {@link
|
||||
* Lucene912PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
|
||||
* Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
|
||||
* TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs),
|
||||
* {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block
|
||||
* size of the device storing the index.
|
||||
|
@ -272,7 +272,7 @@ class TermIntervalsSource extends IntervalsSource {
|
|||
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
|
||||
|
||||
/**
|
||||
* Number of simple operations in {@link Lucene912PostingsReader.EverythingEnum#nextPosition()}
|
||||
* Number of simple operations in {@link Lucene101PostingsReader.EverythingEnum#nextPosition()}
|
||||
* when no seek or buffer refill is done.
|
||||
*/
|
||||
private static final int TERM_OPS_PER_POS = 7;
|
||||
|
|
|
@ -32,7 +32,8 @@ module org.apache.lucene.suggest {
|
|||
org.apache.lucene.search.suggest.document.Completion84PostingsFormat,
|
||||
org.apache.lucene.search.suggest.document.Completion90PostingsFormat,
|
||||
org.apache.lucene.search.suggest.document.Completion99PostingsFormat,
|
||||
org.apache.lucene.search.suggest.document.Completion912PostingsFormat;
|
||||
org.apache.lucene.search.suggest.document.Completion912PostingsFormat,
|
||||
org.apache.lucene.search.suggest.document.Completion101PostingsFormat;
|
||||
provides org.apache.lucene.analysis.TokenFilterFactory with
|
||||
org.apache.lucene.search.suggest.analyzing.SuggestStopFilterFactory;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
|
||||
/**
|
||||
* {@link CompletionPostingsFormat} for {@link
|
||||
* org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat}
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Completion101PostingsFormat extends CompletionPostingsFormat {
|
||||
/** Creates a {@link Completion101PostingsFormat} that will load the completion FST on-heap. */
|
||||
public Completion101PostingsFormat() {
|
||||
this(FSTLoadMode.ON_HEAP);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link Completion101PostingsFormat} that will use the provided <code>fstLoadMode
|
||||
* </code> to determine if the completion FST should be loaded on or off heap.
|
||||
*/
|
||||
public Completion101PostingsFormat(FSTLoadMode fstLoadMode) {
|
||||
super("Completion101", fstLoadMode);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PostingsFormat delegatePostingsFormat() {
|
||||
return PostingsFormat.forName("Lucene101");
|
||||
}
|
||||
}
|
|
@ -19,8 +19,10 @@ package org.apache.lucene.search.suggest.document;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
|
||||
/**
|
||||
* {@link CompletionPostingsFormat} for {@link
|
||||
* org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
|
||||
* {@link org.apache.lucene.search.suggest.document.CompletionPostingsFormat} for {@code
|
||||
* org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat}. This format is only used
|
||||
* for backward-compatibility of the index format and cannot be used to write data, use {@link
|
||||
* Completion101PostingsFormat} on new indices.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
|
|
@ -35,3 +35,4 @@ org.apache.lucene.search.suggest.document.Completion84PostingsFormat
|
|||
org.apache.lucene.search.suggest.document.Completion90PostingsFormat
|
||||
org.apache.lucene.search.suggest.document.Completion99PostingsFormat
|
||||
org.apache.lucene.search.suggest.document.Completion912PostingsFormat
|
||||
org.apache.lucene.search.suggest.document.Completion101PostingsFormat
|
||||
|
|
|
@ -951,7 +951,7 @@ public class TestSuggestField extends LuceneTestCase {
|
|||
new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) {
|
||||
final CompletionPostingsFormat.FSTLoadMode fstLoadMode =
|
||||
RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values());
|
||||
final PostingsFormat postingsFormat = new Completion912PostingsFormat(fstLoadMode);
|
||||
final PostingsFormat postingsFormat = new Completion101PostingsFormat(fstLoadMode);
|
||||
|
||||
@Override
|
||||
public PostingsFormat postingsFormat() {
|
||||
|
|
|
@ -28,9 +28,9 @@ import org.apache.lucene.codecs.blockterms.FixedGapTermsIndexReader;
|
|||
import org.apache.lucene.codecs.blockterms.FixedGapTermsIndexWriter;
|
||||
import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase;
|
||||
import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
|
@ -38,7 +38,7 @@ import org.apache.lucene.index.SegmentWriteState;
|
|||
// any PostingsFormat and make it ord-able...
|
||||
|
||||
/**
|
||||
* Customized version of {@link Lucene912PostingsFormat} that uses {@link FixedGapTermsIndexWriter}.
|
||||
* Customized version of {@link Lucene101PostingsFormat} that uses {@link FixedGapTermsIndexWriter}.
|
||||
*/
|
||||
public final class LuceneFixedGap extends PostingsFormat {
|
||||
final int termIndexInterval;
|
||||
|
@ -54,7 +54,7 @@ public final class LuceneFixedGap extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase docs = new Lucene912PostingsWriter(state);
|
||||
PostingsWriterBase docs = new Lucene101PostingsWriter(state);
|
||||
|
||||
// TODO: should we make the terms index more easily
|
||||
// pluggable? Ie so that this codec would record which
|
||||
|
@ -91,7 +91,7 @@ public final class LuceneFixedGap extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postings = new Lucene912PostingsReader(state);
|
||||
PostingsReaderBase postings = new Lucene101PostingsReader(state);
|
||||
TermsIndexReaderBase indexReader;
|
||||
|
||||
boolean success = false;
|
||||
|
|
|
@ -29,9 +29,9 @@ import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase;
|
|||
import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase;
|
||||
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader;
|
||||
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
|
@ -39,7 +39,7 @@ import org.apache.lucene.index.SegmentWriteState;
|
|||
// any PostingsFormat and make it ord-able...
|
||||
|
||||
/**
|
||||
* Customized version of {@link Lucene912PostingsFormat} that uses {@link
|
||||
* Customized version of {@link Lucene101PostingsFormat} that uses {@link
|
||||
* VariableGapTermsIndexWriter} with a fixed interval, but forcing high docfreq terms to be indexed
|
||||
* terms.
|
||||
*/
|
||||
|
@ -59,7 +59,7 @@ public final class LuceneVarGapDocFreqInterval extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase docs = new Lucene912PostingsWriter(state);
|
||||
PostingsWriterBase docs = new Lucene101PostingsWriter(state);
|
||||
|
||||
// TODO: should we make the terms index more easily
|
||||
// pluggable? Ie so that this codec would record which
|
||||
|
@ -100,7 +100,7 @@ public final class LuceneVarGapDocFreqInterval extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postings = new Lucene912PostingsReader(state);
|
||||
PostingsReaderBase postings = new Lucene101PostingsReader(state);
|
||||
TermsIndexReaderBase indexReader;
|
||||
|
||||
boolean success = false;
|
||||
|
|
|
@ -29,9 +29,9 @@ import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase;
|
|||
import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase;
|
||||
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader;
|
||||
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
|
@ -39,7 +39,7 @@ import org.apache.lucene.index.SegmentWriteState;
|
|||
// any PostingsFormat and make it ord-able...
|
||||
|
||||
/**
|
||||
* Customized version of {@link Lucene912PostingsFormat} that uses {@link
|
||||
* Customized version of {@link Lucene101PostingsFormat} that uses {@link
|
||||
* VariableGapTermsIndexWriter} with a fixed interval.
|
||||
*/
|
||||
public final class LuceneVarGapFixedInterval extends PostingsFormat {
|
||||
|
@ -56,7 +56,7 @@ public final class LuceneVarGapFixedInterval extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase docs = new Lucene912PostingsWriter(state);
|
||||
PostingsWriterBase docs = new Lucene101PostingsWriter(state);
|
||||
|
||||
// TODO: should we make the terms index more easily
|
||||
// pluggable? Ie so that this codec would record which
|
||||
|
@ -95,7 +95,7 @@ public final class LuceneVarGapFixedInterval extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postings = new Lucene912PostingsReader(state);
|
||||
PostingsReaderBase postings = new Lucene101PostingsReader(state);
|
||||
TermsIndexReaderBase indexReader;
|
||||
|
||||
boolean success = false;
|
||||
|
|
|
@ -35,10 +35,10 @@ import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader;
|
|||
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter;
|
||||
import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.memory.FSTTermsReader;
|
||||
import org.apache.lucene.codecs.memory.FSTTermsWriter;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
|
@ -121,7 +121,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat {
|
|||
|
||||
random.nextInt(); // consume a random for buffersize
|
||||
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
|
||||
|
||||
final FieldsConsumer fields;
|
||||
final int t1 = random.nextInt(4);
|
||||
|
@ -289,7 +289,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat {
|
|||
System.out.println("MockRandomCodec: readBufferSize=" + readBufferSize);
|
||||
}
|
||||
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene101PostingsReader(state);
|
||||
|
||||
final FieldsProducer fields;
|
||||
final int t1 = random.nextInt(4);
|
||||
|
|
|
@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter;
|
||||
import org.apache.lucene.codecs.uniformsplit.BlockDecoder;
|
||||
import org.apache.lucene.codecs.uniformsplit.BlockEncoder;
|
||||
import org.apache.lucene.codecs.uniformsplit.IndexDictionary;
|
||||
|
@ -67,7 +67,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState segmentWriteState) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(segmentWriteState);
|
||||
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(segmentWriteState);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer fieldsConsumer = createFieldsConsumer(segmentWriteState, postingsWriter);
|
||||
|
@ -145,7 +145,7 @@ public class UniformSplitRot13PostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState segmentReadState) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(segmentReadState);
|
||||
PostingsReaderBase postingsReader = new Lucene101PostingsReader(segmentReadState);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer fieldsProducer = createFieldsProducer(segmentReadState, postingsReader);
|
||||
|
|
|
@ -38,7 +38,7 @@ import java.util.TimeZone;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.tests.codecs.asserting.AssertingCodec;
|
||||
|
@ -190,7 +190,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
|
|||
codec = CompressingCodec.randomInstance(random);
|
||||
} else if ("Lucene100".equals(TEST_CODEC)
|
||||
|| ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene100"))) {
|
||||
codec = new Lucene100Codec(RandomPicks.randomFrom(random, Lucene100Codec.Mode.values()));
|
||||
codec = new Lucene101Codec(RandomPicks.randomFrom(random, Lucene101Codec.Mode.values()));
|
||||
} else if (!"random".equals(TEST_CODEC)) {
|
||||
codec = Codec.forName(TEST_CODEC);
|
||||
} else if ("random".equals(TEST_POSTINGSFORMAT)) {
|
||||
|
|
|
@ -55,9 +55,9 @@ import org.apache.lucene.codecs.DocValuesFormat;
|
|||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
|
||||
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
@ -1315,7 +1315,7 @@ public final class TestUtil {
|
|||
* different from {@link Codec#getDefault()} because that is randomized.
|
||||
*/
|
||||
public static Codec getDefaultCodec() {
|
||||
return new Lucene100Codec();
|
||||
return new Lucene101Codec();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1323,7 +1323,7 @@ public final class TestUtil {
|
|||
* Lucene.
|
||||
*/
|
||||
public static PostingsFormat getDefaultPostingsFormat() {
|
||||
return new Lucene912PostingsFormat();
|
||||
return new Lucene101PostingsFormat();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1334,7 +1334,7 @@ public final class TestUtil {
|
|||
*/
|
||||
public static PostingsFormat getDefaultPostingsFormat(
|
||||
int minItemsPerBlock, int maxItemsPerBlock) {
|
||||
return new Lucene912PostingsFormat(minItemsPerBlock, maxItemsPerBlock);
|
||||
return new Lucene101PostingsFormat(minItemsPerBlock, maxItemsPerBlock);
|
||||
}
|
||||
|
||||
/** Returns a random postings format that supports term ordinals */
|
||||
|
|
Loading…
Reference in New Issue