mirror of https://github.com/apache/lucene.git
LUCENE-8895: switch all FST usage to enable array-with-gaps encoding
also, deprecate unused Util.getByOutput
This commit is contained in:
parent
b4a602f6b2
commit
81e63e8fec
|
@ -97,7 +97,7 @@ class TokenInfoDictionaryBuilder {
|
|||
lines.sort(Comparator.comparing(entry -> entry[0]));
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15, false);
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
long ord = -1; // first ord will be 0
|
||||
String lastValue = null;
|
||||
|
|
|
@ -109,7 +109,7 @@ public class TokenInfoDictionaryBuilder {
|
|||
System.out.println(" encode...");
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15, false);
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
long ord = -1; // first ord will be 0
|
||||
String lastValue = null;
|
||||
|
|
|
@ -363,7 +363,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true, false, Integer.MAX_VALUE,
|
||||
FST_OUTPUTS, true, 15, false);
|
||||
FST_OUTPUTS, true, 15);
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
//}
|
||||
|
|
|
@ -1084,7 +1084,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
|
|||
result.grow(1+upto);
|
||||
fr.index.readFirstRealTargetArc(arc.target, arc, fstReader);
|
||||
|
||||
if (arc.bytesPerArc != 0) {
|
||||
if (arc.bytesPerArc != 0 && arc.arcIdx > Integer.MIN_VALUE) {
|
||||
// System.out.println(" array arcs");
|
||||
int low = 0;
|
||||
int high = arc.numArcs-1;
|
||||
|
|
|
@ -456,7 +456,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true, false, Integer.MAX_VALUE,
|
||||
outputs, true, 15, true);
|
||||
outputs, true, 15);
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
//}
|
||||
|
|
|
@ -73,8 +73,6 @@ public class Builder<T> {
|
|||
|
||||
private final IntsRefBuilder lastInput = new IntsRefBuilder();
|
||||
|
||||
final boolean useDirectArcAddressing;
|
||||
|
||||
// NOTE: cutting this over to ArrayList instead loses ~6%
|
||||
// in build performance on 9.8M Wikipedia terms; so we
|
||||
// left this as an array:
|
||||
|
@ -99,11 +97,11 @@ public class Builder<T> {
|
|||
|
||||
/**
|
||||
* Instantiates an FST/FSA builder without any pruning. A shortcut to {@link
|
||||
* #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int, boolean)} with
|
||||
* #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int)} with
|
||||
* pruning options turned off.
|
||||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, false);
|
||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -154,13 +152,12 @@ public class Builder<T> {
|
|||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||
boolean allowArrayArcs, int bytesPageBits, boolean useDirectArcAddressing) {
|
||||
boolean allowArrayArcs, int bytesPageBits) {
|
||||
this.minSuffixCount1 = minSuffixCount1;
|
||||
this.minSuffixCount2 = minSuffixCount2;
|
||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
this.allowArrayArcs = allowArrayArcs;
|
||||
this.useDirectArcAddressing = useDirectArcAddressing;
|
||||
fst = new FST<>(inputType, outputs, bytesPageBits);
|
||||
bytes = fst.bytes;
|
||||
assert bytes != null;
|
||||
|
|
|
@ -647,8 +647,7 @@ public final class FST<T> implements Accountable {
|
|||
// array that may have holes in it so that we can address the arcs directly by label without
|
||||
// binary search
|
||||
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
|
||||
boolean writeDirectly = builder.useDirectArcAddressing && labelRange > 0
|
||||
&& labelRange < Builder.DIRECT_ARC_LOAD_FACTOR * nodeIn.numArcs;
|
||||
boolean writeDirectly = labelRange > 0 && labelRange < Builder.DIRECT_ARC_LOAD_FACTOR * nodeIn.numArcs;
|
||||
|
||||
//System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs);
|
||||
// create the header
|
||||
|
|
|
@ -105,6 +105,7 @@ public final class Util {
|
|||
* For example, simple ordinals (0, 1,
|
||||
* 2, ...), or file offsets (when appending to a file)
|
||||
* fit this. */
|
||||
@Deprecated
|
||||
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
|
||||
|
||||
final BytesReader in = fst.getBytesReader();
|
||||
|
|
|
@ -55,7 +55,7 @@ public class Test2BFST extends LuceneTestCase {
|
|||
Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
Object NO_OUTPUT = outputs.getNoOutput();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
||||
true, 15, true);
|
||||
true, 15);
|
||||
|
||||
int count = 0;
|
||||
Random r = new Random(seed);
|
||||
|
@ -137,7 +137,7 @@ public class Test2BFST extends LuceneTestCase {
|
|||
System.out.println("\nTEST: 3 GB size; outputs=bytes");
|
||||
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
|
||||
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
||||
true, 15, true);
|
||||
true, 15);
|
||||
|
||||
byte[] outputBytes = new byte[20];
|
||||
BytesRef output = new BytesRef(outputBytes);
|
||||
|
@ -217,7 +217,7 @@ public class Test2BFST extends LuceneTestCase {
|
|||
System.out.println("\nTEST: 3 GB size; outputs=long");
|
||||
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
|
||||
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
||||
true, 15, true);
|
||||
true, 15);
|
||||
|
||||
long output = 1;
|
||||
|
||||
|
|
|
@ -327,7 +327,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
writer.close();
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
|
||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, true);
|
||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
|
||||
boolean storeOrd = random().nextBoolean();
|
||||
if (VERBOSE) {
|
||||
|
@ -468,7 +468,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
this.inputMode = inputMode;
|
||||
this.outputs = outputs;
|
||||
|
||||
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15, true);
|
||||
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15);
|
||||
}
|
||||
|
||||
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
||||
|
@ -1110,7 +1110,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
public void testFinalOutputOnEndState() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, true);
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
|
||||
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
|
||||
final FST<Long> fst = builder.finish();
|
||||
|
@ -1124,7 +1124,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testInternalFinalState() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, true);
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
final FST<Long> fst = builder.finish();
|
||||
|
|
|
@ -16,18 +16,12 @@
|
|||
*/
|
||||
package org.apache.lucene.util.fst;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -36,147 +30,30 @@ import org.apache.lucene.util.IntsRefBuilder;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
|
||||
public class TestFstDirect extends LuceneTestCase {
|
||||
|
||||
private static final int COUNT = 10_000_000;
|
||||
private List<String> words;
|
||||
private Set<String> dict;
|
||||
private Random random;
|
||||
|
||||
@Before
|
||||
public void before() {
|
||||
words = new ArrayList<>();
|
||||
random = new Random(random().nextLong());
|
||||
}
|
||||
|
||||
public void testDenseWithGap() throws Exception {
|
||||
//words.addAll(Arrays.asList("apple", "berry", "cherry", "damson", "fig", "grape"));
|
||||
words.addAll(Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm"));
|
||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(words, true));
|
||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(words));
|
||||
for (String word : words) {
|
||||
assertNotNull(word + " not found", fstEnum.seekExact(new BytesRef(word)));
|
||||
}
|
||||
}
|
||||
|
||||
@Ignore("for performance testing")
|
||||
public void testLookupIDs() throws Exception {
|
||||
for (int i = 0; i < 10000000; i++) {
|
||||
words.add(String.format(Locale.ROOT, "%09d", i));
|
||||
}
|
||||
FST<Object> baselineFST = buildFST(words, false);
|
||||
FST<Object> optoFST = buildFST(words,true);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
long seed = random().nextLong();
|
||||
random.setSeed(seed);
|
||||
long timeOpto = timeLookups(optoFST);
|
||||
random.setSeed(seed);
|
||||
long timeBase = timeLookups(baselineFST);
|
||||
printf("Sought %d present terms in %d ms (baseline) vs %d ms (opto), a %d%% difference", COUNT, nsToMs(timeBase), nsToMs(timeOpto),
|
||||
-100 * (timeBase - timeOpto) / timeBase);
|
||||
}
|
||||
}
|
||||
|
||||
@Ignore("for performance testing")
|
||||
public void testRandomTerms() throws Exception {
|
||||
for (int i = 0; i < 100000; i++) {
|
||||
words.add(randomString());
|
||||
}
|
||||
Collections.sort(words);
|
||||
FST<Object> baselineFST = buildFST(words, false);
|
||||
FST<Object> optoFST = buildFST(words,true);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
long seed = random().nextLong();
|
||||
random.setSeed(seed);
|
||||
long timeOpto = timeLookups(optoFST);
|
||||
random.setSeed(seed);
|
||||
long timeBase = timeLookups(baselineFST);
|
||||
printf("Sought %d present terms in %d ms (baseline) vs %d ms (opto), a %d%% difference", COUNT, nsToMs(timeBase), nsToMs(timeOpto),
|
||||
-100 * (timeBase - timeOpto) / timeBase);
|
||||
}
|
||||
}
|
||||
|
||||
@Ignore("requires english dictionary")
|
||||
public void testLookupEnglishTerms() throws Exception {
|
||||
FST<Object> baselineFST = buildEnglishFST(false);
|
||||
FST<Object> optoFST = buildFST(words,true);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
long seed = random().nextLong();
|
||||
random.setSeed(seed);
|
||||
long timeOpto = timeLookups(optoFST);
|
||||
random.setSeed(seed);
|
||||
long timeBase = timeLookups(baselineFST);
|
||||
printf("Sought %d present terms in %d ms (baseline) vs %d ms (opto), a %d%% difference", COUNT, nsToMs(timeBase), nsToMs(timeOpto),
|
||||
-100 * (timeBase - timeOpto) / timeBase);
|
||||
}
|
||||
}
|
||||
|
||||
private long timeLookups(FST<Object> fst) throws Exception {
|
||||
final BytesRefFSTEnum<Object> fstEnumOpto = new BytesRefFSTEnum<>(fst);
|
||||
long start = System.nanoTime();
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
assertNotNull(fstEnumOpto.seekExact(new BytesRef(words.get(random.nextInt(words.size())))));
|
||||
}
|
||||
return System.nanoTime() - start;
|
||||
}
|
||||
|
||||
@Ignore("requires english dictionary")
|
||||
public void testLookupRandomStrings() throws Exception {
|
||||
dict = new HashSet<>(words);
|
||||
List<String> tokens = new ArrayList<>();
|
||||
for (int i = 0; i < 1_000_000; i++) {
|
||||
String s;
|
||||
do {
|
||||
s = randomString();
|
||||
} while (dict.contains(s));
|
||||
tokens.add(s);
|
||||
}
|
||||
final FST<Object> fstBase = buildEnglishFST(false);
|
||||
final FST<Object> fstOpto = buildFST(words, true);
|
||||
long seed = random().nextLong();
|
||||
for (int i = 0; i < 10; i++) {
|
||||
random.setSeed(seed);
|
||||
long timeBase = timeLookupRandomStrings(fstBase, tokens);
|
||||
random.setSeed(seed);
|
||||
long timeOpto = timeLookupRandomStrings(fstOpto, tokens);
|
||||
printf("Sought %d absent terms in %d ms (base) / %d ms (opto), a %d%% change", COUNT, nsToMs(timeBase), nsToMs(timeOpto),
|
||||
-100 * (timeBase - timeOpto) / timeBase);
|
||||
}
|
||||
}
|
||||
|
||||
private long timeLookupRandomStrings(FST<Object> fst, List<String> tokens) throws Exception {
|
||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fst);
|
||||
long start = System.nanoTime();
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
fstEnum.seekExact(new BytesRef(tokens.get(random.nextInt(tokens.size()))));
|
||||
}
|
||||
return System.nanoTime() - start;
|
||||
}
|
||||
|
||||
private String randomString() {
|
||||
int len = random().nextInt(7) + 3;
|
||||
StringBuilder buf = new StringBuilder();
|
||||
for (int i = 0; i < len; i++) {
|
||||
buf.append(random().nextInt(26) + 'a');
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
private FST<Object> buildEnglishFST(boolean useDirectAddressing) throws Exception {
|
||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream("WORDS"), "ASCII"))) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
words.add(line);
|
||||
}
|
||||
}
|
||||
return buildFST(words, useDirectAddressing);
|
||||
}
|
||||
|
||||
private FST<Object> buildFST(List<String> words, boolean useDirectAddressing) throws Exception {
|
||||
private FST<Object> buildFST(List<String> words) throws Exception {
|
||||
long start = System.nanoTime();
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, useDirectAddressing);
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
|
||||
for (String word : words) {
|
||||
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
|
@ -187,11 +64,11 @@ public class TestFstDirect extends LuceneTestCase {
|
|||
return fst;
|
||||
}
|
||||
|
||||
static void printf(String format, Object ... values) {
|
||||
private static void printf(String format, Object ... values) {
|
||||
System.out.println(String.format(Locale.ROOT, format, values));
|
||||
}
|
||||
|
||||
static long nsToMs(long ns) {
|
||||
private static long nsToMs(long ns) {
|
||||
return ns / 1_000_000;
|
||||
}
|
||||
|
||||
|
|
|
@ -352,7 +352,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true, false, Integer.MAX_VALUE,
|
||||
FST_OUTPUTS, true, 15, false);
|
||||
FST_OUTPUTS, true, 15);
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
//}
|
||||
|
|
|
@ -236,7 +236,7 @@ public class FSTCompletionBuilder {
|
|||
final Object empty = outputs.getNoOutput();
|
||||
final Builder<Object> builder = new Builder<>(
|
||||
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
|
||||
shareMaxTailLength, outputs, true, 15, true);
|
||||
shareMaxTailLength, outputs, true, 15);
|
||||
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
BytesRef entry;
|
||||
|
|
|
@ -279,7 +279,7 @@ public class FSTTester<T> {
|
|||
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
|
||||
outputs,
|
||||
true,
|
||||
15, true);
|
||||
15);
|
||||
|
||||
for(InputOutput<T> pair : pairs) {
|
||||
if (pair.output instanceof List) {
|
||||
|
|
Loading…
Reference in New Issue