LUCENE-8895: switch all FST usage to enable array-with-gaps encoding

also, deprecate unused Util.getByOutput
This commit is contained in:
Michael Sokolov 2019-06-30 11:13:43 -04:00
parent b4a602f6b2
commit 81e63e8fec
14 changed files with 25 additions and 151 deletions

View File

@ -97,7 +97,7 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(entry -> entry[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15, false);
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0
String lastValue = null;

View File

@ -109,7 +109,7 @@ public class TokenInfoDictionaryBuilder {
System.out.println(" encode...");
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15, false);
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0
String lastValue = null;

View File

@ -363,7 +363,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
FST_OUTPUTS, true, 15, false);
FST_OUTPUTS, true, 15);
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}

View File

@ -1084,7 +1084,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
result.grow(1+upto);
fr.index.readFirstRealTargetArc(arc.target, arc, fstReader);
if (arc.bytesPerArc != 0) {
if (arc.bytesPerArc != 0 && arc.arcIdx > Integer.MIN_VALUE) {
// System.out.println(" array arcs");
int low = 0;
int high = arc.numArcs-1;

View File

@ -456,7 +456,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
outputs, true, 15, true);
outputs, true, 15);
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}

View File

@ -73,8 +73,6 @@ public class Builder<T> {
private final IntsRefBuilder lastInput = new IntsRefBuilder();
final boolean useDirectArcAddressing;
// NOTE: cutting this over to ArrayList instead loses ~6%
// in build performance on 9.8M Wikipedia terms; so we
// left this as an array:
@ -99,11 +97,11 @@ public class Builder<T> {
/**
* Instantiates an FST/FSA builder without any pruning. A shortcut to {@link
* #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int, boolean)} with
* #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int)} with
* pruning options turned off.
*/
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, false);
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
}
/**
@ -154,13 +152,12 @@ public class Builder<T> {
*/
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
boolean allowArrayArcs, int bytesPageBits, boolean useDirectArcAddressing) {
boolean allowArrayArcs, int bytesPageBits) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
this.allowArrayArcs = allowArrayArcs;
this.useDirectArcAddressing = useDirectArcAddressing;
fst = new FST<>(inputType, outputs, bytesPageBits);
bytes = fst.bytes;
assert bytes != null;

View File

@ -647,8 +647,7 @@ public final class FST<T> implements Accountable {
// array that may have holes in it so that we can address the arcs directly by label without
// binary search
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
boolean writeDirectly = builder.useDirectArcAddressing && labelRange > 0
&& labelRange < Builder.DIRECT_ARC_LOAD_FACTOR * nodeIn.numArcs;
boolean writeDirectly = labelRange > 0 && labelRange < Builder.DIRECT_ARC_LOAD_FACTOR * nodeIn.numArcs;
//System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs);
// create the header

View File

@ -105,6 +105,7 @@ public final class Util {
* For example, simple ordinals (0, 1,
* 2, ...), or file offsets (when appending to a file)
* fit this. */
@Deprecated
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
final BytesReader in = fst.getBytesReader();

View File

@ -55,7 +55,7 @@ public class Test2BFST extends LuceneTestCase {
Outputs<Object> outputs = NoOutputs.getSingleton();
Object NO_OUTPUT = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
true, 15, true);
true, 15);
int count = 0;
Random r = new Random(seed);
@ -137,7 +137,7 @@ public class Test2BFST extends LuceneTestCase {
System.out.println("\nTEST: 3 GB size; outputs=bytes");
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
true, 15, true);
true, 15);
byte[] outputBytes = new byte[20];
BytesRef output = new BytesRef(outputBytes);
@ -217,7 +217,7 @@ public class Test2BFST extends LuceneTestCase {
System.out.println("\nTEST: 3 GB size; outputs=long");
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
true, 15, true);
true, 15);
long output = 1;

View File

@ -327,7 +327,7 @@ public class TestFSTs extends LuceneTestCase {
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, true);
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
@ -468,7 +468,7 @@ public class TestFSTs extends LuceneTestCase {
this.inputMode = inputMode;
this.outputs = outputs;
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15, true);
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15);
}
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@ -1110,7 +1110,7 @@ public class TestFSTs extends LuceneTestCase {
public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, true);
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
final FST<Long> fst = builder.finish();
@ -1124,7 +1124,7 @@ public class TestFSTs extends LuceneTestCase {
public void testInternalFinalState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, true);
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
final FST<Long> fst = builder.finish();

View File

@ -16,18 +16,12 @@
*/
package org.apache.lucene.util.fst;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
@ -36,147 +30,30 @@ import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Before;
import org.junit.Ignore;
public class TestFstDirect extends LuceneTestCase {
private static final int COUNT = 10_000_000;
private List<String> words;
private Set<String> dict;
private Random random;
@Before
public void before() {
words = new ArrayList<>();
random = new Random(random().nextLong());
}
public void testDenseWithGap() throws Exception {
//words.addAll(Arrays.asList("apple", "berry", "cherry", "damson", "fig", "grape"));
words.addAll(Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm"));
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(words, true));
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(words));
for (String word : words) {
assertNotNull(word + " not found", fstEnum.seekExact(new BytesRef(word)));
}
}
@Ignore("for performance testing")
public void testLookupIDs() throws Exception {
for (int i = 0; i < 10000000; i++) {
words.add(String.format(Locale.ROOT, "%09d", i));
}
FST<Object> baselineFST = buildFST(words, false);
FST<Object> optoFST = buildFST(words,true);
for (int i = 0; i < 10; i++) {
long seed = random().nextLong();
random.setSeed(seed);
long timeOpto = timeLookups(optoFST);
random.setSeed(seed);
long timeBase = timeLookups(baselineFST);
printf("Sought %d present terms in %d ms (baseline) vs %d ms (opto), a %d%% difference", COUNT, nsToMs(timeBase), nsToMs(timeOpto),
-100 * (timeBase - timeOpto) / timeBase);
}
}
@Ignore("for performance testing")
public void testRandomTerms() throws Exception {
for (int i = 0; i < 100000; i++) {
words.add(randomString());
}
Collections.sort(words);
FST<Object> baselineFST = buildFST(words, false);
FST<Object> optoFST = buildFST(words,true);
for (int i = 0; i < 10; i++) {
long seed = random().nextLong();
random.setSeed(seed);
long timeOpto = timeLookups(optoFST);
random.setSeed(seed);
long timeBase = timeLookups(baselineFST);
printf("Sought %d present terms in %d ms (baseline) vs %d ms (opto), a %d%% difference", COUNT, nsToMs(timeBase), nsToMs(timeOpto),
-100 * (timeBase - timeOpto) / timeBase);
}
}
@Ignore("requires english dictionary")
public void testLookupEnglishTerms() throws Exception {
FST<Object> baselineFST = buildEnglishFST(false);
FST<Object> optoFST = buildFST(words,true);
for (int i = 0; i < 10; i++) {
long seed = random().nextLong();
random.setSeed(seed);
long timeOpto = timeLookups(optoFST);
random.setSeed(seed);
long timeBase = timeLookups(baselineFST);
printf("Sought %d present terms in %d ms (baseline) vs %d ms (opto), a %d%% difference", COUNT, nsToMs(timeBase), nsToMs(timeOpto),
-100 * (timeBase - timeOpto) / timeBase);
}
}
private long timeLookups(FST<Object> fst) throws Exception {
final BytesRefFSTEnum<Object> fstEnumOpto = new BytesRefFSTEnum<>(fst);
long start = System.nanoTime();
for (int i = 0; i < COUNT; i++) {
assertNotNull(fstEnumOpto.seekExact(new BytesRef(words.get(random.nextInt(words.size())))));
}
return System.nanoTime() - start;
}
@Ignore("requires english dictionary")
public void testLookupRandomStrings() throws Exception {
dict = new HashSet<>(words);
List<String> tokens = new ArrayList<>();
for (int i = 0; i < 1_000_000; i++) {
String s;
do {
s = randomString();
} while (dict.contains(s));
tokens.add(s);
}
final FST<Object> fstBase = buildEnglishFST(false);
final FST<Object> fstOpto = buildFST(words, true);
long seed = random().nextLong();
for (int i = 0; i < 10; i++) {
random.setSeed(seed);
long timeBase = timeLookupRandomStrings(fstBase, tokens);
random.setSeed(seed);
long timeOpto = timeLookupRandomStrings(fstOpto, tokens);
printf("Sought %d absent terms in %d ms (base) / %d ms (opto), a %d%% change", COUNT, nsToMs(timeBase), nsToMs(timeOpto),
-100 * (timeBase - timeOpto) / timeBase);
}
}
private long timeLookupRandomStrings(FST<Object> fst, List<String> tokens) throws Exception {
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fst);
long start = System.nanoTime();
for (int i = 0; i < COUNT; i++) {
fstEnum.seekExact(new BytesRef(tokens.get(random.nextInt(tokens.size()))));
}
return System.nanoTime() - start;
}
private String randomString() {
int len = random().nextInt(7) + 3;
StringBuilder buf = new StringBuilder();
for (int i = 0; i < len; i++) {
buf.append(random().nextInt(26) + 'a');
}
return buf.toString();
}
private FST<Object> buildEnglishFST(boolean useDirectAddressing) throws Exception {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream("WORDS"), "ASCII"))) {
String line;
while ((line = reader.readLine()) != null) {
words.add(line);
}
}
return buildFST(words, useDirectAddressing);
}
private FST<Object> buildFST(List<String> words, boolean useDirectAddressing) throws Exception {
private FST<Object> buildFST(List<String> words) throws Exception {
long start = System.nanoTime();
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, useDirectAddressing);
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
for (String word : words) {
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
@ -187,11 +64,11 @@ public class TestFstDirect extends LuceneTestCase {
return fst;
}
static void printf(String format, Object ... values) {
private static void printf(String format, Object ... values) {
System.out.println(String.format(Locale.ROOT, format, values));
}
static long nsToMs(long ns) {
private static long nsToMs(long ns) {
return ns / 1_000_000;
}

View File

@ -352,7 +352,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
FST_OUTPUTS, true, 15, false);
FST_OUTPUTS, true, 15);
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}

View File

@ -236,7 +236,7 @@ public class FSTCompletionBuilder {
final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<>(
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
shareMaxTailLength, outputs, true, 15, true);
shareMaxTailLength, outputs, true, 15);
BytesRefBuilder scratch = new BytesRefBuilder();
BytesRef entry;

View File

@ -279,7 +279,7 @@ public class FSTTester<T> {
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
outputs,
true,
15, true);
15);
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof List) {