Merge with master.

This commit is contained in:
Dawid Weiss 2019-12-13 15:35:14 +01:00
commit abd5102312
37 changed files with 549 additions and 481 deletions

View File

@ -38,6 +38,9 @@ API Changes
* LUCENE-8905: Better defence against malformed arguments in TopDocsCollector * LUCENE-8905: Better defence against malformed arguments in TopDocsCollector
(Atri Sharma) (Atri Sharma)
* LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder.
(Bruno Roustant)
Improvements Improvements
* LUCENE-8757: When provided with an ExecutorService to run queries across * LUCENE-8757: When provided with an ExecutorService to run queries across
@ -64,6 +67,8 @@ Other
* LUCENE-8768: Fix Javadocs build in Java 11. (Namgyu Kim) * LUCENE-8768: Fix Javadocs build in Java 11. (Namgyu Kim)
* LUCENE-9092: upgrade randomizedtesting to 2.7.5 (Dawid Weiss)
======================= Lucene 8.5.0 ======================= ======================= Lucene 8.5.0 =======================
API Changes API Changes

View File

@ -1,5 +1,10 @@
# Apache Lucene Migration Guide # Apache Lucene Migration Guide
## o.a.l.util.fst.Builder is renamed FSTCompiler with fluent-style Builder (LUCENE-9089) ##
Simply use FSTCompiler instead of the previous Builder. Use either the simple constructor with default settings, or
the FSTCompiler.Builder to tune and tweak any parameter.
## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ## ## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids

View File

@ -25,6 +25,7 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.fst.Util;
@ -106,13 +107,13 @@ public class NormalizeCharMap {
final FST<CharsRef> map; final FST<CharsRef> map;
try { try {
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs); final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder(); final IntsRefBuilder scratch = new IntsRefBuilder();
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
builder.add(Util.toUTF16(ent.getKey(), scratch), fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
new CharsRef(ent.getValue())); new CharsRef(ent.getValue()));
} }
map = builder.finish(); map = fstCompiler.compile();
pendingPairs.clear(); pendingPairs.clear();
} catch (IOException ioe) { } catch (IOException ioe) {
// Bogus FST IOExceptions!! (will never happen) // Bogus FST IOExceptions!! (will never happen)

View File

@ -64,7 +64,7 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.OfflineSorter; import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntSequenceOutputs; import org.apache.lucene.util.fst.IntSequenceOutputs;
@ -231,9 +231,9 @@ public class Dictionary {
// read dictionary entries // read dictionary entries
IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o); FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, b); readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
words = b.finish(); words = fstCompiler.compile();
aliases = null; // no longer needed aliases = null; // no longer needed
morphAliases = null; // no longer needed morphAliases = null; // no longer needed
success = true; success = true;
@ -414,7 +414,7 @@ public class Dictionary {
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException { private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
IntsRefBuilder scratch = new IntsRefBuilder(); IntsRefBuilder scratch = new IntsRefBuilder();
for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) { for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
Util.toUTF32(entry.getKey(), scratch); Util.toUTF32(entry.getKey(), scratch);
@ -423,9 +423,9 @@ public class Dictionary {
for (Integer c : entries) { for (Integer c : entries) {
output.ints[output.length++] = c; output.ints[output.length++] = c;
} }
builder.add(scratch.get(), output); fstCompiler.add(scratch.get(), output);
} }
return builder.finish(); return fstCompiler.compile();
} }
static String escapeDash(String re) { static String escapeDash(String re) {
@ -608,14 +608,14 @@ public class Dictionary {
} }
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String,String> entry : mappings.entrySet()) { for (Map.Entry<String,String> entry : mappings.entrySet()) {
Util.toUTF16(entry.getKey(), scratchInts); Util.toUTF16(entry.getKey(), scratchInts);
builder.add(scratchInts.get(), new CharsRef(entry.getValue())); fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
} }
return builder.finish(); return fstCompiler.compile();
} }
/** pattern accepts optional BOM + SET + any whitespace */ /** pattern accepts optional BOM + SET + any whitespace */
@ -776,7 +776,7 @@ public class Dictionary {
* @param decoder CharsetDecoder used to decode the contents of the file * @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file * @throws IOException Can be thrown while reading from the file
*/ */
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, FSTCompiler<IntsRef> words) throws IOException {
BytesRefBuilder flagsScratch = new BytesRefBuilder(); BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder();

View File

@ -35,6 +35,7 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.FSTCompiler;
/** /**
* Provides the ability to override any {@link KeywordAttribute} aware stemmer * Provides the ability to override any {@link KeywordAttribute} aware stemmer
@ -203,7 +204,7 @@ public final class StemmerOverrideFilter extends TokenFilter {
*/ */
public StemmerOverrideMap build() throws IOException { public StemmerOverrideMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>( FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(
FST.INPUT_TYPE.BYTE4, outputs); FST.INPUT_TYPE.BYTE4, outputs);
final int[] sort = hash.sort(); final int[] sort = hash.sort();
IntsRefBuilder intsSpare = new IntsRefBuilder(); IntsRefBuilder intsSpare = new IntsRefBuilder();
@ -213,9 +214,9 @@ public final class StemmerOverrideFilter extends TokenFilter {
int id = sort[i]; int id = sort[i];
BytesRef bytesRef = hash.get(id, spare); BytesRef bytesRef = hash.get(id, spare);
intsSpare.copyUTF8Bytes(bytesRef); intsSpare.copyUTF8Bytes(bytesRef);
builder.add(intsSpare.get(), new BytesRef(outputValues.get(id))); fstCompiler.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
} }
return new StemmerOverrideMap(builder.finish(), ignoreCase); return new StemmerOverrideMap(fstCompiler.compile(), ignoreCase);
} }
} }

View File

@ -39,6 +39,7 @@ import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.fst.Util;
/** /**
@ -213,8 +214,8 @@ public class SynonymMap {
public SynonymMap build() throws IOException { public SynonymMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
// TODO: are we using the best sharing options? // TODO: are we using the best sharing options?
org.apache.lucene.util.fst.Builder<BytesRef> builder = FSTCompiler<BytesRef> fstCompiler =
new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs); new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRefBuilder scratch = new BytesRefBuilder(); BytesRefBuilder scratch = new BytesRefBuilder();
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
@ -278,10 +279,10 @@ public class SynonymMap {
scratch.setLength(scratchOutput.getPosition()); scratch.setLength(scratchOutput.getPosition());
//System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef()); fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
} }
FST<BytesRef> fst = builder.finish(); FST<BytesRef> fst = fstCompiler.compile();
return new SynonymMap(fst, words, maxHorizontalContext); return new SynonymMap(fst, words, maxHorizontalContext);
} }
} }

View File

@ -30,7 +30,7 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Outputs;
@ -196,26 +196,26 @@ public class TestDictionary extends LuceneTestCase {
public void testReplacements() throws Exception { public void testReplacements() throws Exception {
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder();
// a -> b // a -> b
Util.toUTF16("a", scratchInts); Util.toUTF16("a", scratchInts);
builder.add(scratchInts.get(), new CharsRef("b")); fstCompiler.add(scratchInts.get(), new CharsRef("b"));
// ab -> c // ab -> c
Util.toUTF16("ab", scratchInts); Util.toUTF16("ab", scratchInts);
builder.add(scratchInts.get(), new CharsRef("c")); fstCompiler.add(scratchInts.get(), new CharsRef("c"));
// c -> de // c -> de
Util.toUTF16("c", scratchInts); Util.toUTF16("c", scratchInts);
builder.add(scratchInts.get(), new CharsRef("de")); fstCompiler.add(scratchInts.get(), new CharsRef("de"));
// def -> gh // def -> gh
Util.toUTF16("def", scratchInts); Util.toUTF16("def", scratchInts);
builder.add(scratchInts.get(), new CharsRef("gh")); fstCompiler.add(scratchInts.get(), new CharsRef("gh"));
FST<CharsRef> fst = builder.finish(); FST<CharsRef> fst = fstCompiler.compile();
StringBuilder sb = new StringBuilder("atestanother"); StringBuilder sb = new StringBuilder("atestanother");
Dictionary.applyMappings(fst, sb); Dictionary.applyMappings(fst, sb);

View File

@ -29,7 +29,7 @@ import java.util.TreeMap;
import org.apache.lucene.analysis.ja.util.CSVUtil; import org.apache.lucene.analysis.ja.util.CSVUtil;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -99,7 +99,7 @@ public final class UserDictionary implements Dictionary {
List<int[]> segmentations = new ArrayList<>(featureEntries.size()); List<int[]> segmentations = new ArrayList<>(featureEntries.size());
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput); FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder(); IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0; long ord = 0;
@ -136,11 +136,11 @@ public final class UserDictionary implements Dictionary {
for (int i = 0; i < token.length(); i++) { for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, (int) token.charAt(i)); scratch.setIntAt(i, (int) token.charAt(i));
} }
fstBuilder.add(scratch.get(), ord); fstCompiler.add(scratch.get(), ord);
segmentations.add(wordIdAndLength); segmentations.add(wordIdAndLength);
ord++; ord++;
} }
this.fst = new TokenInfoFST(fstBuilder.finish(), false); this.fst = new TokenInfoFST(fstCompiler.compile(), false);
this.data = data.toArray(new String[data.size()]); this.data = data.toArray(new String[data.size()]);
this.segmentations = segmentations.toArray(new int[segmentations.size()][]); this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
} }

View File

@ -31,7 +31,7 @@ import java.util.stream.Stream;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat; import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -97,7 +97,7 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(entry -> entry[0])); lines.sort(Comparator.comparing(entry -> entry[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15); FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder(); IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0 long ord = -1; // first ord will be 0
String lastValue = null; String lastValue = null;
@ -120,12 +120,12 @@ class TokenInfoDictionaryBuilder {
for (int i = 0; i < token.length(); i++) { for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, (int) token.charAt(i)); scratch.setIntAt(i, (int) token.charAt(i));
} }
fstBuilder.add(scratch.get(), ord); fstCompiler.add(scratch.get(), ord);
} }
dictionary.addMapping((int) ord, offset); dictionary.addMapping((int) ord, offset);
offset = next; offset = next;
} }
dictionary.setFST(fstBuilder.finish()); dictionary.setFST(fstCompiler.compile());
return dictionary; return dictionary;
} }

View File

@ -25,7 +25,7 @@ import java.util.List;
import org.apache.lucene.analysis.ko.POS; import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -83,7 +83,7 @@ public final class UserDictionary implements Dictionary {
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0])); entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput); FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder(); IntsRefBuilder scratch = new IntsRefBuilder();
String lastToken = null; String lastToken = null;
@ -129,11 +129,11 @@ public final class UserDictionary implements Dictionary {
for (int i = 0; i < token.length(); i++) { for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, token.charAt(i)); scratch.setIntAt(i, token.charAt(i));
} }
fstBuilder.add(scratch.get(), ord); fstCompiler.add(scratch.get(), ord);
lastToken = token; lastToken = token;
ord ++; ord ++;
} }
this.fst = new TokenInfoFST(fstBuilder.finish()); this.fst = new TokenInfoFST(fstCompiler.compile());
this.segmentations = segmentations.toArray(new int[segmentations.size()][]); this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
this.rightIds = new short[rightIds.size()]; this.rightIds = new short[rightIds.size()];
for (int i = 0; i < rightIds.size(); i++) { for (int i = 0; i < rightIds.size(); i++) {

View File

@ -30,7 +30,7 @@ import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -90,7 +90,7 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(left -> left[0])); lines.sort(Comparator.comparing(left -> left[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15); FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder(); IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0 long ord = -1; // first ord will be 0
String lastValue = null; String lastValue = null;
@ -116,12 +116,12 @@ class TokenInfoDictionaryBuilder {
for (int i = 0; i < surfaceForm.length(); i++) { for (int i = 0; i < surfaceForm.length(); i++) {
scratch.setIntAt(i, surfaceForm.charAt(i)); scratch.setIntAt(i, surfaceForm.charAt(i));
} }
fstBuilder.add(scratch.get(), ord); fstCompiler.add(scratch.get(), ord);
} }
dictionary.addMapping((int) ord, offset); dictionary.addMapping((int) ord, offset);
offset = next; offset = next;
} }
dictionary.setFST(fstBuilder.finish()); dictionary.setFST(fstCompiler.compile());
return dictionary; return dictionary;
} }
} }

View File

@ -41,7 +41,7 @@ import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.fst.Util;
@ -183,15 +183,15 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
private void updateFST(SortedMap<String, Double> weights) throws IOException { private void updateFST(SortedMap<String, Double> weights) throws IOException {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
BytesRefBuilder scratchBytes = new BytesRefBuilder(); BytesRefBuilder scratchBytes = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String, Double> entry : weights.entrySet()) { for (Map.Entry<String, Double> entry : weights.entrySet()) {
scratchBytes.copyChars(entry.getKey()); scratchBytes.copyChars(entry.getKey());
fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
.getValue().longValue()); .getValue().longValue());
} }
fst = fstBuilder.finish(); fst = fstCompiler.compile();
} }

View File

@ -33,7 +33,7 @@ import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.fst.Util;
@ -219,7 +219,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
} }
private class FSTFieldWriter extends FieldWriter { private class FSTFieldWriter extends FieldWriter {
private final Builder<Long> fstBuilder; private final FSTCompiler<Long> fstCompiler;
private final PositiveIntOutputs fstOutputs; private final PositiveIntOutputs fstOutputs;
private final long startTermsFilePointer; private final long startTermsFilePointer;
@ -233,12 +233,12 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException { public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton(); fstOutputs = PositiveIntOutputs.getSingleton();
fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs); fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
indexStart = out.getFilePointer(); indexStart = out.getFilePointer();
////System.out.println("VGW: field=" + fieldInfo.name); ////System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in // Always put empty string in
fstBuilder.add(new IntsRef(), termsFilePointer); fstCompiler.add(new IntsRef(), termsFilePointer);
startTermsFilePointer = termsFilePointer; startTermsFilePointer = termsFilePointer;
} }
@ -269,7 +269,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
final int lengthSave = text.length; final int lengthSave = text.length;
text.length = indexedTermPrefixLength(lastTerm.get(), text); text.length = indexedTermPrefixLength(lastTerm.get(), text);
try { try {
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer); fstCompiler.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
} finally { } finally {
text.length = lengthSave; text.length = lengthSave;
} }
@ -278,7 +278,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
@Override @Override
public void finish(long termsFilePointer) throws IOException { public void finish(long termsFilePointer) throws IOException {
fst = fstBuilder.finish(); fst = fstCompiler.compile();
if (fst != null) { if (fst != null) {
fst.save(out); fst.save(out);
} }

View File

@ -45,7 +45,7 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.fst.Util;
@ -361,16 +361,14 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
} }
} }
final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, final FSTCompiler<Output> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS).shouldShareNonSingletonNodes(false).build();
0, 0, true, false, Integer.MAX_VALUE,
FST_OUTPUTS, true, 15);
//if (DEBUG) { //if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix); // System.out.println(" compile index for prefix=" + prefix);
//} //}
//indexBuilder.DEBUG = false; //indexBuilder.DEBUG = false;
final byte[] bytes = scratchBytes.toArrayCopy(); final byte[] bytes = scratchBytes.toArrayCopy();
assert bytes.length > 0; assert bytes.length > 0;
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef),
FST_OUTPUTS.newOutput(new BytesRef(bytes, 0, bytes.length), FST_OUTPUTS.newOutput(new BytesRef(bytes, 0, bytes.length),
0, Long.MAX_VALUE-(sumTotalTermCount-1))); 0, Long.MAX_VALUE-(sumTotalTermCount-1)));
scratchBytes.reset(); scratchBytes.reset();
@ -381,7 +379,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
for(PendingBlock block : blocks) { for(PendingBlock block : blocks) {
if (block.subIndices != null) { if (block.subIndices != null) {
for(SubIndex subIndex : block.subIndices) { for(SubIndex subIndex : block.subIndices) {
append(indexBuilder, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef); append(fstCompiler, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef);
} }
block.subIndices = null; block.subIndices = null;
} }
@ -391,7 +389,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
assert sumTotalTermCount == totFloorTermCount; assert sumTotalTermCount == totFloorTermCount;
index = indexBuilder.finish(); index = fstCompiler.compile();
assert subIndices == null; assert subIndices == null;
/* /*
@ -405,7 +403,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
// TODO: maybe we could add bulk-add method to // TODO: maybe we could add bulk-add method to
// Builder? Takes FST and unions it w/ current // Builder? Takes FST and unions it w/ current
// FST. // FST.
private void append(Builder<Output> builder, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException { private void append(FSTCompiler<Output> fstCompiler, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex); final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
BytesRefFSTEnum.InputOutput<Output> indexEnt; BytesRefFSTEnum.InputOutput<Output> indexEnt;
while ((indexEnt = subIndexEnum.next()) != null) { while ((indexEnt = subIndexEnum.next()) != null) {
@ -416,7 +414,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
//long blockTermCount = output.endOrd - output.startOrd + 1; //long blockTermCount = output.endOrd - output.startOrd + 1;
Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset); Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset);
//System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd)); //System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd));
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput); fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
} }
} }
} }

View File

@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.fst.Util;
@ -287,7 +287,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
} }
final class TermsWriter { final class TermsWriter {
private final Builder<Long> builder; private final FSTCompiler<Long> fstCompiler;
private final PositiveIntOutputs outputs; private final PositiveIntOutputs outputs;
private final FieldInfo fieldInfo; private final FieldInfo fieldInfo;
private final int longsSize; private final int longsSize;
@ -311,7 +311,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
this.longsSize = postingsWriter.setField(fieldInfo); this.longsSize = postingsWriter.setField(fieldInfo);
this.outputs = PositiveIntOutputs.getSingleton(); this.outputs = PositiveIntOutputs.getSingleton();
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
this.lastBlockStatsFP = 0; this.lastBlockStatsFP = 0;
this.lastBlockMetaLongsFP = 0; this.lastBlockMetaLongsFP = 0;
@ -346,7 +346,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
} }
metaLongsOut.writeVLong(metaBytesOut.size() - lastMetaBytesFP); metaLongsOut.writeVLong(metaBytesOut.size() - lastMetaBytesFP);
builder.add(Util.toIntsRef(text, scratchTerm), numTerms); fstCompiler.add(Util.toIntsRef(text, scratchTerm), numTerms);
numTerms++; numTerms++;
lastMetaBytesFP = metaBytesOut.size(); lastMetaBytesFP = metaBytesOut.size();
@ -365,7 +365,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
metadata.statsOut = statsOut; metadata.statsOut = statsOut;
metadata.metaLongsOut = metaLongsOut; metadata.metaLongsOut = metaLongsOut;
metadata.metaBytesOut = metaBytesOut; metadata.metaBytesOut = metaBytesOut;
metadata.dict = builder.finish(); metadata.dict = fstCompiler.compile();
fields.add(metadata); fields.add(metadata);
} }
} }

View File

@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.fst.Util;
@ -247,7 +247,7 @@ public class FSTTermsWriter extends FieldsConsumer {
} }
final class TermsWriter { final class TermsWriter {
private final Builder<FSTTermOutputs.TermData> builder; private final FSTCompiler<FSTTermOutputs.TermData> fstCompiler;
private final FSTTermOutputs outputs; private final FSTTermOutputs outputs;
private final FieldInfo fieldInfo; private final FieldInfo fieldInfo;
private final int longsSize; private final int longsSize;
@ -261,7 +261,7 @@ public class FSTTermsWriter extends FieldsConsumer {
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
this.longsSize = postingsWriter.setField(fieldInfo); this.longsSize = postingsWriter.setField(fieldInfo);
this.outputs = new FSTTermOutputs(fieldInfo, longsSize); this.outputs = new FSTTermOutputs(fieldInfo, longsSize);
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
} }
public void finishTerm(BytesRef text, BlockTermState state) throws IOException { public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
@ -276,14 +276,14 @@ public class FSTTermsWriter extends FieldsConsumer {
meta.bytes = metaWriter.toArrayCopy(); meta.bytes = metaWriter.toArrayCopy();
metaWriter.reset(); metaWriter.reset();
} }
builder.add(Util.toIntsRef(text, scratchTerm), meta); fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta);
numTerms++; numTerms++;
} }
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
// save FST dict // save FST dict
if (numTerms > 0) { if (numTerms > 0) {
final FST<FSTTermOutputs.TermData> fst = builder.finish(); final FST<FSTTermOutputs.TermData> fst = fstCompiler.compile();
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst)); fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
} }
} }

View File

@ -52,7 +52,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs; import org.apache.lucene.util.fst.PairOutputs;
@ -539,11 +539,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
private void loadTerms() throws IOException { private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b; final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs); final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs, final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
outputsInner); outputsInner);
b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IndexInput in = SimpleTextFieldsReader.this.in.clone(); IndexInput in = SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart); in.seek(termsStart);
final BytesRefBuilder lastTerm = new BytesRefBuilder(); final BytesRefBuilder lastTerm = new BytesRefBuilder();
@ -556,7 +556,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
SimpleTextUtil.readLine(in, scratch); SimpleTextUtil.readLine(in, scratch);
if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) { if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
if (lastDocsStart != -1) { if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
outputs.newPair(lastDocsStart, outputs.newPair(lastDocsStart,
outputsInner.newPair((long) docFreq, totalTermFreq))); outputsInner.newPair((long) docFreq, totalTermFreq)));
sumTotalTermFreq += totalTermFreq; sumTotalTermFreq += totalTermFreq;
@ -574,7 +574,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1; totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
} else if (StringHelper.startsWith(scratch.get(), TERM)) { } else if (StringHelper.startsWith(scratch.get(), TERM)) {
if (lastDocsStart != -1) { if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart, fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
outputsInner.newPair((long) docFreq, totalTermFreq))); outputsInner.newPair((long) docFreq, totalTermFreq)));
} }
lastDocsStart = in.getFilePointer(); lastDocsStart = in.getFilePointer();
@ -589,7 +589,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
} }
} }
docCount = visitedDocs.cardinality(); docCount = visitedDocs.cardinality();
fst = b.finish(); fst = fstCompiler.compile();
/* /*
PrintStream ps = new PrintStream("out.dot"); PrintStream ps = new PrintStream("out.dot");
fst.toDot(ps); fst.toDot(ps);

View File

@ -30,6 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.fst.Util;
@ -202,19 +203,19 @@ public class FSTDictionary implements IndexDictionary {
*/ */
public static class Builder implements IndexDictionary.Builder { public static class Builder implements IndexDictionary.Builder {
protected final org.apache.lucene.util.fst.Builder<Long> fstBuilder; protected final FSTCompiler<Long> fstCompiler;
protected final IntsRefBuilder scratchInts; protected final IntsRefBuilder scratchInts;
public Builder() { public Builder() {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
fstBuilder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE1, outputs); fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
scratchInts = new IntsRefBuilder(); scratchInts = new IntsRefBuilder();
} }
@Override @Override
public void add(BytesRef blockKey, long blockFilePointer) { public void add(BytesRef blockKey, long blockFilePointer) {
try { try {
fstBuilder.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer); fstCompiler.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer);
} catch (IOException e) { } catch (IOException e) {
// Should never happen. // Should never happen.
throw new RuntimeException(e); throw new RuntimeException(e);
@ -224,7 +225,7 @@ public class FSTDictionary implements IndexDictionary {
@Override @Override
public FSTDictionary build() { public FSTDictionary build() {
try { try {
return new FSTDictionary(fstBuilder.finish()); return new FSTDictionary(fstCompiler.compile());
} catch (IOException e) { } catch (IOException e) {
// Should never happen. // Should never happen.
throw new RuntimeException(e); throw new RuntimeException(e);

View File

@ -44,7 +44,7 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
@ -454,29 +454,27 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
} }
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).shouldShareNonSingletonNodes(false).build();
0, 0, true, false, Integer.MAX_VALUE,
outputs, true, 15);
//if (DEBUG) { //if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix); // System.out.println(" compile index for prefix=" + prefix);
//} //}
//indexBuilder.DEBUG = false; //indexBuilder.DEBUG = false;
final byte[] bytes = scratchBytes.toArrayCopy(); final byte[] bytes = scratchBytes.toArrayCopy();
assert bytes.length > 0; assert bytes.length > 0;
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
scratchBytes.reset(); scratchBytes.reset();
// Copy over index for all sub-blocks // Copy over index for all sub-blocks
for(PendingBlock block : blocks) { for(PendingBlock block : blocks) {
if (block.subIndices != null) { if (block.subIndices != null) {
for(FST<BytesRef> subIndex : block.subIndices) { for(FST<BytesRef> subIndex : block.subIndices) {
append(indexBuilder, subIndex, scratchIntsRef); append(fstCompiler, subIndex, scratchIntsRef);
} }
block.subIndices = null; block.subIndices = null;
} }
} }
index = indexBuilder.finish(); index = fstCompiler.compile();
assert subIndices == null; assert subIndices == null;
@ -491,14 +489,14 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// TODO: maybe we could add bulk-add method to // TODO: maybe we could add bulk-add method to
// Builder? Takes FST and unions it w/ current // Builder? Takes FST and unions it w/ current
// FST. // FST.
private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException { private void append(FSTCompiler<BytesRef> fstCompiler, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex); final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
BytesRefFSTEnum.InputOutput<BytesRef> indexEnt; BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
while((indexEnt = subIndexEnum.next()) != null) { while((indexEnt = subIndexEnum.next()) != null) {
//if (DEBUG) { //if (DEBUG) {
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
//} //}
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
} }
} }
} }

View File

@ -605,7 +605,7 @@ public final class FST<T> implements Accountable {
// serializes new node by appending its bytes to the end // serializes new node by appending its bytes to the end
// of the current byte[] // of the current byte[]
long addNode(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException { long addNode(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
T NO_OUTPUT = outputs.getNoOutput(); T NO_OUTPUT = outputs.getNoOutput();
//System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); //System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
@ -616,28 +616,28 @@ public final class FST<T> implements Accountable {
return NON_FINAL_END_NODE; return NON_FINAL_END_NODE;
} }
} }
final long startAddress = builder.bytes.getPosition(); final long startAddress = fstCompiler.bytes.getPosition();
//System.out.println(" startAddr=" + startAddress); //System.out.println(" startAddr=" + startAddress);
final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(builder, nodeIn); final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(fstCompiler, nodeIn);
if (doFixedLengthArcs) { if (doFixedLengthArcs) {
//System.out.println(" fixed length arcs"); //System.out.println(" fixed length arcs");
if (builder.numBytesPerArc.length < nodeIn.numArcs) { if (fstCompiler.numBytesPerArc.length < nodeIn.numArcs) {
builder.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)]; fstCompiler.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)];
builder.numLabelBytesPerArc = new int[builder.numBytesPerArc.length]; fstCompiler.numLabelBytesPerArc = new int[fstCompiler.numBytesPerArc.length];
} }
} }
builder.arcCount += nodeIn.numArcs; fstCompiler.arcCount += nodeIn.numArcs;
final int lastArc = nodeIn.numArcs-1; final int lastArc = nodeIn.numArcs-1;
long lastArcStart = builder.bytes.getPosition(); long lastArcStart = fstCompiler.bytes.getPosition();
int maxBytesPerArc = 0; int maxBytesPerArc = 0;
int maxBytesPerArcWithoutLabel = 0; int maxBytesPerArcWithoutLabel = 0;
for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) { for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) {
final Builder.Arc<T> arc = nodeIn.arcs[arcIdx]; final FSTCompiler.Arc<T> arc = nodeIn.arcs[arcIdx];
final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; final FSTCompiler.CompiledNode target = (FSTCompiler.CompiledNode) arc.target;
int flags = 0; int flags = 0;
//System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node); //System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node);
@ -645,7 +645,7 @@ public final class FST<T> implements Accountable {
flags += BIT_LAST_ARC; flags += BIT_LAST_ARC;
} }
if (builder.lastFrozenNode == target.node && !doFixedLengthArcs) { if (fstCompiler.lastFrozenNode == target.node && !doFixedLengthArcs) {
// TODO: for better perf (but more RAM used) we // TODO: for better perf (but more RAM used) we
// could avoid this except when arc is "near" the // could avoid this except when arc is "near" the
// last arc: // last arc:
@ -671,36 +671,36 @@ public final class FST<T> implements Accountable {
flags += BIT_ARC_HAS_OUTPUT; flags += BIT_ARC_HAS_OUTPUT;
} }
builder.bytes.writeByte((byte) flags); fstCompiler.bytes.writeByte((byte) flags);
long labelStart = builder.bytes.getPosition(); long labelStart = fstCompiler.bytes.getPosition();
writeLabel(builder.bytes, arc.label); writeLabel(fstCompiler.bytes, arc.label);
int numLabelBytes = (int) (builder.bytes.getPosition() - labelStart); int numLabelBytes = (int) (fstCompiler.bytes.getPosition() - labelStart);
// System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output)); // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output));
if (arc.output != NO_OUTPUT) { if (arc.output != NO_OUTPUT) {
outputs.write(arc.output, builder.bytes); outputs.write(arc.output, fstCompiler.bytes);
//System.out.println(" write output"); //System.out.println(" write output");
} }
if (arc.nextFinalOutput != NO_OUTPUT) { if (arc.nextFinalOutput != NO_OUTPUT) {
//System.out.println(" write final output"); //System.out.println(" write final output");
outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes); outputs.writeFinalOutput(arc.nextFinalOutput, fstCompiler.bytes);
} }
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
assert target.node > 0; assert target.node > 0;
//System.out.println(" write target"); //System.out.println(" write target");
builder.bytes.writeVLong(target.node); fstCompiler.bytes.writeVLong(target.node);
} }
// just write the arcs "like normal" on first pass, but record how many bytes each one took // just write the arcs "like normal" on first pass, but record how many bytes each one took
// and max byte size: // and max byte size:
if (doFixedLengthArcs) { if (doFixedLengthArcs) {
int numArcBytes = (int) (builder.bytes.getPosition() - lastArcStart); int numArcBytes = (int) (fstCompiler.bytes.getPosition() - lastArcStart);
builder.numBytesPerArc[arcIdx] = numArcBytes; fstCompiler.numBytesPerArc[arcIdx] = numArcBytes;
builder.numLabelBytesPerArc[arcIdx] = numLabelBytes; fstCompiler.numLabelBytesPerArc[arcIdx] = numLabelBytes;
lastArcStart = builder.bytes.getPosition(); lastArcStart = fstCompiler.bytes.getPosition();
maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes); maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes);
maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes); maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes);
//System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes); //System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes);
@ -733,18 +733,18 @@ public final class FST<T> implements Accountable {
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1; int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
assert labelRange > 0; assert labelRange > 0;
if (shouldExpandNodeWithDirectAddressing(builder, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) { if (shouldExpandNodeWithDirectAddressing(fstCompiler, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
writeNodeForDirectAddressing(builder, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange); writeNodeForDirectAddressing(fstCompiler, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
builder.directAddressingNodeCount++; fstCompiler.directAddressingNodeCount++;
} else { } else {
writeNodeForBinarySearch(builder, nodeIn, startAddress, maxBytesPerArc); writeNodeForBinarySearch(fstCompiler, nodeIn, startAddress, maxBytesPerArc);
builder.binarySearchNodeCount++; fstCompiler.binarySearchNodeCount++;
} }
} }
final long thisNodeAddress = builder.bytes.getPosition()-1; final long thisNodeAddress = fstCompiler.bytes.getPosition()-1;
builder.bytes.reverse(startAddress, thisNodeAddress); fstCompiler.bytes.reverse(startAddress, thisNodeAddress);
builder.nodeCount++; fstCompiler.nodeCount++;
return thisNodeAddress; return thisNodeAddress;
} }
@ -757,8 +757,8 @@ public final class FST<T> implements Accountable {
* of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear * of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear
* scan) on lookup by arc label. * scan) on lookup by arc label.
*/ */
private boolean shouldExpandNodeWithFixedLengthArcs(Builder<T> builder, Builder.UnCompiledNode<T> node) { private boolean shouldExpandNodeWithFixedLengthArcs(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> node) {
return builder.allowFixedLengthArcs && return fstCompiler.allowFixedLengthArcs &&
((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) || ((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS); node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS);
} }
@ -769,18 +769,18 @@ public final class FST<T> implements Accountable {
* Prefer direct addressing for performance if it does not oversize binary search byte size too much, * Prefer direct addressing for performance if it does not oversize binary search byte size too much,
* so that the arcs can be directly addressed by label. * so that the arcs can be directly addressed by label.
* *
* @see Builder#getDirectAddressingMaxOversizingFactor() * @see FSTCompiler#getDirectAddressingMaxOversizingFactor()
*/ */
private boolean shouldExpandNodeWithDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, private boolean shouldExpandNodeWithDirectAddressing(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn,
int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) { int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) {
// Anticipate precisely the size of the encodings. // Anticipate precisely the size of the encodings.
int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs; int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs;
int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + builder.numLabelBytesPerArc[0] int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + fstCompiler.numLabelBytesPerArc[0]
+ maxBytesPerArcWithoutLabel * nodeIn.numArcs; + maxBytesPerArcWithoutLabel * nodeIn.numArcs;
// Determine the allowed oversize compared to binary search. // Determine the allowed oversize compared to binary search.
// This is defined by a parameter of FST Builder (default 1: no oversize). // This is defined by a parameter of FST Builder (default 1: no oversize).
int allowedOversize = (int) (sizeForBinarySearch * builder.getDirectAddressingMaxOversizingFactor()); int allowedOversize = (int) (sizeForBinarySearch * fstCompiler.getDirectAddressingMaxOversizingFactor());
int expansionCost = sizeForDirectAddressing - allowedOversize; int expansionCost = sizeForDirectAddressing - allowedOversize;
// Select direct addressing if either: // Select direct addressing if either:
@ -790,46 +790,46 @@ public final class FST<T> implements Accountable {
// In this case, decrement the credit by the oversize. // In this case, decrement the credit by the oversize.
// In addition, do not try to oversize to a clearly too large node size // In addition, do not try to oversize to a clearly too large node size
// (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter). // (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter).
if (expansionCost <= 0 || (builder.directAddressingExpansionCredit >= expansionCost if (expansionCost <= 0 || (fstCompiler.directAddressingExpansionCredit >= expansionCost
&& sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) { && sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) {
builder.directAddressingExpansionCredit -= expansionCost; fstCompiler.directAddressingExpansionCredit -= expansionCost;
return true; return true;
} }
return false; return false;
} }
private void writeNodeForBinarySearch(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) { private void writeNodeForBinarySearch(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) {
// Build the header in a buffer. // Build the header in a buffer.
// It is a false/special arc which is in fact a node header with node flags followed by node metadata. // It is a false/special arc which is in fact a node header with node flags followed by node metadata.
builder.fixedLengthArcsBuffer fstCompiler.fixedLengthArcsBuffer
.resetPosition() .resetPosition()
.writeByte(ARCS_FOR_BINARY_SEARCH) .writeByte(ARCS_FOR_BINARY_SEARCH)
.writeVInt(nodeIn.numArcs) .writeVInt(nodeIn.numArcs)
.writeVInt(maxBytesPerArc); .writeVInt(maxBytesPerArc);
int headerLen = builder.fixedLengthArcsBuffer.getPosition(); int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition();
// Expand the arcs in place, backwards. // Expand the arcs in place, backwards.
long srcPos = builder.bytes.getPosition(); long srcPos = fstCompiler.bytes.getPosition();
long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc; long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc;
assert destPos >= srcPos; assert destPos >= srcPos;
if (destPos > srcPos) { if (destPos > srcPos) {
builder.bytes.skipBytes((int) (destPos - srcPos)); fstCompiler.bytes.skipBytes((int) (destPos - srcPos));
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
destPos -= maxBytesPerArc; destPos -= maxBytesPerArc;
int arcLen = builder.numBytesPerArc[arcIdx]; int arcLen = fstCompiler.numBytesPerArc[arcIdx];
srcPos -= arcLen; srcPos -= arcLen;
if (srcPos != destPos) { if (srcPos != destPos) {
assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs; assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs;
builder.bytes.copyBytes(srcPos, destPos, arcLen); fstCompiler.bytes.copyBytes(srcPos, destPos, arcLen);
} }
} }
} }
// Write the header. // Write the header.
builder.bytes.writeBytes(startAddress, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen); fstCompiler.bytes.writeBytes(startAddress, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
} }
private void writeNodeForDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) { private void writeNodeForDirectAddressing(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) {
// Expand the arcs backwards in a buffer because we remove the labels. // Expand the arcs backwards in a buffer because we remove the labels.
// So the obtained arcs might occupy less space. This is the reason why this // So the obtained arcs might occupy less space. This is the reason why this
// whole method is more complex. // whole method is more complex.
@ -837,64 +837,64 @@ public final class FST<T> implements Accountable {
// the presence bits, and the first label. Keep the first label. // the presence bits, and the first label. Keep the first label.
int headerMaxLen = 11; int headerMaxLen = 11;
int numPresenceBytes = getNumPresenceBytes(labelRange); int numPresenceBytes = getNumPresenceBytes(labelRange);
long srcPos = builder.bytes.getPosition(); long srcPos = fstCompiler.bytes.getPosition();
int totalArcBytes = builder.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel; int totalArcBytes = fstCompiler.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes; int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
byte[] buffer = builder.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes(); byte[] buffer = fstCompiler.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes();
// Copy the arcs to the buffer, dropping all labels except first one. // Copy the arcs to the buffer, dropping all labels except first one.
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
bufferOffset -= maxBytesPerArcWithoutLabel; bufferOffset -= maxBytesPerArcWithoutLabel;
int srcArcLen = builder.numBytesPerArc[arcIdx]; int srcArcLen = fstCompiler.numBytesPerArc[arcIdx];
srcPos -= srcArcLen; srcPos -= srcArcLen;
int labelLen = builder.numLabelBytesPerArc[arcIdx]; int labelLen = fstCompiler.numLabelBytesPerArc[arcIdx];
// Copy the flags. // Copy the flags.
builder.bytes.copyBytes(srcPos, buffer, bufferOffset, 1); fstCompiler.bytes.copyBytes(srcPos, buffer, bufferOffset, 1);
// Skip the label, copy the remaining. // Skip the label, copy the remaining.
int remainingArcLen = srcArcLen - 1 - labelLen; int remainingArcLen = srcArcLen - 1 - labelLen;
if (remainingArcLen != 0) { if (remainingArcLen != 0) {
builder.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); fstCompiler.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen);
} }
if (arcIdx == 0) { if (arcIdx == 0) {
// Copy the label of the first arc only. // Copy the label of the first arc only.
bufferOffset -= labelLen; bufferOffset -= labelLen;
builder.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); fstCompiler.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen);
} }
} }
assert bufferOffset == headerMaxLen + numPresenceBytes; assert bufferOffset == headerMaxLen + numPresenceBytes;
// Build the header in the buffer. // Build the header in the buffer.
// It is a false/special arc which is in fact a node header with node flags followed by node metadata. // It is a false/special arc which is in fact a node header with node flags followed by node metadata.
builder.fixedLengthArcsBuffer fstCompiler.fixedLengthArcsBuffer
.resetPosition() .resetPosition()
.writeByte(ARCS_FOR_DIRECT_ADDRESSING) .writeByte(ARCS_FOR_DIRECT_ADDRESSING)
.writeVInt(labelRange) // labelRange instead of numArcs. .writeVInt(labelRange) // labelRange instead of numArcs.
.writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc. .writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
int headerLen = builder.fixedLengthArcsBuffer.getPosition(); int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition();
// Prepare the builder byte store. Enlarge or truncate if needed. // Prepare the builder byte store. Enlarge or truncate if needed.
long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes; long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes;
long currentPosition = builder.bytes.getPosition(); long currentPosition = fstCompiler.bytes.getPosition();
if (nodeEnd >= currentPosition) { if (nodeEnd >= currentPosition) {
builder.bytes.skipBytes((int) (nodeEnd - currentPosition)); fstCompiler.bytes.skipBytes((int) (nodeEnd - currentPosition));
} else { } else {
builder.bytes.truncate(nodeEnd); fstCompiler.bytes.truncate(nodeEnd);
} }
assert builder.bytes.getPosition() == nodeEnd; assert fstCompiler.bytes.getPosition() == nodeEnd;
// Write the header. // Write the header.
long writeOffset = startAddress; long writeOffset = startAddress;
builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen); fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
writeOffset += headerLen; writeOffset += headerLen;
// Write the presence bits // Write the presence bits
writePresenceBits(builder, nodeIn, writeOffset, numPresenceBytes); writePresenceBits(fstCompiler, nodeIn, writeOffset, numPresenceBytes);
writeOffset += numPresenceBytes; writeOffset += numPresenceBytes;
// Write the first label and the arcs. // Write the first label and the arcs.
builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
} }
private void writePresenceBits(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) { private void writePresenceBits(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) {
long bytePos = dest; long bytePos = dest;
byte presenceBits = 1; // The first arc is always present. byte presenceBits = 1; // The first arc is always present.
int presenceIndex = 0; int presenceIndex = 0;
@ -904,7 +904,7 @@ public final class FST<T> implements Accountable {
assert label > previousLabel; assert label > previousLabel;
presenceIndex += label - previousLabel; presenceIndex += label - previousLabel;
while (presenceIndex >= Byte.SIZE) { while (presenceIndex >= Byte.SIZE) {
builder.bytes.writeByte(bytePos++, presenceBits); fstCompiler.bytes.writeByte(bytePos++, presenceBits);
presenceBits = 0; presenceBits = 0;
presenceIndex -= Byte.SIZE; presenceIndex -= Byte.SIZE;
} }
@ -915,7 +915,7 @@ public final class FST<T> implements Accountable {
assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8; assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8;
assert presenceBits != 0; // The last byte is not 0. assert presenceBits != 0; // The last byte is not 0.
assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present.
builder.bytes.writeByte(bytePos++, presenceBits); fstCompiler.bytes.writeByte(bytePos++, presenceBits);
assert bytePos - dest == numPresenceBytes; assert bytePos - dest == numPresenceBytes;
} }

View File

@ -49,31 +49,9 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
* @lucene.experimental * @lucene.experimental
*/ */
public class Builder<T> { public class FSTCompiler<T> {
/** static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1f;
* Default oversizing factor used to decide whether to encode a node with direct addressing or binary search.
* Default is 1: ensure no oversizing on average.
* <p>
* This factor does not determine whether to encode a node with a list of variable length arcs or with
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
* encoded with fixed length arcs.
* See {@code FST.shouldExpandNodeWithFixedLengthArcs()}
* and {@code FST.shouldExpandNodeWithDirectAddressing()}.
* <p>
* For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs,
* and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%.
* <p>
* For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs,
* and 14% of them with direct encoding. Overall FST memory reduced by 0.8%.
* <p>
* Use {@code TestFstDirectAddressing.main()}
* and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()}
* to evaluate a change.
*
* @see #setDirectAddressingMaxOversizingFactor
*/
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f;
private final NodeHash<T> dedupHash; private final NodeHash<T> dedupHash;
final FST<T> fst; final FST<T> fst;
@ -117,75 +95,29 @@ public class Builder<T> {
long binarySearchNodeCount; long binarySearchNodeCount;
long directAddressingNodeCount; long directAddressingNodeCount;
boolean allowFixedLengthArcs; final boolean allowFixedLengthArcs;
float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR; final float directAddressingMaxOversizingFactor;
long directAddressingExpansionCredit; long directAddressingExpansionCredit;
BytesStore bytes; final BytesStore bytes;
/** /**
* Instantiates an FST/FSA builder without any pruning. A shortcut to {@link * Instantiates an FST/FSA builder with default settings and pruning options turned off.
* #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int)} with * For more tuning and tweaking, see {@link Builder}.
* pruning options turned off.
*/ */
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) { public FSTCompiler(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, 1f);
} }
/** private FSTCompiler(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
* Instantiates an FST/FSA builder with all the possible tuning and construction boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
* tweaks. Read parameter documentation carefully. boolean allowFixedLengthArcs, int bytesPageBits, float directAddressingMaxOversizingFactor) {
*
* @param inputType
* The input type (transition labels). Can be anything from {@link INPUT_TYPE}
* enumeration. Shorter types will consume less memory. Strings (character sequences) are
* represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
*
* @param minSuffixCount1
* If pruning the input graph during construction, this threshold is used for telling
* if a node is kept or pruned. If transition_count(node) &gt;= minSuffixCount1, the node
* is kept.
*
* @param minSuffixCount2
* (Note: only Mike McCandless knows what this one is really doing...)
*
* @param doShareSuffix
* If <code>true</code>, the shared suffixes will be compacted into unique paths.
* This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
* <code>false</code> creates a single suffix path for all input sequences. This will result in a larger
* FST, but requires substantially less memory and CPU during building.
*
* @param doShareNonSingletonNodes
* Only used if doShareSuffix is true. Set this to
* true to ensure FST is fully minimal, at cost of more
* CPU and more RAM during building.
*
* @param shareMaxTailLength
* Only used if doShareSuffix is true. Set this to
* Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
* CPU and more RAM during building.
*
* @param outputs The output type for each input sequence. Applies only if building an FST. For
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*
* @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or
* direct addressing) while building the FST; this will make the resulting FST smaller but slower to
* traverse.
*
* @param bytesPageBits How many bits wide to make each
* byte[] block in the BytesStore; if you know the FST
* will be large then make this larger. For example 15
* bits = 32768 byte pages.
*/
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
boolean allowFixedLengthArcs, int bytesPageBits) {
this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2; this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes; this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength; this.shareMaxTailLength = shareMaxTailLength;
this.allowFixedLengthArcs = allowFixedLengthArcs; this.allowFixedLengthArcs = allowFixedLengthArcs;
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
fst = new FST<>(inputType, outputs, bytesPageBits); fst = new FST<>(inputType, outputs, bytesPageBits);
bytes = fst.bytes; bytes = fst.bytes;
assert bytes != null; assert bytes != null;
@ -205,22 +137,145 @@ public class Builder<T> {
} }
/** /**
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing * Fluent-style constructor for FST {@link FSTCompiler}.
* of arcs instead of binary search.
* <p> * <p>
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing, * Creates an FST/FSA builder with all the possible tuning and construction tweaks.
* only binary search nodes will be created. * Read parameter documentation carefully.
*
* @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR
*/ */
public Builder<T> setDirectAddressingMaxOversizingFactor(float factor) { public static class Builder<T> {
directAddressingMaxOversizingFactor = factor;
return this; private final INPUT_TYPE inputType;
private final Outputs<T> outputs;
private int minSuffixCount1;
private int minSuffixCount2;
private boolean shouldShareSuffix = true;
private boolean shouldShareNonSingletonNodes = true;
private int shareMaxTailLength = Integer.MAX_VALUE;
private boolean allowFixedLengthArcs = true;
private int bytesPageBits = 15;
private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
/**
* @param inputType The input type (transition labels). Can be anything from {@link INPUT_TYPE}
* enumeration. Shorter types will consume less memory. Strings (character sequences) are
* represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
* @param outputs The output type for each input sequence. Applies only if building an FST. For
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*/
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this.inputType = inputType;
this.outputs = outputs;
}
/**
* If pruning the input graph during construction, this threshold is used for telling if a node is kept
* or pruned. If transition_count(node) &gt;= minSuffixCount1, the node is kept.
* <p>
* Default = 0.
*/
public Builder<T> minSuffixCount1(int minSuffixCount1) {
this.minSuffixCount1 = minSuffixCount1;
return this;
}
/**
* Better pruning: we prune node (and all following nodes) if the prior node has less than this number
* of terms go through it.
* <p>
* Default = 0.
*/
public Builder<T> minSuffixCount2(int minSuffixCount2) {
this.minSuffixCount2 = minSuffixCount2;
return this;
}
/**
* If {@code true}, the shared suffixes will be compacted into unique paths.
* This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
* {@code false} creates a single suffix path for all input sequences. This will result in a larger
* FST, but requires substantially less memory and CPU during building.
* <p>
* Default = {@code true}.
*/
public Builder<T> shouldShareSuffix(boolean shouldShareSuffix) {
this.shouldShareSuffix = shouldShareSuffix;
return this;
}
/**
* Only used if {@code shouldShareSuffix} is true. Set this to true to ensure FST is fully minimal,
* at cost of more CPU and more RAM during building.
* <p>
* Default = {@code true}.
*/
public Builder<T> shouldShareNonSingletonNodes(boolean shouldShareNonSingletonNodes) {
this.shouldShareNonSingletonNodes = shouldShareNonSingletonNodes;
return this;
}
/**
* Only used if {@code shouldShareSuffix} is true. Set this to Integer.MAX_VALUE to ensure FST is
* fully minimal, at cost of more CPU and more RAM during building.
* <p>
* Default = {@link Integer#MAX_VALUE}.
*/
public Builder<T> shareMaxTailLength(int shareMaxTailLength) {
this.shareMaxTailLength = shareMaxTailLength;
return this;
}
/**
* Pass {@code false} to disable the fixed length arc optimization (binary search or direct addressing)
* while building the FST; this will make the resulting FST smaller but slower to traverse.
* <p>
* Default = {@code true}.
*/
public Builder<T> allowFixedLengthArcs(boolean allowFixedLengthArcs) {
this.allowFixedLengthArcs = allowFixedLengthArcs;
return this;
}
/**
* How many bits wide to make each byte[] block in the BytesStore; if you know the FST
* will be large then make this larger. For example 15 bits = 32768 byte pages.
* <p>
* Default = 15.
*/
public Builder<T> bytesPageBits(int bytesPageBits) {
this.bytesPageBits = bytesPageBits;
return this;
}
/**
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
* of arcs instead of binary search.
* <p>
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
* only binary search nodes will be created.
* <p>
* This factor does not determine whether to encode a node with a list of variable length arcs or with
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
* encoded with fixed length arcs.
* <p>
* Default = 1.
*/
public Builder<T> directAddressingMaxOversizingFactor(float factor) {
this.directAddressingMaxOversizingFactor = factor;
return this;
}
/**
* Creates a new {@link FSTCompiler}.
*/
public FSTCompiler<T> build() {
FSTCompiler<T> fstCompiler = new FSTCompiler<>(inputType, minSuffixCount1, minSuffixCount2, shouldShareSuffix,
shouldShareNonSingletonNodes, shareMaxTailLength, outputs, allowFixedLengthArcs, bytesPageBits,
directAddressingMaxOversizingFactor);
return fstCompiler;
}
} }
/**
* @see #setDirectAddressingMaxOversizingFactor(float)
*/
public float getDirectAddressingMaxOversizingFactor() { public float getDirectAddressingMaxOversizingFactor() {
return directAddressingMaxOversizingFactor; return directAddressingMaxOversizingFactor;
} }
@ -514,7 +569,7 @@ public class Builder<T> {
/** Returns final FST. NOTE: this will return null if /** Returns final FST. NOTE: this will return null if
* nothing is accepted by the FST. */ * nothing is accepted by the FST. */
public FST<T> finish() throws IOException { public FST<T> compile() throws IOException {
final UnCompiledNode<T> root = frontier[0]; final UnCompiledNode<T> root = frontier[0];
@ -554,19 +609,19 @@ public class Builder<T> {
} }
/** Expert: holds a pending (seen but not yet serialized) arc. */ /** Expert: holds a pending (seen but not yet serialized) arc. */
public static class Arc<T> { static class Arc<T> {
public int label; // really an "unsigned" byte int label; // really an "unsigned" byte
public Node target; Node target;
public boolean isFinal; boolean isFinal;
public T output; T output;
public T nextFinalOutput; T nextFinalOutput;
} }
// NOTE: not many instances of Node or CompiledNode are in // NOTE: not many instances of Node or CompiledNode are in
// memory while the FST is being built; it's only the // memory while the FST is being built; it's only the
// current "frontier": // current "frontier":
static interface Node { interface Node {
boolean isCompiled(); boolean isCompiled();
} }
@ -583,20 +638,20 @@ public class Builder<T> {
} }
/** Expert: holds a pending (seen but not yet serialized) Node. */ /** Expert: holds a pending (seen but not yet serialized) Node. */
public static final class UnCompiledNode<T> implements Node { static final class UnCompiledNode<T> implements Node {
final Builder<T> owner; final FSTCompiler<T> owner;
public int numArcs; int numArcs;
public Arc<T>[] arcs; Arc<T>[] arcs;
// TODO: instead of recording isFinal/output on the // TODO: instead of recording isFinal/output on the
// node, maybe we should use -1 arc to mean "end" (like // node, maybe we should use -1 arc to mean "end" (like
// we do when reading the FST). Would simplify much // we do when reading the FST). Would simplify much
// code here... // code here...
public T output; T output;
public boolean isFinal; boolean isFinal;
public long inputCount; long inputCount;
/** This node's depth, starting from the automaton root. */ /** This node's depth, starting from the automaton root. */
public final int depth; final int depth;
/** /**
* @param depth * @param depth
@ -605,7 +660,7 @@ public class Builder<T> {
* fanout size). * fanout size).
*/ */
@SuppressWarnings({"rawtypes","unchecked"}) @SuppressWarnings({"rawtypes","unchecked"})
public UnCompiledNode(Builder<T> owner, int depth) { UnCompiledNode(FSTCompiler<T> owner, int depth) {
this.owner = owner; this.owner = owner;
arcs = (Arc<T>[]) new Arc[1]; arcs = (Arc<T>[]) new Arc[1];
arcs[0] = new Arc<>(); arcs[0] = new Arc<>();
@ -618,7 +673,7 @@ public class Builder<T> {
return false; return false;
} }
public void clear() { void clear() {
numArcs = 0; numArcs = 0;
isFinal = false; isFinal = false;
output = owner.NO_OUTPUT; output = owner.NO_OUTPUT;
@ -628,13 +683,13 @@ public class Builder<T> {
// for nodes on the frontier (even when reused). // for nodes on the frontier (even when reused).
} }
public T getLastOutput(int labelToMatch) { T getLastOutput(int labelToMatch) {
assert numArcs > 0; assert numArcs > 0;
assert arcs[numArcs-1].label == labelToMatch; assert arcs[numArcs-1].label == labelToMatch;
return arcs[numArcs-1].output; return arcs[numArcs-1].output;
} }
public void addArc(int label, Node target) { void addArc(int label, Node target) {
assert label >= 0; assert label >= 0;
assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[numArcs-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs; assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[numArcs-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs;
if (numArcs == arcs.length) { if (numArcs == arcs.length) {
@ -651,7 +706,7 @@ public class Builder<T> {
arc.isFinal = false; arc.isFinal = false;
} }
public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) { void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
assert numArcs > 0; assert numArcs > 0;
final Arc<T> arc = arcs[numArcs-1]; final Arc<T> arc = arcs[numArcs-1];
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch; assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
@ -661,14 +716,14 @@ public class Builder<T> {
arc.isFinal = isFinal; arc.isFinal = isFinal;
} }
public void deleteLast(int label, Node target) { void deleteLast(int label, Node target) {
assert numArcs > 0; assert numArcs > 0;
assert label == arcs[numArcs-1].label; assert label == arcs[numArcs-1].label;
assert target == arcs[numArcs-1].target; assert target == arcs[numArcs-1].target;
numArcs--; numArcs--;
} }
public void setLastOutput(int labelToMatch, T newOutput) { void setLastOutput(int labelToMatch, T newOutput) {
assert owner.validOutput(newOutput); assert owner.validOutput(newOutput);
assert numArcs > 0; assert numArcs > 0;
final Arc<T> arc = arcs[numArcs-1]; final Arc<T> arc = arcs[numArcs-1];
@ -677,7 +732,7 @@ public class Builder<T> {
} }
// pushes an output prefix forward onto all arcs // pushes an output prefix forward onto all arcs
public void prependOutput(T outputPrefix) { void prependOutput(T outputPrefix) {
assert owner.validOutput(outputPrefix); assert owner.validOutput(outputPrefix);
for(int arcIdx=0;arcIdx<numArcs;arcIdx++) { for(int arcIdx=0;arcIdx<numArcs;arcIdx++) {

View File

@ -39,7 +39,7 @@ final class NodeHash<T> {
this.in = in; this.in = in;
} }
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException { private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException {
fst.readFirstRealTargetArc(address, scratchArc, in); fst.readFirstRealTargetArc(address, scratchArc, in);
// Fail fast for a node with fixed length arcs. // Fail fast for a node with fixed length arcs.
@ -58,10 +58,10 @@ final class NodeHash<T> {
} }
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) { for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
final Builder.Arc<T> arc = node.arcs[arcUpto]; final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
if (arc.label != scratchArc.label() || if (arc.label != scratchArc.label() ||
!arc.output.equals(scratchArc.output()) || !arc.output.equals(scratchArc.output()) ||
((Builder.CompiledNode) arc.target).node != scratchArc.target() || ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() ||
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) || !arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) ||
arc.isFinal != scratchArc.isFinal()) { arc.isFinal != scratchArc.isFinal()) {
return false; return false;
@ -82,16 +82,16 @@ final class NodeHash<T> {
// hash code for an unfrozen node. This must be identical // hash code for an unfrozen node. This must be identical
// to the frozen case (below)!! // to the frozen case (below)!!
private long hash(Builder.UnCompiledNode<T> node) { private long hash(FSTCompiler.UnCompiledNode<T> node) {
final int PRIME = 31; final int PRIME = 31;
//System.out.println("hash unfrozen"); //System.out.println("hash unfrozen");
long h = 0; long h = 0;
// TODO: maybe if number of arcs is high we can safely subsample? // TODO: maybe if number of arcs is high we can safely subsample?
for (int arcIdx=0; arcIdx < node.numArcs; arcIdx++) { for (int arcIdx=0; arcIdx < node.numArcs; arcIdx++) {
final Builder.Arc<T> arc = node.arcs[arcIdx]; final FSTCompiler.Arc<T> arc = node.arcs[arcIdx];
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
h = PRIME * h + arc.label; h = PRIME * h + arc.label;
long n = ((Builder.CompiledNode) arc.target).node; long n = ((FSTCompiler.CompiledNode) arc.target).node;
h = PRIME * h + (int) (n^(n>>32)); h = PRIME * h + (int) (n^(n>>32));
h = PRIME * h + arc.output.hashCode(); h = PRIME * h + arc.output.hashCode();
h = PRIME * h + arc.nextFinalOutput.hashCode(); h = PRIME * h + arc.nextFinalOutput.hashCode();
@ -127,7 +127,7 @@ final class NodeHash<T> {
return h & Long.MAX_VALUE; return h & Long.MAX_VALUE;
} }
public long add(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException { public long add(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
//System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask); //System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
final long h = hash(nodeIn); final long h = hash(nodeIn);
long pos = h & mask; long pos = h & mask;
@ -136,7 +136,7 @@ final class NodeHash<T> {
final long v = table.get(pos); final long v = table.get(pos);
if (v == 0) { if (v == 0) {
// freeze & add // freeze & add
final long node = fst.addNode(builder, nodeIn); final long node = fst.addNode(fstCompiler, nodeIn);
//System.out.println(" now freeze node=" + node); //System.out.println(" now freeze node=" + node);
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h; assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
count++; count++;

View File

@ -54,8 +54,7 @@ public class Test2BFST extends LuceneTestCase {
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
Outputs<Object> outputs = NoOutputs.getSingleton(); Outputs<Object> outputs = NoOutputs.getSingleton();
Object NO_OUTPUT = outputs.getNoOutput(); Object NO_OUTPUT = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
true, 15);
int count = 0; int count = 0;
Random r = new Random(seed); Random r = new Random(seed);
@ -66,21 +65,21 @@ public class Test2BFST extends LuceneTestCase {
for(int i=10;i<ints2.length;i++) { for(int i=10;i<ints2.length;i++) {
ints2[i] = r.nextInt(256); ints2[i] = r.nextInt(256);
} }
b.add(input2, NO_OUTPUT); fstCompiler.add(input2, NO_OUTPUT);
count++; count++;
if (count % 100000 == 0) { if (count % 100000 == 0) {
System.out.println(count + ": " + b.fstRamBytesUsed() + " bytes; " + b.getNodeCount() + " nodes"); System.out.println(count + ": " + fstCompiler.fstRamBytesUsed() + " bytes; " + fstCompiler.getNodeCount() + " nodes");
} }
if (b.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) { if (fstCompiler.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
break; break;
} }
nextInput(r, ints2); nextInput(r, ints2);
} }
FST<Object> fst = b.finish(); FST<Object> fst = fstCompiler.compile();
for(int verify=0;verify<2;verify++) { for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]"); System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
Arrays.fill(ints2, 0); Arrays.fill(ints2, 0);
r = new Random(seed); r = new Random(seed);
@ -136,8 +135,7 @@ public class Test2BFST extends LuceneTestCase {
{ {
System.out.println("\nTEST: 3 GB size; outputs=bytes"); System.out.println("\nTEST: 3 GB size; outputs=bytes");
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton(); Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
true, 15);
byte[] outputBytes = new byte[20]; byte[] outputBytes = new byte[20];
BytesRef output = new BytesRef(outputBytes); BytesRef output = new BytesRef(outputBytes);
@ -147,10 +145,10 @@ public class Test2BFST extends LuceneTestCase {
while(true) { while(true) {
r.nextBytes(outputBytes); r.nextBytes(outputBytes);
//System.out.println("add: " + input + " -> " + output); //System.out.println("add: " + input + " -> " + output);
b.add(input, BytesRef.deepCopyOf(output)); fstCompiler.add(input, BytesRef.deepCopyOf(output));
count++; count++;
if (count % 10000 == 0) { if (count % 10000 == 0) {
long size = b.fstRamBytesUsed(); long size = fstCompiler.fstRamBytesUsed();
if (count % 1000000 == 0) { if (count % 1000000 == 0) {
System.out.println(count + "...: " + size + " bytes"); System.out.println(count + "...: " + size + " bytes");
} }
@ -161,10 +159,10 @@ public class Test2BFST extends LuceneTestCase {
nextInput(r, ints); nextInput(r, ints);
} }
FST<BytesRef> fst = b.finish(); FST<BytesRef> fst = fstCompiler.compile();
for(int verify=0;verify<2;verify++) { for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]"); System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
r = new Random(seed); r = new Random(seed);
Arrays.fill(ints, 0); Arrays.fill(ints, 0);
@ -216,8 +214,7 @@ public class Test2BFST extends LuceneTestCase {
{ {
System.out.println("\nTEST: 3 GB size; outputs=long"); System.out.println("\nTEST: 3 GB size; outputs=long");
Outputs<Long> outputs = PositiveIntOutputs.getSingleton(); Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
true, 15);
long output = 1; long output = 1;
@ -226,11 +223,11 @@ public class Test2BFST extends LuceneTestCase {
Random r = new Random(seed); Random r = new Random(seed);
while(true) { while(true) {
//System.out.println("add: " + input + " -> " + output); //System.out.println("add: " + input + " -> " + output);
b.add(input, output); fstCompiler.add(input, output);
output += 1+r.nextInt(10); output += 1+r.nextInt(10);
count++; count++;
if (count % 10000 == 0) { if (count % 10000 == 0) {
long size = b.fstRamBytesUsed(); long size = fstCompiler.fstRamBytesUsed();
if (count % 1000000 == 0) { if (count % 1000000 == 0) {
System.out.println(count + "...: " + size + " bytes"); System.out.println(count + "...: " + size + " bytes");
} }
@ -241,11 +238,11 @@ public class Test2BFST extends LuceneTestCase {
nextInput(r, ints); nextInput(r, ints);
} }
FST<Long> fst = b.finish(); FST<Long> fst = fstCompiler.compile();
for(int verify=0;verify<2;verify++) { for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]"); System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
Arrays.fill(ints, 0); Arrays.fill(ints, 0);

View File

@ -35,7 +35,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
public class TestFstDirectAddressing extends LuceneTestCase { public class TestFSTDirectAddressing extends LuceneTestCase {
public void testDenseWithGap() throws Exception { public void testDenseWithGap() throws Exception {
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm"); List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
@ -86,13 +86,13 @@ public class TestFstDirectAddressing extends LuceneTestCase {
Collections.sort(wordList); Collections.sort(wordList);
// Disable direct addressing and measure the FST size. // Disable direct addressing and measure the FST size.
Builder<Object> builder = createBuilder(-1f); FSTCompiler<Object> fstCompiler = createFSTCompiler(-1f);
FST<Object> fst = buildFST(wordList, builder); FST<Object> fst = buildFST(wordList, fstCompiler);
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed(); long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
// Enable direct addressing and measure the FST size. // Enable direct addressing and measure the FST size.
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR); fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
fst = buildFST(wordList, builder); fst = buildFST(wordList, fstCompiler);
long ramBytesUsed = fst.ramBytesUsed(); long ramBytesUsed = fst.ramBytesUsed();
// Compute the size increase in percents. // Compute the size increase in percents.
@ -107,42 +107,43 @@ public class TestFstDirectAddressing extends LuceneTestCase {
directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT); directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT);
} }
private static void printStats(Builder<Object> builder, long ramBytesUsed, double directAddressingMemoryIncreasePercent) { private static void printStats(FSTCompiler<Object> fstCompiler, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
System.out.println("directAddressingMaxOversizingFactor = " + builder.getDirectAddressingMaxOversizingFactor()); System.out.println("directAddressingMaxOversizingFactor = " + fstCompiler.getDirectAddressingMaxOversizingFactor());
System.out.println("ramBytesUsed = " System.out.println("ramBytesUsed = "
+ String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d) + String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d)
+ String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent)); + String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent));
System.out.println("num nodes = " + builder.nodeCount); System.out.println("num nodes = " + fstCompiler.nodeCount);
long fixedLengthArcNodeCount = builder.directAddressingNodeCount + builder.binarySearchNodeCount; long fixedLengthArcNodeCount = fstCompiler.directAddressingNodeCount + fstCompiler.binarySearchNodeCount;
System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount
+ String.format(Locale.ENGLISH, " (%.2f %% of all nodes)", + String.format(Locale.ENGLISH, " (%.2f %% of all nodes)",
((double) fixedLengthArcNodeCount / builder.nodeCount * 100))); ((double) fixedLengthArcNodeCount / fstCompiler.nodeCount * 100)));
System.out.println("num binary-search nodes = " + (builder.binarySearchNodeCount) System.out.println("num binary-search nodes = " + (fstCompiler.binarySearchNodeCount)
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)", + String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
((double) (builder.binarySearchNodeCount) / fixedLengthArcNodeCount * 100))); ((double) (fstCompiler.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
System.out.println("num direct-addressing nodes = " + (builder.directAddressingNodeCount) System.out.println("num direct-addressing nodes = " + (fstCompiler.directAddressingNodeCount)
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)", + String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
((double) (builder.directAddressingNodeCount) / fixedLengthArcNodeCount * 100))); ((double) (fstCompiler.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
} }
private static Builder<Object> createBuilder(float directAddressingMaxOversizingFactor) { private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) {
return new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, NoOutputs.getSingleton(), true, 15) return new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, NoOutputs.getSingleton())
.setDirectAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor); .directAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor)
.build();
} }
private FST<Object> buildFST(List<BytesRef> entries) throws Exception { private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
return buildFST(entries, createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR)); return buildFST(entries, createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
} }
private static FST<Object> buildFST(List<BytesRef> entries, Builder<Object> builder) throws Exception { private static FST<Object> buildFST(List<BytesRef> entries, FSTCompiler<Object> fstCompiler) throws Exception {
BytesRef last = null; BytesRef last = null;
for (BytesRef entry : entries) { for (BytesRef entry : entries) {
if (entry.equals(last) == false) { if (entry.equals(last) == false) {
builder.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput()); fstCompiler.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
} }
last = entry; last = entry;
} }
return builder.finish(); return fstCompiler.compile();
} }
public static void main(String... args) throws Exception { public static void main(String... args) throws Exception {
@ -195,18 +196,18 @@ public class TestFstDirectAddressing extends LuceneTestCase {
Collections.sort(wordList); Collections.sort(wordList);
// Disable direct addressing and measure the FST size. // Disable direct addressing and measure the FST size.
Builder<Object> builder = createBuilder(-1f); FSTCompiler<Object> fstCompiler = createFSTCompiler(-1f);
FST<Object> fst = buildFST(wordList, builder); FST<Object> fst = buildFST(wordList, fstCompiler);
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed(); long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
// Enable direct addressing and measure the FST size. // Enable direct addressing and measure the FST size.
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR); fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
fst = buildFST(wordList, builder); fst = buildFST(wordList, fstCompiler);
long ramBytesUsed = fst.ramBytesUsed(); long ramBytesUsed = fst.ramBytesUsed();
// Compute the size increase in percents. // Compute the size increase in percents.
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100; double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent); printStats(fstCompiler, ramBytesUsed, directAddressingMemoryIncreasePercent);
} }
} }

View File

@ -327,7 +327,7 @@ public class TestFSTs extends LuceneTestCase {
writer.close(); writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
boolean storeOrd = random().nextBoolean(); boolean storeOrd = random().nextBoolean();
if (VERBOSE) { if (VERBOSE) {
@ -373,15 +373,15 @@ public class TestFSTs extends LuceneTestCase {
} else { } else {
output = termsEnum.docFreq(); output = termsEnum.docFreq();
} }
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output); fstCompiler.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
ord++; ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms..."); System.out.println(ord + " terms...");
} }
} }
FST<Long> fst = builder.finish(); FST<Long> fst = fstCompiler.compile();
if (VERBOSE) { if (VERBOSE) {
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes"); System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
} }
if (ord > 0) { if (ord > 0) {
@ -460,7 +460,7 @@ public class TestFSTs extends LuceneTestCase {
private final Path wordsFileIn; private final Path wordsFileIn;
private int inputMode; private int inputMode;
private final Outputs<T> outputs; private final Outputs<T> outputs;
private final Builder<T> builder; private final FSTCompiler<T> fstCompiler;
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) { public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) {
this.dirOut = dirOut; this.dirOut = dirOut;
@ -468,7 +468,11 @@ public class TestFSTs extends LuceneTestCase {
this.inputMode = inputMode; this.inputMode = inputMode;
this.outputs = outputs; this.outputs = outputs;
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15); fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
.minSuffixCount2(prune)
.shouldShareSuffix(prune == 0)
.allowFixedLengthArcs(!noArcArrays)
.build();
} }
protected abstract T getOutput(IntsRef input, int ord) throws IOException; protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase {
break; break;
} }
toIntsRef(w, inputMode, intsRef); toIntsRef(w, inputMode, intsRef);
builder.add(intsRef.get(), fstCompiler.add(intsRef.get(),
getOutput(intsRef.get(), ord)); getOutput(intsRef.get(), ord));
ord++; ord++;
@ -503,8 +507,8 @@ public class TestFSTs extends LuceneTestCase {
long tMid = System.currentTimeMillis(); long tMid = System.currentTimeMillis();
System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms"); System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms");
assert builder.getTermCount() == ord; assert fstCompiler.getTermCount() == ord;
FST<T> fst = builder.finish(); FST<T> fst = fstCompiler.compile();
long tEnd = System.currentTimeMillis(); long tEnd = System.currentTimeMillis();
System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack"); System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack");
if (fst == null) { if (fst == null) {
@ -516,8 +520,8 @@ public class TestFSTs extends LuceneTestCase {
return; return;
} }
System.out.println(ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs; tot size " + fst.ramBytesUsed()); System.out.println(ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs; tot size " + fst.ramBytesUsed());
if (builder.getNodeCount() < 100) { if (fstCompiler.getNodeCount() < 100) {
Writer w = Files.newBufferedWriter(Paths.get("out.dot"), StandardCharsets.UTF_8); Writer w = Files.newBufferedWriter(Paths.get("out.dot"), StandardCharsets.UTF_8);
Util.toDot(fst, w, false, false); Util.toDot(fst, w, false, false);
w.close(); w.close();
@ -717,9 +721,9 @@ public class TestFSTs extends LuceneTestCase {
public void testSingleString() throws Exception { public void testSingleString() throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton(); final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput()); fstCompiler.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput());
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(b.finish()); final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fstCompiler.compile());
assertNull(fstEnum.seekFloor(new BytesRef("foo"))); assertNull(fstEnum.seekFloor(new BytesRef("foo")));
assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
} }
@ -728,12 +732,12 @@ public class TestFSTs extends LuceneTestCase {
public void testDuplicateFSAString() throws Exception { public void testDuplicateFSAString() throws Exception {
String str = "foobar"; String str = "foobar";
final Outputs<Object> outputs = NoOutputs.getSingleton(); final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder ints = new IntsRefBuilder(); IntsRefBuilder ints = new IntsRefBuilder();
for(int i=0; i<10; i++) { for(int i=0; i<10; i++) {
b.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput()); fstCompiler.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
} }
FST<Object> fst = b.finish(); FST<Object> fst = fstCompiler.compile();
// count the input paths // count the input paths
int count = 0; int count = 0;
@ -797,17 +801,17 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
// Build an FST mapping BytesRef -> Long // Build an FST mapping BytesRef -> Long
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final BytesRef a = new BytesRef("a"); final BytesRef a = new BytesRef("a");
final BytesRef b = new BytesRef("b"); final BytesRef b = new BytesRef("b");
final BytesRef c = new BytesRef("c"); final BytesRef c = new BytesRef("c");
builder.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L); fstCompiler.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L);
builder.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L); fstCompiler.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L);
builder.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L); fstCompiler.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L);
final FST<Long> fst = builder.finish(); final FST<Long> fst = fstCompiler.compile();
assertEquals(13824324872317238L, (long) Util.get(fst, c)); assertEquals(13824324872317238L, (long) Util.get(fst, c));
assertEquals(42, (long) Util.get(fst, b)); assertEquals(42, (long) Util.get(fst, b));
@ -1035,7 +1039,7 @@ public class TestFSTs extends LuceneTestCase {
FST<Object> compile(String[] lines) throws IOException { FST<Object> compile(String[] lines) throws IOException {
final NoOutputs outputs = NoOutputs.getSingleton(); final NoOutputs outputs = NoOutputs.getSingleton();
final Object nothing = outputs.getNoOutput(); final Object nothing = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
int line = 0; int line = 0;
final BytesRefBuilder term = new BytesRefBuilder(); final BytesRefBuilder term = new BytesRefBuilder();
@ -1046,10 +1050,10 @@ public class TestFSTs extends LuceneTestCase {
break; break;
} }
term.copyChars(w); term.copyChars(w);
b.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing); fstCompiler.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
} }
return b.finish(); return fstCompiler.compile();
} }
void generate(ArrayList<String> out, StringBuilder b, char from, char to, void generate(ArrayList<String> out, StringBuilder b, char from, char to,
@ -1110,10 +1114,10 @@ public class TestFSTs extends LuceneTestCase {
public void testFinalOutputOnEndState() throws Exception { public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); final FSTCompiler<Long> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).minSuffixCount1(2).build();
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L); fstCompiler.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L); fstCompiler.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
final FST<Long> fst = builder.finish(); final FST<Long> fst = fstCompiler.compile();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
StringWriter w = new StringWriter(); StringWriter w = new StringWriter();
Util.toDot(fst, w, false, false); Util.toDot(fst, w, false, false);
@ -1124,10 +1128,10 @@ public class TestFSTs extends LuceneTestCase {
public void testInternalFinalState() throws Exception { public void testInternalFinalState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput()); fstCompiler.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput()); fstCompiler.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
final FST<Long> fst = builder.finish(); final FST<Long> fst = fstCompiler.compile();
StringWriter w = new StringWriter(); StringWriter w = new StringWriter();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
Util.toDot(fst, w, false, false); Util.toDot(fst, w, false, false);
@ -1145,20 +1149,20 @@ public class TestFSTs extends LuceneTestCase {
public void testNonFinalStopNode() throws Exception { public void testNonFinalStopNode() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Long nothing = outputs.getNoOutput(); final Long nothing = outputs.getNoOutput();
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
//final FST<Long> fst = new FST<>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, 15); //final FST<Long> fst = new FST<>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, 15);
final FST<Long> fst = b.fst; final FST<Long> fst = fstCompiler.fst;
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<>(b, 0); final FSTCompiler.UnCompiledNode<Long> rootNode = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
// Add final stop node // Add final stop node
{ {
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0); final FSTCompiler.UnCompiledNode<Long> node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
node.isFinal = true; node.isFinal = true;
rootNode.addArc('a', node); rootNode.addArc('a', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode(); final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode();
frozen.node = fst.addNode(b, node); frozen.node = fst.addNode(fstCompiler, node);
rootNode.arcs[0].nextFinalOutput = 17L; rootNode.arcs[0].nextFinalOutput = 17L;
rootNode.arcs[0].isFinal = true; rootNode.arcs[0].isFinal = true;
rootNode.arcs[0].output = nothing; rootNode.arcs[0].output = nothing;
@ -1167,16 +1171,16 @@ public class TestFSTs extends LuceneTestCase {
// Add non-final stop node // Add non-final stop node
{ {
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0); final FSTCompiler.UnCompiledNode<Long> node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
rootNode.addArc('b', node); rootNode.addArc('b', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode(); final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode();
frozen.node = fst.addNode(b, node); frozen.node = fst.addNode(fstCompiler, node);
rootNode.arcs[1].nextFinalOutput = nothing; rootNode.arcs[1].nextFinalOutput = nothing;
rootNode.arcs[1].output = 42L; rootNode.arcs[1].output = 42L;
rootNode.arcs[1].target = frozen; rootNode.arcs[1].target = frozen;
} }
fst.finish(fst.addNode(b, rootNode)); fst.finish(fst.addNode(fstCompiler, rootNode));
StringWriter w = new StringWriter(); StringWriter w = new StringWriter();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
@ -1225,13 +1229,13 @@ public class TestFSTs extends LuceneTestCase {
public void testShortestPaths() throws Exception { public void testShortestPaths() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder(); final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L); fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L); fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L); fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
final FST<Long> fst = builder.finish(); final FST<Long> fst = fstCompiler.compile();
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false); //Util.toDot(fst, w, false, false);
//w.close(); //w.close();
@ -1256,16 +1260,16 @@ public class TestFSTs extends LuceneTestCase {
public void testRejectNoLimits() throws IOException { public void testRejectNoLimits() throws IOException {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Long> fstCompiler = new FSTCompiler<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder(); final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L); fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L); fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
builder.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L); fstCompiler.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
builder.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L); fstCompiler.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L); fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
final FST<Long> fst = builder.finish(); final FST<Long> fst = fstCompiler.compile();
final AtomicInteger rejectCount = new AtomicInteger(); final AtomicInteger rejectCount = new AtomicInteger();
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, 2, 6, minLongComparator) { Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, 2, 6, minLongComparator) {
@Override @Override
@ -1320,13 +1324,13 @@ public class TestFSTs extends LuceneTestCase {
PositiveIntOutputs.getSingleton() // output PositiveIntOutputs.getSingleton() // output
); );
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Pair<Long,Long>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder(); final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L)); fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L)); fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L)); fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
final FST<Pair<Long,Long>> fst = builder.finish(); final FST<Pair<Long,Long>> fst = fstCompiler.compile();
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false); //Util.toDot(fst, w, false, false);
//w.close(); //w.close();
@ -1361,7 +1365,7 @@ public class TestFSTs extends LuceneTestCase {
final TreeSet<String> allPrefixes = new TreeSet<>(); final TreeSet<String> allPrefixes = new TreeSet<>();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder(); final IntsRefBuilder scratch = new IntsRefBuilder();
for (int i = 0; i < numWords; i++) { for (int i = 0; i < numWords; i++) {
@ -1382,10 +1386,10 @@ public class TestFSTs extends LuceneTestCase {
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) { for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
//System.out.println("add: " + e); //System.out.println("add: " + e);
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue()); fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
} }
final FST<Long> fst = builder.finish(); final FST<Long> fst = fstCompiler.compile();
//System.out.println("SAVE out.dot"); //System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false); //Util.toDot(fst, w, false, false);
@ -1479,7 +1483,7 @@ public class TestFSTs extends LuceneTestCase {
PositiveIntOutputs.getSingleton(), // weight PositiveIntOutputs.getSingleton(), // weight
PositiveIntOutputs.getSingleton() // output PositiveIntOutputs.getSingleton() // output
); );
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Pair<Long,Long>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder(); final IntsRefBuilder scratch = new IntsRefBuilder();
Random random = random(); Random random = random();
@ -1504,10 +1508,10 @@ public class TestFSTs extends LuceneTestCase {
//System.out.println("add: " + e); //System.out.println("add: " + e);
long weight = e.getValue().a; long weight = e.getValue().a;
long output = e.getValue().b; long output = e.getValue().b;
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output)); fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
} }
final FST<Pair<Long,Long>> fst = builder.finish(); final FST<Pair<Long,Long>> fst = fstCompiler.compile();
//System.out.println("SAVE out.dot"); //System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false); //Util.toDot(fst, w, false, false);
@ -1563,7 +1567,7 @@ public class TestFSTs extends LuceneTestCase {
public void testLargeOutputsOnArrayArcs() throws Exception { public void testLargeOutputsOnArrayArcs() throws Exception {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final byte[] bytes = new byte[300]; final byte[] bytes = new byte[300];
final IntsRefBuilder input = new IntsRefBuilder(); final IntsRefBuilder input = new IntsRefBuilder();
@ -1572,10 +1576,10 @@ public class TestFSTs extends LuceneTestCase {
for(int arc=0;arc<6;arc++) { for(int arc=0;arc<6;arc++) {
input.setIntAt(0, arc); input.setIntAt(0, arc);
output.bytes[0] = (byte) arc; output.bytes[0] = (byte) arc;
builder.add(input.get(), BytesRef.deepCopyOf(output)); fstCompiler.add(input.get(), BytesRef.deepCopyOf(output));
} }
final FST<BytesRef> fst = builder.finish(); final FST<BytesRef> fst = fstCompiler.compile();
for(int arc=0;arc<6;arc++) { for(int arc=0;arc<6;arc++) {
input.setIntAt(0, arc); input.setIntAt(0, arc);
final BytesRef result = Util.get(fst, input.get()); final BytesRef result = Util.get(fst, input.get());
@ -1608,15 +1612,15 @@ public class TestFSTs extends LuceneTestCase {
Collections.sort(termsList); Collections.sort(termsList);
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder input = new IntsRefBuilder(); IntsRefBuilder input = new IntsRefBuilder();
for(BytesRef term : termsList) { for(BytesRef term : termsList) {
Util.toIntsRef(term, input); Util.toIntsRef(term, input);
builder.add(input.get(), term); fstCompiler.add(input.get(), term);
} }
FST<BytesRef> fst = builder.finish(); FST<BytesRef> fst = fstCompiler.compile();
Arc<BytesRef> arc = new FST.Arc<>(); Arc<BytesRef> arc = new FST.Arc<>();
fst.getFirstArc(arc); fst.getFirstArc(arc);
@ -1638,17 +1642,17 @@ public class TestFSTs extends LuceneTestCase {
public void testSimpleDepth() throws Exception { public void testSimpleDepth() throws Exception {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
BytesRef ab = new BytesRef("ab"); BytesRef ab = new BytesRef("ab");
BytesRef ac = new BytesRef("ac"); BytesRef ac = new BytesRef("ac");
BytesRef bd = new BytesRef("bd"); BytesRef bd = new BytesRef("bd");
builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L); fstCompiler.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L); fstCompiler.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L); fstCompiler.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
FST<Long> fst = builder.finish(); FST<Long> fst = fstCompiler.compile();
assertEquals(3, (long) Util.get(fst, ab)); assertEquals(3, (long) Util.get(fst, ab));
assertEquals(5, (long) Util.get(fst, ac)); assertEquals(5, (long) Util.get(fst, ac));

View File

@ -83,15 +83,17 @@ public class TestUtil extends LuceneTestCase {
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception { private FST<Object> buildFST(List<String> words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton(); final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, allowArrayArcs, 15); final FSTCompiler.Builder<Object> builder = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
.allowFixedLengthArcs(allowArrayArcs);
if (!allowDirectAddressing) { if (!allowDirectAddressing) {
b.setDirectAddressingMaxOversizingFactor(-1f); builder.directAddressingMaxOversizingFactor(-1f);
} }
final FSTCompiler<Object> fstCompiler = builder.build();
for (String word : words) { for (String word : words) {
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput()); fstCompiler.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
} }
return b.finish(); return fstCompiler.compile();
} }
private List<String> createRandomDictionary(int width, int depth) { private List<String> createRandomDictionary(int width, int depth) {

View File

@ -30,7 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator;
* more of its output values. You can use this when a single * more of its output values. You can use this when a single
* input may need to map to more than one output, * input may need to map to more than one output,
* maintaining order: pass the same input with a different * maintaining order: pass the same input with a different
* output by calling {@link Builder#add(IntsRef,Object)} multiple * output by calling {@link FSTCompiler#add(IntsRef,Object)} multiple
* times. The builder will then combine the outputs using * times. The builder will then combine the outputs using
* the {@link Outputs#merge(Object,Object)} method. * the {@link Outputs#merge(Object,Object)} method.
* *
@ -41,7 +41,7 @@ import org.apache.lucene.util.RamUsageEstimator;
* <p>NOTE: the only way to create multiple outputs is to * <p>NOTE: the only way to create multiple outputs is to
* add the same input to the FST multiple times in a row. This is * add the same input to the FST multiple times in a row. This is
* how the FST maps a single input to multiple outputs (e.g. you * how the FST maps a single input to multiple outputs (e.g. you
* cannot pass a List&lt;Object&gt; to {@link Builder#add}). If * cannot pass a List&lt;Object&gt; to {@link FSTCompiler#add}). If
* your outputs are longs, and you need at most 2, then use * your outputs are longs, and you need at most 2, then use
* {@link UpToTwoPositiveIntOutputs} instead since it stores * {@link UpToTwoPositiveIntOutputs} instead since it stores
* the outputs more compactly (by stealing a bit from each * the outputs more compactly (by stealing a bit from each

View File

@ -35,14 +35,14 @@ import org.apache.lucene.util.SuppressForbidden;
* <p>NOTE: the only way to create a TwoLongs output is to * <p>NOTE: the only way to create a TwoLongs output is to
* add the same input to the FST twice in a row. This is * add the same input to the FST twice in a row. This is
* how the FST maps a single input to two outputs (e.g. you * how the FST maps a single input to two outputs (e.g. you
* cannot pass a TwoLongs to {@link Builder#add}. If you * cannot pass a TwoLongs to {@link FSTCompiler#add}. If you
* need more than two then use {@link ListOfOutputs}, but if * need more than two then use {@link ListOfOutputs}, but if
* you only have at most 2 then this implementation will * you only have at most 2 then this implementation will
* require fewer bytes as it steals one bit from each long * require fewer bytes as it steals one bit from each long
* value. * value.
* *
* <p>NOTE: the resulting FST is not guaranteed to be minimal! * <p>NOTE: the resulting FST is not guaranteed to be minimal!
* See {@link Builder}. * See {@link FSTCompiler}.
* *
* @lucene.experimental * @lucene.experimental
*/ */

View File

@ -164,16 +164,16 @@ public class TestFSTsMisc extends LuceneTestCase {
public void testListOfOutputs() throws Exception { public void testListOfOutputs() throws Exception {
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs); ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder(); final IntsRefBuilder scratch = new IntsRefBuilder();
// Add the same input more than once and the outputs // Add the same input more than once and the outputs
// are merged: // are merged:
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L); fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L); fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L); fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L); fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
final FST<Object> fst = builder.finish(); final FST<Object> fst = fstCompiler.compile();
Object output = Util.get(fst, new BytesRef("a")); Object output = Util.get(fst, new BytesRef("a"));
assertNotNull(output); assertNotNull(output);
@ -193,20 +193,20 @@ public class TestFSTsMisc extends LuceneTestCase {
public void testListOfOutputsEmptyString() throws Exception { public void testListOfOutputsEmptyString() throws Exception {
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs); ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder(); final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(scratch.get(), 0L); fstCompiler.add(scratch.get(), 0L);
builder.add(scratch.get(), 1L); fstCompiler.add(scratch.get(), 1L);
builder.add(scratch.get(), 17L); fstCompiler.add(scratch.get(), 17L);
builder.add(scratch.get(), 1L); fstCompiler.add(scratch.get(), 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L); fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L); fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L); fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L); fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
final FST<Object> fst = builder.finish(); final FST<Object> fst = fstCompiler.compile();
Object output = Util.get(fst, new BytesRef("")); Object output = Util.get(fst, new BytesRef(""));
assertNotNull(output); assertNotNull(output);

View File

@ -43,7 +43,7 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
@ -350,29 +350,28 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
} }
} }
final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, final FSTCompiler<Pair<BytesRef,Long>> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS)
0, 0, true, false, Integer.MAX_VALUE, .shouldShareNonSingletonNodes(false).build();
FST_OUTPUTS, true, 15);
//if (DEBUG) { //if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix); // System.out.println(" compile index for prefix=" + prefix);
//} //}
//indexBuilder.DEBUG = false; //indexBuilder.DEBUG = false;
final byte[] bytes = scratchBytes.toArrayCopy(); final byte[] bytes = scratchBytes.toArrayCopy();
assert bytes.length > 0; assert bytes.length > 0;
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex)); fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
scratchBytes.reset(); scratchBytes.reset();
// Copy over index for all sub-blocks // Copy over index for all sub-blocks
for(PendingBlock block : blocks) { for(PendingBlock block : blocks) {
if (block.subIndices != null) { if (block.subIndices != null) {
for(FST<Pair<BytesRef,Long>> subIndex : block.subIndices) { for(FST<Pair<BytesRef,Long>> subIndex : block.subIndices) {
append(indexBuilder, subIndex, scratchIntsRef); append(fstCompiler, subIndex, scratchIntsRef);
} }
block.subIndices = null; block.subIndices = null;
} }
} }
index = indexBuilder.finish(); index = fstCompiler.compile();
assert subIndices == null; assert subIndices == null;
@ -387,14 +386,14 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
// TODO: maybe we could add bulk-add method to // TODO: maybe we could add bulk-add method to
// Builder? Takes FST and unions it w/ current // Builder? Takes FST and unions it w/ current
// FST. // FST.
private void append(Builder<Pair<BytesRef,Long>> builder, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException { private void append(FSTCompiler<Pair<BytesRef,Long>> fstCompiler, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex); final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt; BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
while((indexEnt = subIndexEnum.next()) != null) { while((indexEnt = subIndexEnum.next()) != null) {
//if (DEBUG) { //if (DEBUG) {
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
//} //}
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
} }
} }
} }

View File

@ -52,7 +52,7 @@ import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator; import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
@ -496,7 +496,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName); reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
PairOutputs<Long,BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); PairOutputs<Long,BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); FSTCompiler<Pair<Long,BytesRef>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
// Build FST: // Build FST:
BytesRefBuilder previousAnalyzed = null; BytesRefBuilder previousAnalyzed = null;
@ -570,7 +570,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
Util.toIntsRef(analyzed.get(), scratchInts); Util.toIntsRef(analyzed.get(), scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) { if (!hasPayloads) {
builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface))); fstCompiler.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else { } else {
int payloadOffset = input.getPosition() + surface.length; int payloadOffset = input.getPosition() + surface.length;
int payloadLength = bytes.length - payloadOffset; int payloadLength = bytes.length - payloadOffset;
@ -579,10 +579,10 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
br.bytes[surface.length] = PAYLOAD_SEP; br.bytes[surface.length] = PAYLOAD_SEP;
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength); System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
br.length = br.bytes.length; br.length = br.bytes.length;
builder.add(scratchInts.get(), outputs.newPair(cost, br)); fstCompiler.add(scratchInts.get(), outputs.newPair(cost, br));
} }
} }
fst = builder.finish(); fst = fstCompiler.compile();
//Util.dotToFile(fst, "/tmp/suggest.dot"); //Util.dotToFile(fst, "/tmp/suggest.dot");
} finally { } finally {

View File

@ -66,7 +66,7 @@ import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.FST.BytesReader;
@ -304,7 +304,7 @@ public class FreeTextSuggester extends Lookup implements Accountable {
TermsEnum termsEnum = terms.iterator(); TermsEnum termsEnum = terms.iterator();
Outputs<Long> outputs = PositiveIntOutputs.getSingleton(); Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder();
while (true) { while (true) {
@ -320,10 +320,10 @@ public class FreeTextSuggester extends Lookup implements Accountable {
totTokens += termsEnum.totalTermFreq(); totTokens += termsEnum.totalTermFreq();
} }
builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq())); fstCompiler.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
} }
fst = builder.finish(); fst = fstCompiler.compile();
if (fst == null) { if (fst == null) {
throw new IllegalArgumentException("need at least one suggestion"); throw new IllegalArgumentException("need at least one suggestion");
} }

View File

@ -25,7 +25,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs; import org.apache.lucene.util.fst.PairOutputs;
@ -53,7 +53,7 @@ final class NRTSuggesterBuilder {
public static final int END_BYTE = 0x0; public static final int END_BYTE = 0x0;
private final PairOutputs<Long, BytesRef> outputs; private final PairOutputs<Long, BytesRef> outputs;
private final Builder<PairOutputs.Pair<Long, BytesRef>> builder; private final FSTCompiler<PairOutputs.Pair<Long, BytesRef>> fstCompiler;
private final IntsRefBuilder scratchInts = new IntsRefBuilder(); private final IntsRefBuilder scratchInts = new IntsRefBuilder();
private final BytesRefBuilder analyzed = new BytesRefBuilder(); private final BytesRefBuilder analyzed = new BytesRefBuilder();
private final PriorityQueue<Entry> entries; private final PriorityQueue<Entry> entries;
@ -70,7 +70,7 @@ final class NRTSuggesterBuilder {
this.endByte = END_BYTE; this.endByte = END_BYTE;
this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
this.entries = new PriorityQueue<>(); this.entries = new PriorityQueue<>();
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
} }
/** /**
@ -108,7 +108,7 @@ final class NRTSuggesterBuilder {
} }
analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++); analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
Util.toIntsRef(analyzed.get(), scratchInts); Util.toIntsRef(analyzed.get(), scratchInts);
builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload)); fstCompiler.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
} }
maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size()); maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
entries.clear(); entries.clear();
@ -119,11 +119,11 @@ final class NRTSuggesterBuilder {
* {@link NRTSuggester#load(IndexInput, CompletionPostingsFormat.FSTLoadMode)})} * {@link NRTSuggester#load(IndexInput, CompletionPostingsFormat.FSTLoadMode)})}
*/ */
public boolean store(DataOutput output) throws IOException { public boolean store(DataOutput output) throws IOException {
final FST<PairOutputs.Pair<Long, BytesRef>> build = builder.finish(); final FST<PairOutputs.Pair<Long, BytesRef>> fst = fstCompiler.compile();
if (build == null) { if (fst == null) {
return false; return false;
} }
build.save(output); fst.save(output);
/* write some more meta-info */ /* write some more meta-info */
assert maxAnalyzedPathsPerOutput > 0; assert maxAnalyzedPathsPerOutput > 0;

View File

@ -169,7 +169,7 @@ public class FSTCompletionBuilder {
* @param shareMaxTailLength * @param shareMaxTailLength
* Max shared suffix sharing length. * Max shared suffix sharing length.
* *
* See the description of this parameter in {@link Builder}'s constructor. * See the description of this parameter in {@link org.apache.lucene.util.fst.FSTCompiler.Builder}.
* In general, for very large inputs you'll want to construct a non-minimal * In general, for very large inputs you'll want to construct a non-minimal
* automaton which will be larger, but the construction will take far less ram. * automaton which will be larger, but the construction will take far less ram.
* For minimal automata, set it to {@link Integer#MAX_VALUE}. * For minimal automata, set it to {@link Integer#MAX_VALUE}.
@ -234,9 +234,8 @@ public class FSTCompletionBuilder {
// Build the automaton. // Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton(); final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput(); final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<>( final FSTCompiler<Object> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
FST.INPUT_TYPE.BYTE1, 0, 0, true, true, .shareMaxTailLength(shareMaxTailLength).build();
shareMaxTailLength, outputs, true, 15);
BytesRefBuilder scratch = new BytesRefBuilder(); BytesRefBuilder scratch = new BytesRefBuilder();
BytesRef entry; BytesRef entry;
@ -246,11 +245,11 @@ public class FSTCompletionBuilder {
while((entry = iter.next()) != null) { while((entry = iter.next()) != null) {
count++; count++;
if (scratch.get().compareTo(entry) != 0) { if (scratch.get().compareTo(entry) != 0) {
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty);
scratch.copyBytes(entry); scratch.copyBytes(entry);
} }
} }
return count == 0 ? null : builder.finish(); return count == 0 ? null : fstCompiler.compile();
} }
} }

View File

@ -40,7 +40,7 @@ import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.FST.BytesReader;
@ -116,7 +116,7 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
IntsRefBuilder scratchInts = new IntsRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder();
BytesRefBuilder previous = null; BytesRefBuilder previous = null;
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
while ((scratch = iter.next()) != null) { while ((scratch = iter.next()) != null) {
long cost = iter.weight(); long cost = iter.weight();
@ -127,11 +127,11 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
// added // added
} }
Util.toIntsRef(scratch, scratchInts); Util.toIntsRef(scratch, scratchInts);
builder.add(scratchInts.get(), cost); fstCompiler.add(scratchInts.get(), cost);
previous.copyBytes(scratch); previous.copyBytes(scratch);
count++; count++;
} }
fst = builder.finish(); fst = fstCompiler.compile();
} }

View File

@ -272,27 +272,26 @@ public class FSTTester<T> {
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2); System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
} }
final Builder<T> builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, final FSTCompiler<T> fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
prune1, prune2, .minSuffixCount1(prune1)
prune1==0 && prune2==0, .minSuffixCount2(prune2)
allowRandomSuffixSharing ? random.nextBoolean() : true, .shouldShareSuffix(prune1==0 && prune2==0)
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, .shouldShareNonSingletonNodes(allowRandomSuffixSharing ? random.nextBoolean() : true)
outputs, .shareMaxTailLength(allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE)
true, .build();
15);
for(InputOutput<T> pair : pairs) { for(InputOutput<T> pair : pairs) {
if (pair.output instanceof List) { if (pair.output instanceof List) {
@SuppressWarnings("unchecked") List<Long> longValues = (List<Long>) pair.output; @SuppressWarnings("unchecked") List<Long> longValues = (List<Long>) pair.output;
@SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder; @SuppressWarnings("unchecked") final FSTCompiler<Object> fstCompilerObject = (FSTCompiler<Object>) fstCompiler;
for(Long value : longValues) { for(Long value : longValues) {
builderObject.add(pair.input, value); fstCompilerObject.add(pair.input, value);
} }
} else { } else {
builder.add(pair.input, pair.output); fstCompiler.add(pair.input, pair.output);
} }
} }
FST<T> fst = builder.finish(); FST<T> fst = fstCompiler.compile();
if (random.nextBoolean() && fst != null) { if (random.nextBoolean() && fst != null) {
IOContext context = LuceneTestCase.newIOContext(random); IOContext context = LuceneTestCase.newIOContext(random);
@ -320,7 +319,7 @@ public class FSTTester<T> {
if (fst == null) { if (fst == null) {
System.out.println(" fst has 0 nodes (fully pruned)"); System.out.println(" fst has 0 nodes (fully pruned)");
} else { } else {
System.out.println(" fst has " + builder.getNodeCount() + " nodes and " + builder.getArcCount() + " arcs"); System.out.println(" fst has " + fstCompiler.getNodeCount() + " nodes and " + fstCompiler.getArcCount() + " arcs");
} }
} }
@ -330,8 +329,8 @@ public class FSTTester<T> {
verifyPruned(inputMode, fst, prune1, prune2); verifyPruned(inputMode, fst, prune1, prune2);
} }
nodeCount = builder.getNodeCount(); nodeCount = fstCompiler.getNodeCount();
arcCount = builder.getArcCount(); arcCount = fstCompiler.getArcCount();
return fst; return fst;
} }

View File

@ -89,6 +89,8 @@ Other Changes
* SOLR-13797: SolrResourceLoader no longer caches bad results when asked for wrong type (Mike Drob) * SOLR-13797: SolrResourceLoader no longer caches bad results when asked for wrong type (Mike Drob)
* LUCENE-9092: Upgrade Carrot2 to 3.16.2 (Dawid Weiss).
================== 8.5.0 ================== ================== 8.5.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.