mirror of https://github.com/apache/lucene.git
LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder.
Closes #1070
This commit is contained in:
parent
f083f40b28
commit
1812b367ab
|
@ -38,6 +38,9 @@ API Changes
|
|||
* LUCENE-8905: Better defence against malformed arguments in TopDocsCollector
|
||||
(Atri Sharma)
|
||||
|
||||
* LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder.
|
||||
(Bruno Roustant)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-8757: When provided with an ExecutorService to run queries across
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
# Apache Lucene Migration Guide
|
||||
|
||||
## o.a.l.util.fst.Builder is renamed FSTCompiler with fluent-style Builder (LUCENE-9089) ##
|
||||
|
||||
Simply use FSTCompiler instead of the previous Builder. Use either the simple constructor with default settings, or
|
||||
the FSTCompiler.Builder to tune and tweak any parameter.
|
||||
|
||||
## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
|
||||
|
||||
User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.util.CharsRef;
|
|||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
|
@ -106,13 +107,13 @@ public class NormalizeCharMap {
|
|||
final FST<CharsRef> map;
|
||||
try {
|
||||
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||
final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
|
||||
builder.add(Util.toUTF16(ent.getKey(), scratch),
|
||||
fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
|
||||
new CharsRef(ent.getValue()));
|
||||
}
|
||||
map = builder.finish();
|
||||
map = fstCompiler.compile();
|
||||
pendingPairs.clear();
|
||||
} catch (IOException ioe) {
|
||||
// Bogus FST IOExceptions!! (will never happen)
|
||||
|
|
|
@ -64,7 +64,7 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
|||
import org.apache.lucene.util.OfflineSorter;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||
|
@ -231,9 +231,9 @@ public class Dictionary {
|
|||
|
||||
// read dictionary entries
|
||||
IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
|
||||
Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
|
||||
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, b);
|
||||
words = b.finish();
|
||||
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
|
||||
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
|
||||
words = fstCompiler.compile();
|
||||
aliases = null; // no longer needed
|
||||
morphAliases = null; // no longer needed
|
||||
success = true;
|
||||
|
@ -414,7 +414,7 @@ public class Dictionary {
|
|||
|
||||
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
|
||||
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
|
||||
Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
|
||||
Util.toUTF32(entry.getKey(), scratch);
|
||||
|
@ -423,9 +423,9 @@ public class Dictionary {
|
|||
for (Integer c : entries) {
|
||||
output.ints[output.length++] = c;
|
||||
}
|
||||
builder.add(scratch.get(), output);
|
||||
fstCompiler.add(scratch.get(), output);
|
||||
}
|
||||
return builder.finish();
|
||||
return fstCompiler.compile();
|
||||
}
|
||||
|
||||
static String escapeDash(String re) {
|
||||
|
@ -608,14 +608,14 @@ public class Dictionary {
|
|||
}
|
||||
|
||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
for (Map.Entry<String,String> entry : mappings.entrySet()) {
|
||||
Util.toUTF16(entry.getKey(), scratchInts);
|
||||
builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
|
||||
}
|
||||
|
||||
return builder.finish();
|
||||
return fstCompiler.compile();
|
||||
}
|
||||
|
||||
/** pattern accepts optional BOM + SET + any whitespace */
|
||||
|
@ -776,7 +776,7 @@ public class Dictionary {
|
|||
* @param decoder CharsetDecoder used to decode the contents of the file
|
||||
* @throws IOException Can be thrown while reading from the file
|
||||
*/
|
||||
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
|
||||
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, FSTCompiler<IntsRef> words) throws IOException {
|
||||
BytesRefBuilder flagsScratch = new BytesRefBuilder();
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
|
||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
|||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FST.Arc;
|
||||
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
|
||||
/**
|
||||
* Provides the ability to override any {@link KeywordAttribute} aware stemmer
|
||||
|
@ -203,7 +204,7 @@ public final class StemmerOverrideFilter extends TokenFilter {
|
|||
*/
|
||||
public StemmerOverrideMap build() throws IOException {
|
||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(
|
||||
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(
|
||||
FST.INPUT_TYPE.BYTE4, outputs);
|
||||
final int[] sort = hash.sort();
|
||||
IntsRefBuilder intsSpare = new IntsRefBuilder();
|
||||
|
@ -213,9 +214,9 @@ public final class StemmerOverrideFilter extends TokenFilter {
|
|||
int id = sort[i];
|
||||
BytesRef bytesRef = hash.get(id, spare);
|
||||
intsSpare.copyUTF8Bytes(bytesRef);
|
||||
builder.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
|
||||
fstCompiler.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
|
||||
}
|
||||
return new StemmerOverrideMap(builder.finish(), ignoreCase);
|
||||
return new StemmerOverrideMap(fstCompiler.compile(), ignoreCase);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ import org.apache.lucene.util.CharsRefBuilder;
|
|||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
/**
|
||||
|
@ -213,8 +214,8 @@ public class SynonymMap {
|
|||
public SynonymMap build() throws IOException {
|
||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
// TODO: are we using the best sharing options?
|
||||
org.apache.lucene.util.fst.Builder<BytesRef> builder =
|
||||
new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
FSTCompiler<BytesRef> fstCompiler =
|
||||
new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
|
||||
|
@ -278,10 +279,10 @@ public class SynonymMap {
|
|||
|
||||
scratch.setLength(scratchOutput.getPosition());
|
||||
//System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
|
||||
builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
|
||||
fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
|
||||
}
|
||||
|
||||
FST<BytesRef> fst = builder.finish();
|
||||
FST<BytesRef> fst = fstCompiler.compile();
|
||||
return new SynonymMap(fst, words, maxHorizontalContext);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.util.CharsRef;
|
|||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
|
@ -196,26 +196,26 @@ public class TestDictionary extends LuceneTestCase {
|
|||
|
||||
public void testReplacements() throws Exception {
|
||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
|
||||
// a -> b
|
||||
Util.toUTF16("a", scratchInts);
|
||||
builder.add(scratchInts.get(), new CharsRef("b"));
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef("b"));
|
||||
|
||||
// ab -> c
|
||||
Util.toUTF16("ab", scratchInts);
|
||||
builder.add(scratchInts.get(), new CharsRef("c"));
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef("c"));
|
||||
|
||||
// c -> de
|
||||
Util.toUTF16("c", scratchInts);
|
||||
builder.add(scratchInts.get(), new CharsRef("de"));
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef("de"));
|
||||
|
||||
// def -> gh
|
||||
Util.toUTF16("def", scratchInts);
|
||||
builder.add(scratchInts.get(), new CharsRef("gh"));
|
||||
fstCompiler.add(scratchInts.get(), new CharsRef("gh"));
|
||||
|
||||
FST<CharsRef> fst = builder.finish();
|
||||
FST<CharsRef> fst = fstCompiler.compile();
|
||||
|
||||
StringBuilder sb = new StringBuilder("atestanother");
|
||||
Dictionary.applyMappings(fst, sb);
|
||||
|
|
|
@ -29,7 +29,7 @@ import java.util.TreeMap;
|
|||
|
||||
import org.apache.lucene.analysis.ja.util.CSVUtil;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
|
||||
|
@ -99,7 +99,7 @@ public final class UserDictionary implements Dictionary {
|
|||
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
long ord = 0;
|
||||
|
||||
|
@ -136,11 +136,11 @@ public final class UserDictionary implements Dictionary {
|
|||
for (int i = 0; i < token.length(); i++) {
|
||||
scratch.setIntAt(i, (int) token.charAt(i));
|
||||
}
|
||||
fstBuilder.add(scratch.get(), ord);
|
||||
fstCompiler.add(scratch.get(), ord);
|
||||
segmentations.add(wordIdAndLength);
|
||||
ord++;
|
||||
}
|
||||
this.fst = new TokenInfoFST(fstBuilder.finish(), false);
|
||||
this.fst = new TokenInfoFST(fstCompiler.compile(), false);
|
||||
this.data = data.toArray(new String[data.size()]);
|
||||
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@ import java.util.stream.Stream;
|
|||
|
||||
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
|
||||
|
@ -97,7 +97,7 @@ class TokenInfoDictionaryBuilder {
|
|||
lines.sort(Comparator.comparing(entry -> entry[0]));
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
long ord = -1; // first ord will be 0
|
||||
String lastValue = null;
|
||||
|
@ -120,12 +120,12 @@ class TokenInfoDictionaryBuilder {
|
|||
for (int i = 0; i < token.length(); i++) {
|
||||
scratch.setIntAt(i, (int) token.charAt(i));
|
||||
}
|
||||
fstBuilder.add(scratch.get(), ord);
|
||||
fstCompiler.add(scratch.get(), ord);
|
||||
}
|
||||
dictionary.addMapping((int) ord, offset);
|
||||
offset = next;
|
||||
}
|
||||
dictionary.setFST(fstBuilder.finish());
|
||||
dictionary.setFST(fstCompiler.compile());
|
||||
return dictionary;
|
||||
}
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ import java.util.List;
|
|||
|
||||
import org.apache.lucene.analysis.ko.POS;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
|
||||
|
@ -83,7 +83,7 @@ public final class UserDictionary implements Dictionary {
|
|||
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
|
||||
String lastToken = null;
|
||||
|
@ -129,11 +129,11 @@ public final class UserDictionary implements Dictionary {
|
|||
for (int i = 0; i < token.length(); i++) {
|
||||
scratch.setIntAt(i, token.charAt(i));
|
||||
}
|
||||
fstBuilder.add(scratch.get(), ord);
|
||||
fstCompiler.add(scratch.get(), ord);
|
||||
lastToken = token;
|
||||
ord ++;
|
||||
}
|
||||
this.fst = new TokenInfoFST(fstBuilder.finish());
|
||||
this.fst = new TokenInfoFST(fstCompiler.compile());
|
||||
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
|
||||
this.rightIds = new short[rightIds.size()];
|
||||
for (int i = 0; i < rightIds.size(); i++) {
|
||||
|
|
|
@ -30,7 +30,7 @@ import java.util.stream.Collectors;
|
|||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
|
@ -90,7 +90,7 @@ class TokenInfoDictionaryBuilder {
|
|||
lines.sort(Comparator.comparing(left -> left[0]));
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
long ord = -1; // first ord will be 0
|
||||
String lastValue = null;
|
||||
|
@ -116,12 +116,12 @@ class TokenInfoDictionaryBuilder {
|
|||
for (int i = 0; i < surfaceForm.length(); i++) {
|
||||
scratch.setIntAt(i, surfaceForm.charAt(i));
|
||||
}
|
||||
fstBuilder.add(scratch.get(), ord);
|
||||
fstCompiler.add(scratch.get(), ord);
|
||||
}
|
||||
dictionary.addMapping((int) ord, offset);
|
||||
offset = next;
|
||||
}
|
||||
dictionary.setFST(fstBuilder.finish());
|
||||
dictionary.setFST(fstCompiler.compile());
|
||||
return dictionary;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ import org.apache.lucene.search.WildcardQuery;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
@ -183,15 +183,15 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
|
|||
|
||||
private void updateFST(SortedMap<String, Double> weights) throws IOException {
|
||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
BytesRefBuilder scratchBytes = new BytesRefBuilder();
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
for (Map.Entry<String, Double> entry : weights.entrySet()) {
|
||||
scratchBytes.copyChars(entry.getKey());
|
||||
fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
|
||||
fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
|
||||
.getValue().longValue());
|
||||
}
|
||||
fst = fstBuilder.finish();
|
||||
fst = fstCompiler.compile();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.util.BytesRefBuilder;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
@ -219,7 +219,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
}
|
||||
|
||||
private class FSTFieldWriter extends FieldWriter {
|
||||
private final Builder<Long> fstBuilder;
|
||||
private final FSTCompiler<Long> fstCompiler;
|
||||
private final PositiveIntOutputs fstOutputs;
|
||||
private final long startTermsFilePointer;
|
||||
|
||||
|
@ -233,12 +233,12 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
|
||||
this.fieldInfo = fieldInfo;
|
||||
fstOutputs = PositiveIntOutputs.getSingleton();
|
||||
fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
|
||||
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
|
||||
indexStart = out.getFilePointer();
|
||||
////System.out.println("VGW: field=" + fieldInfo.name);
|
||||
|
||||
// Always put empty string in
|
||||
fstBuilder.add(new IntsRef(), termsFilePointer);
|
||||
fstCompiler.add(new IntsRef(), termsFilePointer);
|
||||
startTermsFilePointer = termsFilePointer;
|
||||
}
|
||||
|
||||
|
@ -269,7 +269,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
final int lengthSave = text.length;
|
||||
text.length = indexedTermPrefixLength(lastTerm.get(), text);
|
||||
try {
|
||||
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
|
||||
fstCompiler.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
|
||||
} finally {
|
||||
text.length = lengthSave;
|
||||
}
|
||||
|
@ -278,7 +278,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
|
||||
@Override
|
||||
public void finish(long termsFilePointer) throws IOException {
|
||||
fst = fstBuilder.finish();
|
||||
fst = fstCompiler.compile();
|
||||
if (fst != null) {
|
||||
fst.save(out);
|
||||
}
|
||||
|
|
|
@ -45,7 +45,7 @@ import org.apache.lucene.util.FixedBitSet;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
@ -361,16 +361,14 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true, false, Integer.MAX_VALUE,
|
||||
FST_OUTPUTS, true, 15);
|
||||
final FSTCompiler<Output> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS).shouldShareNonSingletonNodes(false).build();
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
//}
|
||||
//indexBuilder.DEBUG = false;
|
||||
final byte[] bytes = scratchBytes.toArrayCopy();
|
||||
assert bytes.length > 0;
|
||||
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef),
|
||||
fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef),
|
||||
FST_OUTPUTS.newOutput(new BytesRef(bytes, 0, bytes.length),
|
||||
0, Long.MAX_VALUE-(sumTotalTermCount-1)));
|
||||
scratchBytes.reset();
|
||||
|
@ -381,7 +379,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
|||
for(PendingBlock block : blocks) {
|
||||
if (block.subIndices != null) {
|
||||
for(SubIndex subIndex : block.subIndices) {
|
||||
append(indexBuilder, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef);
|
||||
append(fstCompiler, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef);
|
||||
}
|
||||
block.subIndices = null;
|
||||
}
|
||||
|
@ -391,7 +389,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
assert sumTotalTermCount == totFloorTermCount;
|
||||
|
||||
index = indexBuilder.finish();
|
||||
index = fstCompiler.compile();
|
||||
assert subIndices == null;
|
||||
|
||||
/*
|
||||
|
@ -405,7 +403,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
|||
// TODO: maybe we could add bulk-add method to
|
||||
// Builder? Takes FST and unions it w/ current
|
||||
// FST.
|
||||
private void append(Builder<Output> builder, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
|
||||
private void append(FSTCompiler<Output> fstCompiler, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
|
||||
final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
|
||||
BytesRefFSTEnum.InputOutput<Output> indexEnt;
|
||||
while ((indexEnt = subIndexEnum.next()) != null) {
|
||||
|
@ -416,7 +414,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
|||
//long blockTermCount = output.endOrd - output.startOrd + 1;
|
||||
Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset);
|
||||
//System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd));
|
||||
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
|
||||
fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
@ -287,7 +287,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
final class TermsWriter {
|
||||
private final Builder<Long> builder;
|
||||
private final FSTCompiler<Long> fstCompiler;
|
||||
private final PositiveIntOutputs outputs;
|
||||
private final FieldInfo fieldInfo;
|
||||
private final int longsSize;
|
||||
|
@ -311,7 +311,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
|
|||
this.fieldInfo = fieldInfo;
|
||||
this.longsSize = postingsWriter.setField(fieldInfo);
|
||||
this.outputs = PositiveIntOutputs.getSingleton();
|
||||
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
this.lastBlockStatsFP = 0;
|
||||
this.lastBlockMetaLongsFP = 0;
|
||||
|
@ -346,7 +346,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
metaLongsOut.writeVLong(metaBytesOut.size() - lastMetaBytesFP);
|
||||
|
||||
builder.add(Util.toIntsRef(text, scratchTerm), numTerms);
|
||||
fstCompiler.add(Util.toIntsRef(text, scratchTerm), numTerms);
|
||||
numTerms++;
|
||||
|
||||
lastMetaBytesFP = metaBytesOut.size();
|
||||
|
@ -365,7 +365,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
|
|||
metadata.statsOut = statsOut;
|
||||
metadata.metaLongsOut = metaLongsOut;
|
||||
metadata.metaBytesOut = metaBytesOut;
|
||||
metadata.dict = builder.finish();
|
||||
metadata.dict = fstCompiler.compile();
|
||||
fields.add(metadata);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
|
@ -247,7 +247,7 @@ public class FSTTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
final class TermsWriter {
|
||||
private final Builder<FSTTermOutputs.TermData> builder;
|
||||
private final FSTCompiler<FSTTermOutputs.TermData> fstCompiler;
|
||||
private final FSTTermOutputs outputs;
|
||||
private final FieldInfo fieldInfo;
|
||||
private final int longsSize;
|
||||
|
@ -261,7 +261,7 @@ public class FSTTermsWriter extends FieldsConsumer {
|
|||
this.fieldInfo = fieldInfo;
|
||||
this.longsSize = postingsWriter.setField(fieldInfo);
|
||||
this.outputs = new FSTTermOutputs(fieldInfo, longsSize);
|
||||
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
}
|
||||
|
||||
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
|
||||
|
@ -276,14 +276,14 @@ public class FSTTermsWriter extends FieldsConsumer {
|
|||
meta.bytes = metaWriter.toArrayCopy();
|
||||
metaWriter.reset();
|
||||
}
|
||||
builder.add(Util.toIntsRef(text, scratchTerm), meta);
|
||||
fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta);
|
||||
numTerms++;
|
||||
}
|
||||
|
||||
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
|
||||
// save FST dict
|
||||
if (numTerms > 0) {
|
||||
final FST<FSTTermOutputs.TermData> fst = builder.finish();
|
||||
final FST<FSTTermOutputs.TermData> fst = fstCompiler.compile();
|
||||
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,7 +52,7 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PairOutputs;
|
||||
|
@ -539,11 +539,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
|
||||
private void loadTerms() throws IOException {
|
||||
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
|
||||
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
|
||||
final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
|
||||
final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
|
||||
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
|
||||
outputsInner);
|
||||
b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
IndexInput in = SimpleTextFieldsReader.this.in.clone();
|
||||
in.seek(termsStart);
|
||||
final BytesRefBuilder lastTerm = new BytesRefBuilder();
|
||||
|
@ -556,7 +556,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
SimpleTextUtil.readLine(in, scratch);
|
||||
if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
|
||||
if (lastDocsStart != -1) {
|
||||
b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
|
||||
fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
|
||||
outputs.newPair(lastDocsStart,
|
||||
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
||||
sumTotalTermFreq += totalTermFreq;
|
||||
|
@ -574,7 +574,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
|
||||
} else if (StringHelper.startsWith(scratch.get(), TERM)) {
|
||||
if (lastDocsStart != -1) {
|
||||
b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
|
||||
fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
|
||||
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
||||
}
|
||||
lastDocsStart = in.getFilePointer();
|
||||
|
@ -589,7 +589,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
}
|
||||
}
|
||||
docCount = visitedDocs.cardinality();
|
||||
fst = b.finish();
|
||||
fst = fstCompiler.compile();
|
||||
/*
|
||||
PrintStream ps = new PrintStream("out.dot");
|
||||
fst.toDot(ps);
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
|
@ -202,19 +203,19 @@ public class FSTDictionary implements IndexDictionary {
|
|||
*/
|
||||
public static class Builder implements IndexDictionary.Builder {
|
||||
|
||||
protected final org.apache.lucene.util.fst.Builder<Long> fstBuilder;
|
||||
protected final FSTCompiler<Long> fstCompiler;
|
||||
protected final IntsRefBuilder scratchInts;
|
||||
|
||||
public Builder() {
|
||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
fstBuilder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
scratchInts = new IntsRefBuilder();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(BytesRef blockKey, long blockFilePointer) {
|
||||
try {
|
||||
fstBuilder.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer);
|
||||
fstCompiler.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer);
|
||||
} catch (IOException e) {
|
||||
// Should never happen.
|
||||
throw new RuntimeException(e);
|
||||
|
@ -224,7 +225,7 @@ public class FSTDictionary implements IndexDictionary {
|
|||
@Override
|
||||
public FSTDictionary build() {
|
||||
try {
|
||||
return new FSTDictionary(fstBuilder.finish());
|
||||
return new FSTDictionary(fstCompiler.compile());
|
||||
} catch (IOException e) {
|
||||
// Should never happen.
|
||||
throw new RuntimeException(e);
|
||||
|
|
|
@ -44,7 +44,7 @@ import org.apache.lucene.util.FixedBitSet;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
@ -454,29 +454,27 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true, false, Integer.MAX_VALUE,
|
||||
outputs, true, 15);
|
||||
final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).shouldShareNonSingletonNodes(false).build();
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
//}
|
||||
//indexBuilder.DEBUG = false;
|
||||
final byte[] bytes = scratchBytes.toArrayCopy();
|
||||
assert bytes.length > 0;
|
||||
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
|
||||
fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
|
||||
scratchBytes.reset();
|
||||
|
||||
// Copy over index for all sub-blocks
|
||||
for(PendingBlock block : blocks) {
|
||||
if (block.subIndices != null) {
|
||||
for(FST<BytesRef> subIndex : block.subIndices) {
|
||||
append(indexBuilder, subIndex, scratchIntsRef);
|
||||
append(fstCompiler, subIndex, scratchIntsRef);
|
||||
}
|
||||
block.subIndices = null;
|
||||
}
|
||||
}
|
||||
|
||||
index = indexBuilder.finish();
|
||||
index = fstCompiler.compile();
|
||||
|
||||
assert subIndices == null;
|
||||
|
||||
|
@ -491,14 +489,14 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
|||
// TODO: maybe we could add bulk-add method to
|
||||
// Builder? Takes FST and unions it w/ current
|
||||
// FST.
|
||||
private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
|
||||
private void append(FSTCompiler<BytesRef> fstCompiler, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
|
||||
final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
|
||||
BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
|
||||
while((indexEnt = subIndexEnum.next()) != null) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
|
||||
//}
|
||||
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
|
||||
fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -605,7 +605,7 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
// serializes new node by appending its bytes to the end
|
||||
// of the current byte[]
|
||||
long addNode(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException {
|
||||
long addNode(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
|
||||
T NO_OUTPUT = outputs.getNoOutput();
|
||||
|
||||
//System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
|
||||
|
@ -616,28 +616,28 @@ public final class FST<T> implements Accountable {
|
|||
return NON_FINAL_END_NODE;
|
||||
}
|
||||
}
|
||||
final long startAddress = builder.bytes.getPosition();
|
||||
final long startAddress = fstCompiler.bytes.getPosition();
|
||||
//System.out.println(" startAddr=" + startAddress);
|
||||
|
||||
final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(builder, nodeIn);
|
||||
final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(fstCompiler, nodeIn);
|
||||
if (doFixedLengthArcs) {
|
||||
//System.out.println(" fixed length arcs");
|
||||
if (builder.numBytesPerArc.length < nodeIn.numArcs) {
|
||||
builder.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)];
|
||||
builder.numLabelBytesPerArc = new int[builder.numBytesPerArc.length];
|
||||
if (fstCompiler.numBytesPerArc.length < nodeIn.numArcs) {
|
||||
fstCompiler.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)];
|
||||
fstCompiler.numLabelBytesPerArc = new int[fstCompiler.numBytesPerArc.length];
|
||||
}
|
||||
}
|
||||
|
||||
builder.arcCount += nodeIn.numArcs;
|
||||
fstCompiler.arcCount += nodeIn.numArcs;
|
||||
|
||||
final int lastArc = nodeIn.numArcs-1;
|
||||
|
||||
long lastArcStart = builder.bytes.getPosition();
|
||||
long lastArcStart = fstCompiler.bytes.getPosition();
|
||||
int maxBytesPerArc = 0;
|
||||
int maxBytesPerArcWithoutLabel = 0;
|
||||
for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) {
|
||||
final Builder.Arc<T> arc = nodeIn.arcs[arcIdx];
|
||||
final Builder.CompiledNode target = (Builder.CompiledNode) arc.target;
|
||||
final FSTCompiler.Arc<T> arc = nodeIn.arcs[arcIdx];
|
||||
final FSTCompiler.CompiledNode target = (FSTCompiler.CompiledNode) arc.target;
|
||||
int flags = 0;
|
||||
//System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node);
|
||||
|
||||
|
@ -645,7 +645,7 @@ public final class FST<T> implements Accountable {
|
|||
flags += BIT_LAST_ARC;
|
||||
}
|
||||
|
||||
if (builder.lastFrozenNode == target.node && !doFixedLengthArcs) {
|
||||
if (fstCompiler.lastFrozenNode == target.node && !doFixedLengthArcs) {
|
||||
// TODO: for better perf (but more RAM used) we
|
||||
// could avoid this except when arc is "near" the
|
||||
// last arc:
|
||||
|
@ -671,36 +671,36 @@ public final class FST<T> implements Accountable {
|
|||
flags += BIT_ARC_HAS_OUTPUT;
|
||||
}
|
||||
|
||||
builder.bytes.writeByte((byte) flags);
|
||||
long labelStart = builder.bytes.getPosition();
|
||||
writeLabel(builder.bytes, arc.label);
|
||||
int numLabelBytes = (int) (builder.bytes.getPosition() - labelStart);
|
||||
fstCompiler.bytes.writeByte((byte) flags);
|
||||
long labelStart = fstCompiler.bytes.getPosition();
|
||||
writeLabel(fstCompiler.bytes, arc.label);
|
||||
int numLabelBytes = (int) (fstCompiler.bytes.getPosition() - labelStart);
|
||||
|
||||
// System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output));
|
||||
|
||||
if (arc.output != NO_OUTPUT) {
|
||||
outputs.write(arc.output, builder.bytes);
|
||||
outputs.write(arc.output, fstCompiler.bytes);
|
||||
//System.out.println(" write output");
|
||||
}
|
||||
|
||||
if (arc.nextFinalOutput != NO_OUTPUT) {
|
||||
//System.out.println(" write final output");
|
||||
outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes);
|
||||
outputs.writeFinalOutput(arc.nextFinalOutput, fstCompiler.bytes);
|
||||
}
|
||||
|
||||
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
|
||||
assert target.node > 0;
|
||||
//System.out.println(" write target");
|
||||
builder.bytes.writeVLong(target.node);
|
||||
fstCompiler.bytes.writeVLong(target.node);
|
||||
}
|
||||
|
||||
// just write the arcs "like normal" on first pass, but record how many bytes each one took
|
||||
// and max byte size:
|
||||
if (doFixedLengthArcs) {
|
||||
int numArcBytes = (int) (builder.bytes.getPosition() - lastArcStart);
|
||||
builder.numBytesPerArc[arcIdx] = numArcBytes;
|
||||
builder.numLabelBytesPerArc[arcIdx] = numLabelBytes;
|
||||
lastArcStart = builder.bytes.getPosition();
|
||||
int numArcBytes = (int) (fstCompiler.bytes.getPosition() - lastArcStart);
|
||||
fstCompiler.numBytesPerArc[arcIdx] = numArcBytes;
|
||||
fstCompiler.numLabelBytesPerArc[arcIdx] = numLabelBytes;
|
||||
lastArcStart = fstCompiler.bytes.getPosition();
|
||||
maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes);
|
||||
maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes);
|
||||
//System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes);
|
||||
|
@ -733,18 +733,18 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
|
||||
assert labelRange > 0;
|
||||
if (shouldExpandNodeWithDirectAddressing(builder, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
|
||||
writeNodeForDirectAddressing(builder, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
|
||||
builder.directAddressingNodeCount++;
|
||||
if (shouldExpandNodeWithDirectAddressing(fstCompiler, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
|
||||
writeNodeForDirectAddressing(fstCompiler, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
|
||||
fstCompiler.directAddressingNodeCount++;
|
||||
} else {
|
||||
writeNodeForBinarySearch(builder, nodeIn, startAddress, maxBytesPerArc);
|
||||
builder.binarySearchNodeCount++;
|
||||
writeNodeForBinarySearch(fstCompiler, nodeIn, startAddress, maxBytesPerArc);
|
||||
fstCompiler.binarySearchNodeCount++;
|
||||
}
|
||||
}
|
||||
|
||||
final long thisNodeAddress = builder.bytes.getPosition()-1;
|
||||
builder.bytes.reverse(startAddress, thisNodeAddress);
|
||||
builder.nodeCount++;
|
||||
final long thisNodeAddress = fstCompiler.bytes.getPosition()-1;
|
||||
fstCompiler.bytes.reverse(startAddress, thisNodeAddress);
|
||||
fstCompiler.nodeCount++;
|
||||
return thisNodeAddress;
|
||||
}
|
||||
|
||||
|
@ -757,8 +757,8 @@ public final class FST<T> implements Accountable {
|
|||
* of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear
|
||||
* scan) on lookup by arc label.
|
||||
*/
|
||||
private boolean shouldExpandNodeWithFixedLengthArcs(Builder<T> builder, Builder.UnCompiledNode<T> node) {
|
||||
return builder.allowFixedLengthArcs &&
|
||||
private boolean shouldExpandNodeWithFixedLengthArcs(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> node) {
|
||||
return fstCompiler.allowFixedLengthArcs &&
|
||||
((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
|
||||
node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS);
|
||||
}
|
||||
|
@ -769,18 +769,18 @@ public final class FST<T> implements Accountable {
|
|||
* Prefer direct addressing for performance if it does not oversize binary search byte size too much,
|
||||
* so that the arcs can be directly addressed by label.
|
||||
*
|
||||
* @see Builder#getDirectAddressingMaxOversizingFactor()
|
||||
* @see FSTCompiler#getDirectAddressingMaxOversizingFactor()
|
||||
*/
|
||||
private boolean shouldExpandNodeWithDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn,
|
||||
private boolean shouldExpandNodeWithDirectAddressing(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn,
|
||||
int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) {
|
||||
// Anticipate precisely the size of the encodings.
|
||||
int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs;
|
||||
int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + builder.numLabelBytesPerArc[0]
|
||||
int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + fstCompiler.numLabelBytesPerArc[0]
|
||||
+ maxBytesPerArcWithoutLabel * nodeIn.numArcs;
|
||||
|
||||
// Determine the allowed oversize compared to binary search.
|
||||
// This is defined by a parameter of FST Builder (default 1: no oversize).
|
||||
int allowedOversize = (int) (sizeForBinarySearch * builder.getDirectAddressingMaxOversizingFactor());
|
||||
int allowedOversize = (int) (sizeForBinarySearch * fstCompiler.getDirectAddressingMaxOversizingFactor());
|
||||
int expansionCost = sizeForDirectAddressing - allowedOversize;
|
||||
|
||||
// Select direct addressing if either:
|
||||
|
@ -790,46 +790,46 @@ public final class FST<T> implements Accountable {
|
|||
// In this case, decrement the credit by the oversize.
|
||||
// In addition, do not try to oversize to a clearly too large node size
|
||||
// (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter).
|
||||
if (expansionCost <= 0 || (builder.directAddressingExpansionCredit >= expansionCost
|
||||
if (expansionCost <= 0 || (fstCompiler.directAddressingExpansionCredit >= expansionCost
|
||||
&& sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) {
|
||||
builder.directAddressingExpansionCredit -= expansionCost;
|
||||
fstCompiler.directAddressingExpansionCredit -= expansionCost;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void writeNodeForBinarySearch(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) {
|
||||
private void writeNodeForBinarySearch(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) {
|
||||
// Build the header in a buffer.
|
||||
// It is a false/special arc which is in fact a node header with node flags followed by node metadata.
|
||||
builder.fixedLengthArcsBuffer
|
||||
fstCompiler.fixedLengthArcsBuffer
|
||||
.resetPosition()
|
||||
.writeByte(ARCS_FOR_BINARY_SEARCH)
|
||||
.writeVInt(nodeIn.numArcs)
|
||||
.writeVInt(maxBytesPerArc);
|
||||
int headerLen = builder.fixedLengthArcsBuffer.getPosition();
|
||||
int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition();
|
||||
|
||||
// Expand the arcs in place, backwards.
|
||||
long srcPos = builder.bytes.getPosition();
|
||||
long srcPos = fstCompiler.bytes.getPosition();
|
||||
long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc;
|
||||
assert destPos >= srcPos;
|
||||
if (destPos > srcPos) {
|
||||
builder.bytes.skipBytes((int) (destPos - srcPos));
|
||||
fstCompiler.bytes.skipBytes((int) (destPos - srcPos));
|
||||
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
|
||||
destPos -= maxBytesPerArc;
|
||||
int arcLen = builder.numBytesPerArc[arcIdx];
|
||||
int arcLen = fstCompiler.numBytesPerArc[arcIdx];
|
||||
srcPos -= arcLen;
|
||||
if (srcPos != destPos) {
|
||||
assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs;
|
||||
builder.bytes.copyBytes(srcPos, destPos, arcLen);
|
||||
fstCompiler.bytes.copyBytes(srcPos, destPos, arcLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write the header.
|
||||
builder.bytes.writeBytes(startAddress, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
||||
fstCompiler.bytes.writeBytes(startAddress, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
||||
}
|
||||
|
||||
private void writeNodeForDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) {
|
||||
private void writeNodeForDirectAddressing(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) {
|
||||
// Expand the arcs backwards in a buffer because we remove the labels.
|
||||
// So the obtained arcs might occupy less space. This is the reason why this
|
||||
// whole method is more complex.
|
||||
|
@ -837,64 +837,64 @@ public final class FST<T> implements Accountable {
|
|||
// the presence bits, and the first label. Keep the first label.
|
||||
int headerMaxLen = 11;
|
||||
int numPresenceBytes = getNumPresenceBytes(labelRange);
|
||||
long srcPos = builder.bytes.getPosition();
|
||||
int totalArcBytes = builder.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
|
||||
long srcPos = fstCompiler.bytes.getPosition();
|
||||
int totalArcBytes = fstCompiler.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
|
||||
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
|
||||
byte[] buffer = builder.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes();
|
||||
byte[] buffer = fstCompiler.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes();
|
||||
// Copy the arcs to the buffer, dropping all labels except first one.
|
||||
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
|
||||
bufferOffset -= maxBytesPerArcWithoutLabel;
|
||||
int srcArcLen = builder.numBytesPerArc[arcIdx];
|
||||
int srcArcLen = fstCompiler.numBytesPerArc[arcIdx];
|
||||
srcPos -= srcArcLen;
|
||||
int labelLen = builder.numLabelBytesPerArc[arcIdx];
|
||||
int labelLen = fstCompiler.numLabelBytesPerArc[arcIdx];
|
||||
// Copy the flags.
|
||||
builder.bytes.copyBytes(srcPos, buffer, bufferOffset, 1);
|
||||
fstCompiler.bytes.copyBytes(srcPos, buffer, bufferOffset, 1);
|
||||
// Skip the label, copy the remaining.
|
||||
int remainingArcLen = srcArcLen - 1 - labelLen;
|
||||
if (remainingArcLen != 0) {
|
||||
builder.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen);
|
||||
fstCompiler.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen);
|
||||
}
|
||||
if (arcIdx == 0) {
|
||||
// Copy the label of the first arc only.
|
||||
bufferOffset -= labelLen;
|
||||
builder.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen);
|
||||
fstCompiler.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen);
|
||||
}
|
||||
}
|
||||
assert bufferOffset == headerMaxLen + numPresenceBytes;
|
||||
|
||||
// Build the header in the buffer.
|
||||
// It is a false/special arc which is in fact a node header with node flags followed by node metadata.
|
||||
builder.fixedLengthArcsBuffer
|
||||
fstCompiler.fixedLengthArcsBuffer
|
||||
.resetPosition()
|
||||
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
|
||||
.writeVInt(labelRange) // labelRange instead of numArcs.
|
||||
.writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
|
||||
int headerLen = builder.fixedLengthArcsBuffer.getPosition();
|
||||
int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition();
|
||||
|
||||
// Prepare the builder byte store. Enlarge or truncate if needed.
|
||||
long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes;
|
||||
long currentPosition = builder.bytes.getPosition();
|
||||
long currentPosition = fstCompiler.bytes.getPosition();
|
||||
if (nodeEnd >= currentPosition) {
|
||||
builder.bytes.skipBytes((int) (nodeEnd - currentPosition));
|
||||
fstCompiler.bytes.skipBytes((int) (nodeEnd - currentPosition));
|
||||
} else {
|
||||
builder.bytes.truncate(nodeEnd);
|
||||
fstCompiler.bytes.truncate(nodeEnd);
|
||||
}
|
||||
assert builder.bytes.getPosition() == nodeEnd;
|
||||
assert fstCompiler.bytes.getPosition() == nodeEnd;
|
||||
|
||||
// Write the header.
|
||||
long writeOffset = startAddress;
|
||||
builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
||||
fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
||||
writeOffset += headerLen;
|
||||
|
||||
// Write the presence bits
|
||||
writePresenceBits(builder, nodeIn, writeOffset, numPresenceBytes);
|
||||
writePresenceBits(fstCompiler, nodeIn, writeOffset, numPresenceBytes);
|
||||
writeOffset += numPresenceBytes;
|
||||
|
||||
// Write the first label and the arcs.
|
||||
builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
|
||||
fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
|
||||
}
|
||||
|
||||
private void writePresenceBits(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) {
|
||||
private void writePresenceBits(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) {
|
||||
long bytePos = dest;
|
||||
byte presenceBits = 1; // The first arc is always present.
|
||||
int presenceIndex = 0;
|
||||
|
@ -904,7 +904,7 @@ public final class FST<T> implements Accountable {
|
|||
assert label > previousLabel;
|
||||
presenceIndex += label - previousLabel;
|
||||
while (presenceIndex >= Byte.SIZE) {
|
||||
builder.bytes.writeByte(bytePos++, presenceBits);
|
||||
fstCompiler.bytes.writeByte(bytePos++, presenceBits);
|
||||
presenceBits = 0;
|
||||
presenceIndex -= Byte.SIZE;
|
||||
}
|
||||
|
@ -915,7 +915,7 @@ public final class FST<T> implements Accountable {
|
|||
assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8;
|
||||
assert presenceBits != 0; // The last byte is not 0.
|
||||
assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present.
|
||||
builder.bytes.writeByte(bytePos++, presenceBits);
|
||||
fstCompiler.bytes.writeByte(bytePos++, presenceBits);
|
||||
assert bytePos - dest == numPresenceBytes;
|
||||
}
|
||||
|
||||
|
|
|
@ -49,31 +49,9 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
|
|||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
public class Builder<T> {
|
||||
public class FSTCompiler<T> {
|
||||
|
||||
/**
|
||||
* Default oversizing factor used to decide whether to encode a node with direct addressing or binary search.
|
||||
* Default is 1: ensure no oversizing on average.
|
||||
* <p>
|
||||
* This factor does not determine whether to encode a node with a list of variable length arcs or with
|
||||
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
|
||||
* encoded with fixed length arcs.
|
||||
* See {@code FST.shouldExpandNodeWithFixedLengthArcs()}
|
||||
* and {@code FST.shouldExpandNodeWithDirectAddressing()}.
|
||||
* <p>
|
||||
* For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs,
|
||||
* and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%.
|
||||
* <p>
|
||||
* For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs,
|
||||
* and 14% of them with direct encoding. Overall FST memory reduced by 0.8%.
|
||||
* <p>
|
||||
* Use {@code TestFstDirectAddressing.main()}
|
||||
* and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()}
|
||||
* to evaluate a change.
|
||||
*
|
||||
* @see #setDirectAddressingMaxOversizingFactor
|
||||
*/
|
||||
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f;
|
||||
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1f;
|
||||
|
||||
private final NodeHash<T> dedupHash;
|
||||
final FST<T> fst;
|
||||
|
@ -117,75 +95,29 @@ public class Builder<T> {
|
|||
long binarySearchNodeCount;
|
||||
long directAddressingNodeCount;
|
||||
|
||||
boolean allowFixedLengthArcs;
|
||||
float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
|
||||
final boolean allowFixedLengthArcs;
|
||||
final float directAddressingMaxOversizingFactor;
|
||||
long directAddressingExpansionCredit;
|
||||
|
||||
BytesStore bytes;
|
||||
final BytesStore bytes;
|
||||
|
||||
/**
|
||||
* Instantiates an FST/FSA builder without any pruning. A shortcut to {@link
|
||||
* #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int)} with
|
||||
* pruning options turned off.
|
||||
* Instantiates an FST/FSA builder with default settings and pruning options turned off.
|
||||
* For more tuning and tweaking, see {@link Builder}.
|
||||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
public FSTCompiler(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, 1f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates an FST/FSA builder with all the possible tuning and construction
|
||||
* tweaks. Read parameter documentation carefully.
|
||||
*
|
||||
* @param inputType
|
||||
* The input type (transition labels). Can be anything from {@link INPUT_TYPE}
|
||||
* enumeration. Shorter types will consume less memory. Strings (character sequences) are
|
||||
* represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
|
||||
*
|
||||
* @param minSuffixCount1
|
||||
* If pruning the input graph during construction, this threshold is used for telling
|
||||
* if a node is kept or pruned. If transition_count(node) >= minSuffixCount1, the node
|
||||
* is kept.
|
||||
*
|
||||
* @param minSuffixCount2
|
||||
* (Note: only Mike McCandless knows what this one is really doing...)
|
||||
*
|
||||
* @param doShareSuffix
|
||||
* If <code>true</code>, the shared suffixes will be compacted into unique paths.
|
||||
* This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
|
||||
* <code>false</code> creates a single suffix path for all input sequences. This will result in a larger
|
||||
* FST, but requires substantially less memory and CPU during building.
|
||||
*
|
||||
* @param doShareNonSingletonNodes
|
||||
* Only used if doShareSuffix is true. Set this to
|
||||
* true to ensure FST is fully minimal, at cost of more
|
||||
* CPU and more RAM during building.
|
||||
*
|
||||
* @param shareMaxTailLength
|
||||
* Only used if doShareSuffix is true. Set this to
|
||||
* Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
|
||||
* CPU and more RAM during building.
|
||||
*
|
||||
* @param outputs The output type for each input sequence. Applies only if building an FST. For
|
||||
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
||||
* singleton output object.
|
||||
*
|
||||
* @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or
|
||||
* direct addressing) while building the FST; this will make the resulting FST smaller but slower to
|
||||
* traverse.
|
||||
*
|
||||
* @param bytesPageBits How many bits wide to make each
|
||||
* byte[] block in the BytesStore; if you know the FST
|
||||
* will be large then make this larger. For example 15
|
||||
* bits = 32768 byte pages.
|
||||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||
private FSTCompiler(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||
boolean allowFixedLengthArcs, int bytesPageBits) {
|
||||
boolean allowFixedLengthArcs, int bytesPageBits, float directAddressingMaxOversizingFactor) {
|
||||
this.minSuffixCount1 = minSuffixCount1;
|
||||
this.minSuffixCount2 = minSuffixCount2;
|
||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
||||
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
|
||||
fst = new FST<>(inputType, outputs, bytesPageBits);
|
||||
bytes = fst.bytes;
|
||||
assert bytes != null;
|
||||
|
@ -204,23 +136,146 @@ public class Builder<T> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fluent-style constructor for FST {@link FSTCompiler}.
|
||||
* <p>
|
||||
* Creates an FST/FSA builder with all the possible tuning and construction tweaks.
|
||||
* Read parameter documentation carefully.
|
||||
*/
|
||||
public static class Builder<T> {
|
||||
|
||||
private final INPUT_TYPE inputType;
|
||||
private final Outputs<T> outputs;
|
||||
private int minSuffixCount1;
|
||||
private int minSuffixCount2;
|
||||
private boolean shouldShareSuffix = true;
|
||||
private boolean shouldShareNonSingletonNodes = true;
|
||||
private int shareMaxTailLength = Integer.MAX_VALUE;
|
||||
private boolean allowFixedLengthArcs = true;
|
||||
private int bytesPageBits = 15;
|
||||
private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
|
||||
|
||||
/**
|
||||
* @param inputType The input type (transition labels). Can be anything from {@link INPUT_TYPE}
|
||||
* enumeration. Shorter types will consume less memory. Strings (character sequences) are
|
||||
* represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
|
||||
* @param outputs The output type for each input sequence. Applies only if building an FST. For
|
||||
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
||||
* singleton output object.
|
||||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||
this.inputType = inputType;
|
||||
this.outputs = outputs;
|
||||
}
|
||||
|
||||
/**
|
||||
* If pruning the input graph during construction, this threshold is used for telling if a node is kept
|
||||
* or pruned. If transition_count(node) >= minSuffixCount1, the node is kept.
|
||||
* <p>
|
||||
* Default = 0.
|
||||
*/
|
||||
public Builder<T> minSuffixCount1(int minSuffixCount1) {
|
||||
this.minSuffixCount1 = minSuffixCount1;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Better pruning: we prune node (and all following nodes) if the prior node has less than this number
|
||||
* of terms go through it.
|
||||
* <p>
|
||||
* Default = 0.
|
||||
*/
|
||||
public Builder<T> minSuffixCount2(int minSuffixCount2) {
|
||||
this.minSuffixCount2 = minSuffixCount2;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* If {@code true}, the shared suffixes will be compacted into unique paths.
|
||||
* This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
|
||||
* {@code false} creates a single suffix path for all input sequences. This will result in a larger
|
||||
* FST, but requires substantially less memory and CPU during building.
|
||||
* <p>
|
||||
* Default = {@code true}.
|
||||
*/
|
||||
public Builder<T> shouldShareSuffix(boolean shouldShareSuffix) {
|
||||
this.shouldShareSuffix = shouldShareSuffix;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Only used if {@code shouldShareSuffix} is true. Set this to true to ensure FST is fully minimal,
|
||||
* at cost of more CPU and more RAM during building.
|
||||
* <p>
|
||||
* Default = {@code true}.
|
||||
*/
|
||||
public Builder<T> shouldShareNonSingletonNodes(boolean shouldShareNonSingletonNodes) {
|
||||
this.shouldShareNonSingletonNodes = shouldShareNonSingletonNodes;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Only used if {@code shouldShareSuffix} is true. Set this to Integer.MAX_VALUE to ensure FST is
|
||||
* fully minimal, at cost of more CPU and more RAM during building.
|
||||
* <p>
|
||||
* Default = {@link Integer#MAX_VALUE}.
|
||||
*/
|
||||
public Builder<T> shareMaxTailLength(int shareMaxTailLength) {
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pass {@code false} to disable the fixed length arc optimization (binary search or direct addressing)
|
||||
* while building the FST; this will make the resulting FST smaller but slower to traverse.
|
||||
* <p>
|
||||
* Default = {@code true}.
|
||||
*/
|
||||
public Builder<T> allowFixedLengthArcs(boolean allowFixedLengthArcs) {
|
||||
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* How many bits wide to make each byte[] block in the BytesStore; if you know the FST
|
||||
* will be large then make this larger. For example 15 bits = 32768 byte pages.
|
||||
* <p>
|
||||
* Default = 15.
|
||||
*/
|
||||
public Builder<T> bytesPageBits(int bytesPageBits) {
|
||||
this.bytesPageBits = bytesPageBits;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
|
||||
* of arcs instead of binary search.
|
||||
* <p>
|
||||
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
|
||||
* only binary search nodes will be created.
|
||||
*
|
||||
* @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR
|
||||
* <p>
|
||||
* This factor does not determine whether to encode a node with a list of variable length arcs or with
|
||||
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
|
||||
* encoded with fixed length arcs.
|
||||
* <p>
|
||||
* Default = 1.
|
||||
*/
|
||||
public Builder<T> setDirectAddressingMaxOversizingFactor(float factor) {
|
||||
directAddressingMaxOversizingFactor = factor;
|
||||
public Builder<T> directAddressingMaxOversizingFactor(float factor) {
|
||||
this.directAddressingMaxOversizingFactor = factor;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setDirectAddressingMaxOversizingFactor(float)
|
||||
* Creates a new {@link FSTCompiler}.
|
||||
*/
|
||||
public FSTCompiler<T> build() {
|
||||
FSTCompiler<T> fstCompiler = new FSTCompiler<>(inputType, minSuffixCount1, minSuffixCount2, shouldShareSuffix,
|
||||
shouldShareNonSingletonNodes, shareMaxTailLength, outputs, allowFixedLengthArcs, bytesPageBits,
|
||||
directAddressingMaxOversizingFactor);
|
||||
return fstCompiler;
|
||||
}
|
||||
}
|
||||
|
||||
public float getDirectAddressingMaxOversizingFactor() {
|
||||
return directAddressingMaxOversizingFactor;
|
||||
}
|
||||
|
@ -514,7 +569,7 @@ public class Builder<T> {
|
|||
|
||||
/** Returns final FST. NOTE: this will return null if
|
||||
* nothing is accepted by the FST. */
|
||||
public FST<T> finish() throws IOException {
|
||||
public FST<T> compile() throws IOException {
|
||||
|
||||
final UnCompiledNode<T> root = frontier[0];
|
||||
|
||||
|
@ -554,19 +609,19 @@ public class Builder<T> {
|
|||
}
|
||||
|
||||
/** Expert: holds a pending (seen but not yet serialized) arc. */
|
||||
public static class Arc<T> {
|
||||
public int label; // really an "unsigned" byte
|
||||
public Node target;
|
||||
public boolean isFinal;
|
||||
public T output;
|
||||
public T nextFinalOutput;
|
||||
static class Arc<T> {
|
||||
int label; // really an "unsigned" byte
|
||||
Node target;
|
||||
boolean isFinal;
|
||||
T output;
|
||||
T nextFinalOutput;
|
||||
}
|
||||
|
||||
// NOTE: not many instances of Node or CompiledNode are in
|
||||
// memory while the FST is being built; it's only the
|
||||
// current "frontier":
|
||||
|
||||
static interface Node {
|
||||
interface Node {
|
||||
boolean isCompiled();
|
||||
}
|
||||
|
||||
|
@ -583,20 +638,20 @@ public class Builder<T> {
|
|||
}
|
||||
|
||||
/** Expert: holds a pending (seen but not yet serialized) Node. */
|
||||
public static final class UnCompiledNode<T> implements Node {
|
||||
final Builder<T> owner;
|
||||
public int numArcs;
|
||||
public Arc<T>[] arcs;
|
||||
static final class UnCompiledNode<T> implements Node {
|
||||
final FSTCompiler<T> owner;
|
||||
int numArcs;
|
||||
Arc<T>[] arcs;
|
||||
// TODO: instead of recording isFinal/output on the
|
||||
// node, maybe we should use -1 arc to mean "end" (like
|
||||
// we do when reading the FST). Would simplify much
|
||||
// code here...
|
||||
public T output;
|
||||
public boolean isFinal;
|
||||
public long inputCount;
|
||||
T output;
|
||||
boolean isFinal;
|
||||
long inputCount;
|
||||
|
||||
/** This node's depth, starting from the automaton root. */
|
||||
public final int depth;
|
||||
final int depth;
|
||||
|
||||
/**
|
||||
* @param depth
|
||||
|
@ -605,7 +660,7 @@ public class Builder<T> {
|
|||
* fanout size).
|
||||
*/
|
||||
@SuppressWarnings({"rawtypes","unchecked"})
|
||||
public UnCompiledNode(Builder<T> owner, int depth) {
|
||||
UnCompiledNode(FSTCompiler<T> owner, int depth) {
|
||||
this.owner = owner;
|
||||
arcs = (Arc<T>[]) new Arc[1];
|
||||
arcs[0] = new Arc<>();
|
||||
|
@ -618,7 +673,7 @@ public class Builder<T> {
|
|||
return false;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
void clear() {
|
||||
numArcs = 0;
|
||||
isFinal = false;
|
||||
output = owner.NO_OUTPUT;
|
||||
|
@ -628,13 +683,13 @@ public class Builder<T> {
|
|||
// for nodes on the frontier (even when reused).
|
||||
}
|
||||
|
||||
public T getLastOutput(int labelToMatch) {
|
||||
T getLastOutput(int labelToMatch) {
|
||||
assert numArcs > 0;
|
||||
assert arcs[numArcs-1].label == labelToMatch;
|
||||
return arcs[numArcs-1].output;
|
||||
}
|
||||
|
||||
public void addArc(int label, Node target) {
|
||||
void addArc(int label, Node target) {
|
||||
assert label >= 0;
|
||||
assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[numArcs-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs;
|
||||
if (numArcs == arcs.length) {
|
||||
|
@ -651,7 +706,7 @@ public class Builder<T> {
|
|||
arc.isFinal = false;
|
||||
}
|
||||
|
||||
public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
|
||||
void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
|
||||
assert numArcs > 0;
|
||||
final Arc<T> arc = arcs[numArcs-1];
|
||||
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
|
||||
|
@ -661,14 +716,14 @@ public class Builder<T> {
|
|||
arc.isFinal = isFinal;
|
||||
}
|
||||
|
||||
public void deleteLast(int label, Node target) {
|
||||
void deleteLast(int label, Node target) {
|
||||
assert numArcs > 0;
|
||||
assert label == arcs[numArcs-1].label;
|
||||
assert target == arcs[numArcs-1].target;
|
||||
numArcs--;
|
||||
}
|
||||
|
||||
public void setLastOutput(int labelToMatch, T newOutput) {
|
||||
void setLastOutput(int labelToMatch, T newOutput) {
|
||||
assert owner.validOutput(newOutput);
|
||||
assert numArcs > 0;
|
||||
final Arc<T> arc = arcs[numArcs-1];
|
||||
|
@ -677,7 +732,7 @@ public class Builder<T> {
|
|||
}
|
||||
|
||||
// pushes an output prefix forward onto all arcs
|
||||
public void prependOutput(T outputPrefix) {
|
||||
void prependOutput(T outputPrefix) {
|
||||
assert owner.validOutput(outputPrefix);
|
||||
|
||||
for(int arcIdx=0;arcIdx<numArcs;arcIdx++) {
|
|
@ -39,7 +39,7 @@ final class NodeHash<T> {
|
|||
this.in = in;
|
||||
}
|
||||
|
||||
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
|
||||
private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException {
|
||||
fst.readFirstRealTargetArc(address, scratchArc, in);
|
||||
|
||||
// Fail fast for a node with fixed length arcs.
|
||||
|
@ -58,10 +58,10 @@ final class NodeHash<T> {
|
|||
}
|
||||
|
||||
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
|
||||
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
||||
final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
|
||||
if (arc.label != scratchArc.label() ||
|
||||
!arc.output.equals(scratchArc.output()) ||
|
||||
((Builder.CompiledNode) arc.target).node != scratchArc.target() ||
|
||||
((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() ||
|
||||
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) ||
|
||||
arc.isFinal != scratchArc.isFinal()) {
|
||||
return false;
|
||||
|
@ -82,16 +82,16 @@ final class NodeHash<T> {
|
|||
|
||||
// hash code for an unfrozen node. This must be identical
|
||||
// to the frozen case (below)!!
|
||||
private long hash(Builder.UnCompiledNode<T> node) {
|
||||
private long hash(FSTCompiler.UnCompiledNode<T> node) {
|
||||
final int PRIME = 31;
|
||||
//System.out.println("hash unfrozen");
|
||||
long h = 0;
|
||||
// TODO: maybe if number of arcs is high we can safely subsample?
|
||||
for (int arcIdx=0; arcIdx < node.numArcs; arcIdx++) {
|
||||
final Builder.Arc<T> arc = node.arcs[arcIdx];
|
||||
final FSTCompiler.Arc<T> arc = node.arcs[arcIdx];
|
||||
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
|
||||
h = PRIME * h + arc.label;
|
||||
long n = ((Builder.CompiledNode) arc.target).node;
|
||||
long n = ((FSTCompiler.CompiledNode) arc.target).node;
|
||||
h = PRIME * h + (int) (n^(n>>32));
|
||||
h = PRIME * h + arc.output.hashCode();
|
||||
h = PRIME * h + arc.nextFinalOutput.hashCode();
|
||||
|
@ -127,7 +127,7 @@ final class NodeHash<T> {
|
|||
return h & Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
public long add(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException {
|
||||
public long add(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
|
||||
//System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
|
||||
final long h = hash(nodeIn);
|
||||
long pos = h & mask;
|
||||
|
@ -136,7 +136,7 @@ final class NodeHash<T> {
|
|||
final long v = table.get(pos);
|
||||
if (v == 0) {
|
||||
// freeze & add
|
||||
final long node = fst.addNode(builder, nodeIn);
|
||||
final long node = fst.addNode(fstCompiler, nodeIn);
|
||||
//System.out.println(" now freeze node=" + node);
|
||||
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
|
||||
count++;
|
||||
|
|
|
@ -54,8 +54,7 @@ public class Test2BFST extends LuceneTestCase {
|
|||
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
|
||||
Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
Object NO_OUTPUT = outputs.getNoOutput();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
||||
true, 15);
|
||||
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
int count = 0;
|
||||
Random r = new Random(seed);
|
||||
|
@ -66,21 +65,21 @@ public class Test2BFST extends LuceneTestCase {
|
|||
for(int i=10;i<ints2.length;i++) {
|
||||
ints2[i] = r.nextInt(256);
|
||||
}
|
||||
b.add(input2, NO_OUTPUT);
|
||||
fstCompiler.add(input2, NO_OUTPUT);
|
||||
count++;
|
||||
if (count % 100000 == 0) {
|
||||
System.out.println(count + ": " + b.fstRamBytesUsed() + " bytes; " + b.getNodeCount() + " nodes");
|
||||
System.out.println(count + ": " + fstCompiler.fstRamBytesUsed() + " bytes; " + fstCompiler.getNodeCount() + " nodes");
|
||||
}
|
||||
if (b.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
|
||||
if (fstCompiler.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
|
||||
break;
|
||||
}
|
||||
nextInput(r, ints2);
|
||||
}
|
||||
|
||||
FST<Object> fst = b.finish();
|
||||
FST<Object> fst = fstCompiler.compile();
|
||||
|
||||
for(int verify=0;verify<2;verify++) {
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
|
||||
|
||||
Arrays.fill(ints2, 0);
|
||||
r = new Random(seed);
|
||||
|
@ -136,8 +135,7 @@ public class Test2BFST extends LuceneTestCase {
|
|||
{
|
||||
System.out.println("\nTEST: 3 GB size; outputs=bytes");
|
||||
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
|
||||
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
||||
true, 15);
|
||||
final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
byte[] outputBytes = new byte[20];
|
||||
BytesRef output = new BytesRef(outputBytes);
|
||||
|
@ -147,10 +145,10 @@ public class Test2BFST extends LuceneTestCase {
|
|||
while(true) {
|
||||
r.nextBytes(outputBytes);
|
||||
//System.out.println("add: " + input + " -> " + output);
|
||||
b.add(input, BytesRef.deepCopyOf(output));
|
||||
fstCompiler.add(input, BytesRef.deepCopyOf(output));
|
||||
count++;
|
||||
if (count % 10000 == 0) {
|
||||
long size = b.fstRamBytesUsed();
|
||||
long size = fstCompiler.fstRamBytesUsed();
|
||||
if (count % 1000000 == 0) {
|
||||
System.out.println(count + "...: " + size + " bytes");
|
||||
}
|
||||
|
@ -161,10 +159,10 @@ public class Test2BFST extends LuceneTestCase {
|
|||
nextInput(r, ints);
|
||||
}
|
||||
|
||||
FST<BytesRef> fst = b.finish();
|
||||
FST<BytesRef> fst = fstCompiler.compile();
|
||||
for(int verify=0;verify<2;verify++) {
|
||||
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
|
||||
|
||||
r = new Random(seed);
|
||||
Arrays.fill(ints, 0);
|
||||
|
@ -216,8 +214,7 @@ public class Test2BFST extends LuceneTestCase {
|
|||
{
|
||||
System.out.println("\nTEST: 3 GB size; outputs=long");
|
||||
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
|
||||
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
||||
true, 15);
|
||||
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
long output = 1;
|
||||
|
||||
|
@ -226,11 +223,11 @@ public class Test2BFST extends LuceneTestCase {
|
|||
Random r = new Random(seed);
|
||||
while(true) {
|
||||
//System.out.println("add: " + input + " -> " + output);
|
||||
b.add(input, output);
|
||||
fstCompiler.add(input, output);
|
||||
output += 1+r.nextInt(10);
|
||||
count++;
|
||||
if (count % 10000 == 0) {
|
||||
long size = b.fstRamBytesUsed();
|
||||
long size = fstCompiler.fstRamBytesUsed();
|
||||
if (count % 1000000 == 0) {
|
||||
System.out.println(count + "...: " + size + " bytes");
|
||||
}
|
||||
|
@ -241,11 +238,11 @@ public class Test2BFST extends LuceneTestCase {
|
|||
nextInput(r, ints);
|
||||
}
|
||||
|
||||
FST<Long> fst = b.finish();
|
||||
FST<Long> fst = fstCompiler.compile();
|
||||
|
||||
for(int verify=0;verify<2;verify++) {
|
||||
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
|
||||
|
||||
Arrays.fill(ints, 0);
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestFstDirectAddressing extends LuceneTestCase {
|
||||
public class TestFSTDirectAddressing extends LuceneTestCase {
|
||||
|
||||
public void testDenseWithGap() throws Exception {
|
||||
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
|
||||
|
@ -86,13 +86,13 @@ public class TestFstDirectAddressing extends LuceneTestCase {
|
|||
Collections.sort(wordList);
|
||||
|
||||
// Disable direct addressing and measure the FST size.
|
||||
Builder<Object> builder = createBuilder(-1f);
|
||||
FST<Object> fst = buildFST(wordList, builder);
|
||||
FSTCompiler<Object> fstCompiler = createFSTCompiler(-1f);
|
||||
FST<Object> fst = buildFST(wordList, fstCompiler);
|
||||
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
|
||||
|
||||
// Enable direct addressing and measure the FST size.
|
||||
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
||||
fst = buildFST(wordList, builder);
|
||||
fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
||||
fst = buildFST(wordList, fstCompiler);
|
||||
long ramBytesUsed = fst.ramBytesUsed();
|
||||
|
||||
// Compute the size increase in percents.
|
||||
|
@ -107,42 +107,43 @@ public class TestFstDirectAddressing extends LuceneTestCase {
|
|||
directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT);
|
||||
}
|
||||
|
||||
private static void printStats(Builder<Object> builder, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
|
||||
System.out.println("directAddressingMaxOversizingFactor = " + builder.getDirectAddressingMaxOversizingFactor());
|
||||
private static void printStats(FSTCompiler<Object> fstCompiler, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
|
||||
System.out.println("directAddressingMaxOversizingFactor = " + fstCompiler.getDirectAddressingMaxOversizingFactor());
|
||||
System.out.println("ramBytesUsed = "
|
||||
+ String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d)
|
||||
+ String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent));
|
||||
System.out.println("num nodes = " + builder.nodeCount);
|
||||
long fixedLengthArcNodeCount = builder.directAddressingNodeCount + builder.binarySearchNodeCount;
|
||||
System.out.println("num nodes = " + fstCompiler.nodeCount);
|
||||
long fixedLengthArcNodeCount = fstCompiler.directAddressingNodeCount + fstCompiler.binarySearchNodeCount;
|
||||
System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount
|
||||
+ String.format(Locale.ENGLISH, " (%.2f %% of all nodes)",
|
||||
((double) fixedLengthArcNodeCount / builder.nodeCount * 100)));
|
||||
System.out.println("num binary-search nodes = " + (builder.binarySearchNodeCount)
|
||||
((double) fixedLengthArcNodeCount / fstCompiler.nodeCount * 100)));
|
||||
System.out.println("num binary-search nodes = " + (fstCompiler.binarySearchNodeCount)
|
||||
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
|
||||
((double) (builder.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||
System.out.println("num direct-addressing nodes = " + (builder.directAddressingNodeCount)
|
||||
((double) (fstCompiler.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||
System.out.println("num direct-addressing nodes = " + (fstCompiler.directAddressingNodeCount)
|
||||
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
|
||||
((double) (builder.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||
((double) (fstCompiler.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||
}
|
||||
|
||||
private static Builder<Object> createBuilder(float directAddressingMaxOversizingFactor) {
|
||||
return new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, NoOutputs.getSingleton(), true, 15)
|
||||
.setDirectAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor);
|
||||
private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) {
|
||||
return new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, NoOutputs.getSingleton())
|
||||
.directAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor)
|
||||
.build();
|
||||
}
|
||||
|
||||
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
|
||||
return buildFST(entries, createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
|
||||
return buildFST(entries, createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
|
||||
}
|
||||
|
||||
private static FST<Object> buildFST(List<BytesRef> entries, Builder<Object> builder) throws Exception {
|
||||
private static FST<Object> buildFST(List<BytesRef> entries, FSTCompiler<Object> fstCompiler) throws Exception {
|
||||
BytesRef last = null;
|
||||
for (BytesRef entry : entries) {
|
||||
if (entry.equals(last) == false) {
|
||||
builder.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
|
||||
fstCompiler.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
|
||||
}
|
||||
last = entry;
|
||||
}
|
||||
return builder.finish();
|
||||
return fstCompiler.compile();
|
||||
}
|
||||
|
||||
public static void main(String... args) throws Exception {
|
||||
|
@ -195,18 +196,18 @@ public class TestFstDirectAddressing extends LuceneTestCase {
|
|||
Collections.sort(wordList);
|
||||
|
||||
// Disable direct addressing and measure the FST size.
|
||||
Builder<Object> builder = createBuilder(-1f);
|
||||
FST<Object> fst = buildFST(wordList, builder);
|
||||
FSTCompiler<Object> fstCompiler = createFSTCompiler(-1f);
|
||||
FST<Object> fst = buildFST(wordList, fstCompiler);
|
||||
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
|
||||
|
||||
// Enable direct addressing and measure the FST size.
|
||||
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
||||
fst = buildFST(wordList, builder);
|
||||
fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
||||
fst = buildFST(wordList, fstCompiler);
|
||||
long ramBytesUsed = fst.ramBytesUsed();
|
||||
|
||||
// Compute the size increase in percents.
|
||||
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
|
||||
|
||||
printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent);
|
||||
printStats(fstCompiler, ramBytesUsed, directAddressingMemoryIncreasePercent);
|
||||
}
|
||||
}
|
|
@ -327,7 +327,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
writer.close();
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
|
||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
boolean storeOrd = random().nextBoolean();
|
||||
if (VERBOSE) {
|
||||
|
@ -373,15 +373,15 @@ public class TestFSTs extends LuceneTestCase {
|
|||
} else {
|
||||
output = termsEnum.docFreq();
|
||||
}
|
||||
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
|
||||
fstCompiler.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
|
||||
ord++;
|
||||
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
|
||||
System.out.println(ord + " terms...");
|
||||
}
|
||||
}
|
||||
FST<Long> fst = builder.finish();
|
||||
FST<Long> fst = fstCompiler.compile();
|
||||
if (VERBOSE) {
|
||||
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
|
||||
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
|
||||
}
|
||||
|
||||
if (ord > 0) {
|
||||
|
@ -460,7 +460,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
private final Path wordsFileIn;
|
||||
private int inputMode;
|
||||
private final Outputs<T> outputs;
|
||||
private final Builder<T> builder;
|
||||
private final FSTCompiler<T> fstCompiler;
|
||||
|
||||
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) {
|
||||
this.dirOut = dirOut;
|
||||
|
@ -468,7 +468,11 @@ public class TestFSTs extends LuceneTestCase {
|
|||
this.inputMode = inputMode;
|
||||
this.outputs = outputs;
|
||||
|
||||
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15);
|
||||
fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
|
||||
.minSuffixCount2(prune)
|
||||
.shouldShareSuffix(prune == 0)
|
||||
.allowFixedLengthArcs(!noArcArrays)
|
||||
.build();
|
||||
}
|
||||
|
||||
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
||||
|
@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
break;
|
||||
}
|
||||
toIntsRef(w, inputMode, intsRef);
|
||||
builder.add(intsRef.get(),
|
||||
fstCompiler.add(intsRef.get(),
|
||||
getOutput(intsRef.get(), ord));
|
||||
|
||||
ord++;
|
||||
|
@ -503,8 +507,8 @@ public class TestFSTs extends LuceneTestCase {
|
|||
long tMid = System.currentTimeMillis();
|
||||
System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms");
|
||||
|
||||
assert builder.getTermCount() == ord;
|
||||
FST<T> fst = builder.finish();
|
||||
assert fstCompiler.getTermCount() == ord;
|
||||
FST<T> fst = fstCompiler.compile();
|
||||
long tEnd = System.currentTimeMillis();
|
||||
System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack");
|
||||
if (fst == null) {
|
||||
|
@ -516,8 +520,8 @@ public class TestFSTs extends LuceneTestCase {
|
|||
return;
|
||||
}
|
||||
|
||||
System.out.println(ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs; tot size " + fst.ramBytesUsed());
|
||||
if (builder.getNodeCount() < 100) {
|
||||
System.out.println(ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs; tot size " + fst.ramBytesUsed());
|
||||
if (fstCompiler.getNodeCount() < 100) {
|
||||
Writer w = Files.newBufferedWriter(Paths.get("out.dot"), StandardCharsets.UTF_8);
|
||||
Util.toDot(fst, w, false, false);
|
||||
w.close();
|
||||
|
@ -717,9 +721,9 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testSingleString() throws Exception {
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(b.finish());
|
||||
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fstCompiler.compile());
|
||||
assertNull(fstEnum.seekFloor(new BytesRef("foo")));
|
||||
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
|
||||
}
|
||||
|
@ -728,12 +732,12 @@ public class TestFSTs extends LuceneTestCase {
|
|||
public void testDuplicateFSAString() throws Exception {
|
||||
String str = "foobar";
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
IntsRefBuilder ints = new IntsRefBuilder();
|
||||
for(int i=0; i<10; i++) {
|
||||
b.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
|
||||
}
|
||||
FST<Object> fst = b.finish();
|
||||
FST<Object> fst = fstCompiler.compile();
|
||||
|
||||
// count the input paths
|
||||
int count = 0;
|
||||
|
@ -797,17 +801,17 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
|
||||
// Build an FST mapping BytesRef -> Long
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final BytesRef a = new BytesRef("a");
|
||||
final BytesRef b = new BytesRef("b");
|
||||
final BytesRef c = new BytesRef("c");
|
||||
|
||||
builder.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L);
|
||||
builder.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L);
|
||||
builder.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L);
|
||||
fstCompiler.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L);
|
||||
fstCompiler.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L);
|
||||
fstCompiler.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L);
|
||||
|
||||
final FST<Long> fst = builder.finish();
|
||||
final FST<Long> fst = fstCompiler.compile();
|
||||
|
||||
assertEquals(13824324872317238L, (long) Util.get(fst, c));
|
||||
assertEquals(42, (long) Util.get(fst, b));
|
||||
|
@ -1035,7 +1039,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
FST<Object> compile(String[] lines) throws IOException {
|
||||
final NoOutputs outputs = NoOutputs.getSingleton();
|
||||
final Object nothing = outputs.getNoOutput();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
int line = 0;
|
||||
final BytesRefBuilder term = new BytesRefBuilder();
|
||||
|
@ -1046,10 +1050,10 @@ public class TestFSTs extends LuceneTestCase {
|
|||
break;
|
||||
}
|
||||
term.copyChars(w);
|
||||
b.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
|
||||
fstCompiler.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
|
||||
}
|
||||
|
||||
return b.finish();
|
||||
return fstCompiler.compile();
|
||||
}
|
||||
|
||||
void generate(ArrayList<String> out, StringBuilder b, char from, char to,
|
||||
|
@ -1110,10 +1114,10 @@ public class TestFSTs extends LuceneTestCase {
|
|||
public void testFinalOutputOnEndState() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
|
||||
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
|
||||
final FST<Long> fst = builder.finish();
|
||||
final FSTCompiler<Long> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).minSuffixCount1(2).build();
|
||||
fstCompiler.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
|
||||
fstCompiler.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
|
||||
final FST<Long> fst = fstCompiler.compile();
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
||||
StringWriter w = new StringWriter();
|
||||
Util.toDot(fst, w, false, false);
|
||||
|
@ -1124,10 +1128,10 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testInternalFinalState() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
final FST<Long> fst = builder.finish();
|
||||
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
final FST<Long> fst = fstCompiler.compile();
|
||||
StringWriter w = new StringWriter();
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
|
||||
Util.toDot(fst, w, false, false);
|
||||
|
@ -1145,20 +1149,20 @@ public class TestFSTs extends LuceneTestCase {
|
|||
public void testNonFinalStopNode() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
final Long nothing = outputs.getNoOutput();
|
||||
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
//final FST<Long> fst = new FST<>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, 15);
|
||||
final FST<Long> fst = b.fst;
|
||||
final FST<Long> fst = fstCompiler.fst;
|
||||
|
||||
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<>(b, 0);
|
||||
final FSTCompiler.UnCompiledNode<Long> rootNode = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
|
||||
|
||||
// Add final stop node
|
||||
{
|
||||
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0);
|
||||
final FSTCompiler.UnCompiledNode<Long> node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
|
||||
node.isFinal = true;
|
||||
rootNode.addArc('a', node);
|
||||
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
||||
frozen.node = fst.addNode(b, node);
|
||||
final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode();
|
||||
frozen.node = fst.addNode(fstCompiler, node);
|
||||
rootNode.arcs[0].nextFinalOutput = 17L;
|
||||
rootNode.arcs[0].isFinal = true;
|
||||
rootNode.arcs[0].output = nothing;
|
||||
|
@ -1167,16 +1171,16 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
// Add non-final stop node
|
||||
{
|
||||
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0);
|
||||
final FSTCompiler.UnCompiledNode<Long> node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
|
||||
rootNode.addArc('b', node);
|
||||
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
||||
frozen.node = fst.addNode(b, node);
|
||||
final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode();
|
||||
frozen.node = fst.addNode(fstCompiler, node);
|
||||
rootNode.arcs[1].nextFinalOutput = nothing;
|
||||
rootNode.arcs[1].output = 42L;
|
||||
rootNode.arcs[1].target = frozen;
|
||||
}
|
||||
|
||||
fst.finish(fst.addNode(b, rootNode));
|
||||
fst.finish(fst.addNode(fstCompiler, rootNode));
|
||||
|
||||
StringWriter w = new StringWriter();
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
||||
|
@ -1225,13 +1229,13 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testShortestPaths() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
|
||||
final FST<Long> fst = builder.finish();
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
|
||||
final FST<Long> fst = fstCompiler.compile();
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
||||
//Util.toDot(fst, w, false, false);
|
||||
//w.close();
|
||||
|
@ -1256,16 +1260,16 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testRejectNoLimits() throws IOException {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Long> fstCompiler = new FSTCompiler<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
|
||||
|
||||
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
|
||||
final FST<Long> fst = builder.finish();
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
|
||||
final FST<Long> fst = fstCompiler.compile();
|
||||
final AtomicInteger rejectCount = new AtomicInteger();
|
||||
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, 2, 6, minLongComparator) {
|
||||
@Override
|
||||
|
@ -1320,13 +1324,13 @@ public class TestFSTs extends LuceneTestCase {
|
|||
PositiveIntOutputs.getSingleton() // output
|
||||
);
|
||||
|
||||
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Pair<Long,Long>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
|
||||
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
|
||||
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
|
||||
final FST<Pair<Long,Long>> fst = builder.finish();
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
|
||||
final FST<Pair<Long,Long>> fst = fstCompiler.compile();
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
||||
//Util.toDot(fst, w, false, false);
|
||||
//w.close();
|
||||
|
@ -1361,7 +1365,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final TreeSet<String> allPrefixes = new TreeSet<>();
|
||||
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
|
||||
for (int i = 0; i < numWords; i++) {
|
||||
|
@ -1382,10 +1386,10 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
|
||||
//System.out.println("add: " + e);
|
||||
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
|
||||
}
|
||||
|
||||
final FST<Long> fst = builder.finish();
|
||||
final FST<Long> fst = fstCompiler.compile();
|
||||
//System.out.println("SAVE out.dot");
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
||||
//Util.toDot(fst, w, false, false);
|
||||
|
@ -1479,7 +1483,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
PositiveIntOutputs.getSingleton(), // weight
|
||||
PositiveIntOutputs.getSingleton() // output
|
||||
);
|
||||
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Pair<Long,Long>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
|
||||
Random random = random();
|
||||
|
@ -1504,10 +1508,10 @@ public class TestFSTs extends LuceneTestCase {
|
|||
//System.out.println("add: " + e);
|
||||
long weight = e.getValue().a;
|
||||
long output = e.getValue().b;
|
||||
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
|
||||
}
|
||||
|
||||
final FST<Pair<Long,Long>> fst = builder.finish();
|
||||
final FST<Pair<Long,Long>> fst = fstCompiler.compile();
|
||||
//System.out.println("SAVE out.dot");
|
||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
||||
//Util.toDot(fst, w, false, false);
|
||||
|
@ -1563,7 +1567,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testLargeOutputsOnArrayArcs() throws Exception {
|
||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
final Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final byte[] bytes = new byte[300];
|
||||
final IntsRefBuilder input = new IntsRefBuilder();
|
||||
|
@ -1572,10 +1576,10 @@ public class TestFSTs extends LuceneTestCase {
|
|||
for(int arc=0;arc<6;arc++) {
|
||||
input.setIntAt(0, arc);
|
||||
output.bytes[0] = (byte) arc;
|
||||
builder.add(input.get(), BytesRef.deepCopyOf(output));
|
||||
fstCompiler.add(input.get(), BytesRef.deepCopyOf(output));
|
||||
}
|
||||
|
||||
final FST<BytesRef> fst = builder.finish();
|
||||
final FST<BytesRef> fst = fstCompiler.compile();
|
||||
for(int arc=0;arc<6;arc++) {
|
||||
input.setIntAt(0, arc);
|
||||
final BytesRef result = Util.get(fst, input.get());
|
||||
|
@ -1608,15 +1612,15 @@ public class TestFSTs extends LuceneTestCase {
|
|||
Collections.sort(termsList);
|
||||
|
||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
IntsRefBuilder input = new IntsRefBuilder();
|
||||
for(BytesRef term : termsList) {
|
||||
Util.toIntsRef(term, input);
|
||||
builder.add(input.get(), term);
|
||||
fstCompiler.add(input.get(), term);
|
||||
}
|
||||
|
||||
FST<BytesRef> fst = builder.finish();
|
||||
FST<BytesRef> fst = fstCompiler.compile();
|
||||
|
||||
Arc<BytesRef> arc = new FST.Arc<>();
|
||||
fst.getFirstArc(arc);
|
||||
|
@ -1638,17 +1642,17 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testSimpleDepth() throws Exception {
|
||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
BytesRef ab = new BytesRef("ab");
|
||||
BytesRef ac = new BytesRef("ac");
|
||||
BytesRef bd = new BytesRef("bd");
|
||||
|
||||
builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
|
||||
builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
|
||||
builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
|
||||
fstCompiler.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
|
||||
fstCompiler.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
|
||||
fstCompiler.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
|
||||
|
||||
FST<Long> fst = builder.finish();
|
||||
FST<Long> fst = fstCompiler.compile();
|
||||
|
||||
assertEquals(3, (long) Util.get(fst, ab));
|
||||
assertEquals(5, (long) Util.get(fst, ac));
|
||||
|
|
|
@ -83,15 +83,17 @@ public class TestUtil extends LuceneTestCase {
|
|||
|
||||
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, allowArrayArcs, 15);
|
||||
final FSTCompiler.Builder<Object> builder = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
|
||||
.allowFixedLengthArcs(allowArrayArcs);
|
||||
if (!allowDirectAddressing) {
|
||||
b.setDirectAddressingMaxOversizingFactor(-1f);
|
||||
builder.directAddressingMaxOversizingFactor(-1f);
|
||||
}
|
||||
final FSTCompiler<Object> fstCompiler = builder.build();
|
||||
|
||||
for (String word : words) {
|
||||
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
}
|
||||
return b.finish();
|
||||
return fstCompiler.compile();
|
||||
}
|
||||
|
||||
private List<String> createRandomDictionary(int width, int depth) {
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* more of its output values. You can use this when a single
|
||||
* input may need to map to more than one output,
|
||||
* maintaining order: pass the same input with a different
|
||||
* output by calling {@link Builder#add(IntsRef,Object)} multiple
|
||||
* output by calling {@link FSTCompiler#add(IntsRef,Object)} multiple
|
||||
* times. The builder will then combine the outputs using
|
||||
* the {@link Outputs#merge(Object,Object)} method.
|
||||
*
|
||||
|
@ -41,7 +41,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* <p>NOTE: the only way to create multiple outputs is to
|
||||
* add the same input to the FST multiple times in a row. This is
|
||||
* how the FST maps a single input to multiple outputs (e.g. you
|
||||
* cannot pass a List<Object> to {@link Builder#add}). If
|
||||
* cannot pass a List<Object> to {@link FSTCompiler#add}). If
|
||||
* your outputs are longs, and you need at most 2, then use
|
||||
* {@link UpToTwoPositiveIntOutputs} instead since it stores
|
||||
* the outputs more compactly (by stealing a bit from each
|
||||
|
|
|
@ -35,14 +35,14 @@ import org.apache.lucene.util.SuppressForbidden;
|
|||
* <p>NOTE: the only way to create a TwoLongs output is to
|
||||
* add the same input to the FST twice in a row. This is
|
||||
* how the FST maps a single input to two outputs (e.g. you
|
||||
* cannot pass a TwoLongs to {@link Builder#add}. If you
|
||||
* cannot pass a TwoLongs to {@link FSTCompiler#add}. If you
|
||||
* need more than two then use {@link ListOfOutputs}, but if
|
||||
* you only have at most 2 then this implementation will
|
||||
* require fewer bytes as it steals one bit from each long
|
||||
* value.
|
||||
*
|
||||
* <p>NOTE: the resulting FST is not guaranteed to be minimal!
|
||||
* See {@link Builder}.
|
||||
* See {@link FSTCompiler}.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
|
|
@ -164,16 +164,16 @@ public class TestFSTsMisc extends LuceneTestCase {
|
|||
public void testListOfOutputs() throws Exception {
|
||||
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
|
||||
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
|
||||
final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
// Add the same input more than once and the outputs
|
||||
// are merged:
|
||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
|
||||
final FST<Object> fst = builder.finish();
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
|
||||
final FST<Object> fst = fstCompiler.compile();
|
||||
|
||||
Object output = Util.get(fst, new BytesRef("a"));
|
||||
assertNotNull(output);
|
||||
|
@ -193,20 +193,20 @@ public class TestFSTsMisc extends LuceneTestCase {
|
|||
public void testListOfOutputsEmptyString() throws Exception {
|
||||
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
|
||||
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
|
||||
final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
builder.add(scratch.get(), 0L);
|
||||
builder.add(scratch.get(), 1L);
|
||||
builder.add(scratch.get(), 17L);
|
||||
builder.add(scratch.get(), 1L);
|
||||
fstCompiler.add(scratch.get(), 0L);
|
||||
fstCompiler.add(scratch.get(), 1L);
|
||||
fstCompiler.add(scratch.get(), 17L);
|
||||
fstCompiler.add(scratch.get(), 1L);
|
||||
|
||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
|
||||
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
|
||||
fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
|
||||
|
||||
final FST<Object> fst = builder.finish();
|
||||
final FST<Object> fst = fstCompiler.compile();
|
||||
|
||||
Object output = Util.get(fst, new BytesRef(""));
|
||||
assertNotNull(output);
|
||||
|
|
|
@ -43,7 +43,7 @@ import org.apache.lucene.util.FixedBitSet;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
@ -350,29 +350,28 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true, false, Integer.MAX_VALUE,
|
||||
FST_OUTPUTS, true, 15);
|
||||
final FSTCompiler<Pair<BytesRef,Long>> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS)
|
||||
.shouldShareNonSingletonNodes(false).build();
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
//}
|
||||
//indexBuilder.DEBUG = false;
|
||||
final byte[] bytes = scratchBytes.toArrayCopy();
|
||||
assert bytes.length > 0;
|
||||
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
|
||||
fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
|
||||
scratchBytes.reset();
|
||||
|
||||
// Copy over index for all sub-blocks
|
||||
for(PendingBlock block : blocks) {
|
||||
if (block.subIndices != null) {
|
||||
for(FST<Pair<BytesRef,Long>> subIndex : block.subIndices) {
|
||||
append(indexBuilder, subIndex, scratchIntsRef);
|
||||
append(fstCompiler, subIndex, scratchIntsRef);
|
||||
}
|
||||
block.subIndices = null;
|
||||
}
|
||||
}
|
||||
|
||||
index = indexBuilder.finish();
|
||||
index = fstCompiler.compile();
|
||||
|
||||
assert subIndices == null;
|
||||
|
||||
|
@ -387,14 +386,14 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
// TODO: maybe we could add bulk-add method to
|
||||
// Builder? Takes FST and unions it w/ current
|
||||
// FST.
|
||||
private void append(Builder<Pair<BytesRef,Long>> builder, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
|
||||
private void append(FSTCompiler<Pair<BytesRef,Long>> fstCompiler, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
|
||||
final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
|
||||
BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
|
||||
while((indexEnt = subIndexEnum.next()) != null) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
|
||||
//}
|
||||
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
|
||||
fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,7 +52,7 @@ import org.apache.lucene.util.automaton.Automaton;
|
|||
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
@ -496,7 +496,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
|
|||
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
|
||||
|
||||
PairOutputs<Long,BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
|
||||
Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
FSTCompiler<Pair<Long,BytesRef>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
// Build FST:
|
||||
BytesRefBuilder previousAnalyzed = null;
|
||||
|
@ -570,7 +570,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
|
|||
Util.toIntsRef(analyzed.get(), scratchInts);
|
||||
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
|
||||
if (!hasPayloads) {
|
||||
builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
|
||||
fstCompiler.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
|
||||
} else {
|
||||
int payloadOffset = input.getPosition() + surface.length;
|
||||
int payloadLength = bytes.length - payloadOffset;
|
||||
|
@ -579,10 +579,10 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
|
|||
br.bytes[surface.length] = PAYLOAD_SEP;
|
||||
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
|
||||
br.length = br.bytes.length;
|
||||
builder.add(scratchInts.get(), outputs.newPair(cost, br));
|
||||
fstCompiler.add(scratchInts.get(), outputs.newPair(cost, br));
|
||||
}
|
||||
}
|
||||
fst = builder.finish();
|
||||
fst = fstCompiler.compile();
|
||||
|
||||
//Util.dotToFile(fst, "/tmp/suggest.dot");
|
||||
} finally {
|
||||
|
|
|
@ -66,7 +66,7 @@ import org.apache.lucene.util.CharsRefBuilder;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FST.Arc;
|
||||
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||
|
@ -304,7 +304,7 @@ public class FreeTextSuggester extends Lookup implements Accountable {
|
|||
TermsEnum termsEnum = terms.iterator();
|
||||
|
||||
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
while (true) {
|
||||
|
@ -320,10 +320,10 @@ public class FreeTextSuggester extends Lookup implements Accountable {
|
|||
totTokens += termsEnum.totalTermFreq();
|
||||
}
|
||||
|
||||
builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
|
||||
fstCompiler.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
|
||||
}
|
||||
|
||||
fst = builder.finish();
|
||||
fst = fstCompiler.compile();
|
||||
if (fst == null) {
|
||||
throw new IllegalArgumentException("need at least one suggestion");
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.lucene.store.IndexInput;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PairOutputs;
|
||||
|
@ -53,7 +53,7 @@ final class NRTSuggesterBuilder {
|
|||
public static final int END_BYTE = 0x0;
|
||||
|
||||
private final PairOutputs<Long, BytesRef> outputs;
|
||||
private final Builder<PairOutputs.Pair<Long, BytesRef>> builder;
|
||||
private final FSTCompiler<PairOutputs.Pair<Long, BytesRef>> fstCompiler;
|
||||
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
private final BytesRefBuilder analyzed = new BytesRefBuilder();
|
||||
private final PriorityQueue<Entry> entries;
|
||||
|
@ -70,7 +70,7 @@ final class NRTSuggesterBuilder {
|
|||
this.endByte = END_BYTE;
|
||||
this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
|
||||
this.entries = new PriorityQueue<>();
|
||||
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -108,7 +108,7 @@ final class NRTSuggesterBuilder {
|
|||
}
|
||||
analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
|
||||
Util.toIntsRef(analyzed.get(), scratchInts);
|
||||
builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
|
||||
fstCompiler.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
|
||||
}
|
||||
maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
|
||||
entries.clear();
|
||||
|
@ -119,11 +119,11 @@ final class NRTSuggesterBuilder {
|
|||
* {@link NRTSuggester#load(IndexInput, CompletionPostingsFormat.FSTLoadMode)})}
|
||||
*/
|
||||
public boolean store(DataOutput output) throws IOException {
|
||||
final FST<PairOutputs.Pair<Long, BytesRef>> build = builder.finish();
|
||||
if (build == null) {
|
||||
final FST<PairOutputs.Pair<Long, BytesRef>> fst = fstCompiler.compile();
|
||||
if (fst == null) {
|
||||
return false;
|
||||
}
|
||||
build.save(output);
|
||||
fst.save(output);
|
||||
|
||||
/* write some more meta-info */
|
||||
assert maxAnalyzedPathsPerOutput > 0;
|
||||
|
|
|
@ -169,7 +169,7 @@ public class FSTCompletionBuilder {
|
|||
* @param shareMaxTailLength
|
||||
* Max shared suffix sharing length.
|
||||
*
|
||||
* See the description of this parameter in {@link Builder}'s constructor.
|
||||
* See the description of this parameter in {@link org.apache.lucene.util.fst.FSTCompiler.Builder}.
|
||||
* In general, for very large inputs you'll want to construct a non-minimal
|
||||
* automaton which will be larger, but the construction will take far less ram.
|
||||
* For minimal automata, set it to {@link Integer#MAX_VALUE}.
|
||||
|
@ -234,9 +234,8 @@ public class FSTCompletionBuilder {
|
|||
// Build the automaton.
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Object empty = outputs.getNoOutput();
|
||||
final Builder<Object> builder = new Builder<>(
|
||||
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
|
||||
shareMaxTailLength, outputs, true, 15);
|
||||
final FSTCompiler<Object> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
|
||||
.shareMaxTailLength(shareMaxTailLength).build();
|
||||
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
BytesRef entry;
|
||||
|
@ -246,11 +245,11 @@ public class FSTCompletionBuilder {
|
|||
while((entry = iter.next()) != null) {
|
||||
count++;
|
||||
if (scratch.get().compareTo(entry) != 0) {
|
||||
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
|
||||
fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty);
|
||||
scratch.copyBytes(entry);
|
||||
}
|
||||
}
|
||||
|
||||
return count == 0 ? null : builder.finish();
|
||||
return count == 0 ? null : fstCompiler.compile();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,7 +40,7 @@ import org.apache.lucene.util.BytesRefBuilder;
|
|||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FST.Arc;
|
||||
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||
|
@ -116,7 +116,7 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
|
|||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
BytesRefBuilder previous = null;
|
||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
while ((scratch = iter.next()) != null) {
|
||||
long cost = iter.weight();
|
||||
|
||||
|
@ -127,11 +127,11 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
|
|||
// added
|
||||
}
|
||||
Util.toIntsRef(scratch, scratchInts);
|
||||
builder.add(scratchInts.get(), cost);
|
||||
fstCompiler.add(scratchInts.get(), cost);
|
||||
previous.copyBytes(scratch);
|
||||
count++;
|
||||
}
|
||||
fst = builder.finish();
|
||||
fst = fstCompiler.compile();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -272,27 +272,26 @@ public class FSTTester<T> {
|
|||
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
|
||||
}
|
||||
|
||||
final Builder<T> builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
|
||||
prune1, prune2,
|
||||
prune1==0 && prune2==0,
|
||||
allowRandomSuffixSharing ? random.nextBoolean() : true,
|
||||
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
|
||||
outputs,
|
||||
true,
|
||||
15);
|
||||
final FSTCompiler<T> fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
|
||||
.minSuffixCount1(prune1)
|
||||
.minSuffixCount2(prune2)
|
||||
.shouldShareSuffix(prune1==0 && prune2==0)
|
||||
.shouldShareNonSingletonNodes(allowRandomSuffixSharing ? random.nextBoolean() : true)
|
||||
.shareMaxTailLength(allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE)
|
||||
.build();
|
||||
|
||||
for(InputOutput<T> pair : pairs) {
|
||||
if (pair.output instanceof List) {
|
||||
@SuppressWarnings("unchecked") List<Long> longValues = (List<Long>) pair.output;
|
||||
@SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
|
||||
@SuppressWarnings("unchecked") final FSTCompiler<Object> fstCompilerObject = (FSTCompiler<Object>) fstCompiler;
|
||||
for(Long value : longValues) {
|
||||
builderObject.add(pair.input, value);
|
||||
fstCompilerObject.add(pair.input, value);
|
||||
}
|
||||
} else {
|
||||
builder.add(pair.input, pair.output);
|
||||
fstCompiler.add(pair.input, pair.output);
|
||||
}
|
||||
}
|
||||
FST<T> fst = builder.finish();
|
||||
FST<T> fst = fstCompiler.compile();
|
||||
|
||||
if (random.nextBoolean() && fst != null) {
|
||||
IOContext context = LuceneTestCase.newIOContext(random);
|
||||
|
@ -320,7 +319,7 @@ public class FSTTester<T> {
|
|||
if (fst == null) {
|
||||
System.out.println(" fst has 0 nodes (fully pruned)");
|
||||
} else {
|
||||
System.out.println(" fst has " + builder.getNodeCount() + " nodes and " + builder.getArcCount() + " arcs");
|
||||
System.out.println(" fst has " + fstCompiler.getNodeCount() + " nodes and " + fstCompiler.getArcCount() + " arcs");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -330,8 +329,8 @@ public class FSTTester<T> {
|
|||
verifyPruned(inputMode, fst, prune1, prune2);
|
||||
}
|
||||
|
||||
nodeCount = builder.getNodeCount();
|
||||
arcCount = builder.getArcCount();
|
||||
nodeCount = fstCompiler.getNodeCount();
|
||||
arcCount = fstCompiler.getArcCount();
|
||||
|
||||
return fst;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue