LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder.

Closes #1070
This commit is contained in:
Bruno Roustant 2019-12-10 16:41:43 +01:00
parent f083f40b28
commit 1812b367ab
No known key found for this signature in database
GPG Key ID: CD28DABB95360525
36 changed files with 545 additions and 481 deletions

View File

@ -38,6 +38,9 @@ API Changes
* LUCENE-8905: Better defence against malformed arguments in TopDocsCollector
(Atri Sharma)
* LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder.
(Bruno Roustant)
Improvements
* LUCENE-8757: When provided with an ExecutorService to run queries across

View File

@ -1,5 +1,10 @@
# Apache Lucene Migration Guide
## o.a.l.util.fst.Builder is renamed FSTCompiler with fluent-style Builder (LUCENE-9089) ##
Simply use FSTCompiler instead of the previous Builder. Use either the simple constructor with default settings, or
the FSTCompiler.Builder to tune and tweak any parameter.
## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids

View File

@ -25,6 +25,7 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
@ -106,13 +107,13 @@ public class NormalizeCharMap {
final FST<CharsRef> map;
try {
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
builder.add(Util.toUTF16(ent.getKey(), scratch),
fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
new CharsRef(ent.getValue()));
}
map = builder.finish();
map = fstCompiler.compile();
pendingPairs.clear();
} catch (IOException ioe) {
// Bogus FST IOExceptions!! (will never happen)

View File

@ -64,7 +64,7 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntSequenceOutputs;
@ -231,9 +231,9 @@ public class Dictionary {
// read dictionary entries
IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, b);
words = b.finish();
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
words = fstCompiler.compile();
aliases = null; // no longer needed
morphAliases = null; // no longer needed
success = true;
@ -414,7 +414,7 @@ public class Dictionary {
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
IntsRefBuilder scratch = new IntsRefBuilder();
for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
Util.toUTF32(entry.getKey(), scratch);
@ -423,9 +423,9 @@ public class Dictionary {
for (Integer c : entries) {
output.ints[output.length++] = c;
}
builder.add(scratch.get(), output);
fstCompiler.add(scratch.get(), output);
}
return builder.finish();
return fstCompiler.compile();
}
static String escapeDash(String re) {
@ -608,14 +608,14 @@ public class Dictionary {
}
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String,String> entry : mappings.entrySet()) {
Util.toUTF16(entry.getKey(), scratchInts);
builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
}
return builder.finish();
return fstCompiler.compile();
}
/** pattern accepts optional BOM + SET + any whitespace */
@ -776,7 +776,7 @@ public class Dictionary {
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, FSTCompiler<IntsRef> words) throws IOException {
BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();

View File

@ -35,6 +35,7 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.FSTCompiler;
/**
* Provides the ability to override any {@link KeywordAttribute} aware stemmer
@ -203,7 +204,7 @@ public final class StemmerOverrideFilter extends TokenFilter {
*/
public StemmerOverrideMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(
FST.INPUT_TYPE.BYTE4, outputs);
final int[] sort = hash.sort();
IntsRefBuilder intsSpare = new IntsRefBuilder();
@ -213,9 +214,9 @@ public final class StemmerOverrideFilter extends TokenFilter {
int id = sort[i];
BytesRef bytesRef = hash.get(id, spare);
intsSpare.copyUTF8Bytes(bytesRef);
builder.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
fstCompiler.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
}
return new StemmerOverrideMap(builder.finish(), ignoreCase);
return new StemmerOverrideMap(fstCompiler.compile(), ignoreCase);
}
}

View File

@ -39,6 +39,7 @@ import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Util;
/**
@ -213,8 +214,8 @@ public class SynonymMap {
public SynonymMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
// TODO: are we using the best sharing options?
org.apache.lucene.util.fst.Builder<BytesRef> builder =
new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRefBuilder scratch = new BytesRefBuilder();
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
@ -278,10 +279,10 @@ public class SynonymMap {
scratch.setLength(scratchOutput.getPosition());
//System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
}
FST<BytesRef> fst = builder.finish();
FST<BytesRef> fst = fstCompiler.compile();
return new SynonymMap(fst, words, maxHorizontalContext);
}
}

View File

@ -30,7 +30,7 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
@ -196,26 +196,26 @@ public class TestDictionary extends LuceneTestCase {
public void testReplacements() throws Exception {
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
// a -> b
Util.toUTF16("a", scratchInts);
builder.add(scratchInts.get(), new CharsRef("b"));
fstCompiler.add(scratchInts.get(), new CharsRef("b"));
// ab -> c
Util.toUTF16("ab", scratchInts);
builder.add(scratchInts.get(), new CharsRef("c"));
fstCompiler.add(scratchInts.get(), new CharsRef("c"));
// c -> de
Util.toUTF16("c", scratchInts);
builder.add(scratchInts.get(), new CharsRef("de"));
fstCompiler.add(scratchInts.get(), new CharsRef("de"));
// def -> gh
Util.toUTF16("def", scratchInts);
builder.add(scratchInts.get(), new CharsRef("gh"));
fstCompiler.add(scratchInts.get(), new CharsRef("gh"));
FST<CharsRef> fst = builder.finish();
FST<CharsRef> fst = fstCompiler.compile();
StringBuilder sb = new StringBuilder("atestanother");
Dictionary.applyMappings(fst, sb);

View File

@ -29,7 +29,7 @@ import java.util.TreeMap;
import org.apache.lucene.analysis.ja.util.CSVUtil;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -99,7 +99,7 @@ public final class UserDictionary implements Dictionary {
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0;
@ -136,11 +136,11 @@ public final class UserDictionary implements Dictionary {
for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, (int) token.charAt(i));
}
fstBuilder.add(scratch.get(), ord);
fstCompiler.add(scratch.get(), ord);
segmentations.add(wordIdAndLength);
ord++;
}
this.fst = new TokenInfoFST(fstBuilder.finish(), false);
this.fst = new TokenInfoFST(fstCompiler.compile(), false);
this.data = data.toArray(new String[data.size()]);
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
}

View File

@ -31,7 +31,7 @@ import java.util.stream.Stream;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -97,7 +97,7 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(entry -> entry[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0
String lastValue = null;
@ -120,12 +120,12 @@ class TokenInfoDictionaryBuilder {
for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, (int) token.charAt(i));
}
fstBuilder.add(scratch.get(), ord);
fstCompiler.add(scratch.get(), ord);
}
dictionary.addMapping((int) ord, offset);
offset = next;
}
dictionary.setFST(fstBuilder.finish());
dictionary.setFST(fstCompiler.compile());
return dictionary;
}

View File

@ -25,7 +25,7 @@ import java.util.List;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -83,7 +83,7 @@ public final class UserDictionary implements Dictionary {
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
String lastToken = null;
@ -129,11 +129,11 @@ public final class UserDictionary implements Dictionary {
for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, token.charAt(i));
}
fstBuilder.add(scratch.get(), ord);
fstCompiler.add(scratch.get(), ord);
lastToken = token;
ord ++;
}
this.fst = new TokenInfoFST(fstBuilder.finish());
this.fst = new TokenInfoFST(fstCompiler.compile());
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
this.rightIds = new short[rightIds.size()];
for (int i = 0; i < rightIds.size(); i++) {

View File

@ -30,7 +30,7 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -90,7 +90,7 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(left -> left[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0
String lastValue = null;
@ -116,12 +116,12 @@ class TokenInfoDictionaryBuilder {
for (int i = 0; i < surfaceForm.length(); i++) {
scratch.setIntAt(i, surfaceForm.charAt(i));
}
fstBuilder.add(scratch.get(), ord);
fstCompiler.add(scratch.get(), ord);
}
dictionary.addMapping((int) ord, offset);
offset = next;
}
dictionary.setFST(fstBuilder.finish());
dictionary.setFST(fstCompiler.compile());
return dictionary;
}
}

View File

@ -41,7 +41,7 @@ import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
@ -183,15 +183,15 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
private void updateFST(SortedMap<String, Double> weights) throws IOException {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
BytesRefBuilder scratchBytes = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String, Double> entry : weights.entrySet()) {
scratchBytes.copyChars(entry.getKey());
fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
.getValue().longValue());
}
fst = fstBuilder.finish();
fst = fstCompiler.compile();
}

View File

@ -33,7 +33,7 @@ import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
@ -219,7 +219,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
}
private class FSTFieldWriter extends FieldWriter {
private final Builder<Long> fstBuilder;
private final FSTCompiler<Long> fstCompiler;
private final PositiveIntOutputs fstOutputs;
private final long startTermsFilePointer;
@ -233,12 +233,12 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton();
fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
indexStart = out.getFilePointer();
////System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in
fstBuilder.add(new IntsRef(), termsFilePointer);
fstCompiler.add(new IntsRef(), termsFilePointer);
startTermsFilePointer = termsFilePointer;
}
@ -269,7 +269,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
final int lengthSave = text.length;
text.length = indexedTermPrefixLength(lastTerm.get(), text);
try {
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
fstCompiler.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
} finally {
text.length = lengthSave;
}
@ -278,7 +278,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
@Override
public void finish(long termsFilePointer) throws IOException {
fst = fstBuilder.finish();
fst = fstCompiler.compile();
if (fst != null) {
fst.save(out);
}

View File

@ -45,7 +45,7 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
@ -361,16 +361,14 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
}
}
final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
FST_OUTPUTS, true, 15);
final FSTCompiler<Output> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS).shouldShareNonSingletonNodes(false).build();
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}
//indexBuilder.DEBUG = false;
final byte[] bytes = scratchBytes.toArrayCopy();
assert bytes.length > 0;
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef),
fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef),
FST_OUTPUTS.newOutput(new BytesRef(bytes, 0, bytes.length),
0, Long.MAX_VALUE-(sumTotalTermCount-1)));
scratchBytes.reset();
@ -381,7 +379,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
for(PendingBlock block : blocks) {
if (block.subIndices != null) {
for(SubIndex subIndex : block.subIndices) {
append(indexBuilder, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef);
append(fstCompiler, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef);
}
block.subIndices = null;
}
@ -391,7 +389,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
assert sumTotalTermCount == totFloorTermCount;
index = indexBuilder.finish();
index = fstCompiler.compile();
assert subIndices == null;
/*
@ -405,7 +403,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
// TODO: maybe we could add bulk-add method to
// Builder? Takes FST and unions it w/ current
// FST.
private void append(Builder<Output> builder, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
private void append(FSTCompiler<Output> fstCompiler, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
BytesRefFSTEnum.InputOutput<Output> indexEnt;
while ((indexEnt = subIndexEnum.next()) != null) {
@ -416,7 +414,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
//long blockTermCount = output.endOrd - output.startOrd + 1;
Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset);
//System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd));
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
}
}
}

View File

@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
@ -287,7 +287,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
}
final class TermsWriter {
private final Builder<Long> builder;
private final FSTCompiler<Long> fstCompiler;
private final PositiveIntOutputs outputs;
private final FieldInfo fieldInfo;
private final int longsSize;
@ -311,7 +311,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
this.fieldInfo = fieldInfo;
this.longsSize = postingsWriter.setField(fieldInfo);
this.outputs = PositiveIntOutputs.getSingleton();
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
this.lastBlockStatsFP = 0;
this.lastBlockMetaLongsFP = 0;
@ -346,7 +346,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
}
metaLongsOut.writeVLong(metaBytesOut.size() - lastMetaBytesFP);
builder.add(Util.toIntsRef(text, scratchTerm), numTerms);
fstCompiler.add(Util.toIntsRef(text, scratchTerm), numTerms);
numTerms++;
lastMetaBytesFP = metaBytesOut.size();
@ -365,7 +365,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
metadata.statsOut = statsOut;
metadata.metaLongsOut = metaLongsOut;
metadata.metaBytesOut = metaBytesOut;
metadata.dict = builder.finish();
metadata.dict = fstCompiler.compile();
fields.add(metadata);
}
}

View File

@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
@ -247,7 +247,7 @@ public class FSTTermsWriter extends FieldsConsumer {
}
final class TermsWriter {
private final Builder<FSTTermOutputs.TermData> builder;
private final FSTCompiler<FSTTermOutputs.TermData> fstCompiler;
private final FSTTermOutputs outputs;
private final FieldInfo fieldInfo;
private final int longsSize;
@ -261,7 +261,7 @@ public class FSTTermsWriter extends FieldsConsumer {
this.fieldInfo = fieldInfo;
this.longsSize = postingsWriter.setField(fieldInfo);
this.outputs = new FSTTermOutputs(fieldInfo, longsSize);
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
}
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
@ -276,14 +276,14 @@ public class FSTTermsWriter extends FieldsConsumer {
meta.bytes = metaWriter.toArrayCopy();
metaWriter.reset();
}
builder.add(Util.toIntsRef(text, scratchTerm), meta);
fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta);
numTerms++;
}
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
// save FST dict
if (numTerms > 0) {
final FST<FSTTermOutputs.TermData> fst = builder.finish();
final FST<FSTTermOutputs.TermData> fst = fstCompiler.compile();
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
}
}

View File

@ -52,7 +52,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
@ -539,11 +539,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
outputsInner);
b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IndexInput in = SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRefBuilder lastTerm = new BytesRefBuilder();
@ -556,7 +556,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
SimpleTextUtil.readLine(in, scratch);
if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
outputs.newPair(lastDocsStart,
outputsInner.newPair((long) docFreq, totalTermFreq)));
sumTotalTermFreq += totalTermFreq;
@ -574,7 +574,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
} else if (StringHelper.startsWith(scratch.get(), TERM)) {
if (lastDocsStart != -1) {
b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
outputsInner.newPair((long) docFreq, totalTermFreq)));
}
lastDocsStart = in.getFilePointer();
@ -589,7 +589,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
}
}
docCount = visitedDocs.cardinality();
fst = b.finish();
fst = fstCompiler.compile();
/*
PrintStream ps = new PrintStream("out.dot");
fst.toDot(ps);

View File

@ -30,6 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
@ -202,19 +203,19 @@ public class FSTDictionary implements IndexDictionary {
*/
public static class Builder implements IndexDictionary.Builder {
protected final org.apache.lucene.util.fst.Builder<Long> fstBuilder;
protected final FSTCompiler<Long> fstCompiler;
protected final IntsRefBuilder scratchInts;
public Builder() {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
fstBuilder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
scratchInts = new IntsRefBuilder();
}
@Override
public void add(BytesRef blockKey, long blockFilePointer) {
try {
fstBuilder.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer);
fstCompiler.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer);
} catch (IOException e) {
// Should never happen.
throw new RuntimeException(e);
@ -224,7 +225,7 @@ public class FSTDictionary implements IndexDictionary {
@Override
public FSTDictionary build() {
try {
return new FSTDictionary(fstBuilder.finish());
return new FSTDictionary(fstCompiler.compile());
} catch (IOException e) {
// Should never happen.
throw new RuntimeException(e);

View File

@ -44,7 +44,7 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
@ -454,29 +454,27 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
}
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
outputs, true, 15);
final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).shouldShareNonSingletonNodes(false).build();
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}
//indexBuilder.DEBUG = false;
final byte[] bytes = scratchBytes.toArrayCopy();
assert bytes.length > 0;
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
scratchBytes.reset();
// Copy over index for all sub-blocks
for(PendingBlock block : blocks) {
if (block.subIndices != null) {
for(FST<BytesRef> subIndex : block.subIndices) {
append(indexBuilder, subIndex, scratchIntsRef);
append(fstCompiler, subIndex, scratchIntsRef);
}
block.subIndices = null;
}
}
index = indexBuilder.finish();
index = fstCompiler.compile();
assert subIndices == null;
@ -491,14 +489,14 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// TODO: maybe we could add bulk-add method to
// Builder? Takes FST and unions it w/ current
// FST.
private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
private void append(FSTCompiler<BytesRef> fstCompiler, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
while((indexEnt = subIndexEnum.next()) != null) {
//if (DEBUG) {
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
//}
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
}
}
}

View File

@ -605,7 +605,7 @@ public final class FST<T> implements Accountable {
// serializes new node by appending its bytes to the end
// of the current byte[]
long addNode(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException {
long addNode(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
T NO_OUTPUT = outputs.getNoOutput();
//System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
@ -616,28 +616,28 @@ public final class FST<T> implements Accountable {
return NON_FINAL_END_NODE;
}
}
final long startAddress = builder.bytes.getPosition();
final long startAddress = fstCompiler.bytes.getPosition();
//System.out.println(" startAddr=" + startAddress);
final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(builder, nodeIn);
final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(fstCompiler, nodeIn);
if (doFixedLengthArcs) {
//System.out.println(" fixed length arcs");
if (builder.numBytesPerArc.length < nodeIn.numArcs) {
builder.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)];
builder.numLabelBytesPerArc = new int[builder.numBytesPerArc.length];
if (fstCompiler.numBytesPerArc.length < nodeIn.numArcs) {
fstCompiler.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)];
fstCompiler.numLabelBytesPerArc = new int[fstCompiler.numBytesPerArc.length];
}
}
builder.arcCount += nodeIn.numArcs;
fstCompiler.arcCount += nodeIn.numArcs;
final int lastArc = nodeIn.numArcs-1;
long lastArcStart = builder.bytes.getPosition();
long lastArcStart = fstCompiler.bytes.getPosition();
int maxBytesPerArc = 0;
int maxBytesPerArcWithoutLabel = 0;
for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) {
final Builder.Arc<T> arc = nodeIn.arcs[arcIdx];
final Builder.CompiledNode target = (Builder.CompiledNode) arc.target;
final FSTCompiler.Arc<T> arc = nodeIn.arcs[arcIdx];
final FSTCompiler.CompiledNode target = (FSTCompiler.CompiledNode) arc.target;
int flags = 0;
//System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node);
@ -645,7 +645,7 @@ public final class FST<T> implements Accountable {
flags += BIT_LAST_ARC;
}
if (builder.lastFrozenNode == target.node && !doFixedLengthArcs) {
if (fstCompiler.lastFrozenNode == target.node && !doFixedLengthArcs) {
// TODO: for better perf (but more RAM used) we
// could avoid this except when arc is "near" the
// last arc:
@ -671,36 +671,36 @@ public final class FST<T> implements Accountable {
flags += BIT_ARC_HAS_OUTPUT;
}
builder.bytes.writeByte((byte) flags);
long labelStart = builder.bytes.getPosition();
writeLabel(builder.bytes, arc.label);
int numLabelBytes = (int) (builder.bytes.getPosition() - labelStart);
fstCompiler.bytes.writeByte((byte) flags);
long labelStart = fstCompiler.bytes.getPosition();
writeLabel(fstCompiler.bytes, arc.label);
int numLabelBytes = (int) (fstCompiler.bytes.getPosition() - labelStart);
// System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output));
if (arc.output != NO_OUTPUT) {
outputs.write(arc.output, builder.bytes);
outputs.write(arc.output, fstCompiler.bytes);
//System.out.println(" write output");
}
if (arc.nextFinalOutput != NO_OUTPUT) {
//System.out.println(" write final output");
outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes);
outputs.writeFinalOutput(arc.nextFinalOutput, fstCompiler.bytes);
}
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
assert target.node > 0;
//System.out.println(" write target");
builder.bytes.writeVLong(target.node);
fstCompiler.bytes.writeVLong(target.node);
}
// just write the arcs "like normal" on first pass, but record how many bytes each one took
// and max byte size:
if (doFixedLengthArcs) {
int numArcBytes = (int) (builder.bytes.getPosition() - lastArcStart);
builder.numBytesPerArc[arcIdx] = numArcBytes;
builder.numLabelBytesPerArc[arcIdx] = numLabelBytes;
lastArcStart = builder.bytes.getPosition();
int numArcBytes = (int) (fstCompiler.bytes.getPosition() - lastArcStart);
fstCompiler.numBytesPerArc[arcIdx] = numArcBytes;
fstCompiler.numLabelBytesPerArc[arcIdx] = numLabelBytes;
lastArcStart = fstCompiler.bytes.getPosition();
maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes);
maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes);
//System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes);
@ -733,18 +733,18 @@ public final class FST<T> implements Accountable {
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
assert labelRange > 0;
if (shouldExpandNodeWithDirectAddressing(builder, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
writeNodeForDirectAddressing(builder, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
builder.directAddressingNodeCount++;
if (shouldExpandNodeWithDirectAddressing(fstCompiler, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
writeNodeForDirectAddressing(fstCompiler, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
fstCompiler.directAddressingNodeCount++;
} else {
writeNodeForBinarySearch(builder, nodeIn, startAddress, maxBytesPerArc);
builder.binarySearchNodeCount++;
writeNodeForBinarySearch(fstCompiler, nodeIn, startAddress, maxBytesPerArc);
fstCompiler.binarySearchNodeCount++;
}
}
final long thisNodeAddress = builder.bytes.getPosition()-1;
builder.bytes.reverse(startAddress, thisNodeAddress);
builder.nodeCount++;
final long thisNodeAddress = fstCompiler.bytes.getPosition()-1;
fstCompiler.bytes.reverse(startAddress, thisNodeAddress);
fstCompiler.nodeCount++;
return thisNodeAddress;
}
@ -757,8 +757,8 @@ public final class FST<T> implements Accountable {
* of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear
* scan) on lookup by arc label.
*/
private boolean shouldExpandNodeWithFixedLengthArcs(Builder<T> builder, Builder.UnCompiledNode<T> node) {
return builder.allowFixedLengthArcs &&
private boolean shouldExpandNodeWithFixedLengthArcs(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> node) {
return fstCompiler.allowFixedLengthArcs &&
((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS);
}
@ -769,18 +769,18 @@ public final class FST<T> implements Accountable {
* Prefer direct addressing for performance if it does not oversize binary search byte size too much,
* so that the arcs can be directly addressed by label.
*
* @see Builder#getDirectAddressingMaxOversizingFactor()
* @see FSTCompiler#getDirectAddressingMaxOversizingFactor()
*/
private boolean shouldExpandNodeWithDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn,
private boolean shouldExpandNodeWithDirectAddressing(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn,
int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) {
// Anticipate precisely the size of the encodings.
int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs;
int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + builder.numLabelBytesPerArc[0]
int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + fstCompiler.numLabelBytesPerArc[0]
+ maxBytesPerArcWithoutLabel * nodeIn.numArcs;
// Determine the allowed oversize compared to binary search.
// This is defined by a parameter of FST Builder (default 1: no oversize).
int allowedOversize = (int) (sizeForBinarySearch * builder.getDirectAddressingMaxOversizingFactor());
int allowedOversize = (int) (sizeForBinarySearch * fstCompiler.getDirectAddressingMaxOversizingFactor());
int expansionCost = sizeForDirectAddressing - allowedOversize;
// Select direct addressing if either:
@ -790,46 +790,46 @@ public final class FST<T> implements Accountable {
// In this case, decrement the credit by the oversize.
// In addition, do not try to oversize to a clearly too large node size
// (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter).
if (expansionCost <= 0 || (builder.directAddressingExpansionCredit >= expansionCost
if (expansionCost <= 0 || (fstCompiler.directAddressingExpansionCredit >= expansionCost
&& sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) {
builder.directAddressingExpansionCredit -= expansionCost;
fstCompiler.directAddressingExpansionCredit -= expansionCost;
return true;
}
return false;
}
private void writeNodeForBinarySearch(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) {
private void writeNodeForBinarySearch(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) {
// Build the header in a buffer.
// It is a false/special arc which is in fact a node header with node flags followed by node metadata.
builder.fixedLengthArcsBuffer
fstCompiler.fixedLengthArcsBuffer
.resetPosition()
.writeByte(ARCS_FOR_BINARY_SEARCH)
.writeVInt(nodeIn.numArcs)
.writeVInt(maxBytesPerArc);
int headerLen = builder.fixedLengthArcsBuffer.getPosition();
int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition();
// Expand the arcs in place, backwards.
long srcPos = builder.bytes.getPosition();
long srcPos = fstCompiler.bytes.getPosition();
long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc;
assert destPos >= srcPos;
if (destPos > srcPos) {
builder.bytes.skipBytes((int) (destPos - srcPos));
fstCompiler.bytes.skipBytes((int) (destPos - srcPos));
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
destPos -= maxBytesPerArc;
int arcLen = builder.numBytesPerArc[arcIdx];
int arcLen = fstCompiler.numBytesPerArc[arcIdx];
srcPos -= arcLen;
if (srcPos != destPos) {
assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs;
builder.bytes.copyBytes(srcPos, destPos, arcLen);
fstCompiler.bytes.copyBytes(srcPos, destPos, arcLen);
}
}
}
// Write the header.
builder.bytes.writeBytes(startAddress, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
fstCompiler.bytes.writeBytes(startAddress, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
}
private void writeNodeForDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) {
private void writeNodeForDirectAddressing(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) {
// Expand the arcs backwards in a buffer because we remove the labels.
// So the obtained arcs might occupy less space. This is the reason why this
// whole method is more complex.
@ -837,64 +837,64 @@ public final class FST<T> implements Accountable {
// the presence bits, and the first label. Keep the first label.
int headerMaxLen = 11;
int numPresenceBytes = getNumPresenceBytes(labelRange);
long srcPos = builder.bytes.getPosition();
int totalArcBytes = builder.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
long srcPos = fstCompiler.bytes.getPosition();
int totalArcBytes = fstCompiler.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
byte[] buffer = builder.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes();
byte[] buffer = fstCompiler.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes();
// Copy the arcs to the buffer, dropping all labels except first one.
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
bufferOffset -= maxBytesPerArcWithoutLabel;
int srcArcLen = builder.numBytesPerArc[arcIdx];
int srcArcLen = fstCompiler.numBytesPerArc[arcIdx];
srcPos -= srcArcLen;
int labelLen = builder.numLabelBytesPerArc[arcIdx];
int labelLen = fstCompiler.numLabelBytesPerArc[arcIdx];
// Copy the flags.
builder.bytes.copyBytes(srcPos, buffer, bufferOffset, 1);
fstCompiler.bytes.copyBytes(srcPos, buffer, bufferOffset, 1);
// Skip the label, copy the remaining.
int remainingArcLen = srcArcLen - 1 - labelLen;
if (remainingArcLen != 0) {
builder.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen);
fstCompiler.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen);
}
if (arcIdx == 0) {
// Copy the label of the first arc only.
bufferOffset -= labelLen;
builder.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen);
fstCompiler.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen);
}
}
assert bufferOffset == headerMaxLen + numPresenceBytes;
// Build the header in the buffer.
// It is a false/special arc which is in fact a node header with node flags followed by node metadata.
builder.fixedLengthArcsBuffer
fstCompiler.fixedLengthArcsBuffer
.resetPosition()
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
.writeVInt(labelRange) // labelRange instead of numArcs.
.writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
int headerLen = builder.fixedLengthArcsBuffer.getPosition();
int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition();
// Prepare the builder byte store. Enlarge or truncate if needed.
long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes;
long currentPosition = builder.bytes.getPosition();
long currentPosition = fstCompiler.bytes.getPosition();
if (nodeEnd >= currentPosition) {
builder.bytes.skipBytes((int) (nodeEnd - currentPosition));
fstCompiler.bytes.skipBytes((int) (nodeEnd - currentPosition));
} else {
builder.bytes.truncate(nodeEnd);
fstCompiler.bytes.truncate(nodeEnd);
}
assert builder.bytes.getPosition() == nodeEnd;
assert fstCompiler.bytes.getPosition() == nodeEnd;
// Write the header.
long writeOffset = startAddress;
builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
writeOffset += headerLen;
// Write the presence bits
writePresenceBits(builder, nodeIn, writeOffset, numPresenceBytes);
writePresenceBits(fstCompiler, nodeIn, writeOffset, numPresenceBytes);
writeOffset += numPresenceBytes;
// Write the first label and the arcs.
builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
}
private void writePresenceBits(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) {
private void writePresenceBits(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) {
long bytePos = dest;
byte presenceBits = 1; // The first arc is always present.
int presenceIndex = 0;
@ -904,7 +904,7 @@ public final class FST<T> implements Accountable {
assert label > previousLabel;
presenceIndex += label - previousLabel;
while (presenceIndex >= Byte.SIZE) {
builder.bytes.writeByte(bytePos++, presenceBits);
fstCompiler.bytes.writeByte(bytePos++, presenceBits);
presenceBits = 0;
presenceIndex -= Byte.SIZE;
}
@ -915,7 +915,7 @@ public final class FST<T> implements Accountable {
assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8;
assert presenceBits != 0; // The last byte is not 0.
assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present.
builder.bytes.writeByte(bytePos++, presenceBits);
fstCompiler.bytes.writeByte(bytePos++, presenceBits);
assert bytePos - dest == numPresenceBytes;
}

View File

@ -49,31 +49,9 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
* @lucene.experimental
*/
public class Builder<T> {
public class FSTCompiler<T> {
/**
* Default oversizing factor used to decide whether to encode a node with direct addressing or binary search.
* Default is 1: ensure no oversizing on average.
* <p>
* This factor does not determine whether to encode a node with a list of variable length arcs or with
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
* encoded with fixed length arcs.
* See {@code FST.shouldExpandNodeWithFixedLengthArcs()}
* and {@code FST.shouldExpandNodeWithDirectAddressing()}.
* <p>
* For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs,
* and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%.
* <p>
* For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs,
* and 14% of them with direct encoding. Overall FST memory reduced by 0.8%.
* <p>
* Use {@code TestFstDirectAddressing.main()}
* and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()}
* to evaluate a change.
*
* @see #setDirectAddressingMaxOversizingFactor
*/
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f;
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1f;
private final NodeHash<T> dedupHash;
final FST<T> fst;
@ -117,75 +95,29 @@ public class Builder<T> {
long binarySearchNodeCount;
long directAddressingNodeCount;
boolean allowFixedLengthArcs;
float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
final boolean allowFixedLengthArcs;
final float directAddressingMaxOversizingFactor;
long directAddressingExpansionCredit;
BytesStore bytes;
final BytesStore bytes;
/**
* Instantiates an FST/FSA builder without any pruning. A shortcut to {@link
* #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int)} with
* pruning options turned off.
* Instantiates an FST/FSA builder with default settings and pruning options turned off.
* For more tuning and tweaking, see {@link Builder}.
*/
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
public FSTCompiler(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, 1f);
}
/**
* Instantiates an FST/FSA builder with all the possible tuning and construction
* tweaks. Read parameter documentation carefully.
*
* @param inputType
* The input type (transition labels). Can be anything from {@link INPUT_TYPE}
* enumeration. Shorter types will consume less memory. Strings (character sequences) are
* represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
*
* @param minSuffixCount1
* If pruning the input graph during construction, this threshold is used for telling
* if a node is kept or pruned. If transition_count(node) &gt;= minSuffixCount1, the node
* is kept.
*
* @param minSuffixCount2
* (Note: only Mike McCandless knows what this one is really doing...)
*
* @param doShareSuffix
* If <code>true</code>, the shared suffixes will be compacted into unique paths.
* This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
* <code>false</code> creates a single suffix path for all input sequences. This will result in a larger
* FST, but requires substantially less memory and CPU during building.
*
* @param doShareNonSingletonNodes
* Only used if doShareSuffix is true. Set this to
* true to ensure FST is fully minimal, at cost of more
* CPU and more RAM during building.
*
* @param shareMaxTailLength
* Only used if doShareSuffix is true. Set this to
* Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
* CPU and more RAM during building.
*
* @param outputs The output type for each input sequence. Applies only if building an FST. For
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*
* @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or
* direct addressing) while building the FST; this will make the resulting FST smaller but slower to
* traverse.
*
* @param bytesPageBits How many bits wide to make each
* byte[] block in the BytesStore; if you know the FST
* will be large then make this larger. For example 15
* bits = 32768 byte pages.
*/
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
boolean allowFixedLengthArcs, int bytesPageBits) {
private FSTCompiler(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
boolean allowFixedLengthArcs, int bytesPageBits, float directAddressingMaxOversizingFactor) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
this.allowFixedLengthArcs = allowFixedLengthArcs;
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
fst = new FST<>(inputType, outputs, bytesPageBits);
bytes = fst.bytes;
assert bytes != null;
@ -205,22 +137,145 @@ public class Builder<T> {
}
/**
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
* of arcs instead of binary search.
* Fluent-style constructor for FST {@link FSTCompiler}.
* <p>
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
* only binary search nodes will be created.
*
* @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR
* Creates an FST/FSA builder with all the possible tuning and construction tweaks.
* Read parameter documentation carefully.
*/
public Builder<T> setDirectAddressingMaxOversizingFactor(float factor) {
directAddressingMaxOversizingFactor = factor;
return this;
public static class Builder<T> {
private final INPUT_TYPE inputType;
private final Outputs<T> outputs;
private int minSuffixCount1;
private int minSuffixCount2;
private boolean shouldShareSuffix = true;
private boolean shouldShareNonSingletonNodes = true;
private int shareMaxTailLength = Integer.MAX_VALUE;
private boolean allowFixedLengthArcs = true;
private int bytesPageBits = 15;
private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
/**
* @param inputType The input type (transition labels). Can be anything from {@link INPUT_TYPE}
* enumeration. Shorter types will consume less memory. Strings (character sequences) are
* represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
* @param outputs The output type for each input sequence. Applies only if building an FST. For
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*/
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this.inputType = inputType;
this.outputs = outputs;
}
/**
* If pruning the input graph during construction, this threshold is used for telling if a node is kept
* or pruned. If transition_count(node) &gt;= minSuffixCount1, the node is kept.
* <p>
* Default = 0.
*/
public Builder<T> minSuffixCount1(int minSuffixCount1) {
this.minSuffixCount1 = minSuffixCount1;
return this;
}
/**
* Better pruning: we prune node (and all following nodes) if the prior node has less than this number
* of terms go through it.
* <p>
* Default = 0.
*/
public Builder<T> minSuffixCount2(int minSuffixCount2) {
this.minSuffixCount2 = minSuffixCount2;
return this;
}
/**
* If {@code true}, the shared suffixes will be compacted into unique paths.
* This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
* {@code false} creates a single suffix path for all input sequences. This will result in a larger
* FST, but requires substantially less memory and CPU during building.
* <p>
* Default = {@code true}.
*/
public Builder<T> shouldShareSuffix(boolean shouldShareSuffix) {
this.shouldShareSuffix = shouldShareSuffix;
return this;
}
/**
* Only used if {@code shouldShareSuffix} is true. Set this to true to ensure FST is fully minimal,
* at cost of more CPU and more RAM during building.
* <p>
* Default = {@code true}.
*/
public Builder<T> shouldShareNonSingletonNodes(boolean shouldShareNonSingletonNodes) {
this.shouldShareNonSingletonNodes = shouldShareNonSingletonNodes;
return this;
}
/**
* Only used if {@code shouldShareSuffix} is true. Set this to Integer.MAX_VALUE to ensure FST is
* fully minimal, at cost of more CPU and more RAM during building.
* <p>
* Default = {@link Integer#MAX_VALUE}.
*/
public Builder<T> shareMaxTailLength(int shareMaxTailLength) {
this.shareMaxTailLength = shareMaxTailLength;
return this;
}
/**
* Pass {@code false} to disable the fixed length arc optimization (binary search or direct addressing)
* while building the FST; this will make the resulting FST smaller but slower to traverse.
* <p>
* Default = {@code true}.
*/
public Builder<T> allowFixedLengthArcs(boolean allowFixedLengthArcs) {
this.allowFixedLengthArcs = allowFixedLengthArcs;
return this;
}
/**
* How many bits wide to make each byte[] block in the BytesStore; if you know the FST
* will be large then make this larger. For example 15 bits = 32768 byte pages.
* <p>
* Default = 15.
*/
public Builder<T> bytesPageBits(int bytesPageBits) {
this.bytesPageBits = bytesPageBits;
return this;
}
/**
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
* of arcs instead of binary search.
* <p>
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
* only binary search nodes will be created.
* <p>
* This factor does not determine whether to encode a node with a list of variable length arcs or with
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
* encoded with fixed length arcs.
* <p>
* Default = 1.
*/
public Builder<T> directAddressingMaxOversizingFactor(float factor) {
this.directAddressingMaxOversizingFactor = factor;
return this;
}
/**
* Creates a new {@link FSTCompiler}.
*/
public FSTCompiler<T> build() {
FSTCompiler<T> fstCompiler = new FSTCompiler<>(inputType, minSuffixCount1, minSuffixCount2, shouldShareSuffix,
shouldShareNonSingletonNodes, shareMaxTailLength, outputs, allowFixedLengthArcs, bytesPageBits,
directAddressingMaxOversizingFactor);
return fstCompiler;
}
}
/**
* @see #setDirectAddressingMaxOversizingFactor(float)
*/
public float getDirectAddressingMaxOversizingFactor() {
return directAddressingMaxOversizingFactor;
}
@ -514,7 +569,7 @@ public class Builder<T> {
/** Returns final FST. NOTE: this will return null if
* nothing is accepted by the FST. */
public FST<T> finish() throws IOException {
public FST<T> compile() throws IOException {
final UnCompiledNode<T> root = frontier[0];
@ -554,19 +609,19 @@ public class Builder<T> {
}
/** Expert: holds a pending (seen but not yet serialized) arc. */
public static class Arc<T> {
public int label; // really an "unsigned" byte
public Node target;
public boolean isFinal;
public T output;
public T nextFinalOutput;
static class Arc<T> {
int label; // really an "unsigned" byte
Node target;
boolean isFinal;
T output;
T nextFinalOutput;
}
// NOTE: not many instances of Node or CompiledNode are in
// memory while the FST is being built; it's only the
// current "frontier":
static interface Node {
interface Node {
boolean isCompiled();
}
@ -583,20 +638,20 @@ public class Builder<T> {
}
/** Expert: holds a pending (seen but not yet serialized) Node. */
public static final class UnCompiledNode<T> implements Node {
final Builder<T> owner;
public int numArcs;
public Arc<T>[] arcs;
static final class UnCompiledNode<T> implements Node {
final FSTCompiler<T> owner;
int numArcs;
Arc<T>[] arcs;
// TODO: instead of recording isFinal/output on the
// node, maybe we should use -1 arc to mean "end" (like
// we do when reading the FST). Would simplify much
// code here...
public T output;
public boolean isFinal;
public long inputCount;
T output;
boolean isFinal;
long inputCount;
/** This node's depth, starting from the automaton root. */
public final int depth;
final int depth;
/**
* @param depth
@ -605,7 +660,7 @@ public class Builder<T> {
* fanout size).
*/
@SuppressWarnings({"rawtypes","unchecked"})
public UnCompiledNode(Builder<T> owner, int depth) {
UnCompiledNode(FSTCompiler<T> owner, int depth) {
this.owner = owner;
arcs = (Arc<T>[]) new Arc[1];
arcs[0] = new Arc<>();
@ -618,7 +673,7 @@ public class Builder<T> {
return false;
}
public void clear() {
void clear() {
numArcs = 0;
isFinal = false;
output = owner.NO_OUTPUT;
@ -628,13 +683,13 @@ public class Builder<T> {
// for nodes on the frontier (even when reused).
}
public T getLastOutput(int labelToMatch) {
T getLastOutput(int labelToMatch) {
assert numArcs > 0;
assert arcs[numArcs-1].label == labelToMatch;
return arcs[numArcs-1].output;
}
public void addArc(int label, Node target) {
void addArc(int label, Node target) {
assert label >= 0;
assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[numArcs-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs;
if (numArcs == arcs.length) {
@ -651,7 +706,7 @@ public class Builder<T> {
arc.isFinal = false;
}
public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
assert numArcs > 0;
final Arc<T> arc = arcs[numArcs-1];
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
@ -661,14 +716,14 @@ public class Builder<T> {
arc.isFinal = isFinal;
}
public void deleteLast(int label, Node target) {
void deleteLast(int label, Node target) {
assert numArcs > 0;
assert label == arcs[numArcs-1].label;
assert target == arcs[numArcs-1].target;
numArcs--;
}
public void setLastOutput(int labelToMatch, T newOutput) {
void setLastOutput(int labelToMatch, T newOutput) {
assert owner.validOutput(newOutput);
assert numArcs > 0;
final Arc<T> arc = arcs[numArcs-1];
@ -677,7 +732,7 @@ public class Builder<T> {
}
// pushes an output prefix forward onto all arcs
public void prependOutput(T outputPrefix) {
void prependOutput(T outputPrefix) {
assert owner.validOutput(outputPrefix);
for(int arcIdx=0;arcIdx<numArcs;arcIdx++) {

View File

@ -39,7 +39,7 @@ final class NodeHash<T> {
this.in = in;
}
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException {
fst.readFirstRealTargetArc(address, scratchArc, in);
// Fail fast for a node with fixed length arcs.
@ -58,10 +58,10 @@ final class NodeHash<T> {
}
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
final Builder.Arc<T> arc = node.arcs[arcUpto];
final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
if (arc.label != scratchArc.label() ||
!arc.output.equals(scratchArc.output()) ||
((Builder.CompiledNode) arc.target).node != scratchArc.target() ||
((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() ||
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) ||
arc.isFinal != scratchArc.isFinal()) {
return false;
@ -82,16 +82,16 @@ final class NodeHash<T> {
// hash code for an unfrozen node. This must be identical
// to the frozen case (below)!!
private long hash(Builder.UnCompiledNode<T> node) {
private long hash(FSTCompiler.UnCompiledNode<T> node) {
final int PRIME = 31;
//System.out.println("hash unfrozen");
long h = 0;
// TODO: maybe if number of arcs is high we can safely subsample?
for (int arcIdx=0; arcIdx < node.numArcs; arcIdx++) {
final Builder.Arc<T> arc = node.arcs[arcIdx];
final FSTCompiler.Arc<T> arc = node.arcs[arcIdx];
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
h = PRIME * h + arc.label;
long n = ((Builder.CompiledNode) arc.target).node;
long n = ((FSTCompiler.CompiledNode) arc.target).node;
h = PRIME * h + (int) (n^(n>>32));
h = PRIME * h + arc.output.hashCode();
h = PRIME * h + arc.nextFinalOutput.hashCode();
@ -127,7 +127,7 @@ final class NodeHash<T> {
return h & Long.MAX_VALUE;
}
public long add(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException {
public long add(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
//System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
final long h = hash(nodeIn);
long pos = h & mask;
@ -136,7 +136,7 @@ final class NodeHash<T> {
final long v = table.get(pos);
if (v == 0) {
// freeze & add
final long node = fst.addNode(builder, nodeIn);
final long node = fst.addNode(fstCompiler, nodeIn);
//System.out.println(" now freeze node=" + node);
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
count++;

View File

@ -54,8 +54,7 @@ public class Test2BFST extends LuceneTestCase {
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
Outputs<Object> outputs = NoOutputs.getSingleton();
Object NO_OUTPUT = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
true, 15);
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
int count = 0;
Random r = new Random(seed);
@ -66,21 +65,21 @@ public class Test2BFST extends LuceneTestCase {
for(int i=10;i<ints2.length;i++) {
ints2[i] = r.nextInt(256);
}
b.add(input2, NO_OUTPUT);
fstCompiler.add(input2, NO_OUTPUT);
count++;
if (count % 100000 == 0) {
System.out.println(count + ": " + b.fstRamBytesUsed() + " bytes; " + b.getNodeCount() + " nodes");
System.out.println(count + ": " + fstCompiler.fstRamBytesUsed() + " bytes; " + fstCompiler.getNodeCount() + " nodes");
}
if (b.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
if (fstCompiler.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
break;
}
nextInput(r, ints2);
}
FST<Object> fst = b.finish();
FST<Object> fst = fstCompiler.compile();
for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
Arrays.fill(ints2, 0);
r = new Random(seed);
@ -136,8 +135,7 @@ public class Test2BFST extends LuceneTestCase {
{
System.out.println("\nTEST: 3 GB size; outputs=bytes");
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
true, 15);
final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
byte[] outputBytes = new byte[20];
BytesRef output = new BytesRef(outputBytes);
@ -147,10 +145,10 @@ public class Test2BFST extends LuceneTestCase {
while(true) {
r.nextBytes(outputBytes);
//System.out.println("add: " + input + " -> " + output);
b.add(input, BytesRef.deepCopyOf(output));
fstCompiler.add(input, BytesRef.deepCopyOf(output));
count++;
if (count % 10000 == 0) {
long size = b.fstRamBytesUsed();
long size = fstCompiler.fstRamBytesUsed();
if (count % 1000000 == 0) {
System.out.println(count + "...: " + size + " bytes");
}
@ -161,10 +159,10 @@ public class Test2BFST extends LuceneTestCase {
nextInput(r, ints);
}
FST<BytesRef> fst = b.finish();
FST<BytesRef> fst = fstCompiler.compile();
for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
r = new Random(seed);
Arrays.fill(ints, 0);
@ -216,8 +214,7 @@ public class Test2BFST extends LuceneTestCase {
{
System.out.println("\nTEST: 3 GB size; outputs=long");
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
true, 15);
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
long output = 1;
@ -226,11 +223,11 @@ public class Test2BFST extends LuceneTestCase {
Random r = new Random(seed);
while(true) {
//System.out.println("add: " + input + " -> " + output);
b.add(input, output);
fstCompiler.add(input, output);
output += 1+r.nextInt(10);
count++;
if (count % 10000 == 0) {
long size = b.fstRamBytesUsed();
long size = fstCompiler.fstRamBytesUsed();
if (count % 1000000 == 0) {
System.out.println(count + "...: " + size + " bytes");
}
@ -241,11 +238,11 @@ public class Test2BFST extends LuceneTestCase {
nextInput(r, ints);
}
FST<Long> fst = b.finish();
FST<Long> fst = fstCompiler.compile();
for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
Arrays.fill(ints, 0);

View File

@ -35,7 +35,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
public class TestFstDirectAddressing extends LuceneTestCase {
public class TestFSTDirectAddressing extends LuceneTestCase {
public void testDenseWithGap() throws Exception {
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
@ -86,13 +86,13 @@ public class TestFstDirectAddressing extends LuceneTestCase {
Collections.sort(wordList);
// Disable direct addressing and measure the FST size.
Builder<Object> builder = createBuilder(-1f);
FST<Object> fst = buildFST(wordList, builder);
FSTCompiler<Object> fstCompiler = createFSTCompiler(-1f);
FST<Object> fst = buildFST(wordList, fstCompiler);
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
// Enable direct addressing and measure the FST size.
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
fst = buildFST(wordList, builder);
fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
fst = buildFST(wordList, fstCompiler);
long ramBytesUsed = fst.ramBytesUsed();
// Compute the size increase in percents.
@ -107,42 +107,43 @@ public class TestFstDirectAddressing extends LuceneTestCase {
directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT);
}
private static void printStats(Builder<Object> builder, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
System.out.println("directAddressingMaxOversizingFactor = " + builder.getDirectAddressingMaxOversizingFactor());
private static void printStats(FSTCompiler<Object> fstCompiler, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
System.out.println("directAddressingMaxOversizingFactor = " + fstCompiler.getDirectAddressingMaxOversizingFactor());
System.out.println("ramBytesUsed = "
+ String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d)
+ String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent));
System.out.println("num nodes = " + builder.nodeCount);
long fixedLengthArcNodeCount = builder.directAddressingNodeCount + builder.binarySearchNodeCount;
System.out.println("num nodes = " + fstCompiler.nodeCount);
long fixedLengthArcNodeCount = fstCompiler.directAddressingNodeCount + fstCompiler.binarySearchNodeCount;
System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount
+ String.format(Locale.ENGLISH, " (%.2f %% of all nodes)",
((double) fixedLengthArcNodeCount / builder.nodeCount * 100)));
System.out.println("num binary-search nodes = " + (builder.binarySearchNodeCount)
((double) fixedLengthArcNodeCount / fstCompiler.nodeCount * 100)));
System.out.println("num binary-search nodes = " + (fstCompiler.binarySearchNodeCount)
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
((double) (builder.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
System.out.println("num direct-addressing nodes = " + (builder.directAddressingNodeCount)
((double) (fstCompiler.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
System.out.println("num direct-addressing nodes = " + (fstCompiler.directAddressingNodeCount)
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
((double) (builder.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
((double) (fstCompiler.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
}
private static Builder<Object> createBuilder(float directAddressingMaxOversizingFactor) {
return new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, NoOutputs.getSingleton(), true, 15)
.setDirectAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor);
private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) {
return new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, NoOutputs.getSingleton())
.directAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor)
.build();
}
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
return buildFST(entries, createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
return buildFST(entries, createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
}
private static FST<Object> buildFST(List<BytesRef> entries, Builder<Object> builder) throws Exception {
private static FST<Object> buildFST(List<BytesRef> entries, FSTCompiler<Object> fstCompiler) throws Exception {
BytesRef last = null;
for (BytesRef entry : entries) {
if (entry.equals(last) == false) {
builder.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
fstCompiler.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
}
last = entry;
}
return builder.finish();
return fstCompiler.compile();
}
public static void main(String... args) throws Exception {
@ -195,18 +196,18 @@ public class TestFstDirectAddressing extends LuceneTestCase {
Collections.sort(wordList);
// Disable direct addressing and measure the FST size.
Builder<Object> builder = createBuilder(-1f);
FST<Object> fst = buildFST(wordList, builder);
FSTCompiler<Object> fstCompiler = createFSTCompiler(-1f);
FST<Object> fst = buildFST(wordList, fstCompiler);
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
// Enable direct addressing and measure the FST size.
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
fst = buildFST(wordList, builder);
fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
fst = buildFST(wordList, fstCompiler);
long ramBytesUsed = fst.ramBytesUsed();
// Compute the size increase in percents.
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent);
printStats(fstCompiler, ramBytesUsed, directAddressingMemoryIncreasePercent);
}
}

View File

@ -327,7 +327,7 @@ public class TestFSTs extends LuceneTestCase {
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
@ -373,15 +373,15 @@ public class TestFSTs extends LuceneTestCase {
} else {
output = termsEnum.docFreq();
}
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
fstCompiler.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms...");
}
}
FST<Long> fst = builder.finish();
FST<Long> fst = fstCompiler.compile();
if (VERBOSE) {
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
}
if (ord > 0) {
@ -460,7 +460,7 @@ public class TestFSTs extends LuceneTestCase {
private final Path wordsFileIn;
private int inputMode;
private final Outputs<T> outputs;
private final Builder<T> builder;
private final FSTCompiler<T> fstCompiler;
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) {
this.dirOut = dirOut;
@ -468,7 +468,11 @@ public class TestFSTs extends LuceneTestCase {
this.inputMode = inputMode;
this.outputs = outputs;
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15);
fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
.minSuffixCount2(prune)
.shouldShareSuffix(prune == 0)
.allowFixedLengthArcs(!noArcArrays)
.build();
}
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase {
break;
}
toIntsRef(w, inputMode, intsRef);
builder.add(intsRef.get(),
fstCompiler.add(intsRef.get(),
getOutput(intsRef.get(), ord));
ord++;
@ -503,8 +507,8 @@ public class TestFSTs extends LuceneTestCase {
long tMid = System.currentTimeMillis();
System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms");
assert builder.getTermCount() == ord;
FST<T> fst = builder.finish();
assert fstCompiler.getTermCount() == ord;
FST<T> fst = fstCompiler.compile();
long tEnd = System.currentTimeMillis();
System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack");
if (fst == null) {
@ -516,8 +520,8 @@ public class TestFSTs extends LuceneTestCase {
return;
}
System.out.println(ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs; tot size " + fst.ramBytesUsed());
if (builder.getNodeCount() < 100) {
System.out.println(ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs; tot size " + fst.ramBytesUsed());
if (fstCompiler.getNodeCount() < 100) {
Writer w = Files.newBufferedWriter(Paths.get("out.dot"), StandardCharsets.UTF_8);
Util.toDot(fst, w, false, false);
w.close();
@ -717,9 +721,9 @@ public class TestFSTs extends LuceneTestCase {
public void testSingleString() throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput());
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(b.finish());
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
fstCompiler.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput());
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fstCompiler.compile());
assertNull(fstEnum.seekFloor(new BytesRef("foo")));
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
}
@ -728,12 +732,12 @@ public class TestFSTs extends LuceneTestCase {
public void testDuplicateFSAString() throws Exception {
String str = "foobar";
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder ints = new IntsRefBuilder();
for(int i=0; i<10; i++) {
b.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
fstCompiler.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
}
FST<Object> fst = b.finish();
FST<Object> fst = fstCompiler.compile();
// count the input paths
int count = 0;
@ -797,17 +801,17 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
// Build an FST mapping BytesRef -> Long
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final BytesRef a = new BytesRef("a");
final BytesRef b = new BytesRef("b");
final BytesRef c = new BytesRef("c");
builder.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L);
builder.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L);
builder.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L);
fstCompiler.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L);
fstCompiler.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L);
fstCompiler.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L);
final FST<Long> fst = builder.finish();
final FST<Long> fst = fstCompiler.compile();
assertEquals(13824324872317238L, (long) Util.get(fst, c));
assertEquals(42, (long) Util.get(fst, b));
@ -1035,7 +1039,7 @@ public class TestFSTs extends LuceneTestCase {
FST<Object> compile(String[] lines) throws IOException {
final NoOutputs outputs = NoOutputs.getSingleton();
final Object nothing = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
int line = 0;
final BytesRefBuilder term = new BytesRefBuilder();
@ -1046,10 +1050,10 @@ public class TestFSTs extends LuceneTestCase {
break;
}
term.copyChars(w);
b.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
fstCompiler.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
}
return b.finish();
return fstCompiler.compile();
}
void generate(ArrayList<String> out, StringBuilder b, char from, char to,
@ -1110,10 +1114,10 @@ public class TestFSTs extends LuceneTestCase {
public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
final FST<Long> fst = builder.finish();
final FSTCompiler<Long> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).minSuffixCount1(2).build();
fstCompiler.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
fstCompiler.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
final FST<Long> fst = fstCompiler.compile();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
StringWriter w = new StringWriter();
Util.toDot(fst, w, false, false);
@ -1124,10 +1128,10 @@ public class TestFSTs extends LuceneTestCase {
public void testInternalFinalState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
final FST<Long> fst = builder.finish();
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
fstCompiler.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
fstCompiler.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
final FST<Long> fst = fstCompiler.compile();
StringWriter w = new StringWriter();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
Util.toDot(fst, w, false, false);
@ -1145,20 +1149,20 @@ public class TestFSTs extends LuceneTestCase {
public void testNonFinalStopNode() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Long nothing = outputs.getNoOutput();
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
//final FST<Long> fst = new FST<>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, 15);
final FST<Long> fst = b.fst;
final FST<Long> fst = fstCompiler.fst;
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<>(b, 0);
final FSTCompiler.UnCompiledNode<Long> rootNode = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
// Add final stop node
{
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0);
final FSTCompiler.UnCompiledNode<Long> node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
node.isFinal = true;
rootNode.addArc('a', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode();
frozen.node = fst.addNode(b, node);
final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode();
frozen.node = fst.addNode(fstCompiler, node);
rootNode.arcs[0].nextFinalOutput = 17L;
rootNode.arcs[0].isFinal = true;
rootNode.arcs[0].output = nothing;
@ -1167,16 +1171,16 @@ public class TestFSTs extends LuceneTestCase {
// Add non-final stop node
{
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0);
final FSTCompiler.UnCompiledNode<Long> node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
rootNode.addArc('b', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode();
frozen.node = fst.addNode(b, node);
final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode();
frozen.node = fst.addNode(fstCompiler, node);
rootNode.arcs[1].nextFinalOutput = nothing;
rootNode.arcs[1].output = 42L;
rootNode.arcs[1].target = frozen;
}
fst.finish(fst.addNode(b, rootNode));
fst.finish(fst.addNode(fstCompiler, rootNode));
StringWriter w = new StringWriter();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
@ -1225,13 +1229,13 @@ public class TestFSTs extends LuceneTestCase {
public void testShortestPaths() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
final FST<Long> fst = builder.finish();
fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
final FST<Long> fst = fstCompiler.compile();
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
@ -1256,16 +1260,16 @@ public class TestFSTs extends LuceneTestCase {
public void testRejectNoLimits() throws IOException {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Long> fstCompiler = new FSTCompiler<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
builder.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
builder.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
fstCompiler.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
fstCompiler.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
final FST<Long> fst = builder.finish();
fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
final FST<Long> fst = fstCompiler.compile();
final AtomicInteger rejectCount = new AtomicInteger();
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, 2, 6, minLongComparator) {
@Override
@ -1320,13 +1324,13 @@ public class TestFSTs extends LuceneTestCase {
PositiveIntOutputs.getSingleton() // output
);
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Pair<Long,Long>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
final FST<Pair<Long,Long>> fst = builder.finish();
fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
final FST<Pair<Long,Long>> fst = fstCompiler.compile();
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
@ -1361,7 +1365,7 @@ public class TestFSTs extends LuceneTestCase {
final TreeSet<String> allPrefixes = new TreeSet<>();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
for (int i = 0; i < numWords; i++) {
@ -1382,10 +1386,10 @@ public class TestFSTs extends LuceneTestCase {
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
//System.out.println("add: " + e);
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
}
final FST<Long> fst = builder.finish();
final FST<Long> fst = fstCompiler.compile();
//System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
@ -1479,7 +1483,7 @@ public class TestFSTs extends LuceneTestCase {
PositiveIntOutputs.getSingleton(), // weight
PositiveIntOutputs.getSingleton() // output
);
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Pair<Long,Long>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
Random random = random();
@ -1504,10 +1508,10 @@ public class TestFSTs extends LuceneTestCase {
//System.out.println("add: " + e);
long weight = e.getValue().a;
long output = e.getValue().b;
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
}
final FST<Pair<Long,Long>> fst = builder.finish();
final FST<Pair<Long,Long>> fst = fstCompiler.compile();
//System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
@ -1563,7 +1567,7 @@ public class TestFSTs extends LuceneTestCase {
public void testLargeOutputsOnArrayArcs() throws Exception {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final byte[] bytes = new byte[300];
final IntsRefBuilder input = new IntsRefBuilder();
@ -1572,10 +1576,10 @@ public class TestFSTs extends LuceneTestCase {
for(int arc=0;arc<6;arc++) {
input.setIntAt(0, arc);
output.bytes[0] = (byte) arc;
builder.add(input.get(), BytesRef.deepCopyOf(output));
fstCompiler.add(input.get(), BytesRef.deepCopyOf(output));
}
final FST<BytesRef> fst = builder.finish();
final FST<BytesRef> fst = fstCompiler.compile();
for(int arc=0;arc<6;arc++) {
input.setIntAt(0, arc);
final BytesRef result = Util.get(fst, input.get());
@ -1608,15 +1612,15 @@ public class TestFSTs extends LuceneTestCase {
Collections.sort(termsList);
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder input = new IntsRefBuilder();
for(BytesRef term : termsList) {
Util.toIntsRef(term, input);
builder.add(input.get(), term);
fstCompiler.add(input.get(), term);
}
FST<BytesRef> fst = builder.finish();
FST<BytesRef> fst = fstCompiler.compile();
Arc<BytesRef> arc = new FST.Arc<>();
fst.getFirstArc(arc);
@ -1638,17 +1642,17 @@ public class TestFSTs extends LuceneTestCase {
public void testSimpleDepth() throws Exception {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
BytesRef ab = new BytesRef("ab");
BytesRef ac = new BytesRef("ac");
BytesRef bd = new BytesRef("bd");
builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
fstCompiler.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
fstCompiler.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
fstCompiler.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
FST<Long> fst = builder.finish();
FST<Long> fst = fstCompiler.compile();
assertEquals(3, (long) Util.get(fst, ab));
assertEquals(5, (long) Util.get(fst, ac));

View File

@ -83,15 +83,17 @@ public class TestUtil extends LuceneTestCase {
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, allowArrayArcs, 15);
final FSTCompiler.Builder<Object> builder = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
.allowFixedLengthArcs(allowArrayArcs);
if (!allowDirectAddressing) {
b.setDirectAddressingMaxOversizingFactor(-1f);
builder.directAddressingMaxOversizingFactor(-1f);
}
final FSTCompiler<Object> fstCompiler = builder.build();
for (String word : words) {
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
fstCompiler.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
}
return b.finish();
return fstCompiler.compile();
}
private List<String> createRandomDictionary(int width, int depth) {

View File

@ -30,7 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator;
* more of its output values. You can use this when a single
* input may need to map to more than one output,
* maintaining order: pass the same input with a different
* output by calling {@link Builder#add(IntsRef,Object)} multiple
* output by calling {@link FSTCompiler#add(IntsRef,Object)} multiple
* times. The builder will then combine the outputs using
* the {@link Outputs#merge(Object,Object)} method.
*
@ -41,7 +41,7 @@ import org.apache.lucene.util.RamUsageEstimator;
* <p>NOTE: the only way to create multiple outputs is to
* add the same input to the FST multiple times in a row. This is
* how the FST maps a single input to multiple outputs (e.g. you
* cannot pass a List&lt;Object&gt; to {@link Builder#add}). If
* cannot pass a List&lt;Object&gt; to {@link FSTCompiler#add}). If
* your outputs are longs, and you need at most 2, then use
* {@link UpToTwoPositiveIntOutputs} instead since it stores
* the outputs more compactly (by stealing a bit from each

View File

@ -35,14 +35,14 @@ import org.apache.lucene.util.SuppressForbidden;
* <p>NOTE: the only way to create a TwoLongs output is to
* add the same input to the FST twice in a row. This is
* how the FST maps a single input to two outputs (e.g. you
* cannot pass a TwoLongs to {@link Builder#add}. If you
* cannot pass a TwoLongs to {@link FSTCompiler#add}. If you
* need more than two then use {@link ListOfOutputs}, but if
* you only have at most 2 then this implementation will
* require fewer bytes as it steals one bit from each long
* value.
*
* <p>NOTE: the resulting FST is not guaranteed to be minimal!
* See {@link Builder}.
* See {@link FSTCompiler}.
*
* @lucene.experimental
*/

View File

@ -164,16 +164,16 @@ public class TestFSTsMisc extends LuceneTestCase {
public void testListOfOutputs() throws Exception {
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
// Add the same input more than once and the outputs
// are merged:
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
final FST<Object> fst = builder.finish();
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
final FST<Object> fst = fstCompiler.compile();
Object output = Util.get(fst, new BytesRef("a"));
assertNotNull(output);
@ -193,20 +193,20 @@ public class TestFSTsMisc extends LuceneTestCase {
public void testListOfOutputsEmptyString() throws Exception {
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(scratch.get(), 0L);
builder.add(scratch.get(), 1L);
builder.add(scratch.get(), 17L);
builder.add(scratch.get(), 1L);
fstCompiler.add(scratch.get(), 0L);
fstCompiler.add(scratch.get(), 1L);
fstCompiler.add(scratch.get(), 17L);
fstCompiler.add(scratch.get(), 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
final FST<Object> fst = builder.finish();
final FST<Object> fst = fstCompiler.compile();
Object output = Util.get(fst, new BytesRef(""));
assertNotNull(output);

View File

@ -43,7 +43,7 @@ import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
@ -350,29 +350,28 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
}
}
final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
FST_OUTPUTS, true, 15);
final FSTCompiler<Pair<BytesRef,Long>> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS)
.shouldShareNonSingletonNodes(false).build();
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}
//indexBuilder.DEBUG = false;
final byte[] bytes = scratchBytes.toArrayCopy();
assert bytes.length > 0;
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
scratchBytes.reset();
// Copy over index for all sub-blocks
for(PendingBlock block : blocks) {
if (block.subIndices != null) {
for(FST<Pair<BytesRef,Long>> subIndex : block.subIndices) {
append(indexBuilder, subIndex, scratchIntsRef);
append(fstCompiler, subIndex, scratchIntsRef);
}
block.subIndices = null;
}
}
index = indexBuilder.finish();
index = fstCompiler.compile();
assert subIndices == null;
@ -387,14 +386,14 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
// TODO: maybe we could add bulk-add method to
// Builder? Takes FST and unions it w/ current
// FST.
private void append(Builder<Pair<BytesRef,Long>> builder, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
private void append(FSTCompiler<Pair<BytesRef,Long>> fstCompiler, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
while((indexEnt = subIndexEnum.next()) != null) {
//if (DEBUG) {
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
//}
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
}
}
}

View File

@ -52,7 +52,7 @@ import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.FST;
@ -496,7 +496,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
PairOutputs<Long,BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
FSTCompiler<Pair<Long,BytesRef>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
// Build FST:
BytesRefBuilder previousAnalyzed = null;
@ -570,7 +570,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
Util.toIntsRef(analyzed.get(), scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) {
builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
fstCompiler.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = bytes.length - payloadOffset;
@ -579,10 +579,10 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
br.bytes[surface.length] = PAYLOAD_SEP;
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
br.length = br.bytes.length;
builder.add(scratchInts.get(), outputs.newPair(cost, br));
fstCompiler.add(scratchInts.get(), outputs.newPair(cost, br));
}
}
fst = builder.finish();
fst = fstCompiler.compile();
//Util.dotToFile(fst, "/tmp/suggest.dot");
} finally {

View File

@ -66,7 +66,7 @@ import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
@ -304,7 +304,7 @@ public class FreeTextSuggester extends Lookup implements Accountable {
TermsEnum termsEnum = terms.iterator();
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
while (true) {
@ -320,10 +320,10 @@ public class FreeTextSuggester extends Lookup implements Accountable {
totTokens += termsEnum.totalTermFreq();
}
builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
fstCompiler.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
}
fst = builder.finish();
fst = fstCompiler.compile();
if (fst == null) {
throw new IllegalArgumentException("need at least one suggestion");
}

View File

@ -25,7 +25,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
@ -53,7 +53,7 @@ final class NRTSuggesterBuilder {
public static final int END_BYTE = 0x0;
private final PairOutputs<Long, BytesRef> outputs;
private final Builder<PairOutputs.Pair<Long, BytesRef>> builder;
private final FSTCompiler<PairOutputs.Pair<Long, BytesRef>> fstCompiler;
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
private final BytesRefBuilder analyzed = new BytesRefBuilder();
private final PriorityQueue<Entry> entries;
@ -70,7 +70,7 @@ final class NRTSuggesterBuilder {
this.endByte = END_BYTE;
this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
this.entries = new PriorityQueue<>();
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
}
/**
@ -108,7 +108,7 @@ final class NRTSuggesterBuilder {
}
analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
Util.toIntsRef(analyzed.get(), scratchInts);
builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
fstCompiler.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
}
maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
entries.clear();
@ -119,11 +119,11 @@ final class NRTSuggesterBuilder {
* {@link NRTSuggester#load(IndexInput, CompletionPostingsFormat.FSTLoadMode)})}
*/
public boolean store(DataOutput output) throws IOException {
final FST<PairOutputs.Pair<Long, BytesRef>> build = builder.finish();
if (build == null) {
final FST<PairOutputs.Pair<Long, BytesRef>> fst = fstCompiler.compile();
if (fst == null) {
return false;
}
build.save(output);
fst.save(output);
/* write some more meta-info */
assert maxAnalyzedPathsPerOutput > 0;

View File

@ -169,7 +169,7 @@ public class FSTCompletionBuilder {
* @param shareMaxTailLength
* Max shared suffix sharing length.
*
* See the description of this parameter in {@link Builder}'s constructor.
* See the description of this parameter in {@link org.apache.lucene.util.fst.FSTCompiler.Builder}.
* In general, for very large inputs you'll want to construct a non-minimal
* automaton which will be larger, but the construction will take far less ram.
* For minimal automata, set it to {@link Integer#MAX_VALUE}.
@ -234,10 +234,9 @@ public class FSTCompletionBuilder {
// Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<>(
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
shareMaxTailLength, outputs, true, 15);
final FSTCompiler<Object> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
.shareMaxTailLength(shareMaxTailLength).build();
BytesRefBuilder scratch = new BytesRefBuilder();
BytesRef entry;
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
@ -246,11 +245,11 @@ public class FSTCompletionBuilder {
while((entry = iter.next()) != null) {
count++;
if (scratch.get().compareTo(entry) != 0) {
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty);
scratch.copyBytes(entry);
}
}
return count == 0 ? null : builder.finish();
return count == 0 ? null : fstCompiler.compile();
}
}

View File

@ -40,7 +40,7 @@ import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
@ -116,7 +116,7 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
IntsRefBuilder scratchInts = new IntsRefBuilder();
BytesRefBuilder previous = null;
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
while ((scratch = iter.next()) != null) {
long cost = iter.weight();
@ -127,11 +127,11 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
// added
}
Util.toIntsRef(scratch, scratchInts);
builder.add(scratchInts.get(), cost);
fstCompiler.add(scratchInts.get(), cost);
previous.copyBytes(scratch);
count++;
}
fst = builder.finish();
fst = fstCompiler.compile();
}

View File

@ -272,27 +272,26 @@ public class FSTTester<T> {
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
}
final Builder<T> builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1==0 && prune2==0,
allowRandomSuffixSharing ? random.nextBoolean() : true,
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
outputs,
true,
15);
final FSTCompiler<T> fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
.minSuffixCount1(prune1)
.minSuffixCount2(prune2)
.shouldShareSuffix(prune1==0 && prune2==0)
.shouldShareNonSingletonNodes(allowRandomSuffixSharing ? random.nextBoolean() : true)
.shareMaxTailLength(allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE)
.build();
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof List) {
@SuppressWarnings("unchecked") List<Long> longValues = (List<Long>) pair.output;
@SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
@SuppressWarnings("unchecked") final FSTCompiler<Object> fstCompilerObject = (FSTCompiler<Object>) fstCompiler;
for(Long value : longValues) {
builderObject.add(pair.input, value);
fstCompilerObject.add(pair.input, value);
}
} else {
builder.add(pair.input, pair.output);
fstCompiler.add(pair.input, pair.output);
}
}
FST<T> fst = builder.finish();
FST<T> fst = fstCompiler.compile();
if (random.nextBoolean() && fst != null) {
IOContext context = LuceneTestCase.newIOContext(random);
@ -320,7 +319,7 @@ public class FSTTester<T> {
if (fst == null) {
System.out.println(" fst has 0 nodes (fully pruned)");
} else {
System.out.println(" fst has " + builder.getNodeCount() + " nodes and " + builder.getArcCount() + " arcs");
System.out.println(" fst has " + fstCompiler.getNodeCount() + " nodes and " + fstCompiler.getArcCount() + " arcs");
}
}
@ -330,8 +329,8 @@ public class FSTTester<T> {
verifyPruned(inputMode, fst, prune1, prune2);
}
nodeCount = builder.getNodeCount();
arcCount = builder.getArcCount();
nodeCount = fstCompiler.getNodeCount();
arcCount = fstCompiler.getArcCount();
return fst;
}