mirror of https://github.com/apache/lucene.git
LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder.
Closes #1070
This commit is contained in:
parent
f083f40b28
commit
1812b367ab
|
@ -38,6 +38,9 @@ API Changes
|
||||||
* LUCENE-8905: Better defence against malformed arguments in TopDocsCollector
|
* LUCENE-8905: Better defence against malformed arguments in TopDocsCollector
|
||||||
(Atri Sharma)
|
(Atri Sharma)
|
||||||
|
|
||||||
|
* LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder.
|
||||||
|
(Bruno Roustant)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-8757: When provided with an ExecutorService to run queries across
|
* LUCENE-8757: When provided with an ExecutorService to run queries across
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
# Apache Lucene Migration Guide
|
# Apache Lucene Migration Guide
|
||||||
|
|
||||||
|
## o.a.l.util.fst.Builder is renamed FSTCompiler with fluent-style Builder (LUCENE-9089) ##
|
||||||
|
|
||||||
|
Simply use FSTCompiler instead of the previous Builder. Use either the simple constructor with default settings, or
|
||||||
|
the FSTCompiler.Builder to tune and tweak any parameter.
|
||||||
|
|
||||||
## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
|
## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
|
||||||
|
|
||||||
User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids
|
User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.Outputs;
|
import org.apache.lucene.util.fst.Outputs;
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
|
||||||
|
@ -106,13 +107,13 @@ public class NormalizeCharMap {
|
||||||
final FST<CharsRef> map;
|
final FST<CharsRef> map;
|
||||||
try {
|
try {
|
||||||
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||||
final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
|
final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
|
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
|
||||||
builder.add(Util.toUTF16(ent.getKey(), scratch),
|
fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
|
||||||
new CharsRef(ent.getValue()));
|
new CharsRef(ent.getValue()));
|
||||||
}
|
}
|
||||||
map = builder.finish();
|
map = fstCompiler.compile();
|
||||||
pendingPairs.clear();
|
pendingPairs.clear();
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
// Bogus FST IOExceptions!! (will never happen)
|
// Bogus FST IOExceptions!! (will never happen)
|
||||||
|
|
|
@ -64,7 +64,7 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||||
import org.apache.lucene.util.OfflineSorter;
|
import org.apache.lucene.util.OfflineSorter;
|
||||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||||
import org.apache.lucene.util.automaton.RegExp;
|
import org.apache.lucene.util.automaton.RegExp;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||||
|
@ -231,9 +231,9 @@ public class Dictionary {
|
||||||
|
|
||||||
// read dictionary entries
|
// read dictionary entries
|
||||||
IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
|
IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
|
||||||
Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
|
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
|
||||||
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, b);
|
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
|
||||||
words = b.finish();
|
words = fstCompiler.compile();
|
||||||
aliases = null; // no longer needed
|
aliases = null; // no longer needed
|
||||||
morphAliases = null; // no longer needed
|
morphAliases = null; // no longer needed
|
||||||
success = true;
|
success = true;
|
||||||
|
@ -414,7 +414,7 @@ public class Dictionary {
|
||||||
|
|
||||||
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
|
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
|
||||||
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
|
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
|
||||||
Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
|
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
|
for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
|
||||||
Util.toUTF32(entry.getKey(), scratch);
|
Util.toUTF32(entry.getKey(), scratch);
|
||||||
|
@ -423,9 +423,9 @@ public class Dictionary {
|
||||||
for (Integer c : entries) {
|
for (Integer c : entries) {
|
||||||
output.ints[output.length++] = c;
|
output.ints[output.length++] = c;
|
||||||
}
|
}
|
||||||
builder.add(scratch.get(), output);
|
fstCompiler.add(scratch.get(), output);
|
||||||
}
|
}
|
||||||
return builder.finish();
|
return fstCompiler.compile();
|
||||||
}
|
}
|
||||||
|
|
||||||
static String escapeDash(String re) {
|
static String escapeDash(String re) {
|
||||||
|
@ -608,14 +608,14 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||||
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
|
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
for (Map.Entry<String,String> entry : mappings.entrySet()) {
|
for (Map.Entry<String,String> entry : mappings.entrySet()) {
|
||||||
Util.toUTF16(entry.getKey(), scratchInts);
|
Util.toUTF16(entry.getKey(), scratchInts);
|
||||||
builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
|
fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
return builder.finish();
|
return fstCompiler.compile();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** pattern accepts optional BOM + SET + any whitespace */
|
/** pattern accepts optional BOM + SET + any whitespace */
|
||||||
|
@ -776,7 +776,7 @@ public class Dictionary {
|
||||||
* @param decoder CharsetDecoder used to decode the contents of the file
|
* @param decoder CharsetDecoder used to decode the contents of the file
|
||||||
* @throws IOException Can be thrown while reading from the file
|
* @throws IOException Can be thrown while reading from the file
|
||||||
*/
|
*/
|
||||||
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
|
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, FSTCompiler<IntsRef> words) throws IOException {
|
||||||
BytesRefBuilder flagsScratch = new BytesRefBuilder();
|
BytesRefBuilder flagsScratch = new BytesRefBuilder();
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
|
|
||||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.FST.Arc;
|
import org.apache.lucene.util.fst.FST.Arc;
|
||||||
import org.apache.lucene.util.fst.FST.BytesReader;
|
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||||
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides the ability to override any {@link KeywordAttribute} aware stemmer
|
* Provides the ability to override any {@link KeywordAttribute} aware stemmer
|
||||||
|
@ -203,7 +204,7 @@ public final class StemmerOverrideFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
public StemmerOverrideMap build() throws IOException {
|
public StemmerOverrideMap build() throws IOException {
|
||||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(
|
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(
|
||||||
FST.INPUT_TYPE.BYTE4, outputs);
|
FST.INPUT_TYPE.BYTE4, outputs);
|
||||||
final int[] sort = hash.sort();
|
final int[] sort = hash.sort();
|
||||||
IntsRefBuilder intsSpare = new IntsRefBuilder();
|
IntsRefBuilder intsSpare = new IntsRefBuilder();
|
||||||
|
@ -213,9 +214,9 @@ public final class StemmerOverrideFilter extends TokenFilter {
|
||||||
int id = sort[i];
|
int id = sort[i];
|
||||||
BytesRef bytesRef = hash.get(id, spare);
|
BytesRef bytesRef = hash.get(id, spare);
|
||||||
intsSpare.copyUTF8Bytes(bytesRef);
|
intsSpare.copyUTF8Bytes(bytesRef);
|
||||||
builder.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
|
fstCompiler.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
|
||||||
}
|
}
|
||||||
return new StemmerOverrideMap(builder.finish(), ignoreCase);
|
return new StemmerOverrideMap(fstCompiler.compile(), ignoreCase);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,6 +39,7 @@ import org.apache.lucene.util.CharsRefBuilder;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -213,8 +214,8 @@ public class SynonymMap {
|
||||||
public SynonymMap build() throws IOException {
|
public SynonymMap build() throws IOException {
|
||||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
// TODO: are we using the best sharing options?
|
// TODO: are we using the best sharing options?
|
||||||
org.apache.lucene.util.fst.Builder<BytesRef> builder =
|
FSTCompiler<BytesRef> fstCompiler =
|
||||||
new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
|
new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||||
|
|
||||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
|
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
|
||||||
|
@ -278,10 +279,10 @@ public class SynonymMap {
|
||||||
|
|
||||||
scratch.setLength(scratchOutput.getPosition());
|
scratch.setLength(scratchOutput.getPosition());
|
||||||
//System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
|
//System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
|
||||||
builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
|
fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
|
||||||
}
|
}
|
||||||
|
|
||||||
FST<BytesRef> fst = builder.finish();
|
FST<BytesRef> fst = fstCompiler.compile();
|
||||||
return new SynonymMap(fst, words, maxHorizontalContext);
|
return new SynonymMap(fst, words, maxHorizontalContext);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.Outputs;
|
import org.apache.lucene.util.fst.Outputs;
|
||||||
|
@ -196,26 +196,26 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
|
|
||||||
public void testReplacements() throws Exception {
|
public void testReplacements() throws Exception {
|
||||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||||
Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
|
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
|
|
||||||
// a -> b
|
// a -> b
|
||||||
Util.toUTF16("a", scratchInts);
|
Util.toUTF16("a", scratchInts);
|
||||||
builder.add(scratchInts.get(), new CharsRef("b"));
|
fstCompiler.add(scratchInts.get(), new CharsRef("b"));
|
||||||
|
|
||||||
// ab -> c
|
// ab -> c
|
||||||
Util.toUTF16("ab", scratchInts);
|
Util.toUTF16("ab", scratchInts);
|
||||||
builder.add(scratchInts.get(), new CharsRef("c"));
|
fstCompiler.add(scratchInts.get(), new CharsRef("c"));
|
||||||
|
|
||||||
// c -> de
|
// c -> de
|
||||||
Util.toUTF16("c", scratchInts);
|
Util.toUTF16("c", scratchInts);
|
||||||
builder.add(scratchInts.get(), new CharsRef("de"));
|
fstCompiler.add(scratchInts.get(), new CharsRef("de"));
|
||||||
|
|
||||||
// def -> gh
|
// def -> gh
|
||||||
Util.toUTF16("def", scratchInts);
|
Util.toUTF16("def", scratchInts);
|
||||||
builder.add(scratchInts.get(), new CharsRef("gh"));
|
fstCompiler.add(scratchInts.get(), new CharsRef("gh"));
|
||||||
|
|
||||||
FST<CharsRef> fst = builder.finish();
|
FST<CharsRef> fst = fstCompiler.compile();
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder("atestanother");
|
StringBuilder sb = new StringBuilder("atestanother");
|
||||||
Dictionary.applyMappings(fst, sb);
|
Dictionary.applyMappings(fst, sb);
|
||||||
|
|
|
@ -29,7 +29,7 @@ import java.util.TreeMap;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ja.util.CSVUtil;
|
import org.apache.lucene.analysis.ja.util.CSVUtil;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
|
|
||||||
|
@ -99,7 +99,7 @@ public final class UserDictionary implements Dictionary {
|
||||||
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
|
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
long ord = 0;
|
long ord = 0;
|
||||||
|
|
||||||
|
@ -136,11 +136,11 @@ public final class UserDictionary implements Dictionary {
|
||||||
for (int i = 0; i < token.length(); i++) {
|
for (int i = 0; i < token.length(); i++) {
|
||||||
scratch.setIntAt(i, (int) token.charAt(i));
|
scratch.setIntAt(i, (int) token.charAt(i));
|
||||||
}
|
}
|
||||||
fstBuilder.add(scratch.get(), ord);
|
fstCompiler.add(scratch.get(), ord);
|
||||||
segmentations.add(wordIdAndLength);
|
segmentations.add(wordIdAndLength);
|
||||||
ord++;
|
ord++;
|
||||||
}
|
}
|
||||||
this.fst = new TokenInfoFST(fstBuilder.finish(), false);
|
this.fst = new TokenInfoFST(fstCompiler.compile(), false);
|
||||||
this.data = data.toArray(new String[data.size()]);
|
this.data = data.toArray(new String[data.size()]);
|
||||||
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
|
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,7 @@ import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
|
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
|
|
||||||
|
@ -97,7 +97,7 @@ class TokenInfoDictionaryBuilder {
|
||||||
lines.sort(Comparator.comparing(entry -> entry[0]));
|
lines.sort(Comparator.comparing(entry -> entry[0]));
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
|
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
long ord = -1; // first ord will be 0
|
long ord = -1; // first ord will be 0
|
||||||
String lastValue = null;
|
String lastValue = null;
|
||||||
|
@ -120,12 +120,12 @@ class TokenInfoDictionaryBuilder {
|
||||||
for (int i = 0; i < token.length(); i++) {
|
for (int i = 0; i < token.length(); i++) {
|
||||||
scratch.setIntAt(i, (int) token.charAt(i));
|
scratch.setIntAt(i, (int) token.charAt(i));
|
||||||
}
|
}
|
||||||
fstBuilder.add(scratch.get(), ord);
|
fstCompiler.add(scratch.get(), ord);
|
||||||
}
|
}
|
||||||
dictionary.addMapping((int) ord, offset);
|
dictionary.addMapping((int) ord, offset);
|
||||||
offset = next;
|
offset = next;
|
||||||
}
|
}
|
||||||
dictionary.setFST(fstBuilder.finish());
|
dictionary.setFST(fstCompiler.compile());
|
||||||
return dictionary;
|
return dictionary;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ko.POS;
|
import org.apache.lucene.analysis.ko.POS;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
|
|
||||||
|
@ -83,7 +83,7 @@ public final class UserDictionary implements Dictionary {
|
||||||
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
|
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
|
|
||||||
String lastToken = null;
|
String lastToken = null;
|
||||||
|
@ -129,11 +129,11 @@ public final class UserDictionary implements Dictionary {
|
||||||
for (int i = 0; i < token.length(); i++) {
|
for (int i = 0; i < token.length(); i++) {
|
||||||
scratch.setIntAt(i, token.charAt(i));
|
scratch.setIntAt(i, token.charAt(i));
|
||||||
}
|
}
|
||||||
fstBuilder.add(scratch.get(), ord);
|
fstCompiler.add(scratch.get(), ord);
|
||||||
lastToken = token;
|
lastToken = token;
|
||||||
ord ++;
|
ord ++;
|
||||||
}
|
}
|
||||||
this.fst = new TokenInfoFST(fstBuilder.finish());
|
this.fst = new TokenInfoFST(fstCompiler.compile());
|
||||||
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
|
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
|
||||||
this.rightIds = new short[rightIds.size()];
|
this.rightIds = new short[rightIds.size()];
|
||||||
for (int i = 0; i < rightIds.size(); i++) {
|
for (int i = 0; i < rightIds.size(); i++) {
|
||||||
|
|
|
@ -30,7 +30,7 @@ import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
|
@ -90,7 +90,7 @@ class TokenInfoDictionaryBuilder {
|
||||||
lines.sort(Comparator.comparing(left -> left[0]));
|
lines.sort(Comparator.comparing(left -> left[0]));
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
|
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
long ord = -1; // first ord will be 0
|
long ord = -1; // first ord will be 0
|
||||||
String lastValue = null;
|
String lastValue = null;
|
||||||
|
@ -116,12 +116,12 @@ class TokenInfoDictionaryBuilder {
|
||||||
for (int i = 0; i < surfaceForm.length(); i++) {
|
for (int i = 0; i < surfaceForm.length(); i++) {
|
||||||
scratch.setIntAt(i, surfaceForm.charAt(i));
|
scratch.setIntAt(i, surfaceForm.charAt(i));
|
||||||
}
|
}
|
||||||
fstBuilder.add(scratch.get(), ord);
|
fstCompiler.add(scratch.get(), ord);
|
||||||
}
|
}
|
||||||
dictionary.addMapping((int) ord, offset);
|
dictionary.addMapping((int) ord, offset);
|
||||||
offset = next;
|
offset = next;
|
||||||
}
|
}
|
||||||
dictionary.setFST(fstBuilder.finish());
|
dictionary.setFST(fstCompiler.compile());
|
||||||
return dictionary;
|
return dictionary;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,7 +41,7 @@ import org.apache.lucene.search.WildcardQuery;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
@ -183,15 +183,15 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
|
||||||
|
|
||||||
private void updateFST(SortedMap<String, Double> weights) throws IOException {
|
private void updateFST(SortedMap<String, Double> weights) throws IOException {
|
||||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
BytesRefBuilder scratchBytes = new BytesRefBuilder();
|
BytesRefBuilder scratchBytes = new BytesRefBuilder();
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
for (Map.Entry<String, Double> entry : weights.entrySet()) {
|
for (Map.Entry<String, Double> entry : weights.entrySet()) {
|
||||||
scratchBytes.copyChars(entry.getKey());
|
scratchBytes.copyChars(entry.getKey());
|
||||||
fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
|
fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
|
||||||
.getValue().longValue());
|
.getValue().longValue());
|
||||||
}
|
}
|
||||||
fst = fstBuilder.finish();
|
fst = fstCompiler.compile();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
@ -219,7 +219,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private class FSTFieldWriter extends FieldWriter {
|
private class FSTFieldWriter extends FieldWriter {
|
||||||
private final Builder<Long> fstBuilder;
|
private final FSTCompiler<Long> fstCompiler;
|
||||||
private final PositiveIntOutputs fstOutputs;
|
private final PositiveIntOutputs fstOutputs;
|
||||||
private final long startTermsFilePointer;
|
private final long startTermsFilePointer;
|
||||||
|
|
||||||
|
@ -233,12 +233,12 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
|
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
fstOutputs = PositiveIntOutputs.getSingleton();
|
fstOutputs = PositiveIntOutputs.getSingleton();
|
||||||
fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
|
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
|
||||||
indexStart = out.getFilePointer();
|
indexStart = out.getFilePointer();
|
||||||
////System.out.println("VGW: field=" + fieldInfo.name);
|
////System.out.println("VGW: field=" + fieldInfo.name);
|
||||||
|
|
||||||
// Always put empty string in
|
// Always put empty string in
|
||||||
fstBuilder.add(new IntsRef(), termsFilePointer);
|
fstCompiler.add(new IntsRef(), termsFilePointer);
|
||||||
startTermsFilePointer = termsFilePointer;
|
startTermsFilePointer = termsFilePointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -269,7 +269,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
final int lengthSave = text.length;
|
final int lengthSave = text.length;
|
||||||
text.length = indexedTermPrefixLength(lastTerm.get(), text);
|
text.length = indexedTermPrefixLength(lastTerm.get(), text);
|
||||||
try {
|
try {
|
||||||
fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
|
fstCompiler.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
|
||||||
} finally {
|
} finally {
|
||||||
text.length = lengthSave;
|
text.length = lengthSave;
|
||||||
}
|
}
|
||||||
|
@ -278,7 +278,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish(long termsFilePointer) throws IOException {
|
public void finish(long termsFilePointer) throws IOException {
|
||||||
fst = fstBuilder.finish();
|
fst = fstCompiler.compile();
|
||||||
if (fst != null) {
|
if (fst != null) {
|
||||||
fst.save(out);
|
fst.save(out);
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,7 +45,7 @@ import org.apache.lucene.util.FixedBitSet;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
@ -361,16 +361,14 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
final FSTCompiler<Output> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS).shouldShareNonSingletonNodes(false).build();
|
||||||
0, 0, true, false, Integer.MAX_VALUE,
|
|
||||||
FST_OUTPUTS, true, 15);
|
|
||||||
//if (DEBUG) {
|
//if (DEBUG) {
|
||||||
// System.out.println(" compile index for prefix=" + prefix);
|
// System.out.println(" compile index for prefix=" + prefix);
|
||||||
//}
|
//}
|
||||||
//indexBuilder.DEBUG = false;
|
//indexBuilder.DEBUG = false;
|
||||||
final byte[] bytes = scratchBytes.toArrayCopy();
|
final byte[] bytes = scratchBytes.toArrayCopy();
|
||||||
assert bytes.length > 0;
|
assert bytes.length > 0;
|
||||||
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef),
|
fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef),
|
||||||
FST_OUTPUTS.newOutput(new BytesRef(bytes, 0, bytes.length),
|
FST_OUTPUTS.newOutput(new BytesRef(bytes, 0, bytes.length),
|
||||||
0, Long.MAX_VALUE-(sumTotalTermCount-1)));
|
0, Long.MAX_VALUE-(sumTotalTermCount-1)));
|
||||||
scratchBytes.reset();
|
scratchBytes.reset();
|
||||||
|
@ -381,7 +379,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
||||||
for(PendingBlock block : blocks) {
|
for(PendingBlock block : blocks) {
|
||||||
if (block.subIndices != null) {
|
if (block.subIndices != null) {
|
||||||
for(SubIndex subIndex : block.subIndices) {
|
for(SubIndex subIndex : block.subIndices) {
|
||||||
append(indexBuilder, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef);
|
append(fstCompiler, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef);
|
||||||
}
|
}
|
||||||
block.subIndices = null;
|
block.subIndices = null;
|
||||||
}
|
}
|
||||||
|
@ -391,7 +389,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
||||||
|
|
||||||
assert sumTotalTermCount == totFloorTermCount;
|
assert sumTotalTermCount == totFloorTermCount;
|
||||||
|
|
||||||
index = indexBuilder.finish();
|
index = fstCompiler.compile();
|
||||||
assert subIndices == null;
|
assert subIndices == null;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -405,7 +403,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
||||||
// TODO: maybe we could add bulk-add method to
|
// TODO: maybe we could add bulk-add method to
|
||||||
// Builder? Takes FST and unions it w/ current
|
// Builder? Takes FST and unions it w/ current
|
||||||
// FST.
|
// FST.
|
||||||
private void append(Builder<Output> builder, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
|
private void append(FSTCompiler<Output> fstCompiler, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
|
||||||
final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
|
final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
|
||||||
BytesRefFSTEnum.InputOutput<Output> indexEnt;
|
BytesRefFSTEnum.InputOutput<Output> indexEnt;
|
||||||
while ((indexEnt = subIndexEnum.next()) != null) {
|
while ((indexEnt = subIndexEnum.next()) != null) {
|
||||||
|
@ -416,7 +414,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
||||||
//long blockTermCount = output.endOrd - output.startOrd + 1;
|
//long blockTermCount = output.endOrd - output.startOrd + 1;
|
||||||
Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset);
|
Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset);
|
||||||
//System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd));
|
//System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd));
|
||||||
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
|
fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
@ -287,7 +287,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
final class TermsWriter {
|
final class TermsWriter {
|
||||||
private final Builder<Long> builder;
|
private final FSTCompiler<Long> fstCompiler;
|
||||||
private final PositiveIntOutputs outputs;
|
private final PositiveIntOutputs outputs;
|
||||||
private final FieldInfo fieldInfo;
|
private final FieldInfo fieldInfo;
|
||||||
private final int longsSize;
|
private final int longsSize;
|
||||||
|
@ -311,7 +311,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
this.longsSize = postingsWriter.setField(fieldInfo);
|
this.longsSize = postingsWriter.setField(fieldInfo);
|
||||||
this.outputs = PositiveIntOutputs.getSingleton();
|
this.outputs = PositiveIntOutputs.getSingleton();
|
||||||
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
this.lastBlockStatsFP = 0;
|
this.lastBlockStatsFP = 0;
|
||||||
this.lastBlockMetaLongsFP = 0;
|
this.lastBlockMetaLongsFP = 0;
|
||||||
|
@ -346,7 +346,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
metaLongsOut.writeVLong(metaBytesOut.size() - lastMetaBytesFP);
|
metaLongsOut.writeVLong(metaBytesOut.size() - lastMetaBytesFP);
|
||||||
|
|
||||||
builder.add(Util.toIntsRef(text, scratchTerm), numTerms);
|
fstCompiler.add(Util.toIntsRef(text, scratchTerm), numTerms);
|
||||||
numTerms++;
|
numTerms++;
|
||||||
|
|
||||||
lastMetaBytesFP = metaBytesOut.size();
|
lastMetaBytesFP = metaBytesOut.size();
|
||||||
|
@ -365,7 +365,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer {
|
||||||
metadata.statsOut = statsOut;
|
metadata.statsOut = statsOut;
|
||||||
metadata.metaLongsOut = metaLongsOut;
|
metadata.metaLongsOut = metaLongsOut;
|
||||||
metadata.metaBytesOut = metaBytesOut;
|
metadata.metaBytesOut = metaBytesOut;
|
||||||
metadata.dict = builder.finish();
|
metadata.dict = fstCompiler.compile();
|
||||||
fields.add(metadata);
|
fields.add(metadata);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
|
||||||
|
@ -247,7 +247,7 @@ public class FSTTermsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
final class TermsWriter {
|
final class TermsWriter {
|
||||||
private final Builder<FSTTermOutputs.TermData> builder;
|
private final FSTCompiler<FSTTermOutputs.TermData> fstCompiler;
|
||||||
private final FSTTermOutputs outputs;
|
private final FSTTermOutputs outputs;
|
||||||
private final FieldInfo fieldInfo;
|
private final FieldInfo fieldInfo;
|
||||||
private final int longsSize;
|
private final int longsSize;
|
||||||
|
@ -261,7 +261,7 @@ public class FSTTermsWriter extends FieldsConsumer {
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
this.longsSize = postingsWriter.setField(fieldInfo);
|
this.longsSize = postingsWriter.setField(fieldInfo);
|
||||||
this.outputs = new FSTTermOutputs(fieldInfo, longsSize);
|
this.outputs = new FSTTermOutputs(fieldInfo, longsSize);
|
||||||
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
|
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
|
||||||
|
@ -276,14 +276,14 @@ public class FSTTermsWriter extends FieldsConsumer {
|
||||||
meta.bytes = metaWriter.toArrayCopy();
|
meta.bytes = metaWriter.toArrayCopy();
|
||||||
metaWriter.reset();
|
metaWriter.reset();
|
||||||
}
|
}
|
||||||
builder.add(Util.toIntsRef(text, scratchTerm), meta);
|
fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta);
|
||||||
numTerms++;
|
numTerms++;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
|
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
|
||||||
// save FST dict
|
// save FST dict
|
||||||
if (numTerms > 0) {
|
if (numTerms > 0) {
|
||||||
final FST<FSTTermOutputs.TermData> fst = builder.finish();
|
final FST<FSTTermOutputs.TermData> fst = fstCompiler.compile();
|
||||||
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
|
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,7 +52,7 @@ import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.PairOutputs;
|
import org.apache.lucene.util.fst.PairOutputs;
|
||||||
|
@ -539,11 +539,11 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
|
|
||||||
private void loadTerms() throws IOException {
|
private void loadTerms() throws IOException {
|
||||||
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
|
||||||
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
|
final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
|
||||||
final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
|
final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
|
||||||
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
|
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
|
||||||
outputsInner);
|
outputsInner);
|
||||||
b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
IndexInput in = SimpleTextFieldsReader.this.in.clone();
|
IndexInput in = SimpleTextFieldsReader.this.in.clone();
|
||||||
in.seek(termsStart);
|
in.seek(termsStart);
|
||||||
final BytesRefBuilder lastTerm = new BytesRefBuilder();
|
final BytesRefBuilder lastTerm = new BytesRefBuilder();
|
||||||
|
@ -556,7 +556,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
SimpleTextUtil.readLine(in, scratch);
|
SimpleTextUtil.readLine(in, scratch);
|
||||||
if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
|
if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
|
||||||
if (lastDocsStart != -1) {
|
if (lastDocsStart != -1) {
|
||||||
b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
|
fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
|
||||||
outputs.newPair(lastDocsStart,
|
outputs.newPair(lastDocsStart,
|
||||||
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
||||||
sumTotalTermFreq += totalTermFreq;
|
sumTotalTermFreq += totalTermFreq;
|
||||||
|
@ -574,7 +574,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
|
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
|
||||||
} else if (StringHelper.startsWith(scratch.get(), TERM)) {
|
} else if (StringHelper.startsWith(scratch.get(), TERM)) {
|
||||||
if (lastDocsStart != -1) {
|
if (lastDocsStart != -1) {
|
||||||
b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
|
fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
|
||||||
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
||||||
}
|
}
|
||||||
lastDocsStart = in.getFilePointer();
|
lastDocsStart = in.getFilePointer();
|
||||||
|
@ -589,7 +589,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
docCount = visitedDocs.cardinality();
|
docCount = visitedDocs.cardinality();
|
||||||
fst = b.finish();
|
fst = fstCompiler.compile();
|
||||||
/*
|
/*
|
||||||
PrintStream ps = new PrintStream("out.dot");
|
PrintStream ps = new PrintStream("out.dot");
|
||||||
fst.toDot(ps);
|
fst.toDot(ps);
|
||||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
|
||||||
|
@ -202,19 +203,19 @@ public class FSTDictionary implements IndexDictionary {
|
||||||
*/
|
*/
|
||||||
public static class Builder implements IndexDictionary.Builder {
|
public static class Builder implements IndexDictionary.Builder {
|
||||||
|
|
||||||
protected final org.apache.lucene.util.fst.Builder<Long> fstBuilder;
|
protected final FSTCompiler<Long> fstCompiler;
|
||||||
protected final IntsRefBuilder scratchInts;
|
protected final IntsRefBuilder scratchInts;
|
||||||
|
|
||||||
public Builder() {
|
public Builder() {
|
||||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
fstBuilder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
scratchInts = new IntsRefBuilder();
|
scratchInts = new IntsRefBuilder();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void add(BytesRef blockKey, long blockFilePointer) {
|
public void add(BytesRef blockKey, long blockFilePointer) {
|
||||||
try {
|
try {
|
||||||
fstBuilder.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer);
|
fstCompiler.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
// Should never happen.
|
// Should never happen.
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
@ -224,7 +225,7 @@ public class FSTDictionary implements IndexDictionary {
|
||||||
@Override
|
@Override
|
||||||
public FSTDictionary build() {
|
public FSTDictionary build() {
|
||||||
try {
|
try {
|
||||||
return new FSTDictionary(fstBuilder.finish());
|
return new FSTDictionary(fstCompiler.compile());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
// Should never happen.
|
// Should never happen.
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
|
|
@ -44,7 +44,7 @@ import org.apache.lucene.util.FixedBitSet;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
@ -454,29 +454,27 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).shouldShareNonSingletonNodes(false).build();
|
||||||
0, 0, true, false, Integer.MAX_VALUE,
|
|
||||||
outputs, true, 15);
|
|
||||||
//if (DEBUG) {
|
//if (DEBUG) {
|
||||||
// System.out.println(" compile index for prefix=" + prefix);
|
// System.out.println(" compile index for prefix=" + prefix);
|
||||||
//}
|
//}
|
||||||
//indexBuilder.DEBUG = false;
|
//indexBuilder.DEBUG = false;
|
||||||
final byte[] bytes = scratchBytes.toArrayCopy();
|
final byte[] bytes = scratchBytes.toArrayCopy();
|
||||||
assert bytes.length > 0;
|
assert bytes.length > 0;
|
||||||
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
|
fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
|
||||||
scratchBytes.reset();
|
scratchBytes.reset();
|
||||||
|
|
||||||
// Copy over index for all sub-blocks
|
// Copy over index for all sub-blocks
|
||||||
for(PendingBlock block : blocks) {
|
for(PendingBlock block : blocks) {
|
||||||
if (block.subIndices != null) {
|
if (block.subIndices != null) {
|
||||||
for(FST<BytesRef> subIndex : block.subIndices) {
|
for(FST<BytesRef> subIndex : block.subIndices) {
|
||||||
append(indexBuilder, subIndex, scratchIntsRef);
|
append(fstCompiler, subIndex, scratchIntsRef);
|
||||||
}
|
}
|
||||||
block.subIndices = null;
|
block.subIndices = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
index = indexBuilder.finish();
|
index = fstCompiler.compile();
|
||||||
|
|
||||||
assert subIndices == null;
|
assert subIndices == null;
|
||||||
|
|
||||||
|
@ -491,14 +489,14 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
// TODO: maybe we could add bulk-add method to
|
// TODO: maybe we could add bulk-add method to
|
||||||
// Builder? Takes FST and unions it w/ current
|
// Builder? Takes FST and unions it w/ current
|
||||||
// FST.
|
// FST.
|
||||||
private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
|
private void append(FSTCompiler<BytesRef> fstCompiler, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
|
||||||
final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
|
final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
|
||||||
BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
|
BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
|
||||||
while((indexEnt = subIndexEnum.next()) != null) {
|
while((indexEnt = subIndexEnum.next()) != null) {
|
||||||
//if (DEBUG) {
|
//if (DEBUG) {
|
||||||
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
|
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
|
||||||
//}
|
//}
|
||||||
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
|
fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -605,7 +605,7 @@ public final class FST<T> implements Accountable {
|
||||||
|
|
||||||
// serializes new node by appending its bytes to the end
|
// serializes new node by appending its bytes to the end
|
||||||
// of the current byte[]
|
// of the current byte[]
|
||||||
long addNode(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException {
|
long addNode(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
|
||||||
T NO_OUTPUT = outputs.getNoOutput();
|
T NO_OUTPUT = outputs.getNoOutput();
|
||||||
|
|
||||||
//System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
|
//System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
|
||||||
|
@ -616,28 +616,28 @@ public final class FST<T> implements Accountable {
|
||||||
return NON_FINAL_END_NODE;
|
return NON_FINAL_END_NODE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
final long startAddress = builder.bytes.getPosition();
|
final long startAddress = fstCompiler.bytes.getPosition();
|
||||||
//System.out.println(" startAddr=" + startAddress);
|
//System.out.println(" startAddr=" + startAddress);
|
||||||
|
|
||||||
final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(builder, nodeIn);
|
final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(fstCompiler, nodeIn);
|
||||||
if (doFixedLengthArcs) {
|
if (doFixedLengthArcs) {
|
||||||
//System.out.println(" fixed length arcs");
|
//System.out.println(" fixed length arcs");
|
||||||
if (builder.numBytesPerArc.length < nodeIn.numArcs) {
|
if (fstCompiler.numBytesPerArc.length < nodeIn.numArcs) {
|
||||||
builder.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)];
|
fstCompiler.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)];
|
||||||
builder.numLabelBytesPerArc = new int[builder.numBytesPerArc.length];
|
fstCompiler.numLabelBytesPerArc = new int[fstCompiler.numBytesPerArc.length];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.arcCount += nodeIn.numArcs;
|
fstCompiler.arcCount += nodeIn.numArcs;
|
||||||
|
|
||||||
final int lastArc = nodeIn.numArcs-1;
|
final int lastArc = nodeIn.numArcs-1;
|
||||||
|
|
||||||
long lastArcStart = builder.bytes.getPosition();
|
long lastArcStart = fstCompiler.bytes.getPosition();
|
||||||
int maxBytesPerArc = 0;
|
int maxBytesPerArc = 0;
|
||||||
int maxBytesPerArcWithoutLabel = 0;
|
int maxBytesPerArcWithoutLabel = 0;
|
||||||
for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) {
|
for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) {
|
||||||
final Builder.Arc<T> arc = nodeIn.arcs[arcIdx];
|
final FSTCompiler.Arc<T> arc = nodeIn.arcs[arcIdx];
|
||||||
final Builder.CompiledNode target = (Builder.CompiledNode) arc.target;
|
final FSTCompiler.CompiledNode target = (FSTCompiler.CompiledNode) arc.target;
|
||||||
int flags = 0;
|
int flags = 0;
|
||||||
//System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node);
|
//System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node);
|
||||||
|
|
||||||
|
@ -645,7 +645,7 @@ public final class FST<T> implements Accountable {
|
||||||
flags += BIT_LAST_ARC;
|
flags += BIT_LAST_ARC;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (builder.lastFrozenNode == target.node && !doFixedLengthArcs) {
|
if (fstCompiler.lastFrozenNode == target.node && !doFixedLengthArcs) {
|
||||||
// TODO: for better perf (but more RAM used) we
|
// TODO: for better perf (but more RAM used) we
|
||||||
// could avoid this except when arc is "near" the
|
// could avoid this except when arc is "near" the
|
||||||
// last arc:
|
// last arc:
|
||||||
|
@ -671,36 +671,36 @@ public final class FST<T> implements Accountable {
|
||||||
flags += BIT_ARC_HAS_OUTPUT;
|
flags += BIT_ARC_HAS_OUTPUT;
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.bytes.writeByte((byte) flags);
|
fstCompiler.bytes.writeByte((byte) flags);
|
||||||
long labelStart = builder.bytes.getPosition();
|
long labelStart = fstCompiler.bytes.getPosition();
|
||||||
writeLabel(builder.bytes, arc.label);
|
writeLabel(fstCompiler.bytes, arc.label);
|
||||||
int numLabelBytes = (int) (builder.bytes.getPosition() - labelStart);
|
int numLabelBytes = (int) (fstCompiler.bytes.getPosition() - labelStart);
|
||||||
|
|
||||||
// System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output));
|
// System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output));
|
||||||
|
|
||||||
if (arc.output != NO_OUTPUT) {
|
if (arc.output != NO_OUTPUT) {
|
||||||
outputs.write(arc.output, builder.bytes);
|
outputs.write(arc.output, fstCompiler.bytes);
|
||||||
//System.out.println(" write output");
|
//System.out.println(" write output");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (arc.nextFinalOutput != NO_OUTPUT) {
|
if (arc.nextFinalOutput != NO_OUTPUT) {
|
||||||
//System.out.println(" write final output");
|
//System.out.println(" write final output");
|
||||||
outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes);
|
outputs.writeFinalOutput(arc.nextFinalOutput, fstCompiler.bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
|
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
|
||||||
assert target.node > 0;
|
assert target.node > 0;
|
||||||
//System.out.println(" write target");
|
//System.out.println(" write target");
|
||||||
builder.bytes.writeVLong(target.node);
|
fstCompiler.bytes.writeVLong(target.node);
|
||||||
}
|
}
|
||||||
|
|
||||||
// just write the arcs "like normal" on first pass, but record how many bytes each one took
|
// just write the arcs "like normal" on first pass, but record how many bytes each one took
|
||||||
// and max byte size:
|
// and max byte size:
|
||||||
if (doFixedLengthArcs) {
|
if (doFixedLengthArcs) {
|
||||||
int numArcBytes = (int) (builder.bytes.getPosition() - lastArcStart);
|
int numArcBytes = (int) (fstCompiler.bytes.getPosition() - lastArcStart);
|
||||||
builder.numBytesPerArc[arcIdx] = numArcBytes;
|
fstCompiler.numBytesPerArc[arcIdx] = numArcBytes;
|
||||||
builder.numLabelBytesPerArc[arcIdx] = numLabelBytes;
|
fstCompiler.numLabelBytesPerArc[arcIdx] = numLabelBytes;
|
||||||
lastArcStart = builder.bytes.getPosition();
|
lastArcStart = fstCompiler.bytes.getPosition();
|
||||||
maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes);
|
maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes);
|
||||||
maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes);
|
maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes);
|
||||||
//System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes);
|
//System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes);
|
||||||
|
@ -733,18 +733,18 @@ public final class FST<T> implements Accountable {
|
||||||
|
|
||||||
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
|
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
|
||||||
assert labelRange > 0;
|
assert labelRange > 0;
|
||||||
if (shouldExpandNodeWithDirectAddressing(builder, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
|
if (shouldExpandNodeWithDirectAddressing(fstCompiler, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
|
||||||
writeNodeForDirectAddressing(builder, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
|
writeNodeForDirectAddressing(fstCompiler, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
|
||||||
builder.directAddressingNodeCount++;
|
fstCompiler.directAddressingNodeCount++;
|
||||||
} else {
|
} else {
|
||||||
writeNodeForBinarySearch(builder, nodeIn, startAddress, maxBytesPerArc);
|
writeNodeForBinarySearch(fstCompiler, nodeIn, startAddress, maxBytesPerArc);
|
||||||
builder.binarySearchNodeCount++;
|
fstCompiler.binarySearchNodeCount++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final long thisNodeAddress = builder.bytes.getPosition()-1;
|
final long thisNodeAddress = fstCompiler.bytes.getPosition()-1;
|
||||||
builder.bytes.reverse(startAddress, thisNodeAddress);
|
fstCompiler.bytes.reverse(startAddress, thisNodeAddress);
|
||||||
builder.nodeCount++;
|
fstCompiler.nodeCount++;
|
||||||
return thisNodeAddress;
|
return thisNodeAddress;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -757,8 +757,8 @@ public final class FST<T> implements Accountable {
|
||||||
* of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear
|
* of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear
|
||||||
* scan) on lookup by arc label.
|
* scan) on lookup by arc label.
|
||||||
*/
|
*/
|
||||||
private boolean shouldExpandNodeWithFixedLengthArcs(Builder<T> builder, Builder.UnCompiledNode<T> node) {
|
private boolean shouldExpandNodeWithFixedLengthArcs(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> node) {
|
||||||
return builder.allowFixedLengthArcs &&
|
return fstCompiler.allowFixedLengthArcs &&
|
||||||
((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
|
((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
|
||||||
node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS);
|
node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS);
|
||||||
}
|
}
|
||||||
|
@ -769,18 +769,18 @@ public final class FST<T> implements Accountable {
|
||||||
* Prefer direct addressing for performance if it does not oversize binary search byte size too much,
|
* Prefer direct addressing for performance if it does not oversize binary search byte size too much,
|
||||||
* so that the arcs can be directly addressed by label.
|
* so that the arcs can be directly addressed by label.
|
||||||
*
|
*
|
||||||
* @see Builder#getDirectAddressingMaxOversizingFactor()
|
* @see FSTCompiler#getDirectAddressingMaxOversizingFactor()
|
||||||
*/
|
*/
|
||||||
private boolean shouldExpandNodeWithDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn,
|
private boolean shouldExpandNodeWithDirectAddressing(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn,
|
||||||
int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) {
|
int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) {
|
||||||
// Anticipate precisely the size of the encodings.
|
// Anticipate precisely the size of the encodings.
|
||||||
int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs;
|
int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs;
|
||||||
int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + builder.numLabelBytesPerArc[0]
|
int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + fstCompiler.numLabelBytesPerArc[0]
|
||||||
+ maxBytesPerArcWithoutLabel * nodeIn.numArcs;
|
+ maxBytesPerArcWithoutLabel * nodeIn.numArcs;
|
||||||
|
|
||||||
// Determine the allowed oversize compared to binary search.
|
// Determine the allowed oversize compared to binary search.
|
||||||
// This is defined by a parameter of FST Builder (default 1: no oversize).
|
// This is defined by a parameter of FST Builder (default 1: no oversize).
|
||||||
int allowedOversize = (int) (sizeForBinarySearch * builder.getDirectAddressingMaxOversizingFactor());
|
int allowedOversize = (int) (sizeForBinarySearch * fstCompiler.getDirectAddressingMaxOversizingFactor());
|
||||||
int expansionCost = sizeForDirectAddressing - allowedOversize;
|
int expansionCost = sizeForDirectAddressing - allowedOversize;
|
||||||
|
|
||||||
// Select direct addressing if either:
|
// Select direct addressing if either:
|
||||||
|
@ -790,46 +790,46 @@ public final class FST<T> implements Accountable {
|
||||||
// In this case, decrement the credit by the oversize.
|
// In this case, decrement the credit by the oversize.
|
||||||
// In addition, do not try to oversize to a clearly too large node size
|
// In addition, do not try to oversize to a clearly too large node size
|
||||||
// (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter).
|
// (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter).
|
||||||
if (expansionCost <= 0 || (builder.directAddressingExpansionCredit >= expansionCost
|
if (expansionCost <= 0 || (fstCompiler.directAddressingExpansionCredit >= expansionCost
|
||||||
&& sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) {
|
&& sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) {
|
||||||
builder.directAddressingExpansionCredit -= expansionCost;
|
fstCompiler.directAddressingExpansionCredit -= expansionCost;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeNodeForBinarySearch(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) {
|
private void writeNodeForBinarySearch(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) {
|
||||||
// Build the header in a buffer.
|
// Build the header in a buffer.
|
||||||
// It is a false/special arc which is in fact a node header with node flags followed by node metadata.
|
// It is a false/special arc which is in fact a node header with node flags followed by node metadata.
|
||||||
builder.fixedLengthArcsBuffer
|
fstCompiler.fixedLengthArcsBuffer
|
||||||
.resetPosition()
|
.resetPosition()
|
||||||
.writeByte(ARCS_FOR_BINARY_SEARCH)
|
.writeByte(ARCS_FOR_BINARY_SEARCH)
|
||||||
.writeVInt(nodeIn.numArcs)
|
.writeVInt(nodeIn.numArcs)
|
||||||
.writeVInt(maxBytesPerArc);
|
.writeVInt(maxBytesPerArc);
|
||||||
int headerLen = builder.fixedLengthArcsBuffer.getPosition();
|
int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition();
|
||||||
|
|
||||||
// Expand the arcs in place, backwards.
|
// Expand the arcs in place, backwards.
|
||||||
long srcPos = builder.bytes.getPosition();
|
long srcPos = fstCompiler.bytes.getPosition();
|
||||||
long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc;
|
long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc;
|
||||||
assert destPos >= srcPos;
|
assert destPos >= srcPos;
|
||||||
if (destPos > srcPos) {
|
if (destPos > srcPos) {
|
||||||
builder.bytes.skipBytes((int) (destPos - srcPos));
|
fstCompiler.bytes.skipBytes((int) (destPos - srcPos));
|
||||||
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
|
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
|
||||||
destPos -= maxBytesPerArc;
|
destPos -= maxBytesPerArc;
|
||||||
int arcLen = builder.numBytesPerArc[arcIdx];
|
int arcLen = fstCompiler.numBytesPerArc[arcIdx];
|
||||||
srcPos -= arcLen;
|
srcPos -= arcLen;
|
||||||
if (srcPos != destPos) {
|
if (srcPos != destPos) {
|
||||||
assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs;
|
assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs;
|
||||||
builder.bytes.copyBytes(srcPos, destPos, arcLen);
|
fstCompiler.bytes.copyBytes(srcPos, destPos, arcLen);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write the header.
|
// Write the header.
|
||||||
builder.bytes.writeBytes(startAddress, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
fstCompiler.bytes.writeBytes(startAddress, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeNodeForDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) {
|
private void writeNodeForDirectAddressing(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) {
|
||||||
// Expand the arcs backwards in a buffer because we remove the labels.
|
// Expand the arcs backwards in a buffer because we remove the labels.
|
||||||
// So the obtained arcs might occupy less space. This is the reason why this
|
// So the obtained arcs might occupy less space. This is the reason why this
|
||||||
// whole method is more complex.
|
// whole method is more complex.
|
||||||
|
@ -837,64 +837,64 @@ public final class FST<T> implements Accountable {
|
||||||
// the presence bits, and the first label. Keep the first label.
|
// the presence bits, and the first label. Keep the first label.
|
||||||
int headerMaxLen = 11;
|
int headerMaxLen = 11;
|
||||||
int numPresenceBytes = getNumPresenceBytes(labelRange);
|
int numPresenceBytes = getNumPresenceBytes(labelRange);
|
||||||
long srcPos = builder.bytes.getPosition();
|
long srcPos = fstCompiler.bytes.getPosition();
|
||||||
int totalArcBytes = builder.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
|
int totalArcBytes = fstCompiler.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
|
||||||
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
|
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
|
||||||
byte[] buffer = builder.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes();
|
byte[] buffer = fstCompiler.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes();
|
||||||
// Copy the arcs to the buffer, dropping all labels except first one.
|
// Copy the arcs to the buffer, dropping all labels except first one.
|
||||||
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
|
for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) {
|
||||||
bufferOffset -= maxBytesPerArcWithoutLabel;
|
bufferOffset -= maxBytesPerArcWithoutLabel;
|
||||||
int srcArcLen = builder.numBytesPerArc[arcIdx];
|
int srcArcLen = fstCompiler.numBytesPerArc[arcIdx];
|
||||||
srcPos -= srcArcLen;
|
srcPos -= srcArcLen;
|
||||||
int labelLen = builder.numLabelBytesPerArc[arcIdx];
|
int labelLen = fstCompiler.numLabelBytesPerArc[arcIdx];
|
||||||
// Copy the flags.
|
// Copy the flags.
|
||||||
builder.bytes.copyBytes(srcPos, buffer, bufferOffset, 1);
|
fstCompiler.bytes.copyBytes(srcPos, buffer, bufferOffset, 1);
|
||||||
// Skip the label, copy the remaining.
|
// Skip the label, copy the remaining.
|
||||||
int remainingArcLen = srcArcLen - 1 - labelLen;
|
int remainingArcLen = srcArcLen - 1 - labelLen;
|
||||||
if (remainingArcLen != 0) {
|
if (remainingArcLen != 0) {
|
||||||
builder.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen);
|
fstCompiler.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen);
|
||||||
}
|
}
|
||||||
if (arcIdx == 0) {
|
if (arcIdx == 0) {
|
||||||
// Copy the label of the first arc only.
|
// Copy the label of the first arc only.
|
||||||
bufferOffset -= labelLen;
|
bufferOffset -= labelLen;
|
||||||
builder.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen);
|
fstCompiler.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert bufferOffset == headerMaxLen + numPresenceBytes;
|
assert bufferOffset == headerMaxLen + numPresenceBytes;
|
||||||
|
|
||||||
// Build the header in the buffer.
|
// Build the header in the buffer.
|
||||||
// It is a false/special arc which is in fact a node header with node flags followed by node metadata.
|
// It is a false/special arc which is in fact a node header with node flags followed by node metadata.
|
||||||
builder.fixedLengthArcsBuffer
|
fstCompiler.fixedLengthArcsBuffer
|
||||||
.resetPosition()
|
.resetPosition()
|
||||||
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
|
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
|
||||||
.writeVInt(labelRange) // labelRange instead of numArcs.
|
.writeVInt(labelRange) // labelRange instead of numArcs.
|
||||||
.writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
|
.writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
|
||||||
int headerLen = builder.fixedLengthArcsBuffer.getPosition();
|
int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition();
|
||||||
|
|
||||||
// Prepare the builder byte store. Enlarge or truncate if needed.
|
// Prepare the builder byte store. Enlarge or truncate if needed.
|
||||||
long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes;
|
long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes;
|
||||||
long currentPosition = builder.bytes.getPosition();
|
long currentPosition = fstCompiler.bytes.getPosition();
|
||||||
if (nodeEnd >= currentPosition) {
|
if (nodeEnd >= currentPosition) {
|
||||||
builder.bytes.skipBytes((int) (nodeEnd - currentPosition));
|
fstCompiler.bytes.skipBytes((int) (nodeEnd - currentPosition));
|
||||||
} else {
|
} else {
|
||||||
builder.bytes.truncate(nodeEnd);
|
fstCompiler.bytes.truncate(nodeEnd);
|
||||||
}
|
}
|
||||||
assert builder.bytes.getPosition() == nodeEnd;
|
assert fstCompiler.bytes.getPosition() == nodeEnd;
|
||||||
|
|
||||||
// Write the header.
|
// Write the header.
|
||||||
long writeOffset = startAddress;
|
long writeOffset = startAddress;
|
||||||
builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
||||||
writeOffset += headerLen;
|
writeOffset += headerLen;
|
||||||
|
|
||||||
// Write the presence bits
|
// Write the presence bits
|
||||||
writePresenceBits(builder, nodeIn, writeOffset, numPresenceBytes);
|
writePresenceBits(fstCompiler, nodeIn, writeOffset, numPresenceBytes);
|
||||||
writeOffset += numPresenceBytes;
|
writeOffset += numPresenceBytes;
|
||||||
|
|
||||||
// Write the first label and the arcs.
|
// Write the first label and the arcs.
|
||||||
builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
|
fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writePresenceBits(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) {
|
private void writePresenceBits(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) {
|
||||||
long bytePos = dest;
|
long bytePos = dest;
|
||||||
byte presenceBits = 1; // The first arc is always present.
|
byte presenceBits = 1; // The first arc is always present.
|
||||||
int presenceIndex = 0;
|
int presenceIndex = 0;
|
||||||
|
@ -904,7 +904,7 @@ public final class FST<T> implements Accountable {
|
||||||
assert label > previousLabel;
|
assert label > previousLabel;
|
||||||
presenceIndex += label - previousLabel;
|
presenceIndex += label - previousLabel;
|
||||||
while (presenceIndex >= Byte.SIZE) {
|
while (presenceIndex >= Byte.SIZE) {
|
||||||
builder.bytes.writeByte(bytePos++, presenceBits);
|
fstCompiler.bytes.writeByte(bytePos++, presenceBits);
|
||||||
presenceBits = 0;
|
presenceBits = 0;
|
||||||
presenceIndex -= Byte.SIZE;
|
presenceIndex -= Byte.SIZE;
|
||||||
}
|
}
|
||||||
|
@ -915,7 +915,7 @@ public final class FST<T> implements Accountable {
|
||||||
assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8;
|
assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8;
|
||||||
assert presenceBits != 0; // The last byte is not 0.
|
assert presenceBits != 0; // The last byte is not 0.
|
||||||
assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present.
|
assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present.
|
||||||
builder.bytes.writeByte(bytePos++, presenceBits);
|
fstCompiler.bytes.writeByte(bytePos++, presenceBits);
|
||||||
assert bytePos - dest == numPresenceBytes;
|
assert bytePos - dest == numPresenceBytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -49,31 +49,9 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class Builder<T> {
|
public class FSTCompiler<T> {
|
||||||
|
|
||||||
/**
|
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1f;
|
||||||
* Default oversizing factor used to decide whether to encode a node with direct addressing or binary search.
|
|
||||||
* Default is 1: ensure no oversizing on average.
|
|
||||||
* <p>
|
|
||||||
* This factor does not determine whether to encode a node with a list of variable length arcs or with
|
|
||||||
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
|
|
||||||
* encoded with fixed length arcs.
|
|
||||||
* See {@code FST.shouldExpandNodeWithFixedLengthArcs()}
|
|
||||||
* and {@code FST.shouldExpandNodeWithDirectAddressing()}.
|
|
||||||
* <p>
|
|
||||||
* For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs,
|
|
||||||
* and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%.
|
|
||||||
* <p>
|
|
||||||
* For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs,
|
|
||||||
* and 14% of them with direct encoding. Overall FST memory reduced by 0.8%.
|
|
||||||
* <p>
|
|
||||||
* Use {@code TestFstDirectAddressing.main()}
|
|
||||||
* and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()}
|
|
||||||
* to evaluate a change.
|
|
||||||
*
|
|
||||||
* @see #setDirectAddressingMaxOversizingFactor
|
|
||||||
*/
|
|
||||||
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f;
|
|
||||||
|
|
||||||
private final NodeHash<T> dedupHash;
|
private final NodeHash<T> dedupHash;
|
||||||
final FST<T> fst;
|
final FST<T> fst;
|
||||||
|
@ -117,75 +95,29 @@ public class Builder<T> {
|
||||||
long binarySearchNodeCount;
|
long binarySearchNodeCount;
|
||||||
long directAddressingNodeCount;
|
long directAddressingNodeCount;
|
||||||
|
|
||||||
boolean allowFixedLengthArcs;
|
final boolean allowFixedLengthArcs;
|
||||||
float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
|
final float directAddressingMaxOversizingFactor;
|
||||||
long directAddressingExpansionCredit;
|
long directAddressingExpansionCredit;
|
||||||
|
|
||||||
BytesStore bytes;
|
final BytesStore bytes;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates an FST/FSA builder without any pruning. A shortcut to {@link
|
* Instantiates an FST/FSA builder with default settings and pruning options turned off.
|
||||||
* #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int)} with
|
* For more tuning and tweaking, see {@link Builder}.
|
||||||
* pruning options turned off.
|
|
||||||
*/
|
*/
|
||||||
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
public FSTCompiler(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, 1f);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
private FSTCompiler(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||||
* Instantiates an FST/FSA builder with all the possible tuning and construction
|
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||||
* tweaks. Read parameter documentation carefully.
|
boolean allowFixedLengthArcs, int bytesPageBits, float directAddressingMaxOversizingFactor) {
|
||||||
*
|
|
||||||
* @param inputType
|
|
||||||
* The input type (transition labels). Can be anything from {@link INPUT_TYPE}
|
|
||||||
* enumeration. Shorter types will consume less memory. Strings (character sequences) are
|
|
||||||
* represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
|
|
||||||
*
|
|
||||||
* @param minSuffixCount1
|
|
||||||
* If pruning the input graph during construction, this threshold is used for telling
|
|
||||||
* if a node is kept or pruned. If transition_count(node) >= minSuffixCount1, the node
|
|
||||||
* is kept.
|
|
||||||
*
|
|
||||||
* @param minSuffixCount2
|
|
||||||
* (Note: only Mike McCandless knows what this one is really doing...)
|
|
||||||
*
|
|
||||||
* @param doShareSuffix
|
|
||||||
* If <code>true</code>, the shared suffixes will be compacted into unique paths.
|
|
||||||
* This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
|
|
||||||
* <code>false</code> creates a single suffix path for all input sequences. This will result in a larger
|
|
||||||
* FST, but requires substantially less memory and CPU during building.
|
|
||||||
*
|
|
||||||
* @param doShareNonSingletonNodes
|
|
||||||
* Only used if doShareSuffix is true. Set this to
|
|
||||||
* true to ensure FST is fully minimal, at cost of more
|
|
||||||
* CPU and more RAM during building.
|
|
||||||
*
|
|
||||||
* @param shareMaxTailLength
|
|
||||||
* Only used if doShareSuffix is true. Set this to
|
|
||||||
* Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
|
|
||||||
* CPU and more RAM during building.
|
|
||||||
*
|
|
||||||
* @param outputs The output type for each input sequence. Applies only if building an FST. For
|
|
||||||
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
|
||||||
* singleton output object.
|
|
||||||
*
|
|
||||||
* @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or
|
|
||||||
* direct addressing) while building the FST; this will make the resulting FST smaller but slower to
|
|
||||||
* traverse.
|
|
||||||
*
|
|
||||||
* @param bytesPageBits How many bits wide to make each
|
|
||||||
* byte[] block in the BytesStore; if you know the FST
|
|
||||||
* will be large then make this larger. For example 15
|
|
||||||
* bits = 32768 byte pages.
|
|
||||||
*/
|
|
||||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
|
||||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
|
||||||
boolean allowFixedLengthArcs, int bytesPageBits) {
|
|
||||||
this.minSuffixCount1 = minSuffixCount1;
|
this.minSuffixCount1 = minSuffixCount1;
|
||||||
this.minSuffixCount2 = minSuffixCount2;
|
this.minSuffixCount2 = minSuffixCount2;
|
||||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||||
this.shareMaxTailLength = shareMaxTailLength;
|
this.shareMaxTailLength = shareMaxTailLength;
|
||||||
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
||||||
|
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
|
||||||
fst = new FST<>(inputType, outputs, bytesPageBits);
|
fst = new FST<>(inputType, outputs, bytesPageBits);
|
||||||
bytes = fst.bytes;
|
bytes = fst.bytes;
|
||||||
assert bytes != null;
|
assert bytes != null;
|
||||||
|
@ -205,22 +137,145 @@ public class Builder<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
|
* Fluent-style constructor for FST {@link FSTCompiler}.
|
||||||
* of arcs instead of binary search.
|
|
||||||
* <p>
|
* <p>
|
||||||
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
|
* Creates an FST/FSA builder with all the possible tuning and construction tweaks.
|
||||||
* only binary search nodes will be created.
|
* Read parameter documentation carefully.
|
||||||
*
|
|
||||||
* @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR
|
|
||||||
*/
|
*/
|
||||||
public Builder<T> setDirectAddressingMaxOversizingFactor(float factor) {
|
public static class Builder<T> {
|
||||||
directAddressingMaxOversizingFactor = factor;
|
|
||||||
return this;
|
private final INPUT_TYPE inputType;
|
||||||
|
private final Outputs<T> outputs;
|
||||||
|
private int minSuffixCount1;
|
||||||
|
private int minSuffixCount2;
|
||||||
|
private boolean shouldShareSuffix = true;
|
||||||
|
private boolean shouldShareNonSingletonNodes = true;
|
||||||
|
private int shareMaxTailLength = Integer.MAX_VALUE;
|
||||||
|
private boolean allowFixedLengthArcs = true;
|
||||||
|
private int bytesPageBits = 15;
|
||||||
|
private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param inputType The input type (transition labels). Can be anything from {@link INPUT_TYPE}
|
||||||
|
* enumeration. Shorter types will consume less memory. Strings (character sequences) are
|
||||||
|
* represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
|
||||||
|
* @param outputs The output type for each input sequence. Applies only if building an FST. For
|
||||||
|
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
||||||
|
* singleton output object.
|
||||||
|
*/
|
||||||
|
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||||
|
this.inputType = inputType;
|
||||||
|
this.outputs = outputs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If pruning the input graph during construction, this threshold is used for telling if a node is kept
|
||||||
|
* or pruned. If transition_count(node) >= minSuffixCount1, the node is kept.
|
||||||
|
* <p>
|
||||||
|
* Default = 0.
|
||||||
|
*/
|
||||||
|
public Builder<T> minSuffixCount1(int minSuffixCount1) {
|
||||||
|
this.minSuffixCount1 = minSuffixCount1;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Better pruning: we prune node (and all following nodes) if the prior node has less than this number
|
||||||
|
* of terms go through it.
|
||||||
|
* <p>
|
||||||
|
* Default = 0.
|
||||||
|
*/
|
||||||
|
public Builder<T> minSuffixCount2(int minSuffixCount2) {
|
||||||
|
this.minSuffixCount2 = minSuffixCount2;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If {@code true}, the shared suffixes will be compacted into unique paths.
|
||||||
|
* This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
|
||||||
|
* {@code false} creates a single suffix path for all input sequences. This will result in a larger
|
||||||
|
* FST, but requires substantially less memory and CPU during building.
|
||||||
|
* <p>
|
||||||
|
* Default = {@code true}.
|
||||||
|
*/
|
||||||
|
public Builder<T> shouldShareSuffix(boolean shouldShareSuffix) {
|
||||||
|
this.shouldShareSuffix = shouldShareSuffix;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Only used if {@code shouldShareSuffix} is true. Set this to true to ensure FST is fully minimal,
|
||||||
|
* at cost of more CPU and more RAM during building.
|
||||||
|
* <p>
|
||||||
|
* Default = {@code true}.
|
||||||
|
*/
|
||||||
|
public Builder<T> shouldShareNonSingletonNodes(boolean shouldShareNonSingletonNodes) {
|
||||||
|
this.shouldShareNonSingletonNodes = shouldShareNonSingletonNodes;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Only used if {@code shouldShareSuffix} is true. Set this to Integer.MAX_VALUE to ensure FST is
|
||||||
|
* fully minimal, at cost of more CPU and more RAM during building.
|
||||||
|
* <p>
|
||||||
|
* Default = {@link Integer#MAX_VALUE}.
|
||||||
|
*/
|
||||||
|
public Builder<T> shareMaxTailLength(int shareMaxTailLength) {
|
||||||
|
this.shareMaxTailLength = shareMaxTailLength;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pass {@code false} to disable the fixed length arc optimization (binary search or direct addressing)
|
||||||
|
* while building the FST; this will make the resulting FST smaller but slower to traverse.
|
||||||
|
* <p>
|
||||||
|
* Default = {@code true}.
|
||||||
|
*/
|
||||||
|
public Builder<T> allowFixedLengthArcs(boolean allowFixedLengthArcs) {
|
||||||
|
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many bits wide to make each byte[] block in the BytesStore; if you know the FST
|
||||||
|
* will be large then make this larger. For example 15 bits = 32768 byte pages.
|
||||||
|
* <p>
|
||||||
|
* Default = 15.
|
||||||
|
*/
|
||||||
|
public Builder<T> bytesPageBits(int bytesPageBits) {
|
||||||
|
this.bytesPageBits = bytesPageBits;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
|
||||||
|
* of arcs instead of binary search.
|
||||||
|
* <p>
|
||||||
|
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
|
||||||
|
* only binary search nodes will be created.
|
||||||
|
* <p>
|
||||||
|
* This factor does not determine whether to encode a node with a list of variable length arcs or with
|
||||||
|
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
|
||||||
|
* encoded with fixed length arcs.
|
||||||
|
* <p>
|
||||||
|
* Default = 1.
|
||||||
|
*/
|
||||||
|
public Builder<T> directAddressingMaxOversizingFactor(float factor) {
|
||||||
|
this.directAddressingMaxOversizingFactor = factor;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new {@link FSTCompiler}.
|
||||||
|
*/
|
||||||
|
public FSTCompiler<T> build() {
|
||||||
|
FSTCompiler<T> fstCompiler = new FSTCompiler<>(inputType, minSuffixCount1, minSuffixCount2, shouldShareSuffix,
|
||||||
|
shouldShareNonSingletonNodes, shareMaxTailLength, outputs, allowFixedLengthArcs, bytesPageBits,
|
||||||
|
directAddressingMaxOversizingFactor);
|
||||||
|
return fstCompiler;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @see #setDirectAddressingMaxOversizingFactor(float)
|
|
||||||
*/
|
|
||||||
public float getDirectAddressingMaxOversizingFactor() {
|
public float getDirectAddressingMaxOversizingFactor() {
|
||||||
return directAddressingMaxOversizingFactor;
|
return directAddressingMaxOversizingFactor;
|
||||||
}
|
}
|
||||||
|
@ -514,7 +569,7 @@ public class Builder<T> {
|
||||||
|
|
||||||
/** Returns final FST. NOTE: this will return null if
|
/** Returns final FST. NOTE: this will return null if
|
||||||
* nothing is accepted by the FST. */
|
* nothing is accepted by the FST. */
|
||||||
public FST<T> finish() throws IOException {
|
public FST<T> compile() throws IOException {
|
||||||
|
|
||||||
final UnCompiledNode<T> root = frontier[0];
|
final UnCompiledNode<T> root = frontier[0];
|
||||||
|
|
||||||
|
@ -554,19 +609,19 @@ public class Builder<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Expert: holds a pending (seen but not yet serialized) arc. */
|
/** Expert: holds a pending (seen but not yet serialized) arc. */
|
||||||
public static class Arc<T> {
|
static class Arc<T> {
|
||||||
public int label; // really an "unsigned" byte
|
int label; // really an "unsigned" byte
|
||||||
public Node target;
|
Node target;
|
||||||
public boolean isFinal;
|
boolean isFinal;
|
||||||
public T output;
|
T output;
|
||||||
public T nextFinalOutput;
|
T nextFinalOutput;
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: not many instances of Node or CompiledNode are in
|
// NOTE: not many instances of Node or CompiledNode are in
|
||||||
// memory while the FST is being built; it's only the
|
// memory while the FST is being built; it's only the
|
||||||
// current "frontier":
|
// current "frontier":
|
||||||
|
|
||||||
static interface Node {
|
interface Node {
|
||||||
boolean isCompiled();
|
boolean isCompiled();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -583,20 +638,20 @@ public class Builder<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Expert: holds a pending (seen but not yet serialized) Node. */
|
/** Expert: holds a pending (seen but not yet serialized) Node. */
|
||||||
public static final class UnCompiledNode<T> implements Node {
|
static final class UnCompiledNode<T> implements Node {
|
||||||
final Builder<T> owner;
|
final FSTCompiler<T> owner;
|
||||||
public int numArcs;
|
int numArcs;
|
||||||
public Arc<T>[] arcs;
|
Arc<T>[] arcs;
|
||||||
// TODO: instead of recording isFinal/output on the
|
// TODO: instead of recording isFinal/output on the
|
||||||
// node, maybe we should use -1 arc to mean "end" (like
|
// node, maybe we should use -1 arc to mean "end" (like
|
||||||
// we do when reading the FST). Would simplify much
|
// we do when reading the FST). Would simplify much
|
||||||
// code here...
|
// code here...
|
||||||
public T output;
|
T output;
|
||||||
public boolean isFinal;
|
boolean isFinal;
|
||||||
public long inputCount;
|
long inputCount;
|
||||||
|
|
||||||
/** This node's depth, starting from the automaton root. */
|
/** This node's depth, starting from the automaton root. */
|
||||||
public final int depth;
|
final int depth;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param depth
|
* @param depth
|
||||||
|
@ -605,7 +660,7 @@ public class Builder<T> {
|
||||||
* fanout size).
|
* fanout size).
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings({"rawtypes","unchecked"})
|
@SuppressWarnings({"rawtypes","unchecked"})
|
||||||
public UnCompiledNode(Builder<T> owner, int depth) {
|
UnCompiledNode(FSTCompiler<T> owner, int depth) {
|
||||||
this.owner = owner;
|
this.owner = owner;
|
||||||
arcs = (Arc<T>[]) new Arc[1];
|
arcs = (Arc<T>[]) new Arc[1];
|
||||||
arcs[0] = new Arc<>();
|
arcs[0] = new Arc<>();
|
||||||
|
@ -618,7 +673,7 @@ public class Builder<T> {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void clear() {
|
void clear() {
|
||||||
numArcs = 0;
|
numArcs = 0;
|
||||||
isFinal = false;
|
isFinal = false;
|
||||||
output = owner.NO_OUTPUT;
|
output = owner.NO_OUTPUT;
|
||||||
|
@ -628,13 +683,13 @@ public class Builder<T> {
|
||||||
// for nodes on the frontier (even when reused).
|
// for nodes on the frontier (even when reused).
|
||||||
}
|
}
|
||||||
|
|
||||||
public T getLastOutput(int labelToMatch) {
|
T getLastOutput(int labelToMatch) {
|
||||||
assert numArcs > 0;
|
assert numArcs > 0;
|
||||||
assert arcs[numArcs-1].label == labelToMatch;
|
assert arcs[numArcs-1].label == labelToMatch;
|
||||||
return arcs[numArcs-1].output;
|
return arcs[numArcs-1].output;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addArc(int label, Node target) {
|
void addArc(int label, Node target) {
|
||||||
assert label >= 0;
|
assert label >= 0;
|
||||||
assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[numArcs-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs;
|
assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[numArcs-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs;
|
||||||
if (numArcs == arcs.length) {
|
if (numArcs == arcs.length) {
|
||||||
|
@ -651,7 +706,7 @@ public class Builder<T> {
|
||||||
arc.isFinal = false;
|
arc.isFinal = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
|
void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
|
||||||
assert numArcs > 0;
|
assert numArcs > 0;
|
||||||
final Arc<T> arc = arcs[numArcs-1];
|
final Arc<T> arc = arcs[numArcs-1];
|
||||||
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
|
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
|
||||||
|
@ -661,14 +716,14 @@ public class Builder<T> {
|
||||||
arc.isFinal = isFinal;
|
arc.isFinal = isFinal;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void deleteLast(int label, Node target) {
|
void deleteLast(int label, Node target) {
|
||||||
assert numArcs > 0;
|
assert numArcs > 0;
|
||||||
assert label == arcs[numArcs-1].label;
|
assert label == arcs[numArcs-1].label;
|
||||||
assert target == arcs[numArcs-1].target;
|
assert target == arcs[numArcs-1].target;
|
||||||
numArcs--;
|
numArcs--;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setLastOutput(int labelToMatch, T newOutput) {
|
void setLastOutput(int labelToMatch, T newOutput) {
|
||||||
assert owner.validOutput(newOutput);
|
assert owner.validOutput(newOutput);
|
||||||
assert numArcs > 0;
|
assert numArcs > 0;
|
||||||
final Arc<T> arc = arcs[numArcs-1];
|
final Arc<T> arc = arcs[numArcs-1];
|
||||||
|
@ -677,7 +732,7 @@ public class Builder<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// pushes an output prefix forward onto all arcs
|
// pushes an output prefix forward onto all arcs
|
||||||
public void prependOutput(T outputPrefix) {
|
void prependOutput(T outputPrefix) {
|
||||||
assert owner.validOutput(outputPrefix);
|
assert owner.validOutput(outputPrefix);
|
||||||
|
|
||||||
for(int arcIdx=0;arcIdx<numArcs;arcIdx++) {
|
for(int arcIdx=0;arcIdx<numArcs;arcIdx++) {
|
|
@ -39,7 +39,7 @@ final class NodeHash<T> {
|
||||||
this.in = in;
|
this.in = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
|
private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException {
|
||||||
fst.readFirstRealTargetArc(address, scratchArc, in);
|
fst.readFirstRealTargetArc(address, scratchArc, in);
|
||||||
|
|
||||||
// Fail fast for a node with fixed length arcs.
|
// Fail fast for a node with fixed length arcs.
|
||||||
|
@ -58,10 +58,10 @@ final class NodeHash<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
|
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
|
||||||
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
|
||||||
if (arc.label != scratchArc.label() ||
|
if (arc.label != scratchArc.label() ||
|
||||||
!arc.output.equals(scratchArc.output()) ||
|
!arc.output.equals(scratchArc.output()) ||
|
||||||
((Builder.CompiledNode) arc.target).node != scratchArc.target() ||
|
((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() ||
|
||||||
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) ||
|
!arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) ||
|
||||||
arc.isFinal != scratchArc.isFinal()) {
|
arc.isFinal != scratchArc.isFinal()) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -82,16 +82,16 @@ final class NodeHash<T> {
|
||||||
|
|
||||||
// hash code for an unfrozen node. This must be identical
|
// hash code for an unfrozen node. This must be identical
|
||||||
// to the frozen case (below)!!
|
// to the frozen case (below)!!
|
||||||
private long hash(Builder.UnCompiledNode<T> node) {
|
private long hash(FSTCompiler.UnCompiledNode<T> node) {
|
||||||
final int PRIME = 31;
|
final int PRIME = 31;
|
||||||
//System.out.println("hash unfrozen");
|
//System.out.println("hash unfrozen");
|
||||||
long h = 0;
|
long h = 0;
|
||||||
// TODO: maybe if number of arcs is high we can safely subsample?
|
// TODO: maybe if number of arcs is high we can safely subsample?
|
||||||
for (int arcIdx=0; arcIdx < node.numArcs; arcIdx++) {
|
for (int arcIdx=0; arcIdx < node.numArcs; arcIdx++) {
|
||||||
final Builder.Arc<T> arc = node.arcs[arcIdx];
|
final FSTCompiler.Arc<T> arc = node.arcs[arcIdx];
|
||||||
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
|
//System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal);
|
||||||
h = PRIME * h + arc.label;
|
h = PRIME * h + arc.label;
|
||||||
long n = ((Builder.CompiledNode) arc.target).node;
|
long n = ((FSTCompiler.CompiledNode) arc.target).node;
|
||||||
h = PRIME * h + (int) (n^(n>>32));
|
h = PRIME * h + (int) (n^(n>>32));
|
||||||
h = PRIME * h + arc.output.hashCode();
|
h = PRIME * h + arc.output.hashCode();
|
||||||
h = PRIME * h + arc.nextFinalOutput.hashCode();
|
h = PRIME * h + arc.nextFinalOutput.hashCode();
|
||||||
|
@ -127,7 +127,7 @@ final class NodeHash<T> {
|
||||||
return h & Long.MAX_VALUE;
|
return h & Long.MAX_VALUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long add(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException {
|
public long add(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
|
||||||
//System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
|
//System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
|
||||||
final long h = hash(nodeIn);
|
final long h = hash(nodeIn);
|
||||||
long pos = h & mask;
|
long pos = h & mask;
|
||||||
|
@ -136,7 +136,7 @@ final class NodeHash<T> {
|
||||||
final long v = table.get(pos);
|
final long v = table.get(pos);
|
||||||
if (v == 0) {
|
if (v == 0) {
|
||||||
// freeze & add
|
// freeze & add
|
||||||
final long node = fst.addNode(builder, nodeIn);
|
final long node = fst.addNode(fstCompiler, nodeIn);
|
||||||
//System.out.println(" now freeze node=" + node);
|
//System.out.println(" now freeze node=" + node);
|
||||||
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
|
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
|
||||||
count++;
|
count++;
|
||||||
|
|
|
@ -54,8 +54,7 @@ public class Test2BFST extends LuceneTestCase {
|
||||||
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
|
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
|
||||||
Outputs<Object> outputs = NoOutputs.getSingleton();
|
Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||||
Object NO_OUTPUT = outputs.getNoOutput();
|
Object NO_OUTPUT = outputs.getNoOutput();
|
||||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
true, 15);
|
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
Random r = new Random(seed);
|
Random r = new Random(seed);
|
||||||
|
@ -66,21 +65,21 @@ public class Test2BFST extends LuceneTestCase {
|
||||||
for(int i=10;i<ints2.length;i++) {
|
for(int i=10;i<ints2.length;i++) {
|
||||||
ints2[i] = r.nextInt(256);
|
ints2[i] = r.nextInt(256);
|
||||||
}
|
}
|
||||||
b.add(input2, NO_OUTPUT);
|
fstCompiler.add(input2, NO_OUTPUT);
|
||||||
count++;
|
count++;
|
||||||
if (count % 100000 == 0) {
|
if (count % 100000 == 0) {
|
||||||
System.out.println(count + ": " + b.fstRamBytesUsed() + " bytes; " + b.getNodeCount() + " nodes");
|
System.out.println(count + ": " + fstCompiler.fstRamBytesUsed() + " bytes; " + fstCompiler.getNodeCount() + " nodes");
|
||||||
}
|
}
|
||||||
if (b.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
|
if (fstCompiler.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
nextInput(r, ints2);
|
nextInput(r, ints2);
|
||||||
}
|
}
|
||||||
|
|
||||||
FST<Object> fst = b.finish();
|
FST<Object> fst = fstCompiler.compile();
|
||||||
|
|
||||||
for(int verify=0;verify<2;verify++) {
|
for(int verify=0;verify<2;verify++) {
|
||||||
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
|
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
|
||||||
|
|
||||||
Arrays.fill(ints2, 0);
|
Arrays.fill(ints2, 0);
|
||||||
r = new Random(seed);
|
r = new Random(seed);
|
||||||
|
@ -136,8 +135,7 @@ public class Test2BFST extends LuceneTestCase {
|
||||||
{
|
{
|
||||||
System.out.println("\nTEST: 3 GB size; outputs=bytes");
|
System.out.println("\nTEST: 3 GB size; outputs=bytes");
|
||||||
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
|
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
|
||||||
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
true, 15);
|
|
||||||
|
|
||||||
byte[] outputBytes = new byte[20];
|
byte[] outputBytes = new byte[20];
|
||||||
BytesRef output = new BytesRef(outputBytes);
|
BytesRef output = new BytesRef(outputBytes);
|
||||||
|
@ -147,10 +145,10 @@ public class Test2BFST extends LuceneTestCase {
|
||||||
while(true) {
|
while(true) {
|
||||||
r.nextBytes(outputBytes);
|
r.nextBytes(outputBytes);
|
||||||
//System.out.println("add: " + input + " -> " + output);
|
//System.out.println("add: " + input + " -> " + output);
|
||||||
b.add(input, BytesRef.deepCopyOf(output));
|
fstCompiler.add(input, BytesRef.deepCopyOf(output));
|
||||||
count++;
|
count++;
|
||||||
if (count % 10000 == 0) {
|
if (count % 10000 == 0) {
|
||||||
long size = b.fstRamBytesUsed();
|
long size = fstCompiler.fstRamBytesUsed();
|
||||||
if (count % 1000000 == 0) {
|
if (count % 1000000 == 0) {
|
||||||
System.out.println(count + "...: " + size + " bytes");
|
System.out.println(count + "...: " + size + " bytes");
|
||||||
}
|
}
|
||||||
|
@ -161,10 +159,10 @@ public class Test2BFST extends LuceneTestCase {
|
||||||
nextInput(r, ints);
|
nextInput(r, ints);
|
||||||
}
|
}
|
||||||
|
|
||||||
FST<BytesRef> fst = b.finish();
|
FST<BytesRef> fst = fstCompiler.compile();
|
||||||
for(int verify=0;verify<2;verify++) {
|
for(int verify=0;verify<2;verify++) {
|
||||||
|
|
||||||
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
|
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
|
||||||
|
|
||||||
r = new Random(seed);
|
r = new Random(seed);
|
||||||
Arrays.fill(ints, 0);
|
Arrays.fill(ints, 0);
|
||||||
|
@ -216,8 +214,7 @@ public class Test2BFST extends LuceneTestCase {
|
||||||
{
|
{
|
||||||
System.out.println("\nTEST: 3 GB size; outputs=long");
|
System.out.println("\nTEST: 3 GB size; outputs=long");
|
||||||
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
|
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
|
||||||
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
|
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
true, 15);
|
|
||||||
|
|
||||||
long output = 1;
|
long output = 1;
|
||||||
|
|
||||||
|
@ -226,11 +223,11 @@ public class Test2BFST extends LuceneTestCase {
|
||||||
Random r = new Random(seed);
|
Random r = new Random(seed);
|
||||||
while(true) {
|
while(true) {
|
||||||
//System.out.println("add: " + input + " -> " + output);
|
//System.out.println("add: " + input + " -> " + output);
|
||||||
b.add(input, output);
|
fstCompiler.add(input, output);
|
||||||
output += 1+r.nextInt(10);
|
output += 1+r.nextInt(10);
|
||||||
count++;
|
count++;
|
||||||
if (count % 10000 == 0) {
|
if (count % 10000 == 0) {
|
||||||
long size = b.fstRamBytesUsed();
|
long size = fstCompiler.fstRamBytesUsed();
|
||||||
if (count % 1000000 == 0) {
|
if (count % 1000000 == 0) {
|
||||||
System.out.println(count + "...: " + size + " bytes");
|
System.out.println(count + "...: " + size + " bytes");
|
||||||
}
|
}
|
||||||
|
@ -241,11 +238,11 @@ public class Test2BFST extends LuceneTestCase {
|
||||||
nextInput(r, ints);
|
nextInput(r, ints);
|
||||||
}
|
}
|
||||||
|
|
||||||
FST<Long> fst = b.finish();
|
FST<Long> fst = fstCompiler.compile();
|
||||||
|
|
||||||
for(int verify=0;verify<2;verify++) {
|
for(int verify=0;verify<2;verify++) {
|
||||||
|
|
||||||
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
|
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]");
|
||||||
|
|
||||||
Arrays.fill(ints, 0);
|
Arrays.fill(ints, 0);
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
public class TestFstDirectAddressing extends LuceneTestCase {
|
public class TestFSTDirectAddressing extends LuceneTestCase {
|
||||||
|
|
||||||
public void testDenseWithGap() throws Exception {
|
public void testDenseWithGap() throws Exception {
|
||||||
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
|
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
|
||||||
|
@ -86,13 +86,13 @@ public class TestFstDirectAddressing extends LuceneTestCase {
|
||||||
Collections.sort(wordList);
|
Collections.sort(wordList);
|
||||||
|
|
||||||
// Disable direct addressing and measure the FST size.
|
// Disable direct addressing and measure the FST size.
|
||||||
Builder<Object> builder = createBuilder(-1f);
|
FSTCompiler<Object> fstCompiler = createFSTCompiler(-1f);
|
||||||
FST<Object> fst = buildFST(wordList, builder);
|
FST<Object> fst = buildFST(wordList, fstCompiler);
|
||||||
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
|
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
|
||||||
|
|
||||||
// Enable direct addressing and measure the FST size.
|
// Enable direct addressing and measure the FST size.
|
||||||
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
||||||
fst = buildFST(wordList, builder);
|
fst = buildFST(wordList, fstCompiler);
|
||||||
long ramBytesUsed = fst.ramBytesUsed();
|
long ramBytesUsed = fst.ramBytesUsed();
|
||||||
|
|
||||||
// Compute the size increase in percents.
|
// Compute the size increase in percents.
|
||||||
|
@ -107,42 +107,43 @@ public class TestFstDirectAddressing extends LuceneTestCase {
|
||||||
directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT);
|
directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void printStats(Builder<Object> builder, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
|
private static void printStats(FSTCompiler<Object> fstCompiler, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
|
||||||
System.out.println("directAddressingMaxOversizingFactor = " + builder.getDirectAddressingMaxOversizingFactor());
|
System.out.println("directAddressingMaxOversizingFactor = " + fstCompiler.getDirectAddressingMaxOversizingFactor());
|
||||||
System.out.println("ramBytesUsed = "
|
System.out.println("ramBytesUsed = "
|
||||||
+ String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d)
|
+ String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d)
|
||||||
+ String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent));
|
+ String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent));
|
||||||
System.out.println("num nodes = " + builder.nodeCount);
|
System.out.println("num nodes = " + fstCompiler.nodeCount);
|
||||||
long fixedLengthArcNodeCount = builder.directAddressingNodeCount + builder.binarySearchNodeCount;
|
long fixedLengthArcNodeCount = fstCompiler.directAddressingNodeCount + fstCompiler.binarySearchNodeCount;
|
||||||
System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount
|
System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount
|
||||||
+ String.format(Locale.ENGLISH, " (%.2f %% of all nodes)",
|
+ String.format(Locale.ENGLISH, " (%.2f %% of all nodes)",
|
||||||
((double) fixedLengthArcNodeCount / builder.nodeCount * 100)));
|
((double) fixedLengthArcNodeCount / fstCompiler.nodeCount * 100)));
|
||||||
System.out.println("num binary-search nodes = " + (builder.binarySearchNodeCount)
|
System.out.println("num binary-search nodes = " + (fstCompiler.binarySearchNodeCount)
|
||||||
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
|
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
|
||||||
((double) (builder.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
|
((double) (fstCompiler.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||||
System.out.println("num direct-addressing nodes = " + (builder.directAddressingNodeCount)
|
System.out.println("num direct-addressing nodes = " + (fstCompiler.directAddressingNodeCount)
|
||||||
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
|
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
|
||||||
((double) (builder.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
|
((double) (fstCompiler.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Builder<Object> createBuilder(float directAddressingMaxOversizingFactor) {
|
private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) {
|
||||||
return new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, NoOutputs.getSingleton(), true, 15)
|
return new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, NoOutputs.getSingleton())
|
||||||
.setDirectAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor);
|
.directAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor)
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
|
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
|
||||||
return buildFST(entries, createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
|
return buildFST(entries, createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static FST<Object> buildFST(List<BytesRef> entries, Builder<Object> builder) throws Exception {
|
private static FST<Object> buildFST(List<BytesRef> entries, FSTCompiler<Object> fstCompiler) throws Exception {
|
||||||
BytesRef last = null;
|
BytesRef last = null;
|
||||||
for (BytesRef entry : entries) {
|
for (BytesRef entry : entries) {
|
||||||
if (entry.equals(last) == false) {
|
if (entry.equals(last) == false) {
|
||||||
builder.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
|
fstCompiler.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
|
||||||
}
|
}
|
||||||
last = entry;
|
last = entry;
|
||||||
}
|
}
|
||||||
return builder.finish();
|
return fstCompiler.compile();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String... args) throws Exception {
|
public static void main(String... args) throws Exception {
|
||||||
|
@ -195,18 +196,18 @@ public class TestFstDirectAddressing extends LuceneTestCase {
|
||||||
Collections.sort(wordList);
|
Collections.sort(wordList);
|
||||||
|
|
||||||
// Disable direct addressing and measure the FST size.
|
// Disable direct addressing and measure the FST size.
|
||||||
Builder<Object> builder = createBuilder(-1f);
|
FSTCompiler<Object> fstCompiler = createFSTCompiler(-1f);
|
||||||
FST<Object> fst = buildFST(wordList, builder);
|
FST<Object> fst = buildFST(wordList, fstCompiler);
|
||||||
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
|
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
|
||||||
|
|
||||||
// Enable direct addressing and measure the FST size.
|
// Enable direct addressing and measure the FST size.
|
||||||
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
||||||
fst = buildFST(wordList, builder);
|
fst = buildFST(wordList, fstCompiler);
|
||||||
long ramBytesUsed = fst.ramBytesUsed();
|
long ramBytesUsed = fst.ramBytesUsed();
|
||||||
|
|
||||||
// Compute the size increase in percents.
|
// Compute the size increase in percents.
|
||||||
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
|
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
|
||||||
|
|
||||||
printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent);
|
printStats(fstCompiler, ramBytesUsed, directAddressingMemoryIncreasePercent);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -327,7 +327,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
writer.close();
|
writer.close();
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
|
|
||||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
boolean storeOrd = random().nextBoolean();
|
boolean storeOrd = random().nextBoolean();
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
|
@ -373,15 +373,15 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
} else {
|
} else {
|
||||||
output = termsEnum.docFreq();
|
output = termsEnum.docFreq();
|
||||||
}
|
}
|
||||||
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
|
fstCompiler.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
|
||||||
ord++;
|
ord++;
|
||||||
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
|
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
|
||||||
System.out.println(ord + " terms...");
|
System.out.println(ord + " terms...");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FST<Long> fst = builder.finish();
|
FST<Long> fst = fstCompiler.compile();
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
|
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ord > 0) {
|
if (ord > 0) {
|
||||||
|
@ -460,7 +460,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
private final Path wordsFileIn;
|
private final Path wordsFileIn;
|
||||||
private int inputMode;
|
private int inputMode;
|
||||||
private final Outputs<T> outputs;
|
private final Outputs<T> outputs;
|
||||||
private final Builder<T> builder;
|
private final FSTCompiler<T> fstCompiler;
|
||||||
|
|
||||||
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) {
|
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) {
|
||||||
this.dirOut = dirOut;
|
this.dirOut = dirOut;
|
||||||
|
@ -468,7 +468,11 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
this.inputMode = inputMode;
|
this.inputMode = inputMode;
|
||||||
this.outputs = outputs;
|
this.outputs = outputs;
|
||||||
|
|
||||||
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15);
|
fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
|
||||||
|
.minSuffixCount2(prune)
|
||||||
|
.shouldShareSuffix(prune == 0)
|
||||||
|
.allowFixedLengthArcs(!noArcArrays)
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
||||||
|
@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
toIntsRef(w, inputMode, intsRef);
|
toIntsRef(w, inputMode, intsRef);
|
||||||
builder.add(intsRef.get(),
|
fstCompiler.add(intsRef.get(),
|
||||||
getOutput(intsRef.get(), ord));
|
getOutput(intsRef.get(), ord));
|
||||||
|
|
||||||
ord++;
|
ord++;
|
||||||
|
@ -503,8 +507,8 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
long tMid = System.currentTimeMillis();
|
long tMid = System.currentTimeMillis();
|
||||||
System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms");
|
System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms");
|
||||||
|
|
||||||
assert builder.getTermCount() == ord;
|
assert fstCompiler.getTermCount() == ord;
|
||||||
FST<T> fst = builder.finish();
|
FST<T> fst = fstCompiler.compile();
|
||||||
long tEnd = System.currentTimeMillis();
|
long tEnd = System.currentTimeMillis();
|
||||||
System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack");
|
System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack");
|
||||||
if (fst == null) {
|
if (fst == null) {
|
||||||
|
@ -516,8 +520,8 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
System.out.println(ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs; tot size " + fst.ramBytesUsed());
|
System.out.println(ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs; tot size " + fst.ramBytesUsed());
|
||||||
if (builder.getNodeCount() < 100) {
|
if (fstCompiler.getNodeCount() < 100) {
|
||||||
Writer w = Files.newBufferedWriter(Paths.get("out.dot"), StandardCharsets.UTF_8);
|
Writer w = Files.newBufferedWriter(Paths.get("out.dot"), StandardCharsets.UTF_8);
|
||||||
Util.toDot(fst, w, false, false);
|
Util.toDot(fst, w, false, false);
|
||||||
w.close();
|
w.close();
|
||||||
|
@ -717,9 +721,9 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
public void testSingleString() throws Exception {
|
public void testSingleString() throws Exception {
|
||||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput());
|
fstCompiler.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(b.finish());
|
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fstCompiler.compile());
|
||||||
assertNull(fstEnum.seekFloor(new BytesRef("foo")));
|
assertNull(fstEnum.seekFloor(new BytesRef("foo")));
|
||||||
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
|
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
|
||||||
}
|
}
|
||||||
|
@ -728,12 +732,12 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
public void testDuplicateFSAString() throws Exception {
|
public void testDuplicateFSAString() throws Exception {
|
||||||
String str = "foobar";
|
String str = "foobar";
|
||||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
IntsRefBuilder ints = new IntsRefBuilder();
|
IntsRefBuilder ints = new IntsRefBuilder();
|
||||||
for(int i=0; i<10; i++) {
|
for(int i=0; i<10; i++) {
|
||||||
b.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
|
fstCompiler.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
|
||||||
}
|
}
|
||||||
FST<Object> fst = b.finish();
|
FST<Object> fst = fstCompiler.compile();
|
||||||
|
|
||||||
// count the input paths
|
// count the input paths
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
@ -797,17 +801,17 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
|
|
||||||
// Build an FST mapping BytesRef -> Long
|
// Build an FST mapping BytesRef -> Long
|
||||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
final BytesRef a = new BytesRef("a");
|
final BytesRef a = new BytesRef("a");
|
||||||
final BytesRef b = new BytesRef("b");
|
final BytesRef b = new BytesRef("b");
|
||||||
final BytesRef c = new BytesRef("c");
|
final BytesRef c = new BytesRef("c");
|
||||||
|
|
||||||
builder.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L);
|
fstCompiler.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L);
|
||||||
builder.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L);
|
fstCompiler.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L);
|
||||||
builder.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L);
|
fstCompiler.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L);
|
||||||
|
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = fstCompiler.compile();
|
||||||
|
|
||||||
assertEquals(13824324872317238L, (long) Util.get(fst, c));
|
assertEquals(13824324872317238L, (long) Util.get(fst, c));
|
||||||
assertEquals(42, (long) Util.get(fst, b));
|
assertEquals(42, (long) Util.get(fst, b));
|
||||||
|
@ -1035,7 +1039,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
FST<Object> compile(String[] lines) throws IOException {
|
FST<Object> compile(String[] lines) throws IOException {
|
||||||
final NoOutputs outputs = NoOutputs.getSingleton();
|
final NoOutputs outputs = NoOutputs.getSingleton();
|
||||||
final Object nothing = outputs.getNoOutput();
|
final Object nothing = outputs.getNoOutput();
|
||||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
int line = 0;
|
int line = 0;
|
||||||
final BytesRefBuilder term = new BytesRefBuilder();
|
final BytesRefBuilder term = new BytesRefBuilder();
|
||||||
|
@ -1046,10 +1050,10 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
term.copyChars(w);
|
term.copyChars(w);
|
||||||
b.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
|
fstCompiler.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
|
||||||
}
|
}
|
||||||
|
|
||||||
return b.finish();
|
return fstCompiler.compile();
|
||||||
}
|
}
|
||||||
|
|
||||||
void generate(ArrayList<String> out, StringBuilder b, char from, char to,
|
void generate(ArrayList<String> out, StringBuilder b, char from, char to,
|
||||||
|
@ -1110,10 +1114,10 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
public void testFinalOutputOnEndState() throws Exception {
|
public void testFinalOutputOnEndState() throws Exception {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
|
|
||||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
final FSTCompiler<Long> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).minSuffixCount1(2).build();
|
||||||
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
|
fstCompiler.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
|
||||||
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
|
fstCompiler.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = fstCompiler.compile();
|
||||||
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
||||||
StringWriter w = new StringWriter();
|
StringWriter w = new StringWriter();
|
||||||
Util.toDot(fst, w, false, false);
|
Util.toDot(fst, w, false, false);
|
||||||
|
@ -1124,10 +1128,10 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
public void testInternalFinalState() throws Exception {
|
public void testInternalFinalState() throws Exception {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
|
fstCompiler.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||||
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
|
fstCompiler.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = fstCompiler.compile();
|
||||||
StringWriter w = new StringWriter();
|
StringWriter w = new StringWriter();
|
||||||
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
|
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
|
||||||
Util.toDot(fst, w, false, false);
|
Util.toDot(fst, w, false, false);
|
||||||
|
@ -1145,20 +1149,20 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
public void testNonFinalStopNode() throws Exception {
|
public void testNonFinalStopNode() throws Exception {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
final Long nothing = outputs.getNoOutput();
|
final Long nothing = outputs.getNoOutput();
|
||||||
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
//final FST<Long> fst = new FST<>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, 15);
|
//final FST<Long> fst = new FST<>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, 15);
|
||||||
final FST<Long> fst = b.fst;
|
final FST<Long> fst = fstCompiler.fst;
|
||||||
|
|
||||||
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<>(b, 0);
|
final FSTCompiler.UnCompiledNode<Long> rootNode = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
|
||||||
|
|
||||||
// Add final stop node
|
// Add final stop node
|
||||||
{
|
{
|
||||||
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0);
|
final FSTCompiler.UnCompiledNode<Long> node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
|
||||||
node.isFinal = true;
|
node.isFinal = true;
|
||||||
rootNode.addArc('a', node);
|
rootNode.addArc('a', node);
|
||||||
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode();
|
||||||
frozen.node = fst.addNode(b, node);
|
frozen.node = fst.addNode(fstCompiler, node);
|
||||||
rootNode.arcs[0].nextFinalOutput = 17L;
|
rootNode.arcs[0].nextFinalOutput = 17L;
|
||||||
rootNode.arcs[0].isFinal = true;
|
rootNode.arcs[0].isFinal = true;
|
||||||
rootNode.arcs[0].output = nothing;
|
rootNode.arcs[0].output = nothing;
|
||||||
|
@ -1167,16 +1171,16 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
// Add non-final stop node
|
// Add non-final stop node
|
||||||
{
|
{
|
||||||
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0);
|
final FSTCompiler.UnCompiledNode<Long> node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0);
|
||||||
rootNode.addArc('b', node);
|
rootNode.addArc('b', node);
|
||||||
final Builder.CompiledNode frozen = new Builder.CompiledNode();
|
final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode();
|
||||||
frozen.node = fst.addNode(b, node);
|
frozen.node = fst.addNode(fstCompiler, node);
|
||||||
rootNode.arcs[1].nextFinalOutput = nothing;
|
rootNode.arcs[1].nextFinalOutput = nothing;
|
||||||
rootNode.arcs[1].output = 42L;
|
rootNode.arcs[1].output = 42L;
|
||||||
rootNode.arcs[1].target = frozen;
|
rootNode.arcs[1].target = frozen;
|
||||||
}
|
}
|
||||||
|
|
||||||
fst.finish(fst.addNode(b, rootNode));
|
fst.finish(fst.addNode(fstCompiler, rootNode));
|
||||||
|
|
||||||
StringWriter w = new StringWriter();
|
StringWriter w = new StringWriter();
|
||||||
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
||||||
|
@ -1225,13 +1229,13 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
public void testShortestPaths() throws Exception {
|
public void testShortestPaths() throws Exception {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = fstCompiler.compile();
|
||||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
||||||
//Util.toDot(fst, w, false, false);
|
//Util.toDot(fst, w, false, false);
|
||||||
//w.close();
|
//w.close();
|
||||||
|
@ -1256,16 +1260,16 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
public void testRejectNoLimits() throws IOException {
|
public void testRejectNoLimits() throws IOException {
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Long> fstCompiler = new FSTCompiler<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
|
||||||
|
|
||||||
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = fstCompiler.compile();
|
||||||
final AtomicInteger rejectCount = new AtomicInteger();
|
final AtomicInteger rejectCount = new AtomicInteger();
|
||||||
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, 2, 6, minLongComparator) {
|
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, 2, 6, minLongComparator) {
|
||||||
@Override
|
@Override
|
||||||
|
@ -1320,13 +1324,13 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
PositiveIntOutputs.getSingleton() // output
|
PositiveIntOutputs.getSingleton() // output
|
||||||
);
|
);
|
||||||
|
|
||||||
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Pair<Long,Long>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
|
fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
|
||||||
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
|
fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
|
||||||
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
|
fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
|
||||||
final FST<Pair<Long,Long>> fst = builder.finish();
|
final FST<Pair<Long,Long>> fst = fstCompiler.compile();
|
||||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
||||||
//Util.toDot(fst, w, false, false);
|
//Util.toDot(fst, w, false, false);
|
||||||
//w.close();
|
//w.close();
|
||||||
|
@ -1361,7 +1365,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
final TreeSet<String> allPrefixes = new TreeSet<>();
|
final TreeSet<String> allPrefixes = new TreeSet<>();
|
||||||
|
|
||||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
|
|
||||||
for (int i = 0; i < numWords; i++) {
|
for (int i = 0; i < numWords; i++) {
|
||||||
|
@ -1382,10 +1386,10 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
|
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
|
||||||
//System.out.println("add: " + e);
|
//System.out.println("add: " + e);
|
||||||
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
|
fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
final FST<Long> fst = builder.finish();
|
final FST<Long> fst = fstCompiler.compile();
|
||||||
//System.out.println("SAVE out.dot");
|
//System.out.println("SAVE out.dot");
|
||||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
||||||
//Util.toDot(fst, w, false, false);
|
//Util.toDot(fst, w, false, false);
|
||||||
|
@ -1479,7 +1483,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
PositiveIntOutputs.getSingleton(), // weight
|
PositiveIntOutputs.getSingleton(), // weight
|
||||||
PositiveIntOutputs.getSingleton() // output
|
PositiveIntOutputs.getSingleton() // output
|
||||||
);
|
);
|
||||||
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Pair<Long,Long>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
|
|
||||||
Random random = random();
|
Random random = random();
|
||||||
|
@ -1504,10 +1508,10 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
//System.out.println("add: " + e);
|
//System.out.println("add: " + e);
|
||||||
long weight = e.getValue().a;
|
long weight = e.getValue().a;
|
||||||
long output = e.getValue().b;
|
long output = e.getValue().b;
|
||||||
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
|
fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
|
||||||
}
|
}
|
||||||
|
|
||||||
final FST<Pair<Long,Long>> fst = builder.finish();
|
final FST<Pair<Long,Long>> fst = fstCompiler.compile();
|
||||||
//System.out.println("SAVE out.dot");
|
//System.out.println("SAVE out.dot");
|
||||||
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
|
||||||
//Util.toDot(fst, w, false, false);
|
//Util.toDot(fst, w, false, false);
|
||||||
|
@ -1563,7 +1567,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
public void testLargeOutputsOnArrayArcs() throws Exception {
|
public void testLargeOutputsOnArrayArcs() throws Exception {
|
||||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
final Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
final byte[] bytes = new byte[300];
|
final byte[] bytes = new byte[300];
|
||||||
final IntsRefBuilder input = new IntsRefBuilder();
|
final IntsRefBuilder input = new IntsRefBuilder();
|
||||||
|
@ -1572,10 +1576,10 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
for(int arc=0;arc<6;arc++) {
|
for(int arc=0;arc<6;arc++) {
|
||||||
input.setIntAt(0, arc);
|
input.setIntAt(0, arc);
|
||||||
output.bytes[0] = (byte) arc;
|
output.bytes[0] = (byte) arc;
|
||||||
builder.add(input.get(), BytesRef.deepCopyOf(output));
|
fstCompiler.add(input.get(), BytesRef.deepCopyOf(output));
|
||||||
}
|
}
|
||||||
|
|
||||||
final FST<BytesRef> fst = builder.finish();
|
final FST<BytesRef> fst = fstCompiler.compile();
|
||||||
for(int arc=0;arc<6;arc++) {
|
for(int arc=0;arc<6;arc++) {
|
||||||
input.setIntAt(0, arc);
|
input.setIntAt(0, arc);
|
||||||
final BytesRef result = Util.get(fst, input.get());
|
final BytesRef result = Util.get(fst, input.get());
|
||||||
|
@ -1608,15 +1612,15 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
Collections.sort(termsList);
|
Collections.sort(termsList);
|
||||||
|
|
||||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
IntsRefBuilder input = new IntsRefBuilder();
|
IntsRefBuilder input = new IntsRefBuilder();
|
||||||
for(BytesRef term : termsList) {
|
for(BytesRef term : termsList) {
|
||||||
Util.toIntsRef(term, input);
|
Util.toIntsRef(term, input);
|
||||||
builder.add(input.get(), term);
|
fstCompiler.add(input.get(), term);
|
||||||
}
|
}
|
||||||
|
|
||||||
FST<BytesRef> fst = builder.finish();
|
FST<BytesRef> fst = fstCompiler.compile();
|
||||||
|
|
||||||
Arc<BytesRef> arc = new FST.Arc<>();
|
Arc<BytesRef> arc = new FST.Arc<>();
|
||||||
fst.getFirstArc(arc);
|
fst.getFirstArc(arc);
|
||||||
|
@ -1638,17 +1642,17 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
|
|
||||||
public void testSimpleDepth() throws Exception {
|
public void testSimpleDepth() throws Exception {
|
||||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
BytesRef ab = new BytesRef("ab");
|
BytesRef ab = new BytesRef("ab");
|
||||||
BytesRef ac = new BytesRef("ac");
|
BytesRef ac = new BytesRef("ac");
|
||||||
BytesRef bd = new BytesRef("bd");
|
BytesRef bd = new BytesRef("bd");
|
||||||
|
|
||||||
builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
|
fstCompiler.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
|
||||||
builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
|
fstCompiler.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
|
||||||
builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
|
fstCompiler.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
|
||||||
|
|
||||||
FST<Long> fst = builder.finish();
|
FST<Long> fst = fstCompiler.compile();
|
||||||
|
|
||||||
assertEquals(3, (long) Util.get(fst, ab));
|
assertEquals(3, (long) Util.get(fst, ab));
|
||||||
assertEquals(5, (long) Util.get(fst, ac));
|
assertEquals(5, (long) Util.get(fst, ac));
|
||||||
|
|
|
@ -83,15 +83,17 @@ public class TestUtil extends LuceneTestCase {
|
||||||
|
|
||||||
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
|
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
|
||||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, allowArrayArcs, 15);
|
final FSTCompiler.Builder<Object> builder = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
|
||||||
|
.allowFixedLengthArcs(allowArrayArcs);
|
||||||
if (!allowDirectAddressing) {
|
if (!allowDirectAddressing) {
|
||||||
b.setDirectAddressingMaxOversizingFactor(-1f);
|
builder.directAddressingMaxOversizingFactor(-1f);
|
||||||
}
|
}
|
||||||
|
final FSTCompiler<Object> fstCompiler = builder.build();
|
||||||
|
|
||||||
for (String word : words) {
|
for (String word : words) {
|
||||||
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
|
fstCompiler.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
|
||||||
}
|
}
|
||||||
return b.finish();
|
return fstCompiler.compile();
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> createRandomDictionary(int width, int depth) {
|
private List<String> createRandomDictionary(int width, int depth) {
|
||||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
||||||
* more of its output values. You can use this when a single
|
* more of its output values. You can use this when a single
|
||||||
* input may need to map to more than one output,
|
* input may need to map to more than one output,
|
||||||
* maintaining order: pass the same input with a different
|
* maintaining order: pass the same input with a different
|
||||||
* output by calling {@link Builder#add(IntsRef,Object)} multiple
|
* output by calling {@link FSTCompiler#add(IntsRef,Object)} multiple
|
||||||
* times. The builder will then combine the outputs using
|
* times. The builder will then combine the outputs using
|
||||||
* the {@link Outputs#merge(Object,Object)} method.
|
* the {@link Outputs#merge(Object,Object)} method.
|
||||||
*
|
*
|
||||||
|
@ -41,7 +41,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
||||||
* <p>NOTE: the only way to create multiple outputs is to
|
* <p>NOTE: the only way to create multiple outputs is to
|
||||||
* add the same input to the FST multiple times in a row. This is
|
* add the same input to the FST multiple times in a row. This is
|
||||||
* how the FST maps a single input to multiple outputs (e.g. you
|
* how the FST maps a single input to multiple outputs (e.g. you
|
||||||
* cannot pass a List<Object> to {@link Builder#add}). If
|
* cannot pass a List<Object> to {@link FSTCompiler#add}). If
|
||||||
* your outputs are longs, and you need at most 2, then use
|
* your outputs are longs, and you need at most 2, then use
|
||||||
* {@link UpToTwoPositiveIntOutputs} instead since it stores
|
* {@link UpToTwoPositiveIntOutputs} instead since it stores
|
||||||
* the outputs more compactly (by stealing a bit from each
|
* the outputs more compactly (by stealing a bit from each
|
||||||
|
|
|
@ -35,14 +35,14 @@ import org.apache.lucene.util.SuppressForbidden;
|
||||||
* <p>NOTE: the only way to create a TwoLongs output is to
|
* <p>NOTE: the only way to create a TwoLongs output is to
|
||||||
* add the same input to the FST twice in a row. This is
|
* add the same input to the FST twice in a row. This is
|
||||||
* how the FST maps a single input to two outputs (e.g. you
|
* how the FST maps a single input to two outputs (e.g. you
|
||||||
* cannot pass a TwoLongs to {@link Builder#add}. If you
|
* cannot pass a TwoLongs to {@link FSTCompiler#add}. If you
|
||||||
* need more than two then use {@link ListOfOutputs}, but if
|
* need more than two then use {@link ListOfOutputs}, but if
|
||||||
* you only have at most 2 then this implementation will
|
* you only have at most 2 then this implementation will
|
||||||
* require fewer bytes as it steals one bit from each long
|
* require fewer bytes as it steals one bit from each long
|
||||||
* value.
|
* value.
|
||||||
*
|
*
|
||||||
* <p>NOTE: the resulting FST is not guaranteed to be minimal!
|
* <p>NOTE: the resulting FST is not guaranteed to be minimal!
|
||||||
* See {@link Builder}.
|
* See {@link FSTCompiler}.
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -164,16 +164,16 @@ public class TestFSTsMisc extends LuceneTestCase {
|
||||||
public void testListOfOutputs() throws Exception {
|
public void testListOfOutputs() throws Exception {
|
||||||
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
|
||||||
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
|
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
|
||||||
final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
// Add the same input more than once and the outputs
|
// Add the same input more than once and the outputs
|
||||||
// are merged:
|
// are merged:
|
||||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
|
||||||
final FST<Object> fst = builder.finish();
|
final FST<Object> fst = fstCompiler.compile();
|
||||||
|
|
||||||
Object output = Util.get(fst, new BytesRef("a"));
|
Object output = Util.get(fst, new BytesRef("a"));
|
||||||
assertNotNull(output);
|
assertNotNull(output);
|
||||||
|
@ -193,20 +193,20 @@ public class TestFSTsMisc extends LuceneTestCase {
|
||||||
public void testListOfOutputsEmptyString() throws Exception {
|
public void testListOfOutputsEmptyString() throws Exception {
|
||||||
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
|
||||||
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
|
ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
|
||||||
final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
builder.add(scratch.get(), 0L);
|
fstCompiler.add(scratch.get(), 0L);
|
||||||
builder.add(scratch.get(), 1L);
|
fstCompiler.add(scratch.get(), 1L);
|
||||||
builder.add(scratch.get(), 17L);
|
fstCompiler.add(scratch.get(), 17L);
|
||||||
builder.add(scratch.get(), 1L);
|
fstCompiler.add(scratch.get(), 1L);
|
||||||
|
|
||||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
|
||||||
builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
|
fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L);
|
||||||
|
|
||||||
final FST<Object> fst = builder.finish();
|
final FST<Object> fst = fstCompiler.compile();
|
||||||
|
|
||||||
Object output = Util.get(fst, new BytesRef(""));
|
Object output = Util.get(fst, new BytesRef(""));
|
||||||
assertNotNull(output);
|
assertNotNull(output);
|
||||||
|
|
|
@ -43,7 +43,7 @@ import org.apache.lucene.util.FixedBitSet;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
@ -350,29 +350,28 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
|
final FSTCompiler<Pair<BytesRef,Long>> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS)
|
||||||
0, 0, true, false, Integer.MAX_VALUE,
|
.shouldShareNonSingletonNodes(false).build();
|
||||||
FST_OUTPUTS, true, 15);
|
|
||||||
//if (DEBUG) {
|
//if (DEBUG) {
|
||||||
// System.out.println(" compile index for prefix=" + prefix);
|
// System.out.println(" compile index for prefix=" + prefix);
|
||||||
//}
|
//}
|
||||||
//indexBuilder.DEBUG = false;
|
//indexBuilder.DEBUG = false;
|
||||||
final byte[] bytes = scratchBytes.toArrayCopy();
|
final byte[] bytes = scratchBytes.toArrayCopy();
|
||||||
assert bytes.length > 0;
|
assert bytes.length > 0;
|
||||||
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
|
fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
|
||||||
scratchBytes.reset();
|
scratchBytes.reset();
|
||||||
|
|
||||||
// Copy over index for all sub-blocks
|
// Copy over index for all sub-blocks
|
||||||
for(PendingBlock block : blocks) {
|
for(PendingBlock block : blocks) {
|
||||||
if (block.subIndices != null) {
|
if (block.subIndices != null) {
|
||||||
for(FST<Pair<BytesRef,Long>> subIndex : block.subIndices) {
|
for(FST<Pair<BytesRef,Long>> subIndex : block.subIndices) {
|
||||||
append(indexBuilder, subIndex, scratchIntsRef);
|
append(fstCompiler, subIndex, scratchIntsRef);
|
||||||
}
|
}
|
||||||
block.subIndices = null;
|
block.subIndices = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
index = indexBuilder.finish();
|
index = fstCompiler.compile();
|
||||||
|
|
||||||
assert subIndices == null;
|
assert subIndices == null;
|
||||||
|
|
||||||
|
@ -387,14 +386,14 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
||||||
// TODO: maybe we could add bulk-add method to
|
// TODO: maybe we could add bulk-add method to
|
||||||
// Builder? Takes FST and unions it w/ current
|
// Builder? Takes FST and unions it w/ current
|
||||||
// FST.
|
// FST.
|
||||||
private void append(Builder<Pair<BytesRef,Long>> builder, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
|
private void append(FSTCompiler<Pair<BytesRef,Long>> fstCompiler, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
|
||||||
final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
|
final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
|
||||||
BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
|
BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
|
||||||
while((indexEnt = subIndexEnum.next()) != null) {
|
while((indexEnt = subIndexEnum.next()) != null) {
|
||||||
//if (DEBUG) {
|
//if (DEBUG) {
|
||||||
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
|
// System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
|
||||||
//}
|
//}
|
||||||
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
|
fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,7 +52,7 @@ import org.apache.lucene.util.automaton.Automaton;
|
||||||
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
|
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
|
||||||
import org.apache.lucene.util.automaton.Operations;
|
import org.apache.lucene.util.automaton.Operations;
|
||||||
import org.apache.lucene.util.automaton.Transition;
|
import org.apache.lucene.util.automaton.Transition;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||||
import org.apache.lucene.util.fst.FST.BytesReader;
|
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
@ -496,7 +496,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
|
||||||
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
|
reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
|
||||||
|
|
||||||
PairOutputs<Long,BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
|
PairOutputs<Long,BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
|
||||||
Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
FSTCompiler<Pair<Long,BytesRef>> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
// Build FST:
|
// Build FST:
|
||||||
BytesRefBuilder previousAnalyzed = null;
|
BytesRefBuilder previousAnalyzed = null;
|
||||||
|
@ -570,7 +570,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
|
||||||
Util.toIntsRef(analyzed.get(), scratchInts);
|
Util.toIntsRef(analyzed.get(), scratchInts);
|
||||||
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
|
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
|
||||||
if (!hasPayloads) {
|
if (!hasPayloads) {
|
||||||
builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
|
fstCompiler.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
|
||||||
} else {
|
} else {
|
||||||
int payloadOffset = input.getPosition() + surface.length;
|
int payloadOffset = input.getPosition() + surface.length;
|
||||||
int payloadLength = bytes.length - payloadOffset;
|
int payloadLength = bytes.length - payloadOffset;
|
||||||
|
@ -579,10 +579,10 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
|
||||||
br.bytes[surface.length] = PAYLOAD_SEP;
|
br.bytes[surface.length] = PAYLOAD_SEP;
|
||||||
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
|
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
|
||||||
br.length = br.bytes.length;
|
br.length = br.bytes.length;
|
||||||
builder.add(scratchInts.get(), outputs.newPair(cost, br));
|
fstCompiler.add(scratchInts.get(), outputs.newPair(cost, br));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fst = builder.finish();
|
fst = fstCompiler.compile();
|
||||||
|
|
||||||
//Util.dotToFile(fst, "/tmp/suggest.dot");
|
//Util.dotToFile(fst, "/tmp/suggest.dot");
|
||||||
} finally {
|
} finally {
|
||||||
|
|
|
@ -66,7 +66,7 @@ import org.apache.lucene.util.CharsRefBuilder;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.FST.Arc;
|
import org.apache.lucene.util.fst.FST.Arc;
|
||||||
import org.apache.lucene.util.fst.FST.BytesReader;
|
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||||
|
@ -304,7 +304,7 @@ public class FreeTextSuggester extends Lookup implements Accountable {
|
||||||
TermsEnum termsEnum = terms.iterator();
|
TermsEnum termsEnum = terms.iterator();
|
||||||
|
|
||||||
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
|
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
|
||||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -320,10 +320,10 @@ public class FreeTextSuggester extends Lookup implements Accountable {
|
||||||
totTokens += termsEnum.totalTermFreq();
|
totTokens += termsEnum.totalTermFreq();
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
|
fstCompiler.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
|
||||||
}
|
}
|
||||||
|
|
||||||
fst = builder.finish();
|
fst = fstCompiler.compile();
|
||||||
if (fst == null) {
|
if (fst == null) {
|
||||||
throw new IllegalArgumentException("need at least one suggestion");
|
throw new IllegalArgumentException("need at least one suggestion");
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.PairOutputs;
|
import org.apache.lucene.util.fst.PairOutputs;
|
||||||
|
@ -53,7 +53,7 @@ final class NRTSuggesterBuilder {
|
||||||
public static final int END_BYTE = 0x0;
|
public static final int END_BYTE = 0x0;
|
||||||
|
|
||||||
private final PairOutputs<Long, BytesRef> outputs;
|
private final PairOutputs<Long, BytesRef> outputs;
|
||||||
private final Builder<PairOutputs.Pair<Long, BytesRef>> builder;
|
private final FSTCompiler<PairOutputs.Pair<Long, BytesRef>> fstCompiler;
|
||||||
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
|
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
private final BytesRefBuilder analyzed = new BytesRefBuilder();
|
private final BytesRefBuilder analyzed = new BytesRefBuilder();
|
||||||
private final PriorityQueue<Entry> entries;
|
private final PriorityQueue<Entry> entries;
|
||||||
|
@ -70,7 +70,7 @@ final class NRTSuggesterBuilder {
|
||||||
this.endByte = END_BYTE;
|
this.endByte = END_BYTE;
|
||||||
this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
|
this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
|
||||||
this.entries = new PriorityQueue<>();
|
this.entries = new PriorityQueue<>();
|
||||||
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -108,7 +108,7 @@ final class NRTSuggesterBuilder {
|
||||||
}
|
}
|
||||||
analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
|
analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
|
||||||
Util.toIntsRef(analyzed.get(), scratchInts);
|
Util.toIntsRef(analyzed.get(), scratchInts);
|
||||||
builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
|
fstCompiler.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
|
||||||
}
|
}
|
||||||
maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
|
maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
|
||||||
entries.clear();
|
entries.clear();
|
||||||
|
@ -119,11 +119,11 @@ final class NRTSuggesterBuilder {
|
||||||
* {@link NRTSuggester#load(IndexInput, CompletionPostingsFormat.FSTLoadMode)})}
|
* {@link NRTSuggester#load(IndexInput, CompletionPostingsFormat.FSTLoadMode)})}
|
||||||
*/
|
*/
|
||||||
public boolean store(DataOutput output) throws IOException {
|
public boolean store(DataOutput output) throws IOException {
|
||||||
final FST<PairOutputs.Pair<Long, BytesRef>> build = builder.finish();
|
final FST<PairOutputs.Pair<Long, BytesRef>> fst = fstCompiler.compile();
|
||||||
if (build == null) {
|
if (fst == null) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
build.save(output);
|
fst.save(output);
|
||||||
|
|
||||||
/* write some more meta-info */
|
/* write some more meta-info */
|
||||||
assert maxAnalyzedPathsPerOutput > 0;
|
assert maxAnalyzedPathsPerOutput > 0;
|
||||||
|
|
|
@ -169,7 +169,7 @@ public class FSTCompletionBuilder {
|
||||||
* @param shareMaxTailLength
|
* @param shareMaxTailLength
|
||||||
* Max shared suffix sharing length.
|
* Max shared suffix sharing length.
|
||||||
*
|
*
|
||||||
* See the description of this parameter in {@link Builder}'s constructor.
|
* See the description of this parameter in {@link org.apache.lucene.util.fst.FSTCompiler.Builder}.
|
||||||
* In general, for very large inputs you'll want to construct a non-minimal
|
* In general, for very large inputs you'll want to construct a non-minimal
|
||||||
* automaton which will be larger, but the construction will take far less ram.
|
* automaton which will be larger, but the construction will take far less ram.
|
||||||
* For minimal automata, set it to {@link Integer#MAX_VALUE}.
|
* For minimal automata, set it to {@link Integer#MAX_VALUE}.
|
||||||
|
@ -234,10 +234,9 @@ public class FSTCompletionBuilder {
|
||||||
// Build the automaton.
|
// Build the automaton.
|
||||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||||
final Object empty = outputs.getNoOutput();
|
final Object empty = outputs.getNoOutput();
|
||||||
final Builder<Object> builder = new Builder<>(
|
final FSTCompiler<Object> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
|
||||||
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
|
.shareMaxTailLength(shareMaxTailLength).build();
|
||||||
shareMaxTailLength, outputs, true, 15);
|
|
||||||
|
|
||||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
BytesRef entry;
|
BytesRef entry;
|
||||||
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
|
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
|
||||||
|
@ -246,11 +245,11 @@ public class FSTCompletionBuilder {
|
||||||
while((entry = iter.next()) != null) {
|
while((entry = iter.next()) != null) {
|
||||||
count++;
|
count++;
|
||||||
if (scratch.get().compareTo(entry) != 0) {
|
if (scratch.get().compareTo(entry) != 0) {
|
||||||
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
|
fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty);
|
||||||
scratch.copyBytes(entry);
|
scratch.copyBytes(entry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return count == 0 ? null : builder.finish();
|
return count == 0 ? null : fstCompiler.compile();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,7 +40,7 @@ import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.apache.lucene.util.CharsRefBuilder;
|
import org.apache.lucene.util.CharsRefBuilder;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||||
import org.apache.lucene.util.fst.Builder;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.FST.Arc;
|
import org.apache.lucene.util.fst.FST.Arc;
|
||||||
import org.apache.lucene.util.fst.FST.BytesReader;
|
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||||
|
@ -116,7 +116,7 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
BytesRefBuilder previous = null;
|
BytesRefBuilder previous = null;
|
||||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
while ((scratch = iter.next()) != null) {
|
while ((scratch = iter.next()) != null) {
|
||||||
long cost = iter.weight();
|
long cost = iter.weight();
|
||||||
|
|
||||||
|
@ -127,11 +127,11 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
|
||||||
// added
|
// added
|
||||||
}
|
}
|
||||||
Util.toIntsRef(scratch, scratchInts);
|
Util.toIntsRef(scratch, scratchInts);
|
||||||
builder.add(scratchInts.get(), cost);
|
fstCompiler.add(scratchInts.get(), cost);
|
||||||
previous.copyBytes(scratch);
|
previous.copyBytes(scratch);
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
fst = builder.finish();
|
fst = fstCompiler.compile();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -272,27 +272,26 @@ public class FSTTester<T> {
|
||||||
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
|
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
|
||||||
}
|
}
|
||||||
|
|
||||||
final Builder<T> builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
|
final FSTCompiler<T> fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
|
||||||
prune1, prune2,
|
.minSuffixCount1(prune1)
|
||||||
prune1==0 && prune2==0,
|
.minSuffixCount2(prune2)
|
||||||
allowRandomSuffixSharing ? random.nextBoolean() : true,
|
.shouldShareSuffix(prune1==0 && prune2==0)
|
||||||
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
|
.shouldShareNonSingletonNodes(allowRandomSuffixSharing ? random.nextBoolean() : true)
|
||||||
outputs,
|
.shareMaxTailLength(allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE)
|
||||||
true,
|
.build();
|
||||||
15);
|
|
||||||
|
|
||||||
for(InputOutput<T> pair : pairs) {
|
for(InputOutput<T> pair : pairs) {
|
||||||
if (pair.output instanceof List) {
|
if (pair.output instanceof List) {
|
||||||
@SuppressWarnings("unchecked") List<Long> longValues = (List<Long>) pair.output;
|
@SuppressWarnings("unchecked") List<Long> longValues = (List<Long>) pair.output;
|
||||||
@SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
|
@SuppressWarnings("unchecked") final FSTCompiler<Object> fstCompilerObject = (FSTCompiler<Object>) fstCompiler;
|
||||||
for(Long value : longValues) {
|
for(Long value : longValues) {
|
||||||
builderObject.add(pair.input, value);
|
fstCompilerObject.add(pair.input, value);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
builder.add(pair.input, pair.output);
|
fstCompiler.add(pair.input, pair.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FST<T> fst = builder.finish();
|
FST<T> fst = fstCompiler.compile();
|
||||||
|
|
||||||
if (random.nextBoolean() && fst != null) {
|
if (random.nextBoolean() && fst != null) {
|
||||||
IOContext context = LuceneTestCase.newIOContext(random);
|
IOContext context = LuceneTestCase.newIOContext(random);
|
||||||
|
@ -320,7 +319,7 @@ public class FSTTester<T> {
|
||||||
if (fst == null) {
|
if (fst == null) {
|
||||||
System.out.println(" fst has 0 nodes (fully pruned)");
|
System.out.println(" fst has 0 nodes (fully pruned)");
|
||||||
} else {
|
} else {
|
||||||
System.out.println(" fst has " + builder.getNodeCount() + " nodes and " + builder.getArcCount() + " arcs");
|
System.out.println(" fst has " + fstCompiler.getNodeCount() + " nodes and " + fstCompiler.getArcCount() + " arcs");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -330,8 +329,8 @@ public class FSTTester<T> {
|
||||||
verifyPruned(inputMode, fst, prune1, prune2);
|
verifyPruned(inputMode, fst, prune1, prune2);
|
||||||
}
|
}
|
||||||
|
|
||||||
nodeCount = builder.getNodeCount();
|
nodeCount = fstCompiler.getNodeCount();
|
||||||
arcCount = builder.getArcCount();
|
arcCount = fstCompiler.getArcCount();
|
||||||
|
|
||||||
return fst;
|
return fst;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue