diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ccc7e91da6e..39daefefb3b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -38,6 +38,9 @@ API Changes * LUCENE-8905: Better defence against malformed arguments in TopDocsCollector (Atri Sharma) +* LUCENE-9089: FST Builder renamed FSTCompiler with fluent-style Builder. + (Bruno Roustant) + Improvements * LUCENE-8757: When provided with an ExecutorService to run queries across @@ -64,6 +67,8 @@ Other * LUCENE-8768: Fix Javadocs build in Java 11. (Namgyu Kim) +* LUCENE-9092: upgrade randomizedtesting to 2.7.5 (Dawid Weiss) + ======================= Lucene 8.5.0 ======================= API Changes diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index a4c181ba365..f2a3e961bbe 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -1,5 +1,10 @@ # Apache Lucene Migration Guide +## o.a.l.util.fst.Builder is renamed FSTCompiler with fluent-style Builder (LUCENE-9089) ## + +Simply use FSTCompiler instead of the previous Builder. Use either the simple constructor with default settings, or +the FSTCompiler.Builder to tune and tweak any parameter. + ## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ## User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java index b3efcf73f86..a217b4c5382 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java @@ -25,6 +25,7 @@ import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Util; @@ -106,13 +107,13 @@ public class NormalizeCharMap { final FST map; try { final Outputs outputs = CharSequenceOutputs.getSingleton(); - final org.apache.lucene.util.fst.Builder builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); for(Map.Entry ent : pendingPairs.entrySet()) { - builder.add(Util.toUTF16(ent.getKey(), scratch), + fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } - map = builder.finish(); + map = fstCompiler.compile(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 443f010047f..ff4a800e81d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -64,7 +64,7 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; import org.apache.lucene.util.OfflineSorter; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.IntSequenceOutputs; @@ -231,9 +231,9 @@ public class Dictionary { // read dictionary entries IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); - Builder b = new Builder<>(FST.INPUT_TYPE.BYTE4, o); - readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, b); - words = b.finish(); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o); + readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler); + words = fstCompiler.compile(); aliases = null; // no longer needed morphAliases = null; // no longer needed success = true; @@ -414,7 +414,7 @@ public class Dictionary { private FST affixFST(TreeMap> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); - Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); @@ -423,9 +423,9 @@ public class Dictionary { for (Integer c : entries) { output.ints[output.length++] = c; } - builder.add(scratch.get(), output); + fstCompiler.add(scratch.get(), output); } - return builder.finish(); + return fstCompiler.compile(); } static String escapeDash(String re) { @@ -608,14 +608,14 @@ public class Dictionary { } Outputs outputs = CharSequenceOutputs.getSingleton(); - Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); - builder.add(scratchInts.get(), new CharsRef(entry.getValue())); + fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue())); } - return builder.finish(); + return fstCompiler.compile(); } /** pattern accepts optional BOM + SET + any whitespace */ @@ -776,7 +776,7 @@ public class Dictionary { * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ - private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List dictionaries, CharsetDecoder decoder, Builder words) throws IOException { + private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List dictionaries, CharsetDecoder decoder, FSTCompiler words) throws IOException { BytesRefBuilder flagsScratch = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java index 078865fc344..7a061d3cc4a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java @@ -35,6 +35,7 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.FSTCompiler; /** * Provides the ability to override any {@link KeywordAttribute} aware stemmer @@ -203,7 +204,7 @@ public final class StemmerOverrideFilter extends TokenFilter { */ public StemmerOverrideMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - org.apache.lucene.util.fst.Builder builder = new org.apache.lucene.util.fst.Builder<>( + FSTCompiler fstCompiler = new FSTCompiler<>( FST.INPUT_TYPE.BYTE4, outputs); final int[] sort = hash.sort(); IntsRefBuilder intsSpare = new IntsRefBuilder(); @@ -213,9 +214,9 @@ public final class StemmerOverrideFilter extends TokenFilter { int id = sort[i]; BytesRef bytesRef = hash.get(id, spare); intsSpare.copyUTF8Bytes(bytesRef); - builder.add(intsSpare.get(), new BytesRef(outputValues.get(id))); + fstCompiler.add(intsSpare.get(), new BytesRef(outputValues.get(id))); } - return new StemmerOverrideMap(builder.finish(), ignoreCase); + return new StemmerOverrideMap(fstCompiler.compile(), ignoreCase); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java index 97db1deb3e4..ed64e0d9215 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java @@ -39,6 +39,7 @@ import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.Util; /** @@ -213,8 +214,8 @@ public class SynonymMap { public SynonymMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); // TODO: are we using the best sharing options? - org.apache.lucene.util.fst.Builder builder = - new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs); + FSTCompiler fstCompiler = + new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); BytesRefBuilder scratch = new BytesRefBuilder(); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); @@ -278,10 +279,10 @@ public class SynonymMap { scratch.setLength(scratchOutput.getPosition()); //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); - builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef()); + fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef()); } - FST fst = builder.finish(); + FST fst = fstCompiler.compile(); return new SynonymMap(fst, words, maxHorizontalContext); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index b7312cb7563..908b0275491 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -30,7 +30,7 @@ import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Outputs; @@ -196,26 +196,26 @@ public class TestDictionary extends LuceneTestCase { public void testReplacements() throws Exception { Outputs outputs = CharSequenceOutputs.getSingleton(); - Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); // a -> b Util.toUTF16("a", scratchInts); - builder.add(scratchInts.get(), new CharsRef("b")); + fstCompiler.add(scratchInts.get(), new CharsRef("b")); // ab -> c Util.toUTF16("ab", scratchInts); - builder.add(scratchInts.get(), new CharsRef("c")); + fstCompiler.add(scratchInts.get(), new CharsRef("c")); // c -> de Util.toUTF16("c", scratchInts); - builder.add(scratchInts.get(), new CharsRef("de")); + fstCompiler.add(scratchInts.get(), new CharsRef("de")); // def -> gh Util.toUTF16("def", scratchInts); - builder.add(scratchInts.get(), new CharsRef("gh")); + fstCompiler.add(scratchInts.get(), new CharsRef("gh")); - FST fst = builder.finish(); + FST fst = fstCompiler.compile(); StringBuilder sb = new StringBuilder("atestanother"); Dictionary.applyMappings(fst, sb); diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index fef24f948e1..120fbb580df 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -29,7 +29,7 @@ import java.util.TreeMap; import org.apache.lucene.analysis.ja.util.CSVUtil; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; @@ -99,7 +99,7 @@ public final class UserDictionary implements Dictionary { List segmentations = new ArrayList<>(featureEntries.size()); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); - Builder fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = 0; @@ -136,11 +136,11 @@ public final class UserDictionary implements Dictionary { for (int i = 0; i < token.length(); i++) { scratch.setIntAt(i, (int) token.charAt(i)); } - fstBuilder.add(scratch.get(), ord); + fstCompiler.add(scratch.get(), ord); segmentations.add(wordIdAndLength); ord++; } - this.fst = new TokenInfoFST(fstBuilder.finish(), false); + this.fst = new TokenInfoFST(fstCompiler.compile(), false); this.data = data.toArray(new String[data.size()]); this.segmentations = segmentations.toArray(new int[segmentations.size()][]); } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java index 4bb8d59b9ef..8f1befdeb0e 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java @@ -31,7 +31,7 @@ import java.util.stream.Stream; import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; @@ -97,7 +97,7 @@ class TokenInfoDictionaryBuilder { lines.sort(Comparator.comparing(entry -> entry[0])); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); - Builder fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = -1; // first ord will be 0 String lastValue = null; @@ -120,12 +120,12 @@ class TokenInfoDictionaryBuilder { for (int i = 0; i < token.length(); i++) { scratch.setIntAt(i, (int) token.charAt(i)); } - fstBuilder.add(scratch.get(), ord); + fstCompiler.add(scratch.get(), ord); } dictionary.addMapping((int) ord, offset); offset = next; } - dictionary.setFST(fstBuilder.finish()); + dictionary.setFST(fstCompiler.compile()); return dictionary; } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java index 186990e0b57..931ce484b50 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java @@ -25,7 +25,7 @@ import java.util.List; import org.apache.lucene.analysis.ko.POS; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; @@ -83,7 +83,7 @@ public final class UserDictionary implements Dictionary { entries.sort(Comparator.comparing(e -> e.split("\\s+")[0])); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); - Builder fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); IntsRefBuilder scratch = new IntsRefBuilder(); String lastToken = null; @@ -129,11 +129,11 @@ public final class UserDictionary implements Dictionary { for (int i = 0; i < token.length(); i++) { scratch.setIntAt(i, token.charAt(i)); } - fstBuilder.add(scratch.get(), ord); + fstCompiler.add(scratch.get(), ord); lastToken = token; ord ++; } - this.fst = new TokenInfoFST(fstBuilder.finish()); + this.fst = new TokenInfoFST(fstCompiler.compile()); this.segmentations = segmentations.toArray(new int[segmentations.size()][]); this.rightIds = new short[rightIds.size()]; for (int i = 0; i < rightIds.size(); i++) { diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java index 4f4f0b7be01..0c6d5f1f088 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java @@ -30,7 +30,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; @@ -90,7 +90,7 @@ class TokenInfoDictionaryBuilder { lines.sort(Comparator.comparing(left -> left[0])); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); - Builder fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = -1; // first ord will be 0 String lastValue = null; @@ -116,12 +116,12 @@ class TokenInfoDictionaryBuilder { for (int i = 0; i < surfaceForm.length(); i++) { scratch.setIntAt(i, surfaceForm.charAt(i)); } - fstBuilder.add(scratch.get(), ord); + fstCompiler.add(scratch.get(), ord); } dictionary.addMapping((int) ord, offset); offset = next; } - dictionary.setFST(fstBuilder.finish()); + dictionary.setFST(fstCompiler.compile()); return dictionary; } } diff --git a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java index 00083750b11..87c9064ffb1 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java @@ -41,7 +41,7 @@ import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; @@ -183,15 +183,15 @@ public class BooleanPerceptronClassifier implements Classifier { private void updateFST(SortedMap weights) throws IOException { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - Builder fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry entry : weights.entrySet()) { scratchBytes.copyChars(entry.getKey()); - fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry + fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry .getValue().longValue()); } - fst = fstBuilder.finish(); + fst = fstCompiler.compile(); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java index 47c6e7610ea..b8785050b90 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java @@ -33,7 +33,7 @@ import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; @@ -219,7 +219,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { } private class FSTFieldWriter extends FieldWriter { - private final Builder fstBuilder; + private final FSTCompiler fstCompiler; private final PositiveIntOutputs fstOutputs; private final long startTermsFilePointer; @@ -233,12 +233,12 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException { this.fieldInfo = fieldInfo; fstOutputs = PositiveIntOutputs.getSingleton(); - fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs); + fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs); indexStart = out.getFilePointer(); ////System.out.println("VGW: field=" + fieldInfo.name); // Always put empty string in - fstBuilder.add(new IntsRef(), termsFilePointer); + fstCompiler.add(new IntsRef(), termsFilePointer); startTermsFilePointer = termsFilePointer; } @@ -269,7 +269,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { final int lengthSave = text.length; text.length = indexedTermPrefixLength(lastTerm.get(), text); try { - fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer); + fstCompiler.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer); } finally { text.length = lengthSave; } @@ -278,7 +278,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { @Override public void finish(long termsFilePointer) throws IOException { - fst = fstBuilder.finish(); + fst = fstCompiler.compile(); if (fst != null) { fst.save(out); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java index 769c7323832..b305e61db7a 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java @@ -45,7 +45,7 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Util; @@ -361,16 +361,14 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { } } - final Builder indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, - 0, 0, true, false, Integer.MAX_VALUE, - FST_OUTPUTS, true, 15); + final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS).shouldShareNonSingletonNodes(false).build(); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} //indexBuilder.DEBUG = false; final byte[] bytes = scratchBytes.toArrayCopy(); assert bytes.length > 0; - indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), + fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newOutput(new BytesRef(bytes, 0, bytes.length), 0, Long.MAX_VALUE-(sumTotalTermCount-1))); scratchBytes.reset(); @@ -381,7 +379,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { for(PendingBlock block : blocks) { if (block.subIndices != null) { for(SubIndex subIndex : block.subIndices) { - append(indexBuilder, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef); + append(fstCompiler, subIndex.index, termOrdOffset + subIndex.termOrdStart, scratchIntsRef); } block.subIndices = null; } @@ -391,7 +389,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { assert sumTotalTermCount == totFloorTermCount; - index = indexBuilder.finish(); + index = fstCompiler.compile(); assert subIndices == null; /* @@ -405,7 +403,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { // TODO: maybe we could add bulk-add method to // Builder? Takes FST and unions it w/ current // FST. - private void append(Builder builder, FST subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException { + private void append(FSTCompiler fstCompiler, FST subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException { final BytesRefFSTEnum subIndexEnum = new BytesRefFSTEnum<>(subIndex); BytesRefFSTEnum.InputOutput indexEnt; while ((indexEnt = subIndexEnum.next()) != null) { @@ -416,7 +414,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { //long blockTermCount = output.endOrd - output.startOrd + 1; Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset); //System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd)); - builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput); + fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput); } } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java index 0077045229d..a31a2f940b3 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java @@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; @@ -287,7 +287,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer { } final class TermsWriter { - private final Builder builder; + private final FSTCompiler fstCompiler; private final PositiveIntOutputs outputs; private final FieldInfo fieldInfo; private final int longsSize; @@ -311,7 +311,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer { this.fieldInfo = fieldInfo; this.longsSize = postingsWriter.setField(fieldInfo); this.outputs = PositiveIntOutputs.getSingleton(); - this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); this.lastBlockStatsFP = 0; this.lastBlockMetaLongsFP = 0; @@ -346,7 +346,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer { } metaLongsOut.writeVLong(metaBytesOut.size() - lastMetaBytesFP); - builder.add(Util.toIntsRef(text, scratchTerm), numTerms); + fstCompiler.add(Util.toIntsRef(text, scratchTerm), numTerms); numTerms++; lastMetaBytesFP = metaBytesOut.size(); @@ -365,7 +365,7 @@ public class FSTOrdTermsWriter extends FieldsConsumer { metadata.statsOut = statsOut; metadata.metaLongsOut = metaLongsOut; metadata.metaBytesOut = metaBytesOut; - metadata.dict = builder.finish(); + metadata.dict = fstCompiler.compile(); fields.add(metadata); } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java index 22eff0b7547..2ef15651041 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java @@ -41,7 +41,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Util; @@ -247,7 +247,7 @@ public class FSTTermsWriter extends FieldsConsumer { } final class TermsWriter { - private final Builder builder; + private final FSTCompiler fstCompiler; private final FSTTermOutputs outputs; private final FieldInfo fieldInfo; private final int longsSize; @@ -261,7 +261,7 @@ public class FSTTermsWriter extends FieldsConsumer { this.fieldInfo = fieldInfo; this.longsSize = postingsWriter.setField(fieldInfo); this.outputs = new FSTTermOutputs(fieldInfo, longsSize); - this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); } public void finishTerm(BytesRef text, BlockTermState state) throws IOException { @@ -276,14 +276,14 @@ public class FSTTermsWriter extends FieldsConsumer { meta.bytes = metaWriter.toArrayCopy(); metaWriter.reset(); } - builder.add(Util.toIntsRef(text, scratchTerm), meta); + fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta); numTerms++; } public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { // save FST dict if (numTerms > 0) { - final FST fst = builder.finish(); + final FST fst = fstCompiler.compile(); fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst)); } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java index 1dec0c869d2..a5464fcdb80 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java @@ -52,7 +52,7 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs; @@ -539,11 +539,11 @@ class SimpleTextFieldsReader extends FieldsProducer { private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(); - final Builder>> b; + final FSTCompiler>> fstCompiler; final PairOutputs outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs); final PairOutputs> outputs = new PairOutputs<>(posIntOutputs, outputsInner); - b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRefBuilder lastTerm = new BytesRefBuilder(); @@ -556,7 +556,7 @@ class SimpleTextFieldsReader extends FieldsProducer { SimpleTextUtil.readLine(in, scratch); if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) { if (lastDocsStart != -1) { - b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), + fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; @@ -574,7 +574,7 @@ class SimpleTextFieldsReader extends FieldsProducer { totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1; } else if (StringHelper.startsWith(scratch.get(), TERM)) { if (lastDocsStart != -1) { - b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart, + fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); @@ -589,7 +589,7 @@ class SimpleTextFieldsReader extends FieldsProducer { } } docCount = visitedDocs.cardinality(); - fst = b.finish(); + fst = fstCompiler.compile(); /* PrintStream ps = new PrintStream("out.dot"); fst.toDot(ps); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java index 3f173acf27e..2ffb687e08b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java @@ -30,6 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; @@ -202,19 +203,19 @@ public class FSTDictionary implements IndexDictionary { */ public static class Builder implements IndexDictionary.Builder { - protected final org.apache.lucene.util.fst.Builder fstBuilder; + protected final FSTCompiler fstCompiler; protected final IntsRefBuilder scratchInts; public Builder() { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - fstBuilder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); scratchInts = new IntsRefBuilder(); } @Override public void add(BytesRef blockKey, long blockFilePointer) { try { - fstBuilder.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer); + fstCompiler.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer); } catch (IOException e) { // Should never happen. throw new RuntimeException(e); @@ -224,7 +225,7 @@ public class FSTDictionary implements IndexDictionary { @Override public FSTDictionary build() { try { - return new FSTDictionary(fstBuilder.finish()); + return new FSTDictionary(fstCompiler.compile()); } catch (IOException e) { // Should never happen. throw new RuntimeException(e); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java index 3059d5a33b2..dde6810af1d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java @@ -44,7 +44,7 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; @@ -454,29 +454,27 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { } final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - final Builder indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, - 0, 0, true, false, Integer.MAX_VALUE, - outputs, true, 15); + final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).shouldShareNonSingletonNodes(false).build(); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} //indexBuilder.DEBUG = false; final byte[] bytes = scratchBytes.toArrayCopy(); assert bytes.length > 0; - indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); + fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); scratchBytes.reset(); // Copy over index for all sub-blocks for(PendingBlock block : blocks) { if (block.subIndices != null) { for(FST subIndex : block.subIndices) { - append(indexBuilder, subIndex, scratchIntsRef); + append(fstCompiler, subIndex, scratchIntsRef); } block.subIndices = null; } } - index = indexBuilder.finish(); + index = fstCompiler.compile(); assert subIndices == null; @@ -491,14 +489,14 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { // TODO: maybe we could add bulk-add method to // Builder? Takes FST and unions it w/ current // FST. - private void append(Builder builder, FST subIndex, IntsRefBuilder scratchIntsRef) throws IOException { + private void append(FSTCompiler fstCompiler, FST subIndex, IntsRefBuilder scratchIntsRef) throws IOException { final BytesRefFSTEnum subIndexEnum = new BytesRefFSTEnum<>(subIndex); BytesRefFSTEnum.InputOutput indexEnt; while((indexEnt = subIndexEnum.next()) != null) { //if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); //} - builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); + fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 46d4b781fc3..571c1e5e23f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -605,7 +605,7 @@ public final class FST implements Accountable { // serializes new node by appending its bytes to the end // of the current byte[] - long addNode(Builder builder, Builder.UnCompiledNode nodeIn) throws IOException { + long addNode(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode nodeIn) throws IOException { T NO_OUTPUT = outputs.getNoOutput(); //System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); @@ -616,28 +616,28 @@ public final class FST implements Accountable { return NON_FINAL_END_NODE; } } - final long startAddress = builder.bytes.getPosition(); + final long startAddress = fstCompiler.bytes.getPosition(); //System.out.println(" startAddr=" + startAddress); - final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(builder, nodeIn); + final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(fstCompiler, nodeIn); if (doFixedLengthArcs) { //System.out.println(" fixed length arcs"); - if (builder.numBytesPerArc.length < nodeIn.numArcs) { - builder.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)]; - builder.numLabelBytesPerArc = new int[builder.numBytesPerArc.length]; + if (fstCompiler.numBytesPerArc.length < nodeIn.numArcs) { + fstCompiler.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)]; + fstCompiler.numLabelBytesPerArc = new int[fstCompiler.numBytesPerArc.length]; } } - builder.arcCount += nodeIn.numArcs; + fstCompiler.arcCount += nodeIn.numArcs; final int lastArc = nodeIn.numArcs-1; - long lastArcStart = builder.bytes.getPosition(); + long lastArcStart = fstCompiler.bytes.getPosition(); int maxBytesPerArc = 0; int maxBytesPerArcWithoutLabel = 0; for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) { - final Builder.Arc arc = nodeIn.arcs[arcIdx]; - final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; + final FSTCompiler.Arc arc = nodeIn.arcs[arcIdx]; + final FSTCompiler.CompiledNode target = (FSTCompiler.CompiledNode) arc.target; int flags = 0; //System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node); @@ -645,7 +645,7 @@ public final class FST implements Accountable { flags += BIT_LAST_ARC; } - if (builder.lastFrozenNode == target.node && !doFixedLengthArcs) { + if (fstCompiler.lastFrozenNode == target.node && !doFixedLengthArcs) { // TODO: for better perf (but more RAM used) we // could avoid this except when arc is "near" the // last arc: @@ -671,36 +671,36 @@ public final class FST implements Accountable { flags += BIT_ARC_HAS_OUTPUT; } - builder.bytes.writeByte((byte) flags); - long labelStart = builder.bytes.getPosition(); - writeLabel(builder.bytes, arc.label); - int numLabelBytes = (int) (builder.bytes.getPosition() - labelStart); + fstCompiler.bytes.writeByte((byte) flags); + long labelStart = fstCompiler.bytes.getPosition(); + writeLabel(fstCompiler.bytes, arc.label); + int numLabelBytes = (int) (fstCompiler.bytes.getPosition() - labelStart); // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output)); if (arc.output != NO_OUTPUT) { - outputs.write(arc.output, builder.bytes); + outputs.write(arc.output, fstCompiler.bytes); //System.out.println(" write output"); } if (arc.nextFinalOutput != NO_OUTPUT) { //System.out.println(" write final output"); - outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes); + outputs.writeFinalOutput(arc.nextFinalOutput, fstCompiler.bytes); } if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { assert target.node > 0; //System.out.println(" write target"); - builder.bytes.writeVLong(target.node); + fstCompiler.bytes.writeVLong(target.node); } // just write the arcs "like normal" on first pass, but record how many bytes each one took // and max byte size: if (doFixedLengthArcs) { - int numArcBytes = (int) (builder.bytes.getPosition() - lastArcStart); - builder.numBytesPerArc[arcIdx] = numArcBytes; - builder.numLabelBytesPerArc[arcIdx] = numLabelBytes; - lastArcStart = builder.bytes.getPosition(); + int numArcBytes = (int) (fstCompiler.bytes.getPosition() - lastArcStart); + fstCompiler.numBytesPerArc[arcIdx] = numArcBytes; + fstCompiler.numLabelBytesPerArc[arcIdx] = numLabelBytes; + lastArcStart = fstCompiler.bytes.getPosition(); maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes); maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes); //System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes); @@ -733,18 +733,18 @@ public final class FST implements Accountable { int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1; assert labelRange > 0; - if (shouldExpandNodeWithDirectAddressing(builder, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) { - writeNodeForDirectAddressing(builder, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange); - builder.directAddressingNodeCount++; + if (shouldExpandNodeWithDirectAddressing(fstCompiler, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) { + writeNodeForDirectAddressing(fstCompiler, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange); + fstCompiler.directAddressingNodeCount++; } else { - writeNodeForBinarySearch(builder, nodeIn, startAddress, maxBytesPerArc); - builder.binarySearchNodeCount++; + writeNodeForBinarySearch(fstCompiler, nodeIn, startAddress, maxBytesPerArc); + fstCompiler.binarySearchNodeCount++; } } - final long thisNodeAddress = builder.bytes.getPosition()-1; - builder.bytes.reverse(startAddress, thisNodeAddress); - builder.nodeCount++; + final long thisNodeAddress = fstCompiler.bytes.getPosition()-1; + fstCompiler.bytes.reverse(startAddress, thisNodeAddress); + fstCompiler.nodeCount++; return thisNodeAddress; } @@ -757,8 +757,8 @@ public final class FST implements Accountable { * of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear * scan) on lookup by arc label. */ - private boolean shouldExpandNodeWithFixedLengthArcs(Builder builder, Builder.UnCompiledNode node) { - return builder.allowFixedLengthArcs && + private boolean shouldExpandNodeWithFixedLengthArcs(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode node) { + return fstCompiler.allowFixedLengthArcs && ((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) || node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS); } @@ -769,18 +769,18 @@ public final class FST implements Accountable { * Prefer direct addressing for performance if it does not oversize binary search byte size too much, * so that the arcs can be directly addressed by label. * - * @see Builder#getDirectAddressingMaxOversizingFactor() + * @see FSTCompiler#getDirectAddressingMaxOversizingFactor() */ - private boolean shouldExpandNodeWithDirectAddressing(Builder builder, Builder.UnCompiledNode nodeIn, + private boolean shouldExpandNodeWithDirectAddressing(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode nodeIn, int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) { // Anticipate precisely the size of the encodings. int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs; - int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + builder.numLabelBytesPerArc[0] + int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + fstCompiler.numLabelBytesPerArc[0] + maxBytesPerArcWithoutLabel * nodeIn.numArcs; // Determine the allowed oversize compared to binary search. // This is defined by a parameter of FST Builder (default 1: no oversize). - int allowedOversize = (int) (sizeForBinarySearch * builder.getDirectAddressingMaxOversizingFactor()); + int allowedOversize = (int) (sizeForBinarySearch * fstCompiler.getDirectAddressingMaxOversizingFactor()); int expansionCost = sizeForDirectAddressing - allowedOversize; // Select direct addressing if either: @@ -790,46 +790,46 @@ public final class FST implements Accountable { // In this case, decrement the credit by the oversize. // In addition, do not try to oversize to a clearly too large node size // (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter). - if (expansionCost <= 0 || (builder.directAddressingExpansionCredit >= expansionCost + if (expansionCost <= 0 || (fstCompiler.directAddressingExpansionCredit >= expansionCost && sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) { - builder.directAddressingExpansionCredit -= expansionCost; + fstCompiler.directAddressingExpansionCredit -= expansionCost; return true; } return false; } - private void writeNodeForBinarySearch(Builder builder, Builder.UnCompiledNode nodeIn, long startAddress, int maxBytesPerArc) { + private void writeNodeForBinarySearch(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode nodeIn, long startAddress, int maxBytesPerArc) { // Build the header in a buffer. // It is a false/special arc which is in fact a node header with node flags followed by node metadata. - builder.fixedLengthArcsBuffer + fstCompiler.fixedLengthArcsBuffer .resetPosition() .writeByte(ARCS_FOR_BINARY_SEARCH) .writeVInt(nodeIn.numArcs) .writeVInt(maxBytesPerArc); - int headerLen = builder.fixedLengthArcsBuffer.getPosition(); + int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition(); // Expand the arcs in place, backwards. - long srcPos = builder.bytes.getPosition(); + long srcPos = fstCompiler.bytes.getPosition(); long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc; assert destPos >= srcPos; if (destPos > srcPos) { - builder.bytes.skipBytes((int) (destPos - srcPos)); + fstCompiler.bytes.skipBytes((int) (destPos - srcPos)); for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { destPos -= maxBytesPerArc; - int arcLen = builder.numBytesPerArc[arcIdx]; + int arcLen = fstCompiler.numBytesPerArc[arcIdx]; srcPos -= arcLen; if (srcPos != destPos) { assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs; - builder.bytes.copyBytes(srcPos, destPos, arcLen); + fstCompiler.bytes.copyBytes(srcPos, destPos, arcLen); } } } // Write the header. - builder.bytes.writeBytes(startAddress, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen); + fstCompiler.bytes.writeBytes(startAddress, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen); } - private void writeNodeForDirectAddressing(Builder builder, Builder.UnCompiledNode nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) { + private void writeNodeForDirectAddressing(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) { // Expand the arcs backwards in a buffer because we remove the labels. // So the obtained arcs might occupy less space. This is the reason why this // whole method is more complex. @@ -837,64 +837,64 @@ public final class FST implements Accountable { // the presence bits, and the first label. Keep the first label. int headerMaxLen = 11; int numPresenceBytes = getNumPresenceBytes(labelRange); - long srcPos = builder.bytes.getPosition(); - int totalArcBytes = builder.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel; + long srcPos = fstCompiler.bytes.getPosition(); + int totalArcBytes = fstCompiler.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel; int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes; - byte[] buffer = builder.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes(); + byte[] buffer = fstCompiler.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes(); // Copy the arcs to the buffer, dropping all labels except first one. for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { bufferOffset -= maxBytesPerArcWithoutLabel; - int srcArcLen = builder.numBytesPerArc[arcIdx]; + int srcArcLen = fstCompiler.numBytesPerArc[arcIdx]; srcPos -= srcArcLen; - int labelLen = builder.numLabelBytesPerArc[arcIdx]; + int labelLen = fstCompiler.numLabelBytesPerArc[arcIdx]; // Copy the flags. - builder.bytes.copyBytes(srcPos, buffer, bufferOffset, 1); + fstCompiler.bytes.copyBytes(srcPos, buffer, bufferOffset, 1); // Skip the label, copy the remaining. int remainingArcLen = srcArcLen - 1 - labelLen; if (remainingArcLen != 0) { - builder.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); + fstCompiler.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); } if (arcIdx == 0) { // Copy the label of the first arc only. bufferOffset -= labelLen; - builder.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); + fstCompiler.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); } } assert bufferOffset == headerMaxLen + numPresenceBytes; // Build the header in the buffer. // It is a false/special arc which is in fact a node header with node flags followed by node metadata. - builder.fixedLengthArcsBuffer + fstCompiler.fixedLengthArcsBuffer .resetPosition() .writeByte(ARCS_FOR_DIRECT_ADDRESSING) .writeVInt(labelRange) // labelRange instead of numArcs. .writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc. - int headerLen = builder.fixedLengthArcsBuffer.getPosition(); + int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition(); // Prepare the builder byte store. Enlarge or truncate if needed. long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes; - long currentPosition = builder.bytes.getPosition(); + long currentPosition = fstCompiler.bytes.getPosition(); if (nodeEnd >= currentPosition) { - builder.bytes.skipBytes((int) (nodeEnd - currentPosition)); + fstCompiler.bytes.skipBytes((int) (nodeEnd - currentPosition)); } else { - builder.bytes.truncate(nodeEnd); + fstCompiler.bytes.truncate(nodeEnd); } - assert builder.bytes.getPosition() == nodeEnd; + assert fstCompiler.bytes.getPosition() == nodeEnd; // Write the header. long writeOffset = startAddress; - builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen); + fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen); writeOffset += headerLen; // Write the presence bits - writePresenceBits(builder, nodeIn, writeOffset, numPresenceBytes); + writePresenceBits(fstCompiler, nodeIn, writeOffset, numPresenceBytes); writeOffset += numPresenceBytes; // Write the first label and the arcs. - builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); + fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); } - private void writePresenceBits(Builder builder, Builder.UnCompiledNode nodeIn, long dest, int numPresenceBytes) { + private void writePresenceBits(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode nodeIn, long dest, int numPresenceBytes) { long bytePos = dest; byte presenceBits = 1; // The first arc is always present. int presenceIndex = 0; @@ -904,7 +904,7 @@ public final class FST implements Accountable { assert label > previousLabel; presenceIndex += label - previousLabel; while (presenceIndex >= Byte.SIZE) { - builder.bytes.writeByte(bytePos++, presenceBits); + fstCompiler.bytes.writeByte(bytePos++, presenceBits); presenceBits = 0; presenceIndex -= Byte.SIZE; } @@ -915,7 +915,7 @@ public final class FST implements Accountable { assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8; assert presenceBits != 0; // The last byte is not 0. assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. - builder.bytes.writeByte(bytePos++, presenceBits); + fstCompiler.bytes.writeByte(bytePos++, presenceBits); assert bytePos - dest == numPresenceBytes; } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java similarity index 74% rename from lucene/core/src/java/org/apache/lucene/util/fst/Builder.java rename to lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index a35c4d7afb3..556e5c2bd9f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -49,31 +49,9 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc * @lucene.experimental */ -public class Builder { +public class FSTCompiler { - /** - * Default oversizing factor used to decide whether to encode a node with direct addressing or binary search. - * Default is 1: ensure no oversizing on average. - *

- * This factor does not determine whether to encode a node with a list of variable length arcs or with - * fixed length arcs. It only determines the effective encoding of a node that is already known to be - * encoded with fixed length arcs. - * See {@code FST.shouldExpandNodeWithFixedLengthArcs()} - * and {@code FST.shouldExpandNodeWithDirectAddressing()}. - *

- * For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs, - * and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%. - *

- * For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs, - * and 14% of them with direct encoding. Overall FST memory reduced by 0.8%. - *

- * Use {@code TestFstDirectAddressing.main()} - * and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()} - * to evaluate a change. - * - * @see #setDirectAddressingMaxOversizingFactor - */ - static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f; + static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1f; private final NodeHash dedupHash; final FST fst; @@ -117,75 +95,29 @@ public class Builder { long binarySearchNodeCount; long directAddressingNodeCount; - boolean allowFixedLengthArcs; - float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR; + final boolean allowFixedLengthArcs; + final float directAddressingMaxOversizingFactor; long directAddressingExpansionCredit; - BytesStore bytes; + final BytesStore bytes; /** - * Instantiates an FST/FSA builder without any pruning. A shortcut to {@link - * #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int)} with - * pruning options turned off. + * Instantiates an FST/FSA builder with default settings and pruning options turned off. + * For more tuning and tweaking, see {@link Builder}. */ - public Builder(FST.INPUT_TYPE inputType, Outputs outputs) { - this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); + public FSTCompiler(FST.INPUT_TYPE inputType, Outputs outputs) { + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, 1f); } - /** - * Instantiates an FST/FSA builder with all the possible tuning and construction - * tweaks. Read parameter documentation carefully. - * - * @param inputType - * The input type (transition labels). Can be anything from {@link INPUT_TYPE} - * enumeration. Shorter types will consume less memory. Strings (character sequences) are - * represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints). - * - * @param minSuffixCount1 - * If pruning the input graph during construction, this threshold is used for telling - * if a node is kept or pruned. If transition_count(node) >= minSuffixCount1, the node - * is kept. - * - * @param minSuffixCount2 - * (Note: only Mike McCandless knows what this one is really doing...) - * - * @param doShareSuffix - * If true, the shared suffixes will be compacted into unique paths. - * This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to - * false creates a single suffix path for all input sequences. This will result in a larger - * FST, but requires substantially less memory and CPU during building. - * - * @param doShareNonSingletonNodes - * Only used if doShareSuffix is true. Set this to - * true to ensure FST is fully minimal, at cost of more - * CPU and more RAM during building. - * - * @param shareMaxTailLength - * Only used if doShareSuffix is true. Set this to - * Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more - * CPU and more RAM during building. - * - * @param outputs The output type for each input sequence. Applies only if building an FST. For - * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the - * singleton output object. - * - * @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or - * direct addressing) while building the FST; this will make the resulting FST smaller but slower to - * traverse. - * - * @param bytesPageBits How many bits wide to make each - * byte[] block in the BytesStore; if you know the FST - * will be large then make this larger. For example 15 - * bits = 32768 byte pages. - */ - public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, - boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, - boolean allowFixedLengthArcs, int bytesPageBits) { + private FSTCompiler(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, + boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, + boolean allowFixedLengthArcs, int bytesPageBits, float directAddressingMaxOversizingFactor) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; this.doShareNonSingletonNodes = doShareNonSingletonNodes; this.shareMaxTailLength = shareMaxTailLength; this.allowFixedLengthArcs = allowFixedLengthArcs; + this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor; fst = new FST<>(inputType, outputs, bytesPageBits); bytes = fst.bytes; assert bytes != null; @@ -205,22 +137,145 @@ public class Builder { } /** - * Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing - * of arcs instead of binary search. + * Fluent-style constructor for FST {@link FSTCompiler}. *

- * Setting this factor to a negative value (e.g. -1) effectively disables direct addressing, - * only binary search nodes will be created. - * - * @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR + * Creates an FST/FSA builder with all the possible tuning and construction tweaks. + * Read parameter documentation carefully. */ - public Builder setDirectAddressingMaxOversizingFactor(float factor) { - directAddressingMaxOversizingFactor = factor; - return this; + public static class Builder { + + private final INPUT_TYPE inputType; + private final Outputs outputs; + private int minSuffixCount1; + private int minSuffixCount2; + private boolean shouldShareSuffix = true; + private boolean shouldShareNonSingletonNodes = true; + private int shareMaxTailLength = Integer.MAX_VALUE; + private boolean allowFixedLengthArcs = true; + private int bytesPageBits = 15; + private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR; + + /** + * @param inputType The input type (transition labels). Can be anything from {@link INPUT_TYPE} + * enumeration. Shorter types will consume less memory. Strings (character sequences) are + * represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints). + * @param outputs The output type for each input sequence. Applies only if building an FST. For + * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the + * singleton output object. + */ + public Builder(FST.INPUT_TYPE inputType, Outputs outputs) { + this.inputType = inputType; + this.outputs = outputs; + } + + /** + * If pruning the input graph during construction, this threshold is used for telling if a node is kept + * or pruned. If transition_count(node) >= minSuffixCount1, the node is kept. + *

+ * Default = 0. + */ + public Builder minSuffixCount1(int minSuffixCount1) { + this.minSuffixCount1 = minSuffixCount1; + return this; + } + + /** + * Better pruning: we prune node (and all following nodes) if the prior node has less than this number + * of terms go through it. + *

+ * Default = 0. + */ + public Builder minSuffixCount2(int minSuffixCount2) { + this.minSuffixCount2 = minSuffixCount2; + return this; + } + + /** + * If {@code true}, the shared suffixes will be compacted into unique paths. + * This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to + * {@code false} creates a single suffix path for all input sequences. This will result in a larger + * FST, but requires substantially less memory and CPU during building. + *

+ * Default = {@code true}. + */ + public Builder shouldShareSuffix(boolean shouldShareSuffix) { + this.shouldShareSuffix = shouldShareSuffix; + return this; + } + + /** + * Only used if {@code shouldShareSuffix} is true. Set this to true to ensure FST is fully minimal, + * at cost of more CPU and more RAM during building. + *

+ * Default = {@code true}. + */ + public Builder shouldShareNonSingletonNodes(boolean shouldShareNonSingletonNodes) { + this.shouldShareNonSingletonNodes = shouldShareNonSingletonNodes; + return this; + } + + /** + * Only used if {@code shouldShareSuffix} is true. Set this to Integer.MAX_VALUE to ensure FST is + * fully minimal, at cost of more CPU and more RAM during building. + *

+ * Default = {@link Integer#MAX_VALUE}. + */ + public Builder shareMaxTailLength(int shareMaxTailLength) { + this.shareMaxTailLength = shareMaxTailLength; + return this; + } + + /** + * Pass {@code false} to disable the fixed length arc optimization (binary search or direct addressing) + * while building the FST; this will make the resulting FST smaller but slower to traverse. + *

+ * Default = {@code true}. + */ + public Builder allowFixedLengthArcs(boolean allowFixedLengthArcs) { + this.allowFixedLengthArcs = allowFixedLengthArcs; + return this; + } + + /** + * How many bits wide to make each byte[] block in the BytesStore; if you know the FST + * will be large then make this larger. For example 15 bits = 32768 byte pages. + *

+ * Default = 15. + */ + public Builder bytesPageBits(int bytesPageBits) { + this.bytesPageBits = bytesPageBits; + return this; + } + + /** + * Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing + * of arcs instead of binary search. + *

+ * Setting this factor to a negative value (e.g. -1) effectively disables direct addressing, + * only binary search nodes will be created. + *

+ * This factor does not determine whether to encode a node with a list of variable length arcs or with + * fixed length arcs. It only determines the effective encoding of a node that is already known to be + * encoded with fixed length arcs. + *

+ * Default = 1. + */ + public Builder directAddressingMaxOversizingFactor(float factor) { + this.directAddressingMaxOversizingFactor = factor; + return this; + } + + /** + * Creates a new {@link FSTCompiler}. + */ + public FSTCompiler build() { + FSTCompiler fstCompiler = new FSTCompiler<>(inputType, minSuffixCount1, minSuffixCount2, shouldShareSuffix, + shouldShareNonSingletonNodes, shareMaxTailLength, outputs, allowFixedLengthArcs, bytesPageBits, + directAddressingMaxOversizingFactor); + return fstCompiler; + } } - /** - * @see #setDirectAddressingMaxOversizingFactor(float) - */ public float getDirectAddressingMaxOversizingFactor() { return directAddressingMaxOversizingFactor; } @@ -514,7 +569,7 @@ public class Builder { /** Returns final FST. NOTE: this will return null if * nothing is accepted by the FST. */ - public FST finish() throws IOException { + public FST compile() throws IOException { final UnCompiledNode root = frontier[0]; @@ -554,19 +609,19 @@ public class Builder { } /** Expert: holds a pending (seen but not yet serialized) arc. */ - public static class Arc { - public int label; // really an "unsigned" byte - public Node target; - public boolean isFinal; - public T output; - public T nextFinalOutput; + static class Arc { + int label; // really an "unsigned" byte + Node target; + boolean isFinal; + T output; + T nextFinalOutput; } // NOTE: not many instances of Node or CompiledNode are in // memory while the FST is being built; it's only the // current "frontier": - static interface Node { + interface Node { boolean isCompiled(); } @@ -583,20 +638,20 @@ public class Builder { } /** Expert: holds a pending (seen but not yet serialized) Node. */ - public static final class UnCompiledNode implements Node { - final Builder owner; - public int numArcs; - public Arc[] arcs; + static final class UnCompiledNode implements Node { + final FSTCompiler owner; + int numArcs; + Arc[] arcs; // TODO: instead of recording isFinal/output on the // node, maybe we should use -1 arc to mean "end" (like // we do when reading the FST). Would simplify much // code here... - public T output; - public boolean isFinal; - public long inputCount; + T output; + boolean isFinal; + long inputCount; /** This node's depth, starting from the automaton root. */ - public final int depth; + final int depth; /** * @param depth @@ -605,7 +660,7 @@ public class Builder { * fanout size). */ @SuppressWarnings({"rawtypes","unchecked"}) - public UnCompiledNode(Builder owner, int depth) { + UnCompiledNode(FSTCompiler owner, int depth) { this.owner = owner; arcs = (Arc[]) new Arc[1]; arcs[0] = new Arc<>(); @@ -618,7 +673,7 @@ public class Builder { return false; } - public void clear() { + void clear() { numArcs = 0; isFinal = false; output = owner.NO_OUTPUT; @@ -628,13 +683,13 @@ public class Builder { // for nodes on the frontier (even when reused). } - public T getLastOutput(int labelToMatch) { + T getLastOutput(int labelToMatch) { assert numArcs > 0; assert arcs[numArcs-1].label == labelToMatch; return arcs[numArcs-1].output; } - public void addArc(int label, Node target) { + void addArc(int label, Node target) { assert label >= 0; assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[numArcs-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs; if (numArcs == arcs.length) { @@ -651,7 +706,7 @@ public class Builder { arc.isFinal = false; } - public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) { + void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) { assert numArcs > 0; final Arc arc = arcs[numArcs-1]; assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch; @@ -661,14 +716,14 @@ public class Builder { arc.isFinal = isFinal; } - public void deleteLast(int label, Node target) { + void deleteLast(int label, Node target) { assert numArcs > 0; assert label == arcs[numArcs-1].label; assert target == arcs[numArcs-1].target; numArcs--; } - public void setLastOutput(int labelToMatch, T newOutput) { + void setLastOutput(int labelToMatch, T newOutput) { assert owner.validOutput(newOutput); assert numArcs > 0; final Arc arc = arcs[numArcs-1]; @@ -677,7 +732,7 @@ public class Builder { } // pushes an output prefix forward onto all arcs - public void prependOutput(T outputPrefix) { + void prependOutput(T outputPrefix) { assert owner.validOutput(outputPrefix); for(int arcIdx=0;arcIdx { this.in = in; } - private boolean nodesEqual(Builder.UnCompiledNode node, long address) throws IOException { + private boolean nodesEqual(FSTCompiler.UnCompiledNode node, long address) throws IOException { fst.readFirstRealTargetArc(address, scratchArc, in); // Fail fast for a node with fixed length arcs. @@ -58,10 +58,10 @@ final class NodeHash { } for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) { - final Builder.Arc arc = node.arcs[arcUpto]; + final FSTCompiler.Arc arc = node.arcs[arcUpto]; if (arc.label != scratchArc.label() || !arc.output.equals(scratchArc.output()) || - ((Builder.CompiledNode) arc.target).node != scratchArc.target() || + ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() || !arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) || arc.isFinal != scratchArc.isFinal()) { return false; @@ -82,16 +82,16 @@ final class NodeHash { // hash code for an unfrozen node. This must be identical // to the frozen case (below)!! - private long hash(Builder.UnCompiledNode node) { + private long hash(FSTCompiler.UnCompiledNode node) { final int PRIME = 31; //System.out.println("hash unfrozen"); long h = 0; // TODO: maybe if number of arcs is high we can safely subsample? for (int arcIdx=0; arcIdx < node.numArcs; arcIdx++) { - final Builder.Arc arc = node.arcs[arcIdx]; + final FSTCompiler.Arc arc = node.arcs[arcIdx]; //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); h = PRIME * h + arc.label; - long n = ((Builder.CompiledNode) arc.target).node; + long n = ((FSTCompiler.CompiledNode) arc.target).node; h = PRIME * h + (int) (n^(n>>32)); h = PRIME * h + arc.output.hashCode(); h = PRIME * h + arc.nextFinalOutput.hashCode(); @@ -127,7 +127,7 @@ final class NodeHash { return h & Long.MAX_VALUE; } - public long add(Builder builder, Builder.UnCompiledNode nodeIn) throws IOException { + public long add(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode nodeIn) throws IOException { //System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask); final long h = hash(nodeIn); long pos = h & mask; @@ -136,7 +136,7 @@ final class NodeHash { final long v = table.get(pos); if (v == 0) { // freeze & add - final long node = fst.addNode(builder, nodeIn); + final long node = fst.addNode(fstCompiler, nodeIn); //System.out.println(" now freeze node=" + node); assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h; count++; diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java index cc093b3cdba..9ee6947b2e3 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java @@ -54,8 +54,7 @@ public class Test2BFST extends LuceneTestCase { System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); Outputs outputs = NoOutputs.getSingleton(); Object NO_OUTPUT = outputs.getNoOutput(); - final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, - true, 15); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); int count = 0; Random r = new Random(seed); @@ -66,21 +65,21 @@ public class Test2BFST extends LuceneTestCase { for(int i=10;i Integer.MAX_VALUE + 100L * 1024 * 1024) { + if (fstCompiler.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) { break; } nextInput(r, ints2); } - FST fst = b.finish(); + FST fst = fstCompiler.compile(); for(int verify=0;verify<2;verify++) { - System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]"); + System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]"); Arrays.fill(ints2, 0); r = new Random(seed); @@ -136,8 +135,7 @@ public class Test2BFST extends LuceneTestCase { { System.out.println("\nTEST: 3 GB size; outputs=bytes"); Outputs outputs = ByteSequenceOutputs.getSingleton(); - final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, - true, 15); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); byte[] outputBytes = new byte[20]; BytesRef output = new BytesRef(outputBytes); @@ -147,10 +145,10 @@ public class Test2BFST extends LuceneTestCase { while(true) { r.nextBytes(outputBytes); //System.out.println("add: " + input + " -> " + output); - b.add(input, BytesRef.deepCopyOf(output)); + fstCompiler.add(input, BytesRef.deepCopyOf(output)); count++; if (count % 10000 == 0) { - long size = b.fstRamBytesUsed(); + long size = fstCompiler.fstRamBytesUsed(); if (count % 1000000 == 0) { System.out.println(count + "...: " + size + " bytes"); } @@ -161,10 +159,10 @@ public class Test2BFST extends LuceneTestCase { nextInput(r, ints); } - FST fst = b.finish(); + FST fst = fstCompiler.compile(); for(int verify=0;verify<2;verify++) { - System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]"); + System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]"); r = new Random(seed); Arrays.fill(ints, 0); @@ -216,8 +214,7 @@ public class Test2BFST extends LuceneTestCase { { System.out.println("\nTEST: 3 GB size; outputs=long"); Outputs outputs = PositiveIntOutputs.getSingleton(); - final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, - true, 15); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); long output = 1; @@ -226,11 +223,11 @@ public class Test2BFST extends LuceneTestCase { Random r = new Random(seed); while(true) { //System.out.println("add: " + input + " -> " + output); - b.add(input, output); + fstCompiler.add(input, output); output += 1+r.nextInt(10); count++; if (count % 10000 == 0) { - long size = b.fstRamBytesUsed(); + long size = fstCompiler.fstRamBytesUsed(); if (count % 1000000 == 0) { System.out.println(count + "...: " + size + " bytes"); } @@ -241,11 +238,11 @@ public class Test2BFST extends LuceneTestCase { nextInput(r, ints); } - FST fst = b.finish(); + FST fst = fstCompiler.compile(); for(int verify=0;verify<2;verify++) { - System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]"); + System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + fstCompiler.getNodeCount() + "; arcCount=" + fstCompiler.getArcCount() + "]"); Arrays.fill(ints, 0); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFstDirectAddressing.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java similarity index 77% rename from lucene/core/src/test/org/apache/lucene/util/fst/TestFstDirectAddressing.java rename to lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java index a1e18a35007..c6cfa134990 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFstDirectAddressing.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java @@ -35,7 +35,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.LuceneTestCase; -public class TestFstDirectAddressing extends LuceneTestCase { +public class TestFSTDirectAddressing extends LuceneTestCase { public void testDenseWithGap() throws Exception { List words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm"); @@ -86,13 +86,13 @@ public class TestFstDirectAddressing extends LuceneTestCase { Collections.sort(wordList); // Disable direct addressing and measure the FST size. - Builder builder = createBuilder(-1f); - FST fst = buildFST(wordList, builder); + FSTCompiler fstCompiler = createFSTCompiler(-1f); + FST fst = buildFST(wordList, fstCompiler); long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed(); // Enable direct addressing and measure the FST size. - builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR); - fst = buildFST(wordList, builder); + fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR); + fst = buildFST(wordList, fstCompiler); long ramBytesUsed = fst.ramBytesUsed(); // Compute the size increase in percents. @@ -107,42 +107,43 @@ public class TestFstDirectAddressing extends LuceneTestCase { directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT); } - private static void printStats(Builder builder, long ramBytesUsed, double directAddressingMemoryIncreasePercent) { - System.out.println("directAddressingMaxOversizingFactor = " + builder.getDirectAddressingMaxOversizingFactor()); + private static void printStats(FSTCompiler fstCompiler, long ramBytesUsed, double directAddressingMemoryIncreasePercent) { + System.out.println("directAddressingMaxOversizingFactor = " + fstCompiler.getDirectAddressingMaxOversizingFactor()); System.out.println("ramBytesUsed = " + String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d) + String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent)); - System.out.println("num nodes = " + builder.nodeCount); - long fixedLengthArcNodeCount = builder.directAddressingNodeCount + builder.binarySearchNodeCount; + System.out.println("num nodes = " + fstCompiler.nodeCount); + long fixedLengthArcNodeCount = fstCompiler.directAddressingNodeCount + fstCompiler.binarySearchNodeCount; System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount + String.format(Locale.ENGLISH, " (%.2f %% of all nodes)", - ((double) fixedLengthArcNodeCount / builder.nodeCount * 100))); - System.out.println("num binary-search nodes = " + (builder.binarySearchNodeCount) + ((double) fixedLengthArcNodeCount / fstCompiler.nodeCount * 100))); + System.out.println("num binary-search nodes = " + (fstCompiler.binarySearchNodeCount) + String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)", - ((double) (builder.binarySearchNodeCount) / fixedLengthArcNodeCount * 100))); - System.out.println("num direct-addressing nodes = " + (builder.directAddressingNodeCount) + ((double) (fstCompiler.binarySearchNodeCount) / fixedLengthArcNodeCount * 100))); + System.out.println("num direct-addressing nodes = " + (fstCompiler.directAddressingNodeCount) + String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)", - ((double) (builder.directAddressingNodeCount) / fixedLengthArcNodeCount * 100))); + ((double) (fstCompiler.directAddressingNodeCount) / fixedLengthArcNodeCount * 100))); } - private static Builder createBuilder(float directAddressingMaxOversizingFactor) { - return new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, NoOutputs.getSingleton(), true, 15) - .setDirectAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor); + private static FSTCompiler createFSTCompiler(float directAddressingMaxOversizingFactor) { + return new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, NoOutputs.getSingleton()) + .directAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor) + .build(); } private FST buildFST(List entries) throws Exception { - return buildFST(entries, createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR)); + return buildFST(entries, createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR)); } - private static FST buildFST(List entries, Builder builder) throws Exception { + private static FST buildFST(List entries, FSTCompiler fstCompiler) throws Exception { BytesRef last = null; for (BytesRef entry : entries) { if (entry.equals(last) == false) { - builder.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput()); + fstCompiler.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput()); } last = entry; } - return builder.finish(); + return fstCompiler.compile(); } public static void main(String... args) throws Exception { @@ -195,18 +196,18 @@ public class TestFstDirectAddressing extends LuceneTestCase { Collections.sort(wordList); // Disable direct addressing and measure the FST size. - Builder builder = createBuilder(-1f); - FST fst = buildFST(wordList, builder); + FSTCompiler fstCompiler = createFSTCompiler(-1f); + FST fst = buildFST(wordList, fstCompiler); long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed(); // Enable direct addressing and measure the FST size. - builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR); - fst = buildFST(wordList, builder); + fstCompiler = createFSTCompiler(FSTCompiler.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR); + fst = buildFST(wordList, fstCompiler); long ramBytesUsed = fst.ramBytesUsed(); // Compute the size increase in percents. double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100; - printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent); + printStats(fstCompiler, ramBytesUsed, directAddressingMemoryIncreasePercent); } } diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 316cbdf1e42..72c09e9814a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -327,7 +327,7 @@ public class TestFSTs extends LuceneTestCase { writer.close(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); boolean storeOrd = random().nextBoolean(); if (VERBOSE) { @@ -373,15 +373,15 @@ public class TestFSTs extends LuceneTestCase { } else { output = termsEnum.docFreq(); } - builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output); + fstCompiler.add(Util.toIntsRef(term, scratchIntsRef), (long) output); ord++; if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { System.out.println(ord + " terms..."); } } - FST fst = builder.finish(); + FST fst = fstCompiler.compile(); if (VERBOSE) { - System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes"); + System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes"); } if (ord > 0) { @@ -460,7 +460,7 @@ public class TestFSTs extends LuceneTestCase { private final Path wordsFileIn; private int inputMode; private final Outputs outputs; - private final Builder builder; + private final FSTCompiler fstCompiler; public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs outputs, boolean noArcArrays) { this.dirOut = dirOut; @@ -468,7 +468,11 @@ public class TestFSTs extends LuceneTestCase { this.inputMode = inputMode; this.outputs = outputs; - builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15); + fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs) + .minSuffixCount2(prune) + .shouldShareSuffix(prune == 0) + .allowFixedLengthArcs(!noArcArrays) + .build(); } protected abstract T getOutput(IntsRef input, int ord) throws IOException; @@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase { break; } toIntsRef(w, inputMode, intsRef); - builder.add(intsRef.get(), + fstCompiler.add(intsRef.get(), getOutput(intsRef.get(), ord)); ord++; @@ -503,8 +507,8 @@ public class TestFSTs extends LuceneTestCase { long tMid = System.currentTimeMillis(); System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms"); - assert builder.getTermCount() == ord; - FST fst = builder.finish(); + assert fstCompiler.getTermCount() == ord; + FST fst = fstCompiler.compile(); long tEnd = System.currentTimeMillis(); System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack"); if (fst == null) { @@ -516,8 +520,8 @@ public class TestFSTs extends LuceneTestCase { return; } - System.out.println(ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs; tot size " + fst.ramBytesUsed()); - if (builder.getNodeCount() < 100) { + System.out.println(ord + " terms; " + fstCompiler.getNodeCount() + " nodes; " + fstCompiler.getArcCount() + " arcs; tot size " + fst.ramBytesUsed()); + if (fstCompiler.getNodeCount() < 100) { Writer w = Files.newBufferedWriter(Paths.get("out.dot"), StandardCharsets.UTF_8); Util.toDot(fst, w, false, false); w.close(); @@ -717,9 +721,9 @@ public class TestFSTs extends LuceneTestCase { public void testSingleString() throws Exception { final Outputs outputs = NoOutputs.getSingleton(); - final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); - b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput()); - final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum<>(b.finish()); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); + fstCompiler.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput()); + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum<>(fstCompiler.compile()); assertNull(fstEnum.seekFloor(new BytesRef("foo"))); assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); } @@ -728,12 +732,12 @@ public class TestFSTs extends LuceneTestCase { public void testDuplicateFSAString() throws Exception { String str = "foobar"; final Outputs outputs = NoOutputs.getSingleton(); - final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); IntsRefBuilder ints = new IntsRefBuilder(); for(int i=0; i<10; i++) { - b.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput()); + fstCompiler.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput()); } - FST fst = b.finish(); + FST fst = fstCompiler.compile(); // count the input paths int count = 0; @@ -797,17 +801,17 @@ public class TestFSTs extends LuceneTestCase { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); // Build an FST mapping BytesRef -> Long - final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); final BytesRef a = new BytesRef("a"); final BytesRef b = new BytesRef("b"); final BytesRef c = new BytesRef("c"); - builder.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L); - builder.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L); - builder.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L); + fstCompiler.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L); + fstCompiler.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L); + fstCompiler.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L); - final FST fst = builder.finish(); + final FST fst = fstCompiler.compile(); assertEquals(13824324872317238L, (long) Util.get(fst, c)); assertEquals(42, (long) Util.get(fst, b)); @@ -1035,7 +1039,7 @@ public class TestFSTs extends LuceneTestCase { FST compile(String[] lines) throws IOException { final NoOutputs outputs = NoOutputs.getSingleton(); final Object nothing = outputs.getNoOutput(); - final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); int line = 0; final BytesRefBuilder term = new BytesRefBuilder(); @@ -1046,10 +1050,10 @@ public class TestFSTs extends LuceneTestCase { break; } term.copyChars(w); - b.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing); + fstCompiler.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing); } - return b.finish(); + return fstCompiler.compile(); } void generate(ArrayList out, StringBuilder b, char from, char to, @@ -1110,10 +1114,10 @@ public class TestFSTs extends LuceneTestCase { public void testFinalOutputOnEndState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); - builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L); - builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L); - final FST fst = builder.finish(); + final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).minSuffixCount1(2).build(); + fstCompiler.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L); + fstCompiler.add(Util.toUTF32("station", new IntsRefBuilder()), 10L); + final FST fst = fstCompiler.compile(); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); StringWriter w = new StringWriter(); Util.toDot(fst, w, false, false); @@ -1124,10 +1128,10 @@ public class TestFSTs extends LuceneTestCase { public void testInternalFinalState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); - builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput()); - builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput()); - final FST fst = builder.finish(); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); + fstCompiler.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput()); + fstCompiler.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput()); + final FST fst = fstCompiler.compile(); StringWriter w = new StringWriter(); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); Util.toDot(fst, w, false, false); @@ -1145,20 +1149,20 @@ public class TestFSTs extends LuceneTestCase { public void testNonFinalStopNode() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final Long nothing = outputs.getNoOutput(); - final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); //final FST fst = new FST<>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, 15); - final FST fst = b.fst; + final FST fst = fstCompiler.fst; - final Builder.UnCompiledNode rootNode = new Builder.UnCompiledNode<>(b, 0); + final FSTCompiler.UnCompiledNode rootNode = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0); // Add final stop node { - final Builder.UnCompiledNode node = new Builder.UnCompiledNode<>(b, 0); + final FSTCompiler.UnCompiledNode node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0); node.isFinal = true; rootNode.addArc('a', node); - final Builder.CompiledNode frozen = new Builder.CompiledNode(); - frozen.node = fst.addNode(b, node); + final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode(); + frozen.node = fst.addNode(fstCompiler, node); rootNode.arcs[0].nextFinalOutput = 17L; rootNode.arcs[0].isFinal = true; rootNode.arcs[0].output = nothing; @@ -1167,16 +1171,16 @@ public class TestFSTs extends LuceneTestCase { // Add non-final stop node { - final Builder.UnCompiledNode node = new Builder.UnCompiledNode<>(b, 0); + final FSTCompiler.UnCompiledNode node = new FSTCompiler.UnCompiledNode<>(fstCompiler, 0); rootNode.addArc('b', node); - final Builder.CompiledNode frozen = new Builder.CompiledNode(); - frozen.node = fst.addNode(b, node); + final FSTCompiler.CompiledNode frozen = new FSTCompiler.CompiledNode(); + frozen.node = fst.addNode(fstCompiler, node); rootNode.arcs[1].nextFinalOutput = nothing; rootNode.arcs[1].output = 42L; rootNode.arcs[1].target = frozen; } - fst.finish(fst.addNode(b, rootNode)); + fst.finish(fst.addNode(fstCompiler, rootNode)); StringWriter w = new StringWriter(); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); @@ -1225,13 +1229,13 @@ public class TestFSTs extends LuceneTestCase { public void testShortestPaths() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); - builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L); - builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L); - builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L); - final FST fst = builder.finish(); + fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L); + fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L); + fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L); + final FST fst = fstCompiler.compile(); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Util.toDot(fst, w, false, false); //w.close(); @@ -1256,16 +1260,16 @@ public class TestFSTs extends LuceneTestCase { public void testRejectNoLimits() throws IOException { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler fstCompiler = new FSTCompiler(FST.INPUT_TYPE.BYTE1, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); - builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L); - builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L); - builder.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L); - builder.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L); + fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L); + fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L); + fstCompiler.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L); + fstCompiler.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L); - builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L); - final FST fst = builder.finish(); + fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L); + final FST fst = fstCompiler.compile(); final AtomicInteger rejectCount = new AtomicInteger(); Util.TopNSearcher searcher = new Util.TopNSearcher(fst, 2, 6, minLongComparator) { @Override @@ -1320,13 +1324,13 @@ public class TestFSTs extends LuceneTestCase { PositiveIntOutputs.getSingleton() // output ); - final Builder> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); - builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L)); - builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L)); - builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L)); - final FST> fst = builder.finish(); + fstCompiler.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L)); + fstCompiler.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L)); + fstCompiler.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L)); + final FST> fst = fstCompiler.compile(); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Util.toDot(fst, w, false, false); //w.close(); @@ -1361,7 +1365,7 @@ public class TestFSTs extends LuceneTestCase { final TreeSet allPrefixes = new TreeSet<>(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); for (int i = 0; i < numWords; i++) { @@ -1382,10 +1386,10 @@ public class TestFSTs extends LuceneTestCase { for (Map.Entry e : slowCompletor.entrySet()) { //System.out.println("add: " + e); - builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue()); + fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue()); } - final FST fst = builder.finish(); + final FST fst = fstCompiler.compile(); //System.out.println("SAVE out.dot"); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Util.toDot(fst, w, false, false); @@ -1479,7 +1483,7 @@ public class TestFSTs extends LuceneTestCase { PositiveIntOutputs.getSingleton(), // weight PositiveIntOutputs.getSingleton() // output ); - final Builder> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); Random random = random(); @@ -1504,10 +1508,10 @@ public class TestFSTs extends LuceneTestCase { //System.out.println("add: " + e); long weight = e.getValue().a; long output = e.getValue().b; - builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output)); + fstCompiler.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output)); } - final FST> fst = builder.finish(); + final FST> fst = fstCompiler.compile(); //System.out.println("SAVE out.dot"); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Util.toDot(fst, w, false, false); @@ -1563,7 +1567,7 @@ public class TestFSTs extends LuceneTestCase { public void testLargeOutputsOnArrayArcs() throws Exception { final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); final byte[] bytes = new byte[300]; final IntsRefBuilder input = new IntsRefBuilder(); @@ -1572,10 +1576,10 @@ public class TestFSTs extends LuceneTestCase { for(int arc=0;arc<6;arc++) { input.setIntAt(0, arc); output.bytes[0] = (byte) arc; - builder.add(input.get(), BytesRef.deepCopyOf(output)); + fstCompiler.add(input.get(), BytesRef.deepCopyOf(output)); } - final FST fst = builder.finish(); + final FST fst = fstCompiler.compile(); for(int arc=0;arc<6;arc++) { input.setIntAt(0, arc); final BytesRef result = Util.get(fst, input.get()); @@ -1608,15 +1612,15 @@ public class TestFSTs extends LuceneTestCase { Collections.sort(termsList); ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); IntsRefBuilder input = new IntsRefBuilder(); for(BytesRef term : termsList) { Util.toIntsRef(term, input); - builder.add(input.get(), term); + fstCompiler.add(input.get(), term); } - FST fst = builder.finish(); + FST fst = fstCompiler.compile(); Arc arc = new FST.Arc<>(); fst.getFirstArc(arc); @@ -1638,17 +1642,17 @@ public class TestFSTs extends LuceneTestCase { public void testSimpleDepth() throws Exception { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); BytesRef ab = new BytesRef("ab"); BytesRef ac = new BytesRef("ac"); BytesRef bd = new BytesRef("bd"); - builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L); - builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L); - builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L); + fstCompiler.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L); + fstCompiler.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L); + fstCompiler.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L); - FST fst = builder.finish(); + FST fst = fstCompiler.compile(); assertEquals(3, (long) Util.get(fst, ab)); assertEquals(5, (long) Util.get(fst, ac)); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestUtil.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestUtil.java index 2793e7e60af..a14be3c9c1b 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestUtil.java @@ -83,15 +83,17 @@ public class TestUtil extends LuceneTestCase { private FST buildFST(List words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception { final Outputs outputs = NoOutputs.getSingleton(); - final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, allowArrayArcs, 15); + final FSTCompiler.Builder builder = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs) + .allowFixedLengthArcs(allowArrayArcs); if (!allowDirectAddressing) { - b.setDirectAddressingMaxOversizingFactor(-1f); + builder.directAddressingMaxOversizingFactor(-1f); } + final FSTCompiler fstCompiler = builder.build(); for (String word : words) { - b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput()); + fstCompiler.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput()); } - return b.finish(); + return fstCompiler.compile(); } private List createRandomDictionary(int width, int depth) { diff --git a/lucene/misc/src/java/org/apache/lucene/util/fst/ListOfOutputs.java b/lucene/misc/src/java/org/apache/lucene/util/fst/ListOfOutputs.java index aa08344646b..da5ce4db2bc 100644 --- a/lucene/misc/src/java/org/apache/lucene/util/fst/ListOfOutputs.java +++ b/lucene/misc/src/java/org/apache/lucene/util/fst/ListOfOutputs.java @@ -30,7 +30,7 @@ import org.apache.lucene.util.RamUsageEstimator; * more of its output values. You can use this when a single * input may need to map to more than one output, * maintaining order: pass the same input with a different - * output by calling {@link Builder#add(IntsRef,Object)} multiple + * output by calling {@link FSTCompiler#add(IntsRef,Object)} multiple * times. The builder will then combine the outputs using * the {@link Outputs#merge(Object,Object)} method. * @@ -41,7 +41,7 @@ import org.apache.lucene.util.RamUsageEstimator; *

NOTE: the only way to create multiple outputs is to * add the same input to the FST multiple times in a row. This is * how the FST maps a single input to multiple outputs (e.g. you - * cannot pass a List<Object> to {@link Builder#add}). If + * cannot pass a List<Object> to {@link FSTCompiler#add}). If * your outputs are longs, and you need at most 2, then use * {@link UpToTwoPositiveIntOutputs} instead since it stores * the outputs more compactly (by stealing a bit from each diff --git a/lucene/misc/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java b/lucene/misc/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java index 327de1ff5f4..af1766e23b5 100644 --- a/lucene/misc/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java +++ b/lucene/misc/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java @@ -35,14 +35,14 @@ import org.apache.lucene.util.SuppressForbidden; *

NOTE: the only way to create a TwoLongs output is to * add the same input to the FST twice in a row. This is * how the FST maps a single input to two outputs (e.g. you - * cannot pass a TwoLongs to {@link Builder#add}. If you + * cannot pass a TwoLongs to {@link FSTCompiler#add}. If you * need more than two then use {@link ListOfOutputs}, but if * you only have at most 2 then this implementation will * require fewer bytes as it steals one bit from each long * value. * *

NOTE: the resulting FST is not guaranteed to be minimal! - * See {@link Builder}. + * See {@link FSTCompiler}. * * @lucene.experimental */ diff --git a/lucene/misc/src/test/org/apache/lucene/util/fst/TestFSTsMisc.java b/lucene/misc/src/test/org/apache/lucene/util/fst/TestFSTsMisc.java index 91a8001f10e..9911182d366 100644 --- a/lucene/misc/src/test/org/apache/lucene/util/fst/TestFSTsMisc.java +++ b/lucene/misc/src/test/org/apache/lucene/util/fst/TestFSTsMisc.java @@ -164,16 +164,16 @@ public class TestFSTsMisc extends LuceneTestCase { public void testListOfOutputs() throws Exception { PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton(); ListOfOutputs outputs = new ListOfOutputs<>(_outputs); - final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); // Add the same input more than once and the outputs // are merged: - builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L); - builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L); - builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L); - builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L); - final FST fst = builder.finish(); + fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L); + fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L); + fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L); + fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L); + final FST fst = fstCompiler.compile(); Object output = Util.get(fst, new BytesRef("a")); assertNotNull(output); @@ -193,20 +193,20 @@ public class TestFSTsMisc extends LuceneTestCase { public void testListOfOutputsEmptyString() throws Exception { PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton(); ListOfOutputs outputs = new ListOfOutputs<>(_outputs); - final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); - builder.add(scratch.get(), 0L); - builder.add(scratch.get(), 1L); - builder.add(scratch.get(), 17L); - builder.add(scratch.get(), 1L); + fstCompiler.add(scratch.get(), 0L); + fstCompiler.add(scratch.get(), 1L); + fstCompiler.add(scratch.get(), 17L); + fstCompiler.add(scratch.get(), 1L); - builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L); - builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L); - builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L); - builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L); + fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L); + fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L); + fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L); + fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 0L); - final FST fst = builder.finish(); + final FST fst = fstCompiler.compile(); Object output = Util.get(fst, new BytesRef("")); assertNotNull(output); diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java index 3f0c2bd249a..9e2f7549f4a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java @@ -43,7 +43,7 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; @@ -350,29 +350,28 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { } } - final Builder> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, - 0, 0, true, false, Integer.MAX_VALUE, - FST_OUTPUTS, true, 15); + final FSTCompiler> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS) + .shouldShareNonSingletonNodes(false).build(); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} //indexBuilder.DEBUG = false; final byte[] bytes = scratchBytes.toArrayCopy(); assert bytes.length > 0; - indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex)); + fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex)); scratchBytes.reset(); // Copy over index for all sub-blocks for(PendingBlock block : blocks) { if (block.subIndices != null) { for(FST> subIndex : block.subIndices) { - append(indexBuilder, subIndex, scratchIntsRef); + append(fstCompiler, subIndex, scratchIntsRef); } block.subIndices = null; } } - index = indexBuilder.finish(); + index = fstCompiler.compile(); assert subIndices == null; @@ -387,14 +386,14 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { // TODO: maybe we could add bulk-add method to // Builder? Takes FST and unions it w/ current // FST. - private void append(Builder> builder, FST> subIndex, IntsRefBuilder scratchIntsRef) throws IOException { + private void append(FSTCompiler> fstCompiler, FST> subIndex, IntsRefBuilder scratchIntsRef) throws IOException { final BytesRefFSTEnum> subIndexEnum = new BytesRefFSTEnum<>(subIndex); BytesRefFSTEnum.InputOutput> indexEnt; while((indexEnt = subIndexEnum.next()) != null) { //if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); //} - builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); + fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); } } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index 6ca21e7ee3c..a03ae229f92 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -52,7 +52,7 @@ import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.Transition; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.FST; @@ -496,7 +496,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable { reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName); PairOutputs outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); - Builder> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + FSTCompiler> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRefBuilder previousAnalyzed = null; @@ -570,7 +570,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable { Util.toIntsRef(analyzed.get(), scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { - builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface))); + fstCompiler.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface))); } else { int payloadOffset = input.getPosition() + surface.length; int payloadLength = bytes.length - payloadOffset; @@ -579,10 +579,10 @@ public class AnalyzingSuggester extends Lookup implements Accountable { br.bytes[surface.length] = PAYLOAD_SEP; System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength); br.length = br.bytes.length; - builder.add(scratchInts.get(), outputs.newPair(cost, br)); + fstCompiler.add(scratchInts.get(), outputs.newPair(cost, br)); } } - fst = builder.finish(); + fst = fstCompiler.compile(); //Util.dotToFile(fst, "/tmp/suggest.dot"); } finally { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java index 10e9bb40593..81c079e8519 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java @@ -66,7 +66,7 @@ import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.BytesReader; @@ -304,7 +304,7 @@ public class FreeTextSuggester extends Lookup implements Accountable { TermsEnum termsEnum = terms.iterator(); Outputs outputs = PositiveIntOutputs.getSingleton(); - Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); while (true) { @@ -320,10 +320,10 @@ public class FreeTextSuggester extends Lookup implements Accountable { totTokens += termsEnum.totalTermFreq(); } - builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq())); + fstCompiler.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq())); } - fst = builder.finish(); + fst = fstCompiler.compile(); if (fst == null) { throw new IllegalArgumentException("need at least one suggestion"); } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java index 51338556616..34b1508a552 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java @@ -25,7 +25,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs; @@ -53,7 +53,7 @@ final class NRTSuggesterBuilder { public static final int END_BYTE = 0x0; private final PairOutputs outputs; - private final Builder> builder; + private final FSTCompiler> fstCompiler; private final IntsRefBuilder scratchInts = new IntsRefBuilder(); private final BytesRefBuilder analyzed = new BytesRefBuilder(); private final PriorityQueue entries; @@ -70,7 +70,7 @@ final class NRTSuggesterBuilder { this.endByte = END_BYTE; this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); this.entries = new PriorityQueue<>(); - this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); } /** @@ -108,7 +108,7 @@ final class NRTSuggesterBuilder { } analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++); Util.toIntsRef(analyzed.get(), scratchInts); - builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload)); + fstCompiler.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload)); } maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size()); entries.clear(); @@ -119,11 +119,11 @@ final class NRTSuggesterBuilder { * {@link NRTSuggester#load(IndexInput, CompletionPostingsFormat.FSTLoadMode)})} */ public boolean store(DataOutput output) throws IOException { - final FST> build = builder.finish(); - if (build == null) { + final FST> fst = fstCompiler.compile(); + if (fst == null) { return false; } - build.save(output); + fst.save(output); /* write some more meta-info */ assert maxAnalyzedPathsPerOutput > 0; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java index 3d20412152a..e3237db3f74 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java @@ -169,7 +169,7 @@ public class FSTCompletionBuilder { * @param shareMaxTailLength * Max shared suffix sharing length. * - * See the description of this parameter in {@link Builder}'s constructor. + * See the description of this parameter in {@link org.apache.lucene.util.fst.FSTCompiler.Builder}. * In general, for very large inputs you'll want to construct a non-minimal * automaton which will be larger, but the construction will take far less ram. * For minimal automata, set it to {@link Integer#MAX_VALUE}. @@ -234,10 +234,9 @@ public class FSTCompletionBuilder { // Build the automaton. final Outputs outputs = NoOutputs.getSingleton(); final Object empty = outputs.getNoOutput(); - final Builder builder = new Builder<>( - FST.INPUT_TYPE.BYTE1, 0, 0, true, true, - shareMaxTailLength, outputs, true, 15); - + final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs) + .shareMaxTailLength(shareMaxTailLength).build(); + BytesRefBuilder scratch = new BytesRefBuilder(); BytesRef entry; final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); @@ -246,11 +245,11 @@ public class FSTCompletionBuilder { while((entry = iter.next()) != null) { count++; if (scratch.get().compareTo(entry) != 0) { - builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); + fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty); scratch.copyBytes(entry); } } - return count == 0 ? null : builder.finish(); + return count == 0 ? null : fstCompiler.compile(); } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java index cb87492cd09..9c8ebb8cda1 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java @@ -40,7 +40,7 @@ import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; -import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.BytesReader; @@ -116,7 +116,7 @@ public class WFSTCompletionLookup extends Lookup implements Accountable { IntsRefBuilder scratchInts = new IntsRefBuilder(); BytesRefBuilder previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); @@ -127,11 +127,11 @@ public class WFSTCompletionLookup extends Lookup implements Accountable { // added } Util.toIntsRef(scratch, scratchInts); - builder.add(scratchInts.get(), cost); + fstCompiler.add(scratchInts.get(), cost); previous.copyBytes(scratch); count++; } - fst = builder.finish(); + fst = fstCompiler.compile(); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java index 590841d8e78..19111d13d9e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java @@ -272,27 +272,26 @@ public class FSTTester { System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2); } - final Builder builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, - prune1, prune2, - prune1==0 && prune2==0, - allowRandomSuffixSharing ? random.nextBoolean() : true, - allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, - outputs, - true, - 15); + final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs) + .minSuffixCount1(prune1) + .minSuffixCount2(prune2) + .shouldShareSuffix(prune1==0 && prune2==0) + .shouldShareNonSingletonNodes(allowRandomSuffixSharing ? random.nextBoolean() : true) + .shareMaxTailLength(allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE) + .build(); for(InputOutput pair : pairs) { if (pair.output instanceof List) { @SuppressWarnings("unchecked") List longValues = (List) pair.output; - @SuppressWarnings("unchecked") final Builder builderObject = (Builder) builder; + @SuppressWarnings("unchecked") final FSTCompiler fstCompilerObject = (FSTCompiler) fstCompiler; for(Long value : longValues) { - builderObject.add(pair.input, value); + fstCompilerObject.add(pair.input, value); } } else { - builder.add(pair.input, pair.output); + fstCompiler.add(pair.input, pair.output); } } - FST fst = builder.finish(); + FST fst = fstCompiler.compile(); if (random.nextBoolean() && fst != null) { IOContext context = LuceneTestCase.newIOContext(random); @@ -320,7 +319,7 @@ public class FSTTester { if (fst == null) { System.out.println(" fst has 0 nodes (fully pruned)"); } else { - System.out.println(" fst has " + builder.getNodeCount() + " nodes and " + builder.getArcCount() + " arcs"); + System.out.println(" fst has " + fstCompiler.getNodeCount() + " nodes and " + fstCompiler.getArcCount() + " arcs"); } } @@ -330,8 +329,8 @@ public class FSTTester { verifyPruned(inputMode, fst, prune1, prune2); } - nodeCount = builder.getNodeCount(); - arcCount = builder.getArcCount(); + nodeCount = fstCompiler.getNodeCount(); + arcCount = fstCompiler.getArcCount(); return fst; } diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 5e2a014ed22..944ba095358 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -89,6 +89,8 @@ Other Changes * SOLR-13797: SolrResourceLoader no longer caches bad results when asked for wrong type (Mike Drob) +* LUCENE-9092: Upgrade Carrot2 to 3.16.2 (Dawid Weiss). + ================== 8.5.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.