From b7846ffaa1ff8cd050feaa924e3a6ab12a43549b Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Sat, 20 Aug 2011 19:20:45 +0000 Subject: [PATCH] LUCENE-3030: add BlockTreeTermsReader/Writer, a more efficient tree-structure for the terms dictionary git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1159905 13f79535-47bb-0310-9956-ffa450edef68 --- .../sandbox/queries/FuzzyLikeThisQuery.java | 2 +- .../{search => index}/AutomatonTermsEnum.java | 34 +- .../lucene/index/BufferedDeletesStream.java | 1 + .../org/apache/lucene/index/CheckIndex.java | 59 +- .../lucene/index/FreqProxTermsWriter.java | 6 +- .../org/apache/lucene/index/MultiTerms.java | 27 +- .../apache/lucene/index/SegmentMerger.java | 4 +- .../java/org/apache/lucene/index/Terms.java | 37 +- .../lucene/index/codecs/BlockTermState.java | 10 +- .../lucene/index/codecs/BlockTermsReader.java | 66 +- .../lucene/index/codecs/BlockTermsWriter.java | 10 +- .../index/codecs/BlockTreeTermsReader.java | 2832 +++++++++++++++++ .../index/codecs/BlockTreeTermsWriter.java | 943 ++++++ .../lucene/index/codecs/CodecProvider.java | 2 +- .../index/codecs/PostingsReaderBase.java | 9 +- .../index/codecs/PostingsWriterBase.java | 6 +- .../intblock/FixedIntBlockIndexOutput.java | 10 +- .../intblock/VariableIntBlockIndexOutput.java | 10 +- .../index/codecs/pulsing/PulsingCodec.java | 114 +- ...erImpl.java => PulsingPostingsReader.java} | 75 +- ...erImpl.java => PulsingPostingsWriter.java} | 115 +- .../index/codecs/sep/IntIndexOutput.java | 2 +- ...ReaderImpl.java => SepPostingsReader.java} | 68 +- ...WriterImpl.java => SepPostingsWriter.java} | 173 +- .../index/codecs/sep/SepSkipListWriter.java | 6 +- .../index/codecs/standard/StandardCodec.java | 97 +- .../standard/StandardPostingsReader.java | 54 +- .../standard/StandardPostingsWriter.java | 179 +- .../apache/lucene/search/AutomatonQuery.java | 86 +- .../lucene/search/FilteredTermsEnum.java | 18 +- .../org/apache/lucene/search/FuzzyQuery.java | 6 +- .../apache/lucene/search/FuzzyTermsEnum.java | 107 +- .../apache/lucene/search/IndexSearcher.java | 2 +- .../org/apache/lucene/search/PrefixQuery.java | 2 +- .../apache/lucene/search/PrefixTermsEnum.java | 5 +- .../apache/lucene/search/SingleTermsEnum.java | 7 +- .../org/apache/lucene/search/TermQuery.java | 7 +- .../org/apache/lucene/search/TermScorer.java | 1 + .../apache/lucene/search/TopTermsRewrite.java | 26 +- .../lucene/store/ByteArrayDataInput.java | 14 + .../lucene/store/CompoundFileDirectory.java | 14 +- .../org/apache/lucene/store/FSDirectory.java | 1 + .../java/org/apache/lucene/util/BytesRef.java | 12 + .../org/apache/lucene/util/TermContext.java | 11 +- .../util/automaton/CompiledAutomaton.java | 313 ++ .../org/apache/lucene/util/fst/Builder.java | 251 +- .../lucene/util/fst/ByteSequenceOutputs.java | 2 +- .../java/org/apache/lucene/util/fst/FST.java | 19 +- .../util/fst/UpToTwoPositiveIntOutputs.java | 4 +- .../java/org/apache/lucene/util/fst/Util.java | 92 +- .../lucene/index/RandomIndexWriter.java | 2 +- .../mockintblock/MockFixedIntBlockCodec.java | 18 +- .../MockVariableIntBlockCodec.java | 18 +- .../codecs/mockrandom/MockRandomCodec.java | 337 +- .../index/codecs/mocksep/MockSepCodec.java | 12 +- .../mocksep/MockSingleIntIndexOutput.java | 12 +- .../apache/lucene/util/LuceneTestCase.java | 22 +- .../org/apache/lucene/util/_TestUtil.java | 2 +- .../DaciukMihovAutomatonBuilder.java | 361 +++ .../lucene/TestSearchForDuplicates.java | 2 +- .../org/apache/lucene/index/Test2BTerms.java | 3 +- .../org/apache/lucene/index/TestCodecs.java | 8 +- .../apache/lucene/index/TestDocTermOrds.java | 2 +- .../lucene/index/TestDocsAndPositions.java | 5 +- .../lucene/index/TestIndexFileDeleter.java | 3 +- .../lucene/index/TestIndexWriterDelete.java | 7 + .../lucene/index/TestIndexWriterMerging.java | 2 + .../apache/lucene/index/TestLongPostings.java | 4 +- .../apache/lucene/index/TestMultiFields.java | 28 +- .../apache/lucene/index/TestTermsEnum.java | 576 ++++ .../apache/lucene/index/TestTermsEnum2.java | 175 + .../index/values/TestDocValuesIndexing.java | 1 + .../lucene/search/TestAutomatonQuery.java | 4 +- .../search/TestBooleanMinShouldMatch.java | 1 + .../apache/lucene/search/TestFuzzyQuery2.java | 8 + .../search/TestMultiTermConstantScore.java | 10 + .../lucene/search/TestRegexpRandom2.java | 23 +- .../apache/lucene/search/TestWildcard.java | 2 +- .../lucene/search/TestWildcardRandom.java | 9 +- .../search/payloads/TestPayloadNearQuery.java | 5 +- .../util/automaton/TestCompiledAutomaton.java | 121 + .../org/apache/lucene/util/fst/TestFSTs.java | 257 +- .../search/spell/DirectSpellChecker.java | 2 +- 83 files changed, 7001 insertions(+), 992 deletions(-) rename lucene/src/java/org/apache/lucene/{search => index}/AutomatonTermsEnum.java (92%) create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java rename lucene/src/java/org/apache/lucene/index/codecs/pulsing/{PulsingPostingsReaderImpl.java => PulsingPostingsReader.java} (86%) rename lucene/src/java/org/apache/lucene/index/codecs/pulsing/{PulsingPostingsWriterImpl.java => PulsingPostingsWriter.java} (71%) rename lucene/src/java/org/apache/lucene/index/codecs/sep/{SepPostingsReaderImpl.java => SepPostingsReader.java} (90%) rename lucene/src/java/org/apache/lucene/index/codecs/sep/{SepPostingsWriterImpl.java => SepPostingsWriter.java} (68%) create mode 100644 lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java create mode 100644 lucene/src/test-framework/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java create mode 100644 lucene/src/test/org/apache/lucene/index/TestTermsEnum2.java create mode 100644 lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java diff --git a/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java b/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java index c6909cd9b99..bef95219193 100644 --- a/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java +++ b/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java @@ -209,7 +209,7 @@ public class FuzzyLikeThisQuery extends Query AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); - FuzzyTermsEnum fe = new FuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()).iterator(), atts, startTerm, f.minSimilarity, f.prefixLength); + FuzzyTermsEnum fe = new FuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength); //store the df so all variants use same idf int df = reader.docFreq(startTerm); int numVariants=0; diff --git a/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java b/lucene/src/java/org/apache/lucene/index/AutomatonTermsEnum.java similarity index 92% rename from lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java rename to lucene/src/java/org/apache/lucene/index/AutomatonTermsEnum.java index 58cb5dd851b..9bd0554b539 100644 --- a/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/index/AutomatonTermsEnum.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search; +package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,14 +20,12 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.Comparator; -import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.FilteredTermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; -import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.Transition; -import org.apache.lucene.util.automaton.UTF32ToUTF8; /** * A FilteredTermsEnum that enumerates terms based upon what is accepted by a @@ -46,7 +44,7 @@ import org.apache.lucene.util.automaton.UTF32ToUTF8; *

* @lucene.experimental */ -public class AutomatonTermsEnum extends FilteredTermsEnum { +class AutomatonTermsEnum extends FilteredTermsEnum { // a tableized array-based form of the DFA private final ByteRunAutomaton runAutomaton; // common suffix of the automaton @@ -81,6 +79,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum { super(tenum); this.finite = compiled.finite; this.runAutomaton = compiled.runAutomaton; + assert this.runAutomaton != null; this.commonSuffixRef = compiled.commonSuffixRef; this.allTransitions = compiled.sortedTransitions; @@ -110,6 +109,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum { @Override protected BytesRef nextSeekTerm(final BytesRef term) throws IOException { + //System.out.println("ATE.nextSeekTerm term=" + term); if (term == null) { assert seekBytesRef.length == 0; // return the empty term, as its valid @@ -318,26 +318,4 @@ public class AutomatonTermsEnum extends FilteredTermsEnum { } return -1; /* all solutions exhausted */ } - - /** - * immutable class with everything this enum needs. - */ - public static class CompiledAutomaton { - public final ByteRunAutomaton runAutomaton; - public final Transition[][] sortedTransitions; - public final BytesRef commonSuffixRef; - public final boolean finite; - - public CompiledAutomaton(Automaton automaton, boolean finite) { - Automaton utf8 = new UTF32ToUTF8().convert(automaton); - runAutomaton = new ByteRunAutomaton(utf8, true); - sortedTransitions = utf8.getSortedTransitions(); - this.finite = finite; - if (finite) { - commonSuffixRef = null; - } else { - commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8); - } - } - } } diff --git a/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java b/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java index 603fac808c2..295222991df 100644 --- a/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java +++ b/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java @@ -398,6 +398,7 @@ class BufferedDeletesStream { if (termsEnum.seekExact(term.bytes(), false)) { DocsEnum docsEnum = termsEnum.docs(reader.getLiveDocs(), docs); + //System.out.println("BDS: got docsEnum=" + docsEnum); if (docsEnum != null) { while (true) { diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java index 82477ea21e5..81866290e56 100644 --- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java @@ -17,13 +17,6 @@ package org.apache.lucene.index; * limitations under the License. */ -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; import java.io.File; import java.io.IOException; import java.io.PrintStream; @@ -31,16 +24,25 @@ import java.text.NumberFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; +import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.document.AbstractField; // for javadocs import org.apache.lucene.document.Document; +import org.apache.lucene.index.codecs.BlockTreeTermsReader; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.DefaultSegmentInfosWriter; import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.values.IndexDocValues; import org.apache.lucene.index.values.ValuesEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; @@ -237,6 +239,8 @@ public class CheckIndex { /** Exception thrown during term index test (null on success) */ public Throwable error = null; + + public Map blockTreeStats = null; } /** @@ -285,10 +289,19 @@ public class CheckIndex { infoStream = null; } + private boolean verbose; + /** Set infoStream where messages should go. If null, no - * messages are printed */ - public void setInfoStream(PrintStream out) { + * messages are printed. If verbose is true then more + * details are printed. */ + public void setInfoStream(PrintStream out, boolean verbose) { infoStream = out; + this.verbose = verbose; + } + + /** Set infoStream where messages should go. See {@link setInfoStream(PrintStream,boolean)}. */ + public void setInfoStream(PrintStream out) { + setInfoStream(out, false); } private void msg(String msg) { @@ -871,6 +884,16 @@ public class CheckIndex { } } } + + final Terms fieldTerms = fields.terms(field); + if (fieldTerms instanceof BlockTreeTermsReader.FieldReader) { + final BlockTreeTermsReader.Stats stats = ((BlockTreeTermsReader.FieldReader) fieldTerms).computeStats(); + assert stats != null; + if (status.blockTreeStats == null) { + status.blockTreeStats = new HashMap(); + } + status.blockTreeStats.put(field, stats); + } if (sumTotalTermFreq != 0) { final long v = fields.terms(field).getSumTotalTermFreq(); @@ -888,7 +911,7 @@ public class CheckIndex { // Test seek to last term: if (lastTerm != null) { - if (terms.seekCeil(lastTerm) != TermsEnum.SeekStatus.FOUND) { + if (terms.seekCeil(lastTerm) != TermsEnum.SeekStatus.FOUND) { throw new RuntimeException("seek to last term " + lastTerm + " failed"); } @@ -951,6 +974,13 @@ public class CheckIndex { msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]"); + if (verbose && status.blockTreeStats != null && infoStream != null && status.termCount > 0) { + for(Map.Entry ent : status.blockTreeStats.entrySet()) { + infoStream.println(" field \"" + ent.getKey() + "\":"); + infoStream.println(" " + ent.getValue().toString().replace("\n", "\n ")); + } + } + } catch (Throwable e) { msg("ERROR: " + e); status.error = e; @@ -1131,7 +1161,7 @@ public class CheckIndex {

Run it like this:

-    java -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]
+    java -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-verbose] [-segment X] [-segment Y]
     
  • -fix: actually write a new segments_N file, removing any problematic segments @@ -1161,6 +1191,7 @@ public class CheckIndex { public static void main(String[] args) throws IOException, InterruptedException { boolean doFix = false; + boolean verbose = false; List onlySegments = new ArrayList(); String indexPath = null; int i = 0; @@ -1168,6 +1199,9 @@ public class CheckIndex { if (args[i].equals("-fix")) { doFix = true; i++; + } else if (args[i].equals("-verbose")) { + verbose = true; + i++; } else if (args[i].equals("-segment")) { if (i == args.length-1) { System.out.println("ERROR: missing name for -segment option"); @@ -1190,6 +1224,7 @@ public class CheckIndex { System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]\n" + "\n" + " -fix: actually write a new segments_N file, removing any problematic segments\n" + + " -verbose: print additional details\n" + " -segment X: only check the specified segments. This can be specified multiple\n" + " times, to check more than one segment, eg '-segment _2 -segment _a'.\n" + " You can't use this with the -fix option\n" + @@ -1231,7 +1266,7 @@ public class CheckIndex { } CheckIndex checker = new CheckIndex(dir); - checker.setInfoStream(System.out); + checker.setInfoStream(System.out, verbose); Status result = checker.checkIndex(onlySegments); if (result.missingSegments) { diff --git a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index dae350aae3a..fc99a719202 100644 --- a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CollectionUtil; +import org.apache.lucene.util.IOUtils; final class FreqProxTermsWriter extends TermsHashConsumer { @@ -58,6 +59,8 @@ final class FreqProxTermsWriter extends TermsHashConsumer { final FieldsConsumer consumer = state.segmentCodecs.codec().fieldsConsumer(state); + boolean success = false; + try { TermsHash termsHash = null; @@ -100,8 +103,9 @@ final class FreqProxTermsWriter extends TermsHashConsumer { if (termsHash != null) { termsHash.reset(); } + success = true; } finally { - consumer.close(); + IOUtils.closeSafely(!success, consumer); } } diff --git a/lucene/src/java/org/apache/lucene/index/MultiTerms.java b/lucene/src/java/org/apache/lucene/index/MultiTerms.java index 2785170dfb3..168a142dd65 100644 --- a/lucene/src/java/org/apache/lucene/index/MultiTerms.java +++ b/lucene/src/java/org/apache/lucene/index/MultiTerms.java @@ -17,13 +17,15 @@ package org.apache.lucene.index; * limitations under the License. */ +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.ReaderUtil; -import java.io.IOException; -import java.util.List; -import java.util.ArrayList; -import java.util.Comparator; +import org.apache.lucene.util.automaton.CompiledAutomaton; /** * Exposes flex API, merged from flex API of @@ -58,6 +60,23 @@ public final class MultiTerms extends Terms { termComp = _termComp; } + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + final List termsEnums = new ArrayList(); + for(int i=0;i 0) { + return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY)); + } else { + return TermsEnum.EMPTY; + } + } + @Override public TermsEnum iterator() throws IOException { diff --git a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java index 52159ab316b..96908747a35 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java @@ -562,12 +562,14 @@ final class SegmentMerger { } codec = segmentWriteState.segmentCodecs.codec(); final FieldsConsumer consumer = codec.fieldsConsumer(segmentWriteState); + boolean success = false; try { consumer.merge(mergeState, new MultiFields(fields.toArray(Fields.EMPTY_ARRAY), slices.toArray(ReaderUtil.Slice.EMPTY_ARRAY))); + success = true; } finally { - consumer.close(); + IOUtils.closeSafely(!success, consumer); } } diff --git a/lucene/src/java/org/apache/lucene/index/Terms.java b/lucene/src/java/org/apache/lucene/index/Terms.java index b7a27e0d0c9..6cdf1f353db 100644 --- a/lucene/src/java/org/apache/lucene/index/Terms.java +++ b/lucene/src/java/org/apache/lucene/index/Terms.java @@ -19,9 +19,11 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.Comparator; + import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CloseableThreadLocal; +import org.apache.lucene.util.automaton.CompiledAutomaton; /** * Access to the terms in a specific field. See {@link Fields}. @@ -37,7 +39,40 @@ public abstract class Terms { /** Returns an iterator that will step through all * terms. This method will not return null.*/ public abstract TermsEnum iterator() throws IOException; - + + /** Returns a TermsEnum that iterates over all terms that + * are accepted by the provided {@link + * CompiledAutomaton}. If the startTerm is + * provided then the returned enum will only accept terms + * > startTerm, but you still must call + * next() first to get to the first term. Note that the + * provided startTerm must be accepted by + * the automaton. + * + *

    NOTE: the returned TermsEnum cannot + * seek

    . */ + public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException { + // TODO: eventually we could support seekCeil/Exact on + // the returned enum, instead of only being able to seek + // at the start + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } + if (startTerm == null) { + return new AutomatonTermsEnum(iterator(), compiled); + } else { + return new AutomatonTermsEnum(iterator(), compiled) { + @Override + protected BytesRef nextSeekTerm(BytesRef term) throws IOException { + if (term == null) { + term = startTerm; + } + return super.nextSeekTerm(term); + } + }; + } + } + /** Return the BytesRef Comparator used to sort terms * provided by the iterator. This method may return null * if there are no terms. This method may be invoked diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java index 36e24c2ec25..2a070f260c7 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java @@ -29,10 +29,8 @@ public class BlockTermState extends OrdTermState { public int docFreq; // how many docs have this term public long totalTermFreq; // total number of occurrences of this term - public int termCount; // term ord are in the current block - public long blockFilePointer; // fp into the terms dict primary file (_X.tib) that holds this term - - public int blockTermCount; // how many terms in current block + public int termBlockOrd; // the term's ord in the current block + public long blockFilePointer; // fp into the terms dict primary file (_X.tim) that holds this term @Override public void copyFrom(TermState _other) { @@ -41,7 +39,7 @@ public class BlockTermState extends OrdTermState { super.copyFrom(_other); docFreq = other.docFreq; totalTermFreq = other.totalTermFreq; - termCount = other.termCount; + termBlockOrd = other.termBlockOrd; blockFilePointer = other.blockFilePointer; // NOTE: don't copy blockTermCount; @@ -51,6 +49,6 @@ public class BlockTermState extends OrdTermState { @Override public String toString() { - return "ord=" + ord + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termCount=" + termCount + " blockFP=" + blockFilePointer; + return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer; } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java index c827926e90c..ffd685c7c2e 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java @@ -106,7 +106,7 @@ public class BlockTermsReader extends FieldsProducer { } } - //private String segment; + // private String segment; public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, IOContext context, int termsCacheSize, int codecId) @@ -115,7 +115,7 @@ public class BlockTermsReader extends FieldsProducer { this.postingsReader = postingsReader; termsCache = new DoubleBarrelLRUCache(termsCacheSize); - //this.segment = segment; + // this.segment = segment; in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTermsWriter.TERMS_EXTENSION), context); @@ -321,6 +321,9 @@ public class BlockTermsReader extends FieldsProducer { /* Common prefix used for all terms in this block. */ private int termBlockPrefix; + /* How many terms in current block */ + private int blockTermCount; + private byte[] docFreqBytes; private final ByteArrayDataInput freqReader = new ByteArrayDataInput(); private int metaDataUpto; @@ -358,16 +361,14 @@ public class BlockTermsReader extends FieldsProducer { throw new IllegalStateException("terms index was not loaded"); } - /* - System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); + //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); if (didIndexNext) { if (nextIndexTerm == null) { - System.out.println(" nextIndexTerm=null"); + //System.out.println(" nextIndexTerm=null"); } else { - System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); + //System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); } } - */ // Check cache if (useCache) { @@ -444,7 +445,7 @@ public class BlockTermsReader extends FieldsProducer { //System.out.println(" seek: term=" + term.utf8ToString()); } else { //System.out.println(" skip seek"); - if (state.termCount == state.blockTermCount && !nextBlock()) { + if (state.termBlockOrd == blockTermCount && !nextBlock()) { indexIsCurrent = false; return SeekStatus.END; } @@ -480,9 +481,9 @@ public class BlockTermsReader extends FieldsProducer { // but it could be in next block. We // must scan to end-of-block to set common // prefix for next block: - if (state.termCount < state.blockTermCount) { - while(state.termCount < state.blockTermCount-1) { - state.termCount++; + if (state.termBlockOrd < blockTermCount) { + while(state.termBlockOrd < blockTermCount-1) { + state.termBlockOrd++; state.ord++; termSuffixesReader.skipBytes(termSuffixesReader.readVInt()); } @@ -505,7 +506,7 @@ public class BlockTermsReader extends FieldsProducer { // Target's prefix is before the common prefix // of this block, so we position to start of // block and return NOT_FOUND: - assert state.termCount == 0; + assert state.termBlockOrd == 0; final int suffix = termSuffixesReader.readVInt(); term.length = termBlockPrefix + suffix; @@ -523,7 +524,7 @@ public class BlockTermsReader extends FieldsProducer { // Test every term in this block while (true) { - state.termCount++; + state.termBlockOrd++; state.ord++; final int suffix = termSuffixesReader.readVInt(); @@ -581,7 +582,7 @@ public class BlockTermsReader extends FieldsProducer { } } - if (state.termCount == state.blockTermCount) { + if (state.termBlockOrd == blockTermCount) { // Must pre-fill term for next block's common prefix term.length = termBlockPrefix + suffix; if (term.bytes.length < term.length) { @@ -613,7 +614,7 @@ public class BlockTermsReader extends FieldsProducer { @Override public BytesRef next() throws IOException { - //System.out.println("BTR.next() seekPending=" + seekPending + " pendingSeekCount=" + state.termCount); + //System.out.println("BTR.next() seekPending=" + seekPending + " pendingSeekCount=" + state.termBlockOrd); // If seek was previously called and the term was cached, // usually caller is just going to pull a D/&PEnum or get @@ -623,7 +624,7 @@ public class BlockTermsReader extends FieldsProducer { if (seekPending) { assert !indexIsCurrent; in.seek(state.blockFilePointer); - final int pendingSeekCount = state.termCount; + final int pendingSeekCount = state.termBlockOrd; boolean result = nextBlock(); final long savOrd = state.ord; @@ -633,7 +634,7 @@ public class BlockTermsReader extends FieldsProducer { // on a real term: assert result; - while(state.termCount < pendingSeekCount) { + while(state.termBlockOrd < pendingSeekCount) { BytesRef nextResult = _next(); assert nextResult != null; } @@ -647,8 +648,8 @@ public class BlockTermsReader extends FieldsProducer { metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) decode all metadata up to the current term. */ private BytesRef _next() throws IOException { - //System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termCount + " (vs " + state.blockTermCount + ")"); - if (state.termCount == state.blockTermCount && !nextBlock()) { + //System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termBlockOrd + " (vs " + blockTermCount + ")"); + if (state.termBlockOrd == blockTermCount && !nextBlock()) { //System.out.println(" eof"); indexIsCurrent = false; return null; @@ -663,12 +664,12 @@ public class BlockTermsReader extends FieldsProducer { term.grow(term.length); } termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix); - state.termCount++; + state.termBlockOrd++; // NOTE: meaningless in the non-ord case state.ord++; - //System.out.println(" return term=" + fieldInfo.name + ":" + term.utf8ToString() + " " + term); + //System.out.println(" return term=" + fieldInfo.name + ":" + term.utf8ToString() + " " + term + " tbOrd=" + state.termBlockOrd); return term; } @@ -695,9 +696,10 @@ public class BlockTermsReader extends FieldsProducer { public DocsEnum docs(Bits liveDocs, DocsEnum reuse) throws IOException { //System.out.println("BTR.docs this=" + this); decodeMetaData(); - //System.out.println(" state.docFreq=" + state.docFreq); + //System.out.println("BTR.docs: state.docFreq=" + state.docFreq); final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, liveDocs, reuse); assert docsEnum != null; + //System.out.println("BTR.docs: return docsEnum=" + docsEnum); return docsEnum; } @@ -716,7 +718,7 @@ public class BlockTermsReader extends FieldsProducer { @Override public void seekExact(BytesRef target, TermState otherState) throws IOException { - //System.out.println("BTR.seek termState target=" + target.utf8ToString() + " " + target + " this=" + this); + //System.out.println("BTR.seekExact termState target=" + target.utf8ToString() + " " + target + " this=" + this); assert otherState != null && otherState instanceof BlockTermState; assert !doOrd || ((BlockTermState) otherState).ord < numTerms; state.copyFrom(otherState); @@ -800,9 +802,9 @@ public class BlockTermsReader extends FieldsProducer { //System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this); state.blockFilePointer = in.getFilePointer(); - state.blockTermCount = in.readVInt(); - //System.out.println(" blockTermCount=" + state.blockTermCount); - if (state.blockTermCount == 0) { + blockTermCount = in.readVInt(); + //System.out.println(" blockTermCount=" + blockTermCount); + if (blockTermCount == 0) { return false; } termBlockPrefix = in.readVInt(); @@ -826,7 +828,7 @@ public class BlockTermsReader extends FieldsProducer { freqReader.reset(docFreqBytes, 0, len); metaDataUpto = 0; - state.termCount = 0; + state.termBlockOrd = 0; postingsReader.readTermsBlock(in, fieldInfo, state); @@ -838,7 +840,7 @@ public class BlockTermsReader extends FieldsProducer { } private void decodeMetaData() throws IOException { - //System.out.println("BTR.decodeMetadata mdUpto=" + metaDataUpto + " vs termCount=" + state.termCount + " state=" + state); + //System.out.println("BTR.decodeMetadata mdUpto=" + metaDataUpto + " vs termCount=" + state.termBlockOrd + " state=" + state); if (!seekPending) { // TODO: cutover to random-access API // here.... really stupid that we have to decode N @@ -846,10 +848,10 @@ public class BlockTermsReader extends FieldsProducer { // that we really need... // lazily catch up on metadata decode: - final int limit = state.termCount; + final int limit = state.termBlockOrd; // We must set/incr state.termCount because // postings impl can look at this - state.termCount = metaDataUpto; + state.termBlockOrd = metaDataUpto; // TODO: better API would be "jump straight to term=N"??? while (metaDataUpto < limit) { //System.out.println(" decode mdUpto=" + metaDataUpto); @@ -870,9 +872,9 @@ public class BlockTermsReader extends FieldsProducer { postingsReader.nextTerm(fieldInfo, state); metaDataUpto++; - state.termCount++; + state.termBlockOrd++; } - //} else { + } else { //System.out.println(" skip! seekPending"); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java index ff1af7ba040..950af269d76 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java @@ -66,7 +66,7 @@ public class BlockTermsWriter extends FieldsConsumer { private final TermsIndexWriterBase termsIndexWriter; private final List fields = new ArrayList(); - //private final String segment; + // private final String segment; public BlockTermsWriter(TermsIndexWriterBase termsIndexWriter, SegmentWriteState state, PostingsWriterBase postingsWriter) @@ -80,7 +80,7 @@ public class BlockTermsWriter extends FieldsConsumer { writeHeader(out); currentField = null; this.postingsWriter = postingsWriter; - //segment = state.segmentName; + // segment = state.segmentName; //System.out.println("BTW.init seg=" + state.segmentName); @@ -188,7 +188,7 @@ public class BlockTermsWriter extends FieldsConsumer { @Override public PostingsConsumer startTerm(BytesRef text) throws IOException { - //System.out.println("BTW.startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment); + //System.out.println("BTW: startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment); postingsWriter.startTerm(); return postingsWriter; } @@ -199,7 +199,7 @@ public class BlockTermsWriter extends FieldsConsumer { public void finishTerm(BytesRef text, TermStats stats) throws IOException { assert stats.docFreq > 0; - //System.out.println("BTW.finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq); + //System.out.println("BTW: finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq); final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats); @@ -308,7 +308,7 @@ public class BlockTermsWriter extends FieldsConsumer { bytesWriter.writeTo(out); bytesWriter.reset(); - postingsWriter.flushTermsBlock(); + postingsWriter.flushTermsBlock(pendingCount, pendingCount); lastPrevTerm.copy(pendingTerms[pendingCount-1].term); pendingCount = 0; } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java new file mode 100644 index 00000000000..850a39e6710 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java @@ -0,0 +1,2832 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.PrintStream; +import java.io.Writer; +import java.util.Collection; +import java.util.Comparator; +import java.util.Iterator; +import java.util.TreeMap; + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.fst.Util; + +/** A block-based terms index and dictionary that assigns + * terms to variable length blocks according to how they + * share prefixes. The terms index is a prefix trie + * whose leaves are term blocks. The advantage of this + * approach is that {@link #seekExact} is often able to + * determine a term cannot exist without doing any IO, and + * intersection with Automata is very fast. Note that this + * terms dictionary has it's own fixed terms index (ie, it + * does not support a pluggable terms index + * implementation). + * + *

    NOTE: this terms dictionary does not support + * index divisor when opening an IndexReader. Instead, you + * can change the min/maxItemsPerBlock during indexing.

    + * + *

    The data structure used by this implementation is very + * similar to a burst trie + * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499), + * but with added logic to break up too-large blocks of all + * terms sharing a given prefix into smaller ones.

    + * + *

    Use {@link CheckIndex} with the -verbose + * option to see summary statistics on the blocks in the + * dictionary. + * + * See {@link BlockTreeTermsWriter}. + * + * @lucene.experimental + */ + +public class BlockTreeTermsReader extends FieldsProducer { + + // Open input to the main terms dict file (_X.tib) + private final IndexInput in; + + //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + // Reads the terms dict entries, to gather state to + // produce DocsEnum on demand + private final PostingsReaderBase postingsReader; + + private final TreeMap fields = new TreeMap(); + + // keeps the dirStart offset + protected long dirOffset; + protected long indexDirOffset; + + private String segment; + + public BlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, String segment, + PostingsReaderBase postingsReader, IOContext ioContext, + int codecId, int indexDivisor) + throws IOException { + + this.postingsReader = postingsReader; + + this.segment = segment; + in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTreeTermsWriter.TERMS_EXTENSION), + ioContext); + + boolean success = false; + IndexInput indexIn = null; + + try { + readHeader(in); + if (indexDivisor != -1) { + indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION), + ioContext); + readIndexHeader(indexIn); + } + + // Have PostingsReader init itself + postingsReader.init(in); + + // Read per-field details + seekDir(in, dirOffset); + if (indexDivisor != -1) { + seekDir(indexIn, indexDirOffset); + } + + final int numFields = in.readVInt(); + + for(int i=0;i= 0; + final int numBytes = in.readVInt(); + final BytesRef rootCode = new BytesRef(new byte[numBytes]); + in.readBytes(rootCode.bytes, 0, numBytes); + rootCode.length = numBytes; + final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + assert fieldInfo != null: "field=" + field; + final long sumTotalTermFreq = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); + final long sumDocFreq = in.readVLong(); + final long indexStartFP = indexDivisor != -1 ? indexIn.readVLong() : 0; + assert !fields.containsKey(fieldInfo.name); + fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, indexStartFP, indexIn)); + } + success = true; + } finally { + if (!success) { + IOUtils.closeSafely(true, indexIn, this); + } else if (indexDivisor != -1) { + indexIn.close(); + } + } + } + + protected void readHeader(IndexInput input) throws IOException { + CodecUtil.checkHeader(input, BlockTreeTermsWriter.CODEC_NAME, + BlockTreeTermsWriter.VERSION_START, + BlockTreeTermsWriter.VERSION_CURRENT); + dirOffset = input.readLong(); + } + + protected void readIndexHeader(IndexInput input) throws IOException { + CodecUtil.checkHeader(input, BlockTreeTermsWriter.CODEC_NAME, + BlockTreeTermsWriter.VERSION_START, + BlockTreeTermsWriter.VERSION_CURRENT); + indexDirOffset = input.readLong(); + } + + protected void seekDir(IndexInput input, long dirOffset) + throws IOException { + input.seek(dirOffset); + } + + // for debugging + // private static String toHex(int v) { + // return "0x" + Integer.toHexString(v); + // } + + @Override + public void close() throws IOException { + try { + IOUtils.closeSafely(false, in, postingsReader); + } finally { + for(FieldReader field : fields.values()) { + field.close(); + } + // Clear so refs to terms index is GCable even if + // app hangs onto us: + fields.clear(); + } + } + + public static void files(Directory dir, SegmentInfo segmentInfo, int codecID, Collection files) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecID, BlockTreeTermsWriter.TERMS_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecID, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION)); + } + + public static void getExtensions(Collection extensions) { + extensions.add(BlockTreeTermsWriter.TERMS_EXTENSION); + extensions.add(BlockTreeTermsWriter.TERMS_INDEX_EXTENSION); + } + + @Override + public FieldsEnum iterator() { + return new TermFieldsEnum(); + } + + @Override + public Terms terms(String field) throws IOException { + return fields.get(field); + } + + // Iterates through all fields + private class TermFieldsEnum extends FieldsEnum { + final Iterator it; + FieldReader current; + + TermFieldsEnum() { + it = fields.values().iterator(); + } + + @Override + public String next() { + if (it.hasNext()) { + current = it.next(); + return current.fieldInfo.name; + } else { + current = null; + return null; + } + } + + @Override + public TermsEnum terms() throws IOException { + return current.iterator(); + } + } + + // for debugging + String brToString(BytesRef b) { + if (b == null) { + return "null"; + } else { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + } + + public static class Stats { + public int indexNodeCount; + public int indexArcCount; + public int indexNumBytes; + + public long totalTermCount; + public long totalTermBytes; + + + public int nonFloorBlockCount; + public int floorBlockCount; + public int floorSubBlockCount; + public int mixedBlockCount; + public int termsOnlyBlockCount; + public int subBlocksOnlyBlockCount; + public int totalBlockCount; + + public int[] blockCountByPrefixLen = new int[10]; + private int startBlockCount; + private int endBlockCount; + public long totalBlockSuffixBytes; + public long totalBlockStatsBytes; + + // Postings impl plus the other few vInts stored in + // the frame: + public long totalBlockOtherBytes; + + public final String segment; + public final String field; + + public Stats(String segment, String field) { + this.segment = segment; + this.field = field; + } + + void startBlock(FieldReader.SegmentTermsEnum.Frame frame, boolean isFloor) { + totalBlockCount++; + if (isFloor) { + if (frame.fp == frame.fpOrig) { + floorBlockCount++; + } + floorSubBlockCount++; + } else { + nonFloorBlockCount++; + } + + if (blockCountByPrefixLen.length <= frame.prefix) { + blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1+frame.prefix); + } + blockCountByPrefixLen[frame.prefix]++; + startBlockCount++; + totalBlockSuffixBytes += frame.suffixesReader.length(); + totalBlockStatsBytes += frame.statsReader.length(); + } + + void endBlock(FieldReader.SegmentTermsEnum.Frame frame) { + final int termCount = frame.isLeafBlock ? frame.entCount : frame.state.termBlockOrd; + final int subBlockCount = frame.entCount - termCount; + totalTermCount += termCount; + if (termCount != 0 && subBlockCount != 0) { + mixedBlockCount++; + } else if (termCount != 0) { + termsOnlyBlockCount++; + } else if (subBlockCount != 0) { + subBlocksOnlyBlockCount++; + } else { + throw new IllegalStateException(); + } + endBlockCount++; + final long otherBytes = frame.fpEnd - frame.fp - frame.suffixesReader.length() - frame.statsReader.length(); + assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd; + totalBlockOtherBytes += otherBytes; + } + + void term(BytesRef term) { + totalTermBytes += term.length; + } + + void finish() { + assert startBlockCount == endBlockCount: "startBlockCount=" + startBlockCount + " endBlockCount=" + endBlockCount; + assert totalBlockCount == floorSubBlockCount + nonFloorBlockCount: "floorSubBlockCount=" + floorSubBlockCount + " nonFloorBlockCount=" + nonFloorBlockCount + " totalBlockCount=" + totalBlockCount; + assert totalBlockCount == mixedBlockCount + termsOnlyBlockCount + subBlocksOnlyBlockCount: "totalBlockCount=" + totalBlockCount + " mixedBlockCount=" + mixedBlockCount + " subBlocksOnlyBlockCount=" + subBlocksOnlyBlockCount + " termsOnlyBlockCount=" + termsOnlyBlockCount; + } + + @Override + public String toString() { + final ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); + final PrintStream out = new PrintStream(bos); + + out.println(" index FST:"); + out.println(" " + indexNodeCount + " nodes"); + out.println(" " + indexArcCount + " arcs"); + out.println(" " + indexNumBytes + " bytes"); + out.println(" terms:"); + out.println(" " + totalTermCount + " terms"); + out.println(" " + totalTermBytes + " bytes" + (totalTermCount != 0 ? " (" + String.format("%.1f", ((double) totalTermBytes)/totalTermCount) + " bytes/term)" : "")); + out.println(" blocks:"); + out.println(" " + totalBlockCount + " blocks"); + out.println(" " + termsOnlyBlockCount + " terms-only blocks"); + out.println(" " + subBlocksOnlyBlockCount + " sub-block-only blocks"); + out.println(" " + mixedBlockCount + " mixed blocks"); + out.println(" " + floorBlockCount + " floor blocks"); + out.println(" " + (totalBlockCount-floorSubBlockCount) + " non-floor blocks"); + out.println(" " + floorSubBlockCount + " floor sub-blocks"); + out.println(" " + totalBlockSuffixBytes + " term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format("%.1f", ((double) totalBlockSuffixBytes)/totalBlockCount) + " suffix-bytes/block)" : "")); + out.println(" " + totalBlockStatsBytes + " term stats bytes" + (totalBlockCount != 0 ? " (" + String.format("%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : "")); + out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format("%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : "")); + if (totalBlockCount != 0) { + out.println(" by prefix length:"); + int total = 0; + for(int prefix=0;prefix fstOutputs = ByteSequenceOutputs.getSingleton(); + final BytesRef NO_OUTPUT = fstOutputs.getNoOutput(); + + public final class FieldReader extends Terms implements Closeable { + final long numTerms; + final FieldInfo fieldInfo; + final long sumTotalTermFreq; + final long sumDocFreq; + final long indexStartFP; + final long rootBlockFP; + final BytesRef rootCode; + private FST index; + + //private boolean DEBUG; + + FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, long indexStartFP, IndexInput indexIn) throws IOException { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.indexStartFP = indexStartFP; + this.rootCode = rootCode; + // if (DEBUG) { + // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor); + // } + + rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; + + if (indexIn != null) { + final IndexInput clone = (IndexInput) indexIn.clone(); + //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name); + clone.seek(indexStartFP); + index = new FST(clone, ByteSequenceOutputs.getSingleton()); + + /* + if (false) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(index, w, false, false); + System.out.println("FST INDEX: SAVED to " + dotFileName); + w.close(); + } + */ + } + } + + /** For debugging -- used by CheckIndex too*/ + // TODO: maybe push this into Terms? + public Stats computeStats() throws IOException { + return new SegmentTermsEnum().computeBlockStats(); + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public void close() { + super.close(); + } + + @Override + public TermsEnum iterator() throws IOException { + return new SegmentTermsEnum(); + } + + @Override + public long getUniqueTermCount() { + return numTerms; + } + + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + + @Override + public long getSumDocFreq() { + return sumDocFreq; + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } + return new IntersectEnum(compiled, startTerm); + } + + // NOTE: cannot seek! + private final class IntersectEnum extends TermsEnum { + private final IndexInput in; + + private Frame[] stack; + + @SuppressWarnings("unchecked") private FST.Arc[] arcs = new FST.Arc[5]; + + private final RunAutomaton runAutomaton; + private final CompiledAutomaton compiledAutomaton; + + private Frame currentFrame; + + private final BytesRef term = new BytesRef(); + + // TODO: can we share this with the frame in STE? + private final class Frame { + final int ord; + long fp; + long fpOrig; + long fpEnd; + long lastSubFP; + + // State in automaton + int state; + + int metaDataUpto; + + byte[] suffixBytes = new byte[128]; + final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); + + byte[] statBytes = new byte[64]; + final ByteArrayDataInput statsReader = new ByteArrayDataInput(); + + byte[] floorData = new byte[32]; + final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); + + // Length of prefix shared by all terms in this block + int prefix; + + // Number of entries (term or sub-block) in this block + int entCount; + + // Which term we will next read + int nextEnt; + + // True if this block is either not a floor block, + // or, it's the last sub-block of a floor block + boolean isLastInFloor; + + // True if all entries are terms + boolean isLeafBlock; + + int numFollowFloorBlocks; + int nextFloorLabel; + + Transition[] transitions; + int curTransitionMax; + int transitionIndex; + + FST.Arc arc; + + final BlockTermState termState; + + // Cumulative output so far + BytesRef outputPrefix; + + private int startBytePos; + private int suffix; + + public Frame(int ord) throws IOException { + this.ord = ord; + termState = postingsReader.newTermState(); + termState.totalTermFreq = -1; + } + + void loadNextFloorBlock() throws IOException { + assert numFollowFloorBlocks > 0; + //if (DEBUG) System.out.println(" loadNextFoorBlock trans=" + transitions[transitionIndex]); + + do { + fp = fpOrig + (floorDataReader.readVLong() >>> 1); + numFollowFloorBlocks--; + // if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[transitionIndex].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); + if (numFollowFloorBlocks != 0) { + nextFloorLabel = floorDataReader.readByte() & 0xff; + } else { + nextFloorLabel = 256; + } + // if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel); + } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[transitionIndex].getMin()); + + load(null); + } + + public void setState(int state) { + this.state = state; + transitionIndex = 0; + transitions = compiledAutomaton.sortedTransitions[state]; + if (transitions.length != 0) { + curTransitionMax = transitions[0].getMax(); + } else { + curTransitionMax = -1; + } + } + + void load(BytesRef frameIndexData) throws IOException { + + // if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state)); + + if (frameIndexData != null && transitions.length != 0) { + // Floor frame + if (floorData.length < frameIndexData.length) { + this.floorData = new byte[ArrayUtil.oversize(frameIndexData.length, 1)]; + } + System.arraycopy(frameIndexData.bytes, frameIndexData.offset, floorData, 0, frameIndexData.length); + floorDataReader.reset(floorData, 0, frameIndexData.length); + // Skip first long -- has redundant fp, hasTerms + // flag, isFloor flag + final long code = floorDataReader.readVLong(); + if ((code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0) { + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + // if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel); + + // If current state is accept, we must process + // first block in case it has empty suffix: + if (!runAutomaton.isAccept(state)) { + // Maybe skip floor blocks: + while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[0].getMin()) { + fp = fpOrig + (floorDataReader.readVLong() >>> 1); + numFollowFloorBlocks--; + // if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); + if (numFollowFloorBlocks != 0) { + nextFloorLabel = floorDataReader.readByte() & 0xff; + } else { + nextFloorLabel = 256; + } + } + } + } + } + + in.seek(fp); + int code = in.readVInt(); + entCount = code >>> 1; + assert entCount > 0; + isLastInFloor = (code & 1) != 0; + + // term suffixes: + code = in.readVInt(); + isLeafBlock = (code & 1) != 0; + int numBytes = code >>> 1; + // if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes); + if (suffixBytes.length < numBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + in.readBytes(suffixBytes, 0, numBytes); + suffixesReader.reset(suffixBytes, 0, numBytes); + + // stats + numBytes = in.readVInt(); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + in.readBytes(statBytes, 0, numBytes); + statsReader.reset(statBytes, 0, numBytes); + metaDataUpto = 0; + + termState.termBlockOrd = 0; + nextEnt = 0; + + postingsReader.readTermsBlock(in, fieldInfo, termState); + + if (!isLastInFloor) { + // Sub-blocks of a single floor block are always + // written one after another -- tail recurse: + fpEnd = in.getFilePointer(); + } + } + + // TODO: maybe add scanToLabel; should give perf boost + + public boolean next() { + return isLeafBlock ? nextLeaf() : nextNonLeaf(); + } + + // Decodes next entry; returns true if it's a sub-block + public boolean nextLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + suffix = suffixesReader.readVInt(); + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + return false; + } + + public boolean nextNonLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + final int code = suffixesReader.readVInt(); + suffix = code >>> 1; + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + if ((code & 1) == 0) { + // A normal term + termState.termBlockOrd++; + return false; + } else { + // A sub-block; make sub-FP absolute: + lastSubFP = fp - suffixesReader.readVLong(); + return true; + } + } + + public int getTermBlockOrd() { + return isLeafBlock ? nextEnt : termState.termBlockOrd; + } + + public void decodeMetaData() throws IOException { + + // lazily catch up on metadata decode: + final int limit = getTermBlockOrd(); + assert limit > 0; + + // We must set/incr state.termCount because + // postings impl can look at this + termState.termBlockOrd = metaDataUpto; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + termState.docFreq = statsReader.readVInt(); + //if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (fieldInfo.indexOptions != IndexOptions.DOCS_ONLY) { + termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); + //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } + + postingsReader.nextTerm(fieldInfo, termState); + metaDataUpto++; + termState.termBlockOrd++; + } + } + } + + private final BytesRef savedStartTerm; + + // TODO: in some cases we can filter by length? eg + // regexp foo*bar must be at least length 6 bytes + public IntersectEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + // if (DEBUG) { + // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef)); + // } + runAutomaton = compiled.runAutomaton; + compiledAutomaton = compiled; + in = (IndexInput) BlockTreeTermsReader.this.in.clone(); + stack = new Frame[5]; + for(int idx=0;idx(); + } + + // TODO: if the automaon is "smallish" we really + // should use the terms index to seek at least to + // the initial term and likely to subsequent terms + // (or, maybe just fallback to ATE for such cases). + // Else the seek cost of loading the frames will be + // too costly. + + final FST.Arc arc = index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + + // Special pushFrame since it's the first one: + final Frame f = stack[0]; + f.fp = f.fpOrig = rootBlockFP; + f.prefix = 0; + f.setState(runAutomaton.getInitialState()); + f.arc = arc; + f.outputPrefix = arc.output; + f.load(rootCode); + + // for assert: + savedStartTerm = startTerm == null ? null : new BytesRef(startTerm); + + currentFrame = f; + if (startTerm != null) { + seekToStartTerm(startTerm); + } + } + + @Override + public TermState termState() throws IOException { + currentFrame.decodeMetaData(); + return (TermState) currentFrame.termState.clone(); + } + + private Frame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final Frame[] next = new Frame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, next, 0, stack.length); + for(int stackOrd=stack.length;stackOrd getArc(int ord) { + if (ord >= arcs.length) { + @SuppressWarnings("unchecked") final FST.Arc[] next = new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + for(int arcOrd=arcs.length;arcOrd(); + } + arcs = next; + } + return arcs[ord]; + } + + private Frame pushFrame(int state) throws IOException { + final Frame f = getFrame(currentFrame == null ? 0 : 1+currentFrame.ord); + + f.fp = f.fpOrig = currentFrame.lastSubFP; + f.prefix = currentFrame.prefix + currentFrame.suffix; + // if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix); + f.setState(state); + + // Walk the arc through the index -- we only + // "bother" with this so we can get the floor data + // from the index and skip floor blocks when + // possible: + FST.Arc arc = currentFrame.arc; + int idx = currentFrame.prefix; + assert currentFrame.suffix > 0; + BytesRef output = currentFrame.outputPrefix; + while (idx < f.prefix) { + final int target = term.bytes[idx] & 0xff; + // TODO: we could be more efficient for the next() + // case by using current arc as starting point, + // passed to findTargetArc + arc = index.findTargetArc(target, arc, getArc(1+idx)); + assert arc != null; + output = fstOutputs.add(output, arc.output); + idx++; + } + + f.arc = arc; + f.outputPrefix = output; + assert arc.isFinal(); + f.load(fstOutputs.add(output, arc.nextFinalOutput)); + return f; + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public int docFreq() throws IOException { + //if (DEBUG) System.out.println("BTIR.docFreq"); + currentFrame.decodeMetaData(); + //if (DEBUG) System.out.println(" return " + currentFrame.termState.docFreq); + return currentFrame.termState.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.totalTermFreq; + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { + currentFrame.decodeMetaData(); + return postingsReader.docs(fieldInfo, currentFrame.termState, skipDocs, reuse); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + return null; + } else { + currentFrame.decodeMetaData(); + return postingsReader.docsAndPositions(fieldInfo, currentFrame.termState, skipDocs, reuse); + } + } + + private int getState() { + int state = currentFrame.state; + for(int idx=0;idx 0) { + // A prefix of the common suffix overlaps with + // the suffix of the block prefix so we first + // test whether the prefix part matches: + final byte[] termBytes = term.bytes; + int termBytesPos = currentFrame.prefix - lenInPrefix; + assert termBytesPos >= 0; + final int termBytesPosEnd = currentFrame.prefix; + while (termBytesPos < termBytesPosEnd) { + if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + // if (DEBUG) { + // System.out.println(" skip: common suffix mismatch (in prefix)"); + // } + continue nextTerm; + } + } + suffixBytesPos = currentFrame.startBytePos; + } else { + suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length; + } + + // Test overlapping suffix part: + final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length; + while (commonSuffixBytesPos < commonSuffixBytesPosEnd) { + if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + // if (DEBUG) { + // System.out.println(" skip: common suffix mismatch"); + // } + continue nextTerm; + } + } + } + + // TODO: maybe we should do the same linear test + // that AutomatonTermsEnum does, so that if we + // reach a part of the automaton where .* is + // "temporarily" accepted, we just blindly .next() + // until the limit + + // See if the term prefix matches the automaton: + int state = currentFrame.state; + for (int idx=0;idx[] arcs = new FST.Arc[5]; + + public SegmentTermsEnum() throws IOException { + //if (DEBUG) System.out.println("BTTR.init seg=" + segment); + in = (IndexInput) BlockTreeTermsReader.this.in.clone(); + stack = new Frame[5]; + for(int stackOrd=0;stackOrd(); + } + + // Init w/ root block; don't use index since it may + // not (and need not) have been loaded + //final FST.Arc arc = index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output in the index! + //assert arc.isFinal(); + + currentFrame = staticFrame; + final FST.Arc arc; + if (index != null) { + arc = index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = pushFrame(arc, rootCode, 0); + currentFrame.loadBlock(); + validIndexPrefix = 0; + // if (DEBUG) { + // System.out.println("init frame state " + currentFrame.ord); + // printSeekState(); + // } + + //System.out.println(); + // computeBlockStats().print(System.out); + } + + /** Runs next() through the entire terms dict, + * computing aggregate statistics. */ + public Stats computeBlockStats() throws IOException { + + Stats stats = new Stats(segment, fieldInfo.name); + if (index != null) { + stats.indexNodeCount = index.getNodeCount(); + stats.indexArcCount = index.getArcCount(); + stats.indexNumBytes = index.sizeInBytes(); + } + + currentFrame = staticFrame; + FST.Arc arc; + if (index != null) { + arc = index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + + // Empty string prefix must have an output in the + // index! + currentFrame = pushFrame(arc, rootCode, 0); + currentFrame.fpOrig = currentFrame.fp; + currentFrame.loadBlock(); + validIndexPrefix = 0; + + stats.startBlock(currentFrame, !currentFrame.isLastInFloor); + + allTerms: + while (true) { + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + stats.endBlock(currentFrame); + if (!currentFrame.isLastInFloor) { + currentFrame.loadNextFloorBlock(); + stats.startBlock(currentFrame, true); + } else { + if (currentFrame.ord == 0) { + break allTerms; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord-1]; + assert lastFP == currentFrame.lastSubFP; + // if (DEBUG) { + // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + // } + } + } + + while(true) { + if (currentFrame.next()) { + // Push to new block: + currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length); + currentFrame.fpOrig = currentFrame.fp; + // This is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.isFloor = false; + //currentFrame.hasTerms = true; + currentFrame.loadBlock(); + stats.startBlock(currentFrame, !currentFrame.isLastInFloor); + } else { + stats.term(term); + break; + } + } + } + + stats.finish(); + + // Put root frame back: + currentFrame = staticFrame; + if (index != null) { + arc = index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = pushFrame(arc, rootCode, 0); + currentFrame.rewind(); + currentFrame.loadBlock(); + validIndexPrefix = 0; + term.length = 0; + + return stats; + } + + private Frame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final Frame[] next = new Frame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, next, 0, stack.length); + for(int stackOrd=stack.length;stackOrd getArc(int ord) { + if (ord >= arcs.length) { + @SuppressWarnings("unchecked") final FST.Arc[] next = new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + for(int arcOrd=arcs.length;arcOrd(); + } + arcs = next; + } + return arcs[ord]; + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + // Pushes a frame we seek'd to + Frame pushFrame(FST.Arc arc, BytesRef frameData, int length) throws IOException { + scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); + final long code = scratchReader.readVLong(); + final long fpSeek = code >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; + final Frame f = getFrame(1+currentFrame.ord); + f.hasTerms = (code & BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0; + f.hasTermsOrig = f.hasTerms; + f.isFloor = (code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0; + if (f.isFloor) { + f.setFloorData(scratchReader, frameData); + } + pushFrame(arc, fpSeek, length); + + return f; + } + + // Pushes next'd frame or seek'd frame; we later + // lazy-load the frame only when needed + Frame pushFrame(FST.Arc arc, long fp, int length) throws IOException { + final Frame f = getFrame(1+currentFrame.ord); + f.arc = arc; + if (f.fpOrig == fp && f.nextEnt != -1) { + //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix); + if (f.prefix > targetBeforeCurrentLength) { + f.rewind(); + } else { + // if (DEBUG) { + // System.out.println(" skip rewind!"); + // } + } + assert length == f.prefix; + } else { + f.nextEnt = -1; + f.prefix = length; + f.state.termBlockOrd = 0; + f.fpOrig = f.fp = fp; + f.lastSubFP = -1; + // if (DEBUG) { + // final int sav = term.length; + // term.length = length; + // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term)); + // term.length = sav; + // } + } + + return f; + } + + // asserts only + private boolean clearEOF() { + eof = false; + return true; + } + + // asserts only + private boolean setEOF() { + eof = true; + return true; + } + + @Override + public boolean seekExact(final BytesRef target, final boolean useCache) throws IOException { + + if (index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + if (term.bytes.length <= target.length) { + term.bytes = ArrayUtil.grow(term.bytes, 1+target.length); + } + + assert clearEOF(); + + // if (DEBUG) { + // System.out.println("\nBTTR.seekExact seg=" + segment + " target=" + fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix); + // printSeekState(); + // } + + FST.Arc arc; + int targetUpto; + BytesRef output; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + // if (DEBUG) { + // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + // } + + arc = arcs[0]; + assert arc.isFinal(); + output = arc.output; + targetUpto = 0; + + Frame lastFrame = stack[0]; + assert validIndexPrefix <= term.length; + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // TODO: reverse vLong byte order for better FST + // prefix output sharing + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + // if (DEBUG) { + // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); + // } + if (cmp != 0) { + break; + } + arc = arcs[1+targetUpto]; + //if (arc.label != (target.bytes[target.offset + targetUpto] & 0xFF)) { + //System.out.println("FAIL: arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF)); + //} + assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + if (arc.isFinal()) { + lastFrame = stack[1+lastFrame.ord]; + } + targetUpto++; + } + + if (cmp == 0) { + final int targetUptoMid = targetUpto; + + // Second compare the rest of the term, but + // don't save arc/output/frame; we only do this + // to find out if the target term is before, + // equal or after the current term + final int targetLimit2 = Math.min(target.length, term.length); + while (targetUpto < targetLimit2) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + // if (DEBUG) { + // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); + // } + if (cmp != 0) { + break; + } + targetUpto++; + } + + if (cmp == 0) { + cmp = term.length - target.length; + } + targetUpto = targetUptoMid; + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + // if (DEBUG) { + // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = 0; + // if (DEBUG) { + // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length == target.length; + if (termExists) { + // if (DEBUG) { + // System.out.println(" target is same as current; return true"); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" target is same as current but term doesn't exist"); + // } + } + //validIndexPrefix = currentFrame.depth; + //term.length = target.length; + //return termExists; + } + + } else { + + targetBeforeCurrentLength = -1; + arc = index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output != null; + + // if (DEBUG) { + // System.out.println(" no seek state; push root frame"); + // } + + output = arc.output; + + currentFrame = staticFrame; + + //term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0); + } + + // if (DEBUG) { + // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); + // } + + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final FST.Arc nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto)); + + if (nextArc == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); + // } + + validIndexPrefix = currentFrame.prefix; + //validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + if (!currentFrame.hasTerms) { + termExists = false; + term.bytes[targetUpto] = (byte) targetLabel; + term.length = 1+targetUpto; + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); + // } + return false; + } + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term)); + // } + return false; + } + } else { + // Follow this arc + arc = nextArc; + term.bytes[targetUpto] = (byte) targetLabel; + // Aggregate output as we go: + assert arc.output != null; + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + + // if (DEBUG) { + // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + // } + targetUpto++; + + if (arc.isFinal()) { + //if (DEBUG) System.out.println(" arc is final!"); + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto); + //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); + } + } + } + + //validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefix; + + currentFrame.scanToFloorFrame(target); + + // Target term is entirely contained in the index: + if (!currentFrame.hasTerms) { + termExists = false; + term.length = targetUpto; + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); + // } + return false; + } + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString()); + // } + + return false; + } + } + + @Override + public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException { + if (index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + if (term.bytes.length <= target.length) { + term.bytes = ArrayUtil.grow(term.bytes, 1+target.length); + } + + assert clearEOF(); + + //if (DEBUG) { + //System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix); + //printSeekState(); + //} + + FST.Arc arc; + int targetUpto; + BytesRef output; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + //if (DEBUG) { + //System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + //} + + arc = arcs[0]; + assert arc.isFinal(); + output = arc.output; + targetUpto = 0; + + Frame lastFrame = stack[0]; + assert validIndexPrefix <= term.length; + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // TOOD: we should write our vLong backwards (MSB + // first) to get better sharing from the FST + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + //if (DEBUG) { + //System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); + //} + if (cmp != 0) { + break; + } + arc = arcs[1+targetUpto]; + assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + // TOOD: we could save the outputs in local + // byte[][] instead of making new objs ever + // seek; but, often the FST doesn't have any + // shared bytes (but this could change if we + // reverse vLong byte order) + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + if (arc.isFinal()) { + lastFrame = stack[1+lastFrame.ord]; + } + targetUpto++; + } + + + if (cmp == 0) { + final int targetUptoMid = targetUpto; + // Second compare the rest of the term, but + // don't save arc/output/frame: + final int targetLimit2 = Math.min(target.length, term.length); + while (targetUpto < targetLimit2) { + cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); + //if (DEBUG) { + //System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); + //} + if (cmp != 0) { + break; + } + targetUpto++; + } + + if (cmp == 0) { + cmp = term.length - target.length; + } + targetUpto = targetUptoMid; + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + //if (DEBUG) { + //System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); clear frame.scanned ord=" + lastFrame.ord); + //} + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = 0; + //if (DEBUG) { + //System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); + //} + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length == target.length; + if (termExists) { + //if (DEBUG) { + //System.out.println(" target is same as current; return FOUND"); + //} + return SeekStatus.FOUND; + } else { + //if (DEBUG) { + //System.out.println(" target is same as current but term doesn't exist"); + //} + } + } + + } else { + + targetBeforeCurrentLength = -1; + arc = index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output != null; + + //if (DEBUG) { + //System.out.println(" no seek state; push root frame"); + //} + + output = arc.output; + + currentFrame = staticFrame; + + //term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0); + } + + //if (DEBUG) { + //System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); + //} + + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final FST.Arc nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto)); + + if (nextArc == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); + // } + + validIndexPrefix = currentFrame.prefix; + //validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, false); + if (result == SeekStatus.END) { + term.copy(target); + termExists = false; + + if (next() != null) { + //if (DEBUG) { + //System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term); + //} + return SeekStatus.NOT_FOUND; + } else { + //if (DEBUG) { + //System.out.println(" return END"); + //} + return SeekStatus.END; + } + } else { + //if (DEBUG) { + //System.out.println(" return " + result + " term=" + brToString(term) + " " + term); + //} + return result; + } + } else { + // Follow this arc + term.bytes[targetUpto] = (byte) targetLabel; + arc = nextArc; + // Aggregate output as we go: + assert arc.output != null; + if (arc.output != NO_OUTPUT) { + output = fstOutputs.add(output, arc.output); + } + + //if (DEBUG) { + //System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + //} + targetUpto++; + + if (arc.isFinal()) { + //if (DEBUG) System.out.println(" arc is final!"); + currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto); + //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); + } + } + } + + //validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefix; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, false); + + if (result == SeekStatus.END) { + term.copy(target); + termExists = false; + if (next() != null) { + //if (DEBUG) { + //System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term); + //} + return SeekStatus.NOT_FOUND; + } else { + //if (DEBUG) { + //System.out.println(" return END"); + //} + return SeekStatus.END; + } + } else { + return result; + } + } + + private void printSeekState() throws IOException { + if (currentFrame == staticFrame) { + System.out.println(" no prior seek"); + } else { + System.out.println(" prior seek state:"); + int ord = 0; + boolean isSeekFrame = true; + while(true) { + Frame f = getFrame(ord); + assert f != null; + final BytesRef prefix = new BytesRef(term.bytes, 0, f.prefix); + if (f.nextEnt == -1) { + System.out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp< 0 || fp != fpOrig) { + if (DEBUG) { + System.out.println(" rewind frame ord=" + ord + " fpOrig=" + fpOrig + " fp=" + fp + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " nextEnt=" + nextEnt + " prefixLen=" + prefix); + } + if (fp != fpOrig) { + fp = fpOrig; + nextEnt = -1; + } else { + nextEnt = 0; + } + hasTerms = hasTermsOrig; + if (isFloor) { + floorDataReader.rewind(); + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + } + assert suffixBytes != null; + suffixesReader.rewind(); + assert statBytes != null; + statsReader.rewind(); + metaDataUpto = 0; + state.termBlockOrd = 0; + // TODO: skip this if !hasTerms? Then postings + // impl wouldn't have to write useless 0 byte + postingsReader.resetTermsBlock(fieldInfo, state); + lastSubFP = -1; + } else if (DEBUG) { + System.out.println(" skip rewind fp=" + fp + " fpOrig=" + fpOrig + " nextEnt=" + nextEnt + " ord=" + ord); + } + */ + } + + public boolean next() { + return isLeafBlock ? nextLeaf() : nextNonLeaf(); + } + + // Decodes next entry; returns true if it's a sub-block + public boolean nextLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + suffix = suffixesReader.readVInt(); + startBytePos = suffixesReader.getPosition(); + term.length = prefix + suffix; + if (term.bytes.length < term.length) { + term.grow(term.length); + } + suffixesReader.readBytes(term.bytes, prefix, suffix); + // A normal term + termExists = true; + return false; + } + + public boolean nextNonLeaf() { + //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + final int code = suffixesReader.readVInt(); + suffix = code >>> 1; + startBytePos = suffixesReader.getPosition(); + term.length = prefix + suffix; + if (term.bytes.length < term.length) { + term.grow(term.length); + } + suffixesReader.readBytes(term.bytes, prefix, suffix); + if ((code & 1) == 0) { + // A normal term + termExists = true; + state.termBlockOrd++; + return false; + } else { + // A sub-block; make sub-FP absolute: + termExists = false; + lastSubFP = fp - suffixesReader.readVLong(); + //if (DEBUG) { + //System.out.println(" lastSubFP=" + lastSubFP); + //} + return true; + } + } + + // TODO: make this array'd so we can do bin search? + // likely not worth it? need to measure how many + // floor blocks we "typically" get + public void scanToFloorFrame(BytesRef target) { + + if (!isFloor || target.length <= prefix) { + // if (DEBUG) { + // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + target.length + " vs prefix=" + prefix); + // } + return; + } + + final int targetLabel = target.bytes[target.offset + prefix] & 0xFF; + + // if (DEBUG) { + // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + toHex(targetLabel) + " vs nextFloorLabel=" + toHex(nextFloorLabel) + " numFollowFloorBlocks=" + numFollowFloorBlocks); + // } + + if (targetLabel < nextFloorLabel) { + // if (DEBUG) { + // System.out.println(" already on correct block"); + // } + return; + } + + assert numFollowFloorBlocks != 0; + + long newFP = fpOrig; + while (true) { + final long code = floorDataReader.readVLong(); + newFP = fpOrig + (code >>> 1); + hasTerms = (code & 1) != 0; + // if (DEBUG) { + // System.out.println(" label=" + toHex(nextFloorLabel) + " fp=" + newFP + " hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks); + // } + + isLastInFloor = numFollowFloorBlocks == 1; + numFollowFloorBlocks--; + + if (isLastInFloor) { + nextFloorLabel = 256; + // if (DEBUG) { + // System.out.println(" stop! last block nextFloorLabel=" + toHex(nextFloorLabel)); + // } + break; + } else { + nextFloorLabel = floorDataReader.readByte() & 0xff; + if (targetLabel < nextFloorLabel) { + // if (DEBUG) { + // System.out.println(" stop! nextFloorLabel=" + toHex(nextFloorLabel)); + // } + break; + } + } + } + + if (newFP != fp) { + // Force re-load of the block: + // if (DEBUG) { + // System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp); + // } + nextEnt = -1; + fp = newFP; + } else { + // if (DEBUG) { + // System.out.println(" stay on same fp=" + newFP); + // } + } + } + + public void decodeMetaData() throws IOException { + + //if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd); + + // lazily catch up on metadata decode: + final int limit = getTermBlockOrd(); + assert limit > 0; + + // We must set/incr state.termCount because + // postings impl can look at this + state.termBlockOrd = metaDataUpto; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + state.docFreq = statsReader.readVInt(); + //if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (fieldInfo.indexOptions != IndexOptions.DOCS_ONLY) { + state.totalTermFreq = state.docFreq + statsReader.readVLong(); + //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } + + postingsReader.nextTerm(fieldInfo, state); + metaDataUpto++; + state.termBlockOrd++; + } + } + + // Used only by assert + private boolean prefixMatches(BytesRef target) { + for(int bytePos=0;bytePos fields = new ArrayList(); + // private final String segment; + + /** Create a new writer. The number of items (terms or + * sub-blocks) per block will aim to be between + * minItemsPerBlock and maxItemsPerBlock, though in some + * cases the blocks may be smaller than the min. */ + public BlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock) + throws IOException + { + if (minItemsInBlock <= 1) { + throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock); + } + if (maxItemsInBlock <= 0) { + throw new IllegalArgumentException("maxItemsInBlock must be >= 1; got " + maxItemsInBlock); + } + if (minItemsInBlock > maxItemsInBlock) { + throw new IllegalArgumentException("maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock); + } + if (2*(minItemsInBlock-1) > maxItemsInBlock) { + throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock); + } + + final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_EXTENSION); + out = state.directory.createOutput(termsFileName, state.context); + boolean success = false; + IndexOutput indexOut = null; + try { + fieldInfos = state.fieldInfos; + this.minItemsInBlock = minItemsInBlock; + this.maxItemsInBlock = maxItemsInBlock; + writeHeader(out); + + //DEBUG = state.segmentName.equals("_4a"); + + final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(termsIndexFileName, state.context); + writeIndexHeader(indexOut); + + currentField = null; + this.postingsWriter = postingsWriter; + // segment = state.segmentName; + + // System.out.println("BTW.init seg=" + state.segmentName); + + postingsWriter.start(out); // have consumer write its format/header + success = true; + } finally { + if (!success) { + IOUtils.closeSafely(true, out, indexOut); + } + } + this.indexOut = indexOut; + } + + protected void writeHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + out.writeLong(0); // leave space for end index pointer + } + + protected void writeIndexHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + out.writeLong(0); // leave space for end index pointer + } + + protected void writeTrailer(long dirStart) throws IOException { + out.seek(CodecUtil.headerLength(CODEC_NAME)); + out.writeLong(dirStart); + } + + protected void writeIndexTrailer(long dirStart) throws IOException { + indexOut.seek(CodecUtil.headerLength(CODEC_NAME)); + indexOut.writeLong(dirStart); + } + + @Override + public TermsConsumer addField(FieldInfo field) throws IOException { + //DEBUG = field.name.equals("id"); + //if (DEBUG) System.out.println("\nBTTW.addField seg=" + segment + " field=" + field.name); + assert currentField == null || currentField.name.compareTo(field.name) < 0; + currentField = field; + final TermsWriter terms = new TermsWriter(field); + fields.add(terms); + return terms; + } + + static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) { + assert fp < (1L << 62); + return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0); + } + + private static class PendingEntry { + public final boolean isTerm; + + protected PendingEntry(boolean isTerm) { + this.isTerm = isTerm; + } + } + + private static final class PendingTerm extends PendingEntry { + public final BytesRef term; + public final TermStats stats; + + public PendingTerm(BytesRef term, TermStats stats) { + super(true); + this.term = term; + this.stats = stats; + } + + @Override + public String toString() { + return term.utf8ToString(); + } + } + + private static final class PendingBlock extends PendingEntry { + public final BytesRef prefix; + public final long fp; + public FST index; + public List> subIndices; + public final boolean hasTerms; + public final boolean isFloor; + public final int floorLeadByte; + + public PendingBlock(BytesRef prefix, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List> subIndices) { + super(false); + this.prefix = prefix; + this.fp = fp; + this.hasTerms = hasTerms; + this.isFloor = isFloor; + this.floorLeadByte = floorLeadByte; + this.subIndices = subIndices; + } + + @Override + public String toString() { + return "BLOCK: " + prefix.utf8ToString(); + } + + public void compileIndex(List floorBlocks, RAMOutputStream scratchBytes) throws IOException { + + assert (isFloor && floorBlocks != null && floorBlocks.size() != 0) || (!isFloor && floorBlocks == null): "isFloor=" + isFloor + " floorBlocks=" + floorBlocks; + + assert scratchBytes.getFilePointer() == 0; + + // TODO: try writing the leading vLong in MSB order + // (opposite of what Lucene does today), for better + // outputs sharing in the FST + scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); + if (isFloor) { + scratchBytes.writeVInt(floorBlocks.size()); + for (PendingBlock sub : floorBlocks) { + assert sub.floorLeadByte != -1; + //if (DEBUG) { + // System.out.println(" write floorLeadByte=" + Integer.toHexString(sub.floorLeadByte&0xff)); + //} + scratchBytes.writeByte((byte) sub.floorLeadByte); + assert sub.fp > fp; + scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0)); + } + } + + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + final Builder indexBuilder = new Builder(FST.INPUT_TYPE.BYTE1, + 0, 0, true, false, Integer.MAX_VALUE, + outputs, null); + //if (DEBUG) { + // System.out.println(" compile index for prefix=" + prefix); + //} + //indexBuilder.DEBUG = false; + final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()]; + assert bytes.length > 0; + scratchBytes.writeTo(bytes, 0); + indexBuilder.add(prefix, new BytesRef(bytes, 0, bytes.length)); + scratchBytes.reset(); + + // Copy over index for all sub-blocks + + if (subIndices != null) { + for(FST subIndex : subIndices) { + append(indexBuilder, subIndex); + } + } + + if (floorBlocks != null) { + for (PendingBlock sub : floorBlocks) { + if (sub.subIndices != null) { + for(FST subIndex : sub.subIndices) { + append(indexBuilder, subIndex); + } + } + sub.subIndices = null; + } + } + + index = indexBuilder.finish(); + subIndices = null; + + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); + Util.toDot(index, w, false, false); + System.out.println("SAVED to out.dot"); + w.close(); + */ + } + + // TODO: maybe we could add bulk-add method to + // Builder? Takes FST and unions it w/ current + // FST. + private void append(Builder builder, FST subIndex) throws IOException { + final BytesRefFSTEnum subIndexEnum = new BytesRefFSTEnum(subIndex); + BytesRefFSTEnum.InputOutput indexEnt; + while((indexEnt = subIndexEnum.next()) != null) { + //if (DEBUG) { + // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); + //} + builder.add(indexEnt.input, indexEnt.output); + } + } + } + + final RAMOutputStream scratchBytes = new RAMOutputStream(); + + class TermsWriter extends TermsConsumer { + private final FieldInfo fieldInfo; + private long numTerms; + long sumTotalTermFreq; + long sumDocFreq; + long indexStartFP; + + // Used only to partition terms into the block tree; we + // don't pull an FST from this builder: + private final NoOutputs noOutputs; + private final Builder blockBuilder; + + // PendingTerm or PendingBlock: + private final List pending = new ArrayList(); + + // Index into pending of most recently written block + private int lastBlockIndex = -1; + + // Re-used when segmenting a too-large block into floor + // blocks: + private int[] subBytes = new int[10]; + private int[] subTermCounts = new int[10]; + private int[] subTermCountSums = new int[10]; + private int[] subSubCounts = new int[10]; + + // This class assigns terms to blocks "naturally", ie, + // according to the number of terms under a given prefix + // that we encounter: + private class FindBlocks extends Builder.FreezeTail { + + @Override + public void freeze(final Builder.UnCompiledNode[] frontier, int prefixLenPlus1, final IntsRef lastInput) throws IOException { + + //if (DEBUG) System.out.println(" freeze prefixLenPlus1=" + prefixLenPlus1); + + for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) { + final Builder.UnCompiledNode node = frontier[idx]; + + long totCount = 0; + + if (node.isFinal) { + totCount++; + } + + for(int arcIdx=0;arcIdx target = (Builder.UnCompiledNode) node.arcs[arcIdx].target; + totCount += target.inputCount; + target.clear(); + node.arcs[arcIdx].target = null; + } + node.numArcs = 0; + + if (totCount >= minItemsInBlock || idx == 0) { + // We are on a prefix node that has enough + // entries (terms or sub-blocks) under it to let + // us write a new block or multiple blocks (main + // block + follow on floor blocks): + //if (DEBUG) { + // if (totCount < minItemsInBlock && idx != 0) { + // System.out.println(" force block has terms"); + // } + //} + writeBlocks(lastInput, idx, (int) totCount); + node.inputCount = 1; + } else { + // stragglers! carry count upwards + node.inputCount = totCount; + } + frontier[idx] = new Builder.UnCompiledNode(blockBuilder, idx); + } + } + } + + // Write the top count entries on the pending stack as + // one or more blocks. Returns how many blocks were + // written. If the entry count is <= maxItemsPerBlock + // we just write a single block; else we break into + // primary (initial) block and then one or more + // following floor blocks: + + void writeBlocks(IntsRef prevTerm, int prefixLength, int count) throws IOException { + if (prefixLength == 0 || count <= maxItemsInBlock) { + // Easy case: not floor block. Eg, prefix is "foo", + // and we found 30 terms/sub-blocks starting w/ that + // prefix, and minItemsInBlock <= 30 <= + // maxItemsInBlock. + final PendingBlock nonFloorBlock = writeBlock(prevTerm, prefixLength, prefixLength, count, count, 0, false, -1, true); + nonFloorBlock.compileIndex(null, scratchBytes); + pending.add(nonFloorBlock); + } else { + // Floor block case. Eg, prefix is "foo" but we + // have 100 terms/sub-blocks starting w/ that + // prefix. We segment the entries into a primary + // block and following floor blocks using the first + // label in the suffix to assign to floor blocks. + + // TODO: we could store min & max suffix start byte + // in each block, to make floor blocks authoritative + + //if (DEBUG) { + // final BytesRef prefix = new BytesRef(prefixLength); + // for(int m=0;m= minItemsInBlock) { + final int curPrefixLength; + if (startLabel == -1) { + curPrefixLength = prefixLength; + } else { + curPrefixLength = 1+prefixLength; + // floor term: + prevTerm.ints[prevTerm.offset + prefixLength] = startLabel; + } + //System.out.println(" " + subCount + " subs"); + final PendingBlock floorBlock = writeBlock(prevTerm, prefixLength, curPrefixLength, curStart, pendingCount, subTermCountSums[1+sub], true, startLabel, curStart == pendingCount); + if (firstBlock == null) { + firstBlock = floorBlock; + } else { + floorBlocks.add(floorBlock); + } + curStart -= pendingCount; + //System.out.println(" = " + pendingCount); + pendingCount = 0; + + assert minItemsInBlock == 1 || subCount > 1: "minItemsInBlock=" + minItemsInBlock + " subCount=" + subCount + " sub=" + sub + " of " + numSubs + " subTermCount=" + subTermCountSums[sub] + " subSubCount=" + subSubCounts[sub] + " depth=" + prefixLength; + subCount = 0; + startLabel = subBytes[sub+1]; + + if (curStart == 0) { + break; + } + + if (curStart <= maxItemsInBlock) { + // remainder is small enough to fit into a + // block. NOTE that this may be too small (< + // minItemsInBlock); need a true segmenter + // here + assert startLabel != -1; + assert firstBlock != null; + prevTerm.ints[prevTerm.offset + prefixLength] = startLabel; + //System.out.println(" final " + (numSubs-sub-1) + " subs"); + /* + for(sub++;sub < numSubs;sub++) { + System.out.println(" " + (subTermCounts[sub] + subSubCounts[sub])); + } + System.out.println(" = " + curStart); + if (curStart < minItemsInBlock) { + System.out.println(" **"); + } + */ + floorBlocks.add(writeBlock(prevTerm, prefixLength, prefixLength+1, curStart, curStart, 0, true, startLabel, true)); + break; + } + } + } + + prevTerm.ints[prevTerm.offset + prefixLength] = savLabel; + + assert firstBlock != null; + firstBlock.compileIndex(floorBlocks, scratchBytes); + + pending.add(firstBlock); + //if (DEBUG) System.out.println(" done pending.size()=" + pending.size()); + } + lastBlockIndex = pending.size()-1; + } + + // for debugging + private String toString(BytesRef b) { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + + // Writes all entries in the pending slice as a single + // block: + private PendingBlock writeBlock(IntsRef prevTerm, int prefixLength, int indexPrefixLength, int startBackwards, int length, + int futureTermCount, boolean isFloor, int floorLeadByte, boolean isLastInFloor) throws IOException { + + assert length > 0; + + final int start = pending.size()-startBackwards; + + assert start >= 0: "pending.size()=" + pending.size() + " startBackwards=" + startBackwards + " length=" + length; + + final List slice = pending.subList(start, start + length); + + final long startFP = out.getFilePointer(); + + final BytesRef prefix = new BytesRef(indexPrefixLength); + for(int m=0;m(FST.INPUT_TYPE.BYTE1, + 0, 0, true, + true, Integer.MAX_VALUE, + noOutputs, + new FindBlocks()); + + postingsWriter.setField(fieldInfo); + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public PostingsConsumer startTerm(BytesRef text) throws IOException { + //if (DEBUG) System.out.println("\nBTTW.startTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment); + postingsWriter.startTerm(); + /* + if (fieldInfo.name.equals("id")) { + postingsWriter.termID = Integer.parseInt(text.utf8ToString()); + } else { + postingsWriter.termID = -1; + } + */ + return postingsWriter; + } + + @Override + public void finishTerm(BytesRef text, TermStats stats) throws IOException { + + assert stats.docFreq > 0; + //if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq); + + blockBuilder.add(text, noOutputs.getNoOutput()); + pending.add(new PendingTerm(new BytesRef(text), stats)); + postingsWriter.finishTerm(stats); + numTerms++; + } + + // Finishes all terms in this field + @Override + public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { + if (numTerms > 0) { + blockBuilder.finish(); + + // We better have one final "root" block: + assert pending.size() == 1 && !pending.get(0).isTerm: "pending.size()=" + pending.size() + " pending=" + pending; + final PendingBlock root = (PendingBlock) pending.get(0); + assert root.prefix.length == 0; + assert root.index.getEmptyOutput() != null; + + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + + // Write FST to index + indexStartFP = indexOut.getFilePointer(); + root.index.save(indexOut); + //System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name); + + // if (SAVE_DOT_FILES || DEBUG) { + // final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + // Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + // Util.toDot(root.index, w, false, false); + // System.out.println("SAVED to " + dotFileName); + // w.close(); + // } + } + } + + private final RAMOutputStream bytesWriter = new RAMOutputStream(); + private final RAMOutputStream bytesWriter2 = new RAMOutputStream(); + } + + @Override + public void close() throws IOException { + + IOException ioe = null; + try { + + int nonZeroCount = 0; + for(TermsWriter field : fields) { + if (field.numTerms > 0) { + nonZeroCount++; + } + } + + final long dirStart = out.getFilePointer(); + final long indexDirStart = indexOut.getFilePointer(); + + out.writeVInt(nonZeroCount); + + for(TermsWriter field : fields) { + if (field.numTerms > 0) { + //System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms"); + out.writeVInt(field.fieldInfo.number); + out.writeVLong(field.numTerms); + final BytesRef rootCode = ((PendingBlock) field.pending.get(0)).index.getEmptyOutput(); + assert rootCode != null: "field=" + field.fieldInfo.name + " numTerms=" + field.numTerms; + out.writeVInt(rootCode.length); + out.writeBytes(rootCode.bytes, rootCode.offset, rootCode.length); + if (field.fieldInfo.indexOptions != IndexOptions.DOCS_ONLY) { + out.writeVLong(field.sumTotalTermFreq); + } + out.writeVLong(field.sumDocFreq); + indexOut.writeVLong(field.indexStartFP); + } + } + writeTrailer(dirStart); + writeIndexTrailer(indexDirStart); + } catch (IOException ioe2) { + ioe = ioe2; + } finally { + IOUtils.closeSafely(ioe, out, indexOut, postingsWriter); + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java index 4236120563d..f4cc7e37bb3 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java @@ -84,7 +84,7 @@ public class CodecProvider { public synchronized Codec lookup(String name) { final Codec codec = codecs.get(name); if (codec == null) { - throw new IllegalArgumentException("required codec '" + name + "' not found"); + throw new IllegalArgumentException("required codec '" + name + "' not found; known codecs: " + codecs.keySet()); } return codec; } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java b/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java index ffdfdc38616..05aec946cd4 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java @@ -26,9 +26,8 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; // javadocs - -/** BlockTermsReader interacts with a single instance +/** The core terms dictionaries (BlockTermsReader, + * BlockTreeTermsReader) interact with a single instance * of this class to manage creation of {@link DocsEnum} and * {@link DocsAndPositionsEnum} instances. It provides an * IndexInput (termsIn) where this class may read any @@ -49,11 +48,11 @@ public abstract class PostingsReaderBase implements Closeable { /** Must fully consume state, since after this call that * TermState may be reused. */ - public abstract DocsEnum docs(FieldInfo fieldInfo, BlockTermState state, Bits liveDocs, DocsEnum reuse) throws IOException; + public abstract DocsEnum docs(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsEnum reuse) throws IOException; /** Must fully consume state, since after this call that * TermState may be reused. */ - public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException; + public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException; public abstract void close() throws IOException; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java b/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java index de59a46525a..5e6ea14b98d 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java @@ -33,7 +33,11 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo public abstract void startTerm() throws IOException; - public abstract void flushTermsBlock() throws IOException; + /** Flush count terms starting at start "backwards", as a + * block. start is a negative offset from the end of the + * terms stack, ie bigger start means further back in + * the stack. */ + public abstract void flushTermsBlock(int start, int count) throws IOException; /** Finishes the current term */ public abstract void finishTerm(TermStats stats) throws IOException; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java index ffbce61c493..436190871f2 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java @@ -68,10 +68,14 @@ public abstract class FixedIntBlockIndexOutput extends IntIndexOutput { } @Override - public void set(IntIndexOutput.Index other) throws IOException { + public void copyFrom(IntIndexOutput.Index other, boolean copyLast) throws IOException { Index idx = (Index) other; - lastFP = fp = idx.fp; - lastUpto = upto = idx.upto; + fp = idx.fp; + upto = idx.upto; + if (copyLast) { + lastFP = fp; + lastUpto = upto; + } } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java index 46112404636..da48ad19b47 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java @@ -77,10 +77,14 @@ public abstract class VariableIntBlockIndexOutput extends IntIndexOutput { } @Override - public void set(IntIndexOutput.Index other) throws IOException { + public void copyFrom(IntIndexOutput.Index other, boolean copyLast) throws IOException { Index idx = (Index) other; - lastFP = fp = idx.fp; - lastUpto = upto = idx.upto; + fp = idx.fp; + upto = idx.upto; + if (copyLast) { + lastFP = fp; + lastUpto = upto; + } } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java index 3c05f495914..169804cc680 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java @@ -22,28 +22,23 @@ import java.util.Set; import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.codecs.Codec; -import org.apache.lucene.index.codecs.PostingsWriterBase; -import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.codecs.PostingsReaderBase; -import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.PostingsWriterBase; +import org.apache.lucene.index.codecs.BlockTreeTermsReader; +import org.apache.lucene.index.codecs.BlockTreeTermsWriter; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.PerDocConsumer; -import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; import org.apache.lucene.index.codecs.PerDocValues; -import org.apache.lucene.index.codecs.VariableGapTermsIndexReader; -import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter; -import org.apache.lucene.index.codecs.BlockTermsReader; -import org.apache.lucene.index.codecs.BlockTermsWriter; -import org.apache.lucene.index.codecs.TermsIndexReaderBase; -import org.apache.lucene.index.codecs.TermsIndexWriterBase; import org.apache.lucene.index.codecs.standard.StandardCodec; +import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.IOUtils; /** This codec "inlines" the postings for terms that have * low docFreq. It wraps another codec, which is used for @@ -56,64 +51,52 @@ import org.apache.lucene.util.IOUtils; public class PulsingCodec extends Codec { private final int freqCutoff; + private final int minBlockSize; + private final int maxBlockSize; - /** - * Creates a {@link PulsingCodec} with freqCutoff = 1 - * - * @see PulsingCodec#PulsingCodec(int) - */ public PulsingCodec() { this(1); } - /** @lucene.internal */ - public int getFreqCutoff() { - return freqCutoff; + public PulsingCodec(int freqCutoff) { + this(freqCutoff, BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); } /** Terms with freq <= freqCutoff are inlined into terms * dict. */ - public PulsingCodec(int freqCutoff) { + public PulsingCodec(int freqCutoff, int minBlockSize, int maxBlockSize) { super("Pulsing"); this.freqCutoff = freqCutoff; + this.minBlockSize = minBlockSize; + assert minBlockSize > 1; + this.maxBlockSize = maxBlockSize; } @Override public String toString() { - return name + "(freqCutoff=" + freqCutoff + ")"; + return name + "(freqCutoff=" + freqCutoff + " minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")"; } @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - // We wrap StandardPostingsWriter, but any StandardPostingsWriter + // We wrap StandardPostingsWriter, but any PostingsWriterBase // will work: + PostingsWriterBase docsWriter = new StandardPostingsWriter(state); // Terms that have <= freqCutoff number of docs are // "pulsed" (inlined): - PostingsWriterBase pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter); - - // Terms dict index - TermsIndexWriterBase indexWriter; - boolean success = false; - try { - indexWriter = new VariableGapTermsIndexWriter(state, new VariableGapTermsIndexWriter.EveryNTermSelector(state.termIndexInterval)); - success = true; - } finally { - if (!success) { - IOUtils.closeSafely(true, pulsingWriter); - } - } + PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(freqCutoff, docsWriter); // Terms dict - success = false; + boolean success = false; try { - FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter); + FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, minBlockSize, maxBlockSize); success = true; return ret; } finally { if (!success) { - IOUtils.closeSafely(true, pulsingWriter, indexWriter); + pulsingWriter.close(); } } } @@ -124,53 +107,34 @@ public class PulsingCodec extends Codec { // We wrap StandardPostingsReader, but any StandardPostingsReader // will work: PostingsReaderBase docsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.context, state.codecId); - PostingsReaderBase pulsingReader = new PulsingPostingsReaderImpl(docsReader); - - // Terms dict index reader - TermsIndexReaderBase indexReader; + PostingsReaderBase pulsingReader = new PulsingPostingsReader(docsReader); boolean success = false; try { - indexReader = new VariableGapTermsIndexReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - state.termsIndexDivisor, - state.codecId, state.context); + FieldsProducer ret = new BlockTreeTermsReader( + state.dir, state.fieldInfos, state.segmentInfo.name, + pulsingReader, + state.context, + state.codecId, + state.termsIndexDivisor); success = true; + return ret; } finally { if (!success) { pulsingReader.close(); } } + } - // Terms dict reader - success = false; - try { - FieldsProducer ret = new BlockTermsReader(indexReader, - state.dir, state.fieldInfos, state.segmentInfo.name, - pulsingReader, - state.context, - StandardCodec.TERMS_CACHE_SIZE, - state.codecId); - success = true; - return ret; - } finally { - if (!success) { - try { - pulsingReader.close(); - } finally { - indexReader.close(); - } - } - } + public int getFreqCutoff() { + return freqCutoff; } @Override - public void files(Directory dir, SegmentInfo segmentInfo, int id, Set files) throws IOException { - StandardPostingsReader.files(dir, segmentInfo, id, files); - BlockTermsReader.files(dir, segmentInfo, id, files); - VariableGapTermsIndexReader.files(dir, segmentInfo, id, files); - DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS()); + public void files(Directory dir, SegmentInfo segmentInfo, int codecID, Set files) throws IOException { + StandardPostingsReader.files(dir, segmentInfo, codecID, files); + BlockTreeTermsReader.files(dir, segmentInfo, codecID, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecID, files, getDocValuesUseCFS()); } @Override @@ -178,7 +142,7 @@ public class PulsingCodec extends Codec { StandardCodec.getStandardExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); } - + @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java similarity index 86% rename from lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java rename to lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java index c76b4cefe19..aefad1077b1 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java @@ -41,20 +41,20 @@ import org.apache.lucene.util.CodecUtil; // create two separate docs readers, one that also reads // prox and one that doesn't? -public class PulsingPostingsReaderImpl extends PostingsReaderBase { +public class PulsingPostingsReader extends PostingsReaderBase { // Fallback reader for non-pulsed terms: final PostingsReaderBase wrappedPostingsReader; int maxPositions; - public PulsingPostingsReaderImpl(PostingsReaderBase wrappedPostingsReader) throws IOException { + public PulsingPostingsReader(PostingsReaderBase wrappedPostingsReader) throws IOException { this.wrappedPostingsReader = wrappedPostingsReader; } @Override public void init(IndexInput termsIn) throws IOException { - CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC, - PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START); + CodecUtil.checkHeader(termsIn, PulsingPostingsWriter.CODEC, + PulsingPostingsWriter.VERSION_START, PulsingPostingsWriter.VERSION_START); maxPositions = termsIn.readVInt(); wrappedPostingsReader.init(termsIn); } @@ -69,8 +69,15 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { @Override public Object clone() { - PulsingTermState clone = new PulsingTermState(); - clone.copyFrom(this); + PulsingTermState clone; + clone = (PulsingTermState) super.clone(); + if (postingsSize != -1) { + clone.postings = new byte[postingsSize]; + System.arraycopy(postings, 0, clone.postings, 0, postingsSize); + } else { + assert wrappedTermState != null; + clone.wrappedTermState = (BlockTermState) wrappedTermState.clone(); + } return clone; } @@ -84,10 +91,8 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { postings = new byte[ArrayUtil.oversize(other.postingsSize, 1)]; } System.arraycopy(other.postings, 0, postings, 0, other.postingsSize); - } else if (wrappedTermState != null) { - wrappedTermState.copyFrom(other.wrappedTermState); } else { - wrappedTermState = (BlockTermState) other.wrappedTermState.clone(); + wrappedTermState.copyFrom(other.wrappedTermState); } // NOTE: we do not copy the @@ -108,18 +113,20 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { @Override public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException { + //System.out.println("PR.readTermsBlock state=" + _termState); final PulsingTermState termState = (PulsingTermState) _termState; if (termState.inlinedBytes == null) { termState.inlinedBytes = new byte[128]; termState.inlinedBytesReader = new ByteArrayDataInput(); } int len = termsIn.readVInt(); + //System.out.println(" len=" + len + " fp=" + termsIn.getFilePointer()); if (termState.inlinedBytes.length < len) { termState.inlinedBytes = new byte[ArrayUtil.oversize(len, 1)]; } termsIn.readBytes(termState.inlinedBytes, 0, len); termState.inlinedBytesReader.reset(termState.inlinedBytes); - termState.wrappedTermState.termCount = 0; + termState.wrappedTermState.termBlockOrd = 0; wrappedPostingsReader.readTermsBlock(termsIn, fieldInfo, termState.wrappedTermState); } @@ -140,7 +147,6 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { //System.out.println(" count=" + count + " threshold=" + maxPositions); if (count <= maxPositions) { - //System.out.println(" inlined pos=" + termState.inlinedBytesReader.getPosition()); // Inlined into terms dict -- just read the byte[] blob in, // but don't decode it now (we only decode when a DocsEnum @@ -154,6 +160,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { // current term block) into another byte[] (just the // blob for this term)... termState.inlinedBytesReader.readBytes(termState.postings, 0, termState.postingsSize); + //System.out.println(" inlined bytes=" + termState.postingsSize); } else { //System.out.println(" not inlined"); termState.postingsSize = -1; @@ -161,7 +168,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { termState.wrappedTermState.docFreq = termState.docFreq; termState.wrappedTermState.totalTermFreq = termState.totalTermFreq; wrappedPostingsReader.nextTerm(fieldInfo, termState.wrappedTermState); - termState.wrappedTermState.termCount++; + termState.wrappedTermState.termBlockOrd++; } } @@ -223,6 +230,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { } private static class PulsingDocsEnum extends DocsEnum { + private byte[] postingsBytes; private final ByteArrayDataInput postings = new ByteArrayDataInput(); private final IndexOptions indexOptions; private final boolean storePayloads; @@ -239,9 +247,16 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { public PulsingDocsEnum reset(Bits liveDocs, PulsingTermState termState) { //System.out.println("PR docsEnum termState=" + termState + " docFreq=" + termState.docFreq); assert termState.postingsSize != -1; - final byte[] bytes = new byte[termState.postingsSize]; - System.arraycopy(termState.postings, 0, bytes, 0, termState.postingsSize); - postings.reset(bytes); + + // Must make a copy of termState's byte[] so that if + // app does TermsEnum.next(), this DocsEnum is not affected + if (postingsBytes == null) { + postingsBytes = new byte[termState.postingsSize]; + } else if (postingsBytes.length < termState.postingsSize) { + postingsBytes = ArrayUtil.grow(postingsBytes, termState.postingsSize); + } + System.arraycopy(termState.postings, 0, postingsBytes, 0, termState.postingsSize); + postings.reset(postingsBytes, 0, termState.postingsSize); docID = 0; payloadLength = 0; freq = 1; @@ -263,6 +278,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { } final int code = postings.readVInt(); + //System.out.println(" read code=" + code); if (indexOptions == IndexOptions.DOCS_ONLY) { docID += code; } else { @@ -295,7 +311,6 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { } if (liveDocs == null || liveDocs.get(docID)) { - //System.out.println(" return docID=" + docID + " freq=" + freq); return docID; } } @@ -323,6 +338,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { } private static class PulsingDocsAndPositionsEnum extends DocsAndPositionsEnum { + private byte[] postingsBytes; private final ByteArrayDataInput postings = new ByteArrayDataInput(); private final boolean storePayloads; @@ -346,9 +362,13 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { public PulsingDocsAndPositionsEnum reset(Bits liveDocs, PulsingTermState termState) { assert termState.postingsSize != -1; - final byte[] bytes = new byte[termState.postingsSize]; - System.arraycopy(termState.postings, 0, bytes, 0, termState.postingsSize); - postings.reset(bytes); + if (postingsBytes == null) { + postingsBytes = new byte[termState.postingsSize]; + } else if (postingsBytes.length < termState.postingsSize) { + postingsBytes = ArrayUtil.grow(postingsBytes, termState.postingsSize); + } + System.arraycopy(termState.postings, 0, postingsBytes, 0, termState.postingsSize); + postings.reset(postingsBytes, 0, termState.postingsSize); this.liveDocs = liveDocs; payloadLength = 0; posPending = 0; @@ -359,7 +379,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { @Override public int nextDoc() throws IOException { - //System.out.println("PR.nextDoc this=" + this); + //System.out.println("PR d&p nextDoc this=" + this); while(true) { //System.out.println(" cycle skip posPending=" + posPending); @@ -367,16 +387,15 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { skipPositions(); if (postings.eof()) { - //System.out.println(" END"); + //System.out.println("PR END"); return docID = NO_MORE_DOCS; } - //System.out.println(" read doc code"); + final int code = postings.readVInt(); docID += code >>> 1; // shift off low bit if ((code & 1) != 0) { // if low bit is set freq = 1; // freq is one } else { - //System.out.println(" read freq"); freq = postings.readVInt(); // else read freq } posPending = freq; @@ -401,10 +420,8 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { @Override public int advance(int target) throws IOException { - //System.out.println("PR.advance target=" + target); int doc; while((doc=nextDoc()) != NO_MORE_DOCS) { - //System.out.println(" nextDoc got doc=" + doc); if (doc >= target) { return docID = doc; } @@ -414,7 +431,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { @Override public int nextPosition() throws IOException { - //System.out.println("PR.nextPosition posPending=" + posPending + " vs freq=" + freq); + //System.out.println("PR d&p nextPosition posPending=" + posPending + " vs freq=" + freq); assert posPending > 0; posPending--; @@ -424,7 +441,6 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { //System.out.println("PR skip payload=" + payloadLength); postings.skipBytes(payloadLength); } - //System.out.println(" read pos code"); final int code = postings.readVInt(); //System.out.println("PR code=" + code); if ((code & 1) != 0) { @@ -437,17 +453,16 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { position += postings.readVInt(); } - //System.out.println(" return pos=" + position + " hasPayload=" + !payloadRetrieved + " posPending=" + posPending + " this=" + this); + //System.out.println("PR d&p nextPos return pos=" + position + " this=" + this); return position; } private void skipPositions() throws IOException { - //System.out.println("PR.skipPositions: posPending=" + posPending); while(posPending != 0) { nextPosition(); } if (storePayloads && !payloadRetrieved) { - //System.out.println(" skip last payload len=" + payloadLength); + //System.out.println(" skip payload len=" + payloadLength); postings.skipBytes(payloadLength); payloadRetrieved = true; } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriter.java similarity index 71% rename from lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java rename to lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriter.java index 8fb3fe95021..c61fd2e1ed6 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriter.java @@ -18,6 +18,8 @@ package org.apache.lucene.index.codecs.pulsing; */ import java.io.IOException; +import java.util.List; +import java.util.ArrayList; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo.IndexOptions; @@ -35,9 +37,9 @@ import org.apache.lucene.util.CodecUtil; // presumably rare in practice... /** @lucene.experimental */ -public final class PulsingPostingsWriterImpl extends PostingsWriterBase { +public final class PulsingPostingsWriter extends PostingsWriterBase { - final static String CODEC = "PulsedPostings"; + final static String CODEC = "PulsedPostingsWriter"; // To add a new version, increment from the last one, and // change VERSION_CURRENT to point to your new version: @@ -50,6 +52,15 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase { private IndexOptions indexOptions; private boolean storePayloads; + private static class PendingTerm { + private final byte[] bytes; + public PendingTerm(byte[] bytes) { + this.bytes = bytes; + } + } + + private final List pendingTerms = new ArrayList(); + // one entry per position private final Position[] pending; private int pendingCount = 0; // -1 once we've hit too many positions @@ -71,7 +82,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase { /** If the total number of positions (summed across all docs * for this term) is <= maxPositions, then the postings are * inlined into terms dict */ - public PulsingPostingsWriterImpl(int maxPositions, PostingsWriterBase wrappedPostingsWriter) throws IOException { + public PulsingPostingsWriter(int maxPositions, PostingsWriterBase wrappedPostingsWriter) throws IOException { pending = new Position[maxPositions]; for(int i=0;i files) throws IOException { - files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.DOC_EXTENSION)); - files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.SKIP_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriter.DOC_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriter.SKIP_EXTENSION)); if (segmentInfo.getFieldInfos().hasFreq()) { - files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.FREQ_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriter.FREQ_EXTENSION)); } if (segmentInfo.getHasProx()) { - files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.POS_EXTENSION)); - files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.PAYLOAD_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriter.POS_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriter.PAYLOAD_EXTENSION)); } } @Override public void init(IndexInput termsIn) throws IOException { // Make sure we are talking to the matching past writer - CodecUtil.checkHeader(termsIn, SepPostingsWriterImpl.CODEC, - SepPostingsWriterImpl.VERSION_START, SepPostingsWriterImpl.VERSION_START); + CodecUtil.checkHeader(termsIn, SepPostingsWriter.CODEC, + SepPostingsWriter.VERSION_START, SepPostingsWriter.VERSION_START); skipInterval = termsIn.readInt(); maxSkipLevels = termsIn.readInt(); skipMinimum = termsIn.readInt(); @@ -153,6 +153,10 @@ public class SepPostingsReaderImpl extends PostingsReaderBase { // Only used for "primary" term state; these are never // copied on clone: + + // TODO: these should somehow be stored per-TermsEnum + // not per TermState; maybe somehow the terms dict + // should load/manage the byte[]/DataReader for us? byte[] bytes; ByteArrayDataInput bytesReader; @@ -216,8 +220,9 @@ public class SepPostingsReaderImpl extends PostingsReaderBase { @Override public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException { final SepTermState termState = (SepTermState) _termState; + //System.out.println("SEPR: readTermsBlock termsIn.fp=" + termsIn.getFilePointer()); final int len = termsIn.readVInt(); - //System.out.println("SepR.readTermsBlock len=" + len); + //System.out.println(" numBytes=" + len); if (termState.bytes == null) { termState.bytes = new byte[ArrayUtil.oversize(len, 1)]; termState.bytesReader = new ByteArrayDataInput(termState.bytes); @@ -231,30 +236,30 @@ public class SepPostingsReaderImpl extends PostingsReaderBase { @Override public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException { final SepTermState termState = (SepTermState) _termState; - //System.out.println("SepR.nextTerm termCount=" + termState.termCount); + final boolean isFirstTerm = termState.termBlockOrd == 0; + //System.out.println("SEPR.nextTerm termCount=" + termState.termBlockOrd + " isFirstTerm=" + isFirstTerm + " bytesReader.pos=" + termState.bytesReader.getPosition()); //System.out.println(" docFreq=" + termState.docFreq); - final boolean isFirstTerm = termState.termCount == 0; termState.docIndex.read(termState.bytesReader, isFirstTerm); //System.out.println(" docIndex=" + termState.docIndex); if (fieldInfo.indexOptions != IndexOptions.DOCS_ONLY) { termState.freqIndex.read(termState.bytesReader, isFirstTerm); - } - - if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - //System.out.println(" freqIndex=" + termState.freqIndex); - termState.posIndex.read(termState.bytesReader, isFirstTerm); - //System.out.println(" posIndex=" + termState.posIndex); - if (fieldInfo.storePayloads) { - if (isFirstTerm) { - termState.payloadFP = termState.bytesReader.readVLong(); - } else { - termState.payloadFP += termState.bytesReader.readVLong(); + if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + //System.out.println(" freqIndex=" + termState.freqIndex); + termState.posIndex.read(termState.bytesReader, isFirstTerm); + //System.out.println(" posIndex=" + termState.posIndex); + if (fieldInfo.storePayloads) { + if (isFirstTerm) { + termState.payloadFP = termState.bytesReader.readVLong(); + } else { + termState.payloadFP += termState.bytesReader.readVLong(); + } + //System.out.println(" payloadFP=" + termState.payloadFP); } - //System.out.println(" payloadFP=" + termState.payloadFP); } } + if (termState.docFreq >= skipMinimum) { - //System.out.println(" readSkip @ " + termState.bytesReader.pos); + //System.out.println(" readSkip @ " + termState.bytesReader.getPosition()); if (isFirstTerm) { termState.skipFP = termState.bytesReader.readVLong(); } else { @@ -538,7 +543,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase { freqIndex = freqIn.index(); posReader = posIn.reader(); posIndex = posIn.index(); - payloadIn = (IndexInput) SepPostingsReaderImpl.this.payloadIn.clone(); + payloadIn = (IndexInput) SepPostingsReader.this.payloadIn.clone(); } SepDocsAndPositionsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits liveDocs) throws IOException { @@ -656,6 +661,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase { // Skipper did move skipper.getFreqIndex().seek(freqReader); skipper.getDocIndex().seek(docReader); + //System.out.println(" doc seek'd to " + skipper.getDocIndex()); // NOTE: don't seek pos here; do it lazily // instead. Eg a PhraseQuery may skip to many // docs before finally asking for positions... diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriter.java similarity index 68% rename from lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java rename to lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriter.java index 4aa91c945ac..4b6ee5f240c 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriter.java @@ -18,6 +18,8 @@ package org.apache.lucene.index.codecs.sep; */ import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.Set; import org.apache.lucene.index.CorruptIndexException; @@ -38,8 +40,8 @@ import org.apache.lucene.util.IOUtils; * to .pyl, skip data to .skp * * @lucene.experimental */ -public final class SepPostingsWriterImpl extends PostingsWriterBase { - final static String CODEC = "SepDocFreqSkip"; +public final class SepPostingsWriter extends PostingsWriterBase { + final static String CODEC = "SepPostingsWriter"; final static String DOC_EXTENSION = "doc"; final static String SKIP_EXTENSION = "skp"; @@ -89,26 +91,22 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase { boolean storePayloads; IndexOptions indexOptions; - long lastSkipFP; - FieldInfo fieldInfo; int lastPayloadLength; int lastPosition; long payloadStart; - long lastPayloadStart; int lastDocID; int df; - private int pendingTermCount; // Holds pending byte[] blob for the current terms block private final RAMOutputStream indexBytesWriter = new RAMOutputStream(); - public SepPostingsWriterImpl(SegmentWriteState state, IntStreamFactory factory) throws IOException { + public SepPostingsWriter(SegmentWriteState state, IntStreamFactory factory) throws IOException { this(state, factory, DEFAULT_SKIP_INTERVAL); } - public SepPostingsWriterImpl(SegmentWriteState state, IntStreamFactory factory, int skipInterval) throws IOException { + public SepPostingsWriter(SegmentWriteState state, IntStreamFactory factory, int skipInterval) throws IOException { freqOut = null; freqIndex = null; posOut = null; @@ -171,6 +169,7 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase { @Override public void startTerm() throws IOException { docIndex.mark(); + //System.out.println("SEPW: startTerm docIndex=" + docIndex); if (indexOptions != IndexOptions.DOCS_ONLY) { freqIndex.mark(); @@ -201,7 +200,7 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase { public void startDoc(int docID, int termDocFreq) throws IOException { final int delta = docID - lastDocID; - //System.out.println("SepW startDoc: write doc=" + docID + " delta=" + delta); + //System.out.println("SEPW: startDoc: write doc=" + docID + " delta=" + delta + " out.fp=" + docOut); if (docID < 0 || (df > 0 && delta <= 0)) { throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); @@ -223,15 +222,6 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase { } } - @Override - public void flushTermsBlock() throws IOException { - //System.out.println("SepW.flushTermsBlock: pendingTermCount=" + pendingTermCount + " bytesUsed=" + indexBytesWriter.getFilePointer()); - termsOut.writeVLong((int) indexBytesWriter.getFilePointer()); - indexBytesWriter.writeTo(termsOut); - indexBytesWriter.reset(); - pendingTermCount = 0; - } - /** Add a new position & payload */ @Override public void addPosition(int position, BytesRef payload) throws IOException { @@ -269,6 +259,24 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase { lastPosition = 0; } + private static class PendingTerm { + public final IntIndexOutput.Index docIndex; + public final IntIndexOutput.Index freqIndex; + public final IntIndexOutput.Index posIndex; + public final long payloadFP; + public final long skipFP; + + public PendingTerm(IntIndexOutput.Index docIndex, IntIndexOutput.Index freqIndex, IntIndexOutput.Index posIndex, long payloadFP, long skipFP) { + this.docIndex = docIndex; + this.freqIndex = freqIndex; + this.posIndex = posIndex; + this.payloadFP = payloadFP; + this.skipFP = skipFP; + } + } + + private final List pendingTerms = new ArrayList(); + /** Called when we are done adding docs to this term */ @Override public void finishTerm(TermStats stats) throws IOException { @@ -276,50 +284,107 @@ public final class SepPostingsWriterImpl extends PostingsWriterBase { assert stats.docFreq > 0; assert stats.docFreq == df; - final boolean isFirstTerm = pendingTermCount == 0; - //System.out.println("SepW.finishTerm: isFirstTerm=" + isFirstTerm); - - docIndex.write(indexBytesWriter, isFirstTerm); - //System.out.println(" docIndex=" + docIndex); + final IntIndexOutput.Index docIndexCopy = docOut.index(); + docIndexCopy.copyFrom(docIndex, false); + final IntIndexOutput.Index freqIndexCopy; + final IntIndexOutput.Index posIndexCopy; if (indexOptions != IndexOptions.DOCS_ONLY) { - freqIndex.write(indexBytesWriter, isFirstTerm); - //System.out.println(" freqIndex=" + freqIndex); - } - - if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - posIndex.write(indexBytesWriter, isFirstTerm); - //System.out.println(" posIndex=" + posIndex); - if (storePayloads) { - if (isFirstTerm) { - indexBytesWriter.writeVLong(payloadStart); - } else { - indexBytesWriter.writeVLong(payloadStart - lastPayloadStart); - } - lastPayloadStart = payloadStart; - //System.out.println(" payloadFP=" + payloadStart); - } - } - - if (df >= skipMinimum) { - //System.out.println(" skipFP=" + skipStart); - final long skipFP = skipOut.getFilePointer(); - skipListWriter.writeSkip(skipOut); - //System.out.println(" writeSkip @ " + indexBytesWriter.getFilePointer()); - if (isFirstTerm) { - indexBytesWriter.writeVLong(skipFP); + freqIndexCopy = freqOut.index(); + freqIndexCopy.copyFrom(freqIndex, false); + if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + posIndexCopy = posOut.index(); + posIndexCopy.copyFrom(posIndex, false); } else { - indexBytesWriter.writeVLong(skipFP - lastSkipFP); + posIndexCopy = null; } - lastSkipFP = skipFP; - } else if (isFirstTerm) { - // lazily write an absolute delta if a term in this block requires skip data. - lastSkipFP = 0; + } else { + freqIndexCopy = null; + posIndexCopy = null; + } + + final long skipFP; + if (df >= skipMinimum) { + skipFP = skipOut.getFilePointer(); + //System.out.println(" skipFP=" + skipFP); + skipListWriter.writeSkip(skipOut); + //System.out.println(" numBytes=" + (skipOut.getFilePointer()-skipFP)); + } else { + skipFP = -1; } lastDocID = 0; df = 0; - pendingTermCount++; + + pendingTerms.add(new PendingTerm(docIndexCopy, + freqIndexCopy, + posIndexCopy, + payloadStart, + skipFP)); + } + + @Override + public void flushTermsBlock(int start, int count) throws IOException { + //System.out.println("SEPW: flushTermsBlock: start=" + start + " count=" + count + " pendingTerms.size()=" + pendingTerms.size() + " termsOut.fp=" + termsOut.getFilePointer()); + assert indexBytesWriter.getFilePointer() == 0; + final int absStart = pendingTerms.size() - start; + final List slice = pendingTerms.subList(absStart, absStart+count); + + long lastPayloadFP = 0; + long lastSkipFP = 0; + + if (count == 0) { + termsOut.writeByte((byte) 0); + return; + } + + final PendingTerm firstTerm = slice.get(0); + final IntIndexOutput.Index docIndexFlush = firstTerm.docIndex; + final IntIndexOutput.Index freqIndexFlush = firstTerm.freqIndex; + final IntIndexOutput.Index posIndexFlush = firstTerm.posIndex; + + for(int idx=0;idx 1; + this.maxBlockSize = maxBlockSize; } @Override @@ -57,29 +63,14 @@ public class StandardCodec extends Codec { // pluggable? Ie so that this codec would record which // index impl was used, and switch on loading? // Or... you must make a new Codec for this? - TermsIndexWriterBase indexWriter; boolean success = false; try { - indexWriter = new VariableGapTermsIndexWriter(state, new VariableGapTermsIndexWriter.EveryNTermSelector(state.termIndexInterval)); - success = true; - } finally { - if (!success) { - docs.close(); - } - } - - success = false; - try { - FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs); + FieldsConsumer ret = new BlockTreeTermsWriter(state, docs, minBlockSize, maxBlockSize); success = true; return ret; } finally { if (!success) { - try { - docs.close(); - } finally { - indexWriter.close(); - } + docs.close(); } } } @@ -89,41 +80,22 @@ public class StandardCodec extends Codec { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { PostingsReaderBase postings = new StandardPostingsReader(state.dir, state.segmentInfo, state.context, state.codecId); - TermsIndexReaderBase indexReader; boolean success = false; try { - indexReader = new VariableGapTermsIndexReader(state.dir, + FieldsProducer ret = new BlockTreeTermsReader( + state.dir, state.fieldInfos, state.segmentInfo.name, - state.termsIndexDivisor, - state.codecId, state.context); - success = true; - } finally { - if (!success) { - postings.close(); - } - } - - success = false; - try { - FieldsProducer ret = new BlockTermsReader(indexReader, - state.dir, - state.fieldInfos, - state.segmentInfo.name, - postings, - state.context, - TERMS_CACHE_SIZE, - state.codecId); + postings, + state.context, + state.codecId, + state.termsIndexDivisor); success = true; return ret; } finally { if (!success) { - try { - postings.close(); - } finally { - indexReader.close(); - } + postings.close(); } } } @@ -135,11 +107,10 @@ public class StandardCodec extends Codec { static final String PROX_EXTENSION = "prx"; @Override - public void files(Directory dir, SegmentInfo segmentInfo, int id, Set files) throws IOException { - StandardPostingsReader.files(dir, segmentInfo, id, files); - BlockTermsReader.files(dir, segmentInfo, id, files); - VariableGapTermsIndexReader.files(dir, segmentInfo, id, files); - DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS()); + public void files(Directory dir, SegmentInfo segmentInfo, int codecID, Set files) throws IOException { + StandardPostingsReader.files(dir, segmentInfo, codecID, files); + BlockTreeTermsReader.files(dir, segmentInfo, codecID, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecID, files, getDocValuesUseCFS()); } @Override @@ -151,8 +122,12 @@ public class StandardCodec extends Codec { public static void getStandardExtensions(Set extensions) { extensions.add(FREQ_EXTENSION); extensions.add(PROX_EXTENSION); - BlockTermsReader.getExtensions(extensions); - VariableGapTermsIndexReader.getIndexExtensions(extensions); + BlockTreeTermsReader.getExtensions(extensions); + } + + @Override + public String toString() { + return name + "(minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")"; } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java index 04479e8dbd9..488c81ff62b 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java @@ -27,8 +27,8 @@ import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.TermState; -import org.apache.lucene.index.codecs.BlockTermState; import org.apache.lucene.index.codecs.PostingsReaderBase; +import org.apache.lucene.index.codecs.BlockTermState; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -46,22 +46,23 @@ public class StandardPostingsReader extends PostingsReaderBase { private final IndexInput freqIn; private final IndexInput proxIn; + // public static boolean DEBUG = BlockTreeTermsWriter.DEBUG; int skipInterval; int maxSkipLevels; int skipMinimum; - //private String segment; + // private String segment; - public StandardPostingsReader(Directory dir, SegmentInfo segmentInfo, IOContext context, int codecId) throws IOException { + public StandardPostingsReader(Directory dir, SegmentInfo segmentInfo, IOContext ioContext, int codecId) throws IOException { freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, StandardCodec.FREQ_EXTENSION), - context); - //this.segment = segmentInfo.name; + ioContext); + // this.segment = segmentInfo.name; if (segmentInfo.getHasProx()) { boolean success = false; try { proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, StandardCodec.PROX_EXTENSION), - context); + ioContext); success = true; } finally { if (!success) { @@ -73,10 +74,10 @@ public class StandardPostingsReader extends PostingsReaderBase { } } - public static void files(Directory dir, SegmentInfo segmentInfo, int id, Collection files) throws IOException { - files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, StandardCodec.FREQ_EXTENSION)); + public static void files(Directory dir, SegmentInfo segmentInfo, int codecID, Collection files) throws IOException { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecID, StandardCodec.FREQ_EXTENSION)); if (segmentInfo.getHasProx()) { - files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, StandardCodec.PROX_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecID, StandardCodec.PROX_EXTENSION)); } } @@ -100,7 +101,7 @@ public class StandardPostingsReader extends PostingsReaderBase { // Only used by the "primary" TermState -- clones don't // copy this (basically they are "transient"): - ByteArrayDataInput bytesReader; + ByteArrayDataInput bytesReader; // TODO: should this NOT be in the TermState...? byte[] bytes; @Override @@ -155,7 +156,8 @@ public class StandardPostingsReader extends PostingsReaderBase { final StandardTermState termState = (StandardTermState) _termState; final int len = termsIn.readVInt(); - //System.out.println("SPR.readTermsBlock termsIn.fp=" + termsIn.getFilePointer()); + + // if (DEBUG) System.out.println(" SPR.readTermsBlock bytes=" + len + " ts=" + _termState); if (termState.bytes == null) { termState.bytes = new byte[ArrayUtil.oversize(len, 1)]; termState.bytesReader = new ByteArrayDataInput(); @@ -171,21 +173,25 @@ public class StandardPostingsReader extends PostingsReaderBase { public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException { final StandardTermState termState = (StandardTermState) _termState; - //System.out.println("StandardR.nextTerm seg=" + segment); - final boolean isFirstTerm = termState.termCount == 0; + // if (DEBUG) System.out.println("SPR: nextTerm seg=" + segment + " tbOrd=" + termState.termBlockOrd + " bytesReader.fp=" + termState.bytesReader.getPosition()); + final boolean isFirstTerm = termState.termBlockOrd == 0; if (isFirstTerm) { termState.freqOffset = termState.bytesReader.readVLong(); } else { termState.freqOffset += termState.bytesReader.readVLong(); } - //System.out.println(" dF=" + termState.docFreq); - //System.out.println(" freqFP=" + termState.freqOffset); + /* + if (DEBUG) { + System.out.println(" dF=" + termState.docFreq); + System.out.println(" freqFP=" + termState.freqOffset); + } + */ assert termState.freqOffset < freqIn.length(); if (termState.docFreq >= skipMinimum) { termState.skipOffset = termState.bytesReader.readVInt(); - //System.out.println(" skipOffset=" + termState.skipOffset + " vs freqIn.length=" + freqIn.length()); + // if (DEBUG) System.out.println(" skipOffset=" + termState.skipOffset + " vs freqIn.length=" + freqIn.length()); assert termState.freqOffset + termState.skipOffset < freqIn.length(); } else { // undefined @@ -197,7 +203,7 @@ public class StandardPostingsReader extends PostingsReaderBase { } else { termState.proxOffset += termState.bytesReader.readVLong(); } - //System.out.println(" proxFP=" + termState.proxOffset); + // if (DEBUG) System.out.println(" proxFP=" + termState.proxOffset); } } @@ -215,6 +221,7 @@ public class StandardPostingsReader extends PostingsReaderBase { docsEnum = new SegmentDocsEnum(freqIn); } } + // if (DEBUG) System.out.println("SPR.docs ts=" + termState); return docsEnum.reset(fieldInfo, (StandardTermState) termState, liveDocs); } @@ -300,7 +307,7 @@ public class StandardPostingsReader extends PostingsReaderBase { assert limit > 0; ord = 0; doc = 0; - //System.out.println(" sde limit=" + limit + " freqFP=" + freqOffset); + // if (DEBUG) System.out.println(" sde limit=" + limit + " freqFP=" + freqOffset); skipped = false; @@ -309,8 +316,10 @@ public class StandardPostingsReader extends PostingsReaderBase { @Override public int nextDoc() throws IOException { + //if (DEBUG) System.out.println(" stpr.nextDoc seg=" + segment + " fp=" + freqIn.getFilePointer()); while(true) { if (ord == limit) { + //if (DEBUG) System.out.println(" return doc=" + NO_MORE_DOCS); return doc = NO_MORE_DOCS; } @@ -318,6 +327,7 @@ public class StandardPostingsReader extends PostingsReaderBase { // Decode next doc/freq pair final int code = freqIn.readVInt(); + // if (DEBUG) System.out.println(" code=" + code); if (omitTF) { doc += code; } else { @@ -334,6 +344,7 @@ public class StandardPostingsReader extends PostingsReaderBase { } } + //if (DEBUG) System.out.println(" stpr.nextDoc return doc=" + doc); return doc; } @@ -480,16 +491,17 @@ public class StandardPostingsReader extends PostingsReaderBase { freqOffset = termState.freqOffset; proxOffset = termState.proxOffset; skipOffset = termState.skipOffset; - //System.out.println("StandardR.D&PE reset seg=" + segment + " limit=" + limit + " freqFP=" + freqOffset + " proxFP=" + proxOffset); + // if (DEBUG) System.out.println("StandardR.D&PE reset seg=" + segment + " limit=" + limit + " freqFP=" + freqOffset + " proxFP=" + proxOffset); return this; } @Override public int nextDoc() throws IOException { + // if (DEBUG) System.out.println("SPR.nextDoc seg=" + segment + " freqIn.fp=" + freqIn.getFilePointer()); while(true) { if (ord == limit) { - //System.out.println("StandardR.D&PE seg=" + segment + " nextDoc return doc=END"); + // if (DEBUG) System.out.println(" return END"); return doc = NO_MORE_DOCS; } @@ -513,7 +525,7 @@ public class StandardPostingsReader extends PostingsReaderBase { position = 0; - //System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc); + // if (DEBUG) System.out.println(" return doc=" + doc); return doc; } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java index 7268edff966..1a316933db3 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java @@ -21,6 +21,8 @@ package org.apache.lucene.index.codecs.standard; * index file format */ import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DocsEnum; @@ -34,18 +36,19 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; -import org.apache.lucene.util.IOUtils; /** @lucene.experimental */ public final class StandardPostingsWriter extends PostingsWriterBase { - final static String CODEC = "StandardPostingsWriterImpl"; + final static String CODEC = "StandardPostingsWriter"; + + //private static boolean DEBUG = BlockTreeTermsWriter.DEBUG; // Increment version to change it: final static int VERSION_START = 0; final static int VERSION_CURRENT = VERSION_START; - IndexOutput freqOut; - IndexOutput proxOut; + final IndexOutput freqOut; + final IndexOutput proxOut; final DefaultSkipListWriter skipListWriter; /** Expert: The fraction of TermDocs entries stored in skip tables, * used to accelerate {@link DocsEnum#advance(int)}. Larger values result in @@ -70,52 +73,42 @@ public final class StandardPostingsWriter extends PostingsWriterBase { IndexOptions indexOptions; boolean storePayloads; // Starts a new term - long lastFreqStart; long freqStart; - long lastProxStart; long proxStart; FieldInfo fieldInfo; int lastPayloadLength; int lastPosition; - private int pendingCount; - - //private String segment; - - private RAMOutputStream bytesWriter = new RAMOutputStream(); + // private String segment; public StandardPostingsWriter(SegmentWriteState state) throws IOException { this(state, DEFAULT_SKIP_INTERVAL); } public StandardPostingsWriter(SegmentWriteState state, int skipInterval) throws IOException { + super(); this.skipInterval = skipInterval; this.skipMinimum = skipInterval; /* set to the same for now */ - //this.segment = state.segmentName; + // this.segment = state.segmentName; String fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, StandardCodec.FREQ_EXTENSION); freqOut = state.directory.createOutput(fileName, state.context); - boolean success = false; - try { - if (state.fieldInfos.hasProx()) { - // At least one field does not omit TF, so create the - // prox file - fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, StandardCodec.PROX_EXTENSION); - proxOut = state.directory.createOutput(fileName, state.context); - } else { - // Every field omits TF so we will write no prox file - proxOut = null; - } - - totalNumDocs = state.numDocs; - - skipListWriter = new DefaultSkipListWriter(skipInterval, maxSkipLevels, - state.numDocs, freqOut, proxOut); - success = true; - } finally { - if (!success) { - IOUtils.closeSafely(true, freqOut, proxOut); - } + if (state.fieldInfos.hasProx()) { + // At least one field does not omit TF, so create the + // prox file + fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, StandardCodec.PROX_EXTENSION); + proxOut = state.directory.createOutput(fileName, state.context); + } else { + // Every field omits TF so we will write no prox file + proxOut = null; } + + totalNumDocs = state.numDocs; + + skipListWriter = new DefaultSkipListWriter(skipInterval, + maxSkipLevels, + state.numDocs, + freqOut, + proxOut); } @Override @@ -129,8 +122,8 @@ public final class StandardPostingsWriter extends PostingsWriterBase { @Override public void startTerm() { - //System.out.println("StandardW: startTerm seg=" + segment + " pendingCount=" + pendingCount); freqStart = freqOut.getFilePointer(); + //if (DEBUG) System.out.println("SPW: startTerm freqOut.fp=" + freqStart); if (proxOut != null) { proxStart = proxOut.getFilePointer(); // force first payload to write its length @@ -144,6 +137,13 @@ public final class StandardPostingsWriter extends PostingsWriterBase { @Override public void setField(FieldInfo fieldInfo) { //System.out.println("SPW: setField"); + /* + if (BlockTreeTermsWriter.DEBUG && fieldInfo.name.equals("id")) { + DEBUG = true; + } else { + DEBUG = false; + } + */ this.fieldInfo = fieldInfo; indexOptions = fieldInfo.indexOptions; storePayloads = fieldInfo.storePayloads; @@ -158,7 +158,7 @@ public final class StandardPostingsWriter extends PostingsWriterBase { * then we just skip consuming positions/payloads. */ @Override public void startDoc(int docID, int termDocFreq) throws IOException { - //System.out.println("StandardW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq); + // if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer()); final int delta = docID - lastDocID; @@ -189,13 +189,13 @@ public final class StandardPostingsWriter extends PostingsWriterBase { /** Add a new position & payload */ @Override public void addPosition(int position, BytesRef payload) throws IOException { - //System.out.println("StandardW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer()); + //if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer()); assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions; assert proxOut != null; final int delta = position - lastPosition; - - assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; + + assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it) lastPosition = position; @@ -222,57 +222,104 @@ public final class StandardPostingsWriter extends PostingsWriterBase { public void finishDoc() { } + private static class PendingTerm { + public final long freqStart; + public final long proxStart; + public final int skipOffset; + + public PendingTerm(long freqStart, long proxStart, int skipOffset) { + this.freqStart = freqStart; + this.proxStart = proxStart; + this.skipOffset = skipOffset; + } + } + + private final List pendingTerms = new ArrayList(); + /** Called when we are done adding docs to this term */ @Override public void finishTerm(TermStats stats) throws IOException { - //System.out.println("StandardW.finishTerm seg=" + segment); + + // if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart); assert stats.docFreq > 0; // TODO: wasteful we are counting this (counting # docs // for this term) in two places? assert stats.docFreq == df; - final boolean isFirstTerm = pendingCount == 0; - //System.out.println(" isFirstTerm=" + isFirstTerm); - - //System.out.println(" freqFP=" + freqStart); - if (isFirstTerm) { - bytesWriter.writeVLong(freqStart); - } else { - bytesWriter.writeVLong(freqStart-lastFreqStart); - } - lastFreqStart = freqStart; - + final int skipOffset; if (df >= skipMinimum) { - bytesWriter.writeVInt((int) (skipListWriter.writeSkip(freqOut)-freqStart)); + skipOffset = (int) (skipListWriter.writeSkip(freqOut)-freqStart); + } else { + skipOffset = -1; } - if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - //System.out.println(" proxFP=" + proxStart); - if (isFirstTerm) { - bytesWriter.writeVLong(proxStart); - } else { - bytesWriter.writeVLong(proxStart - lastProxStart); - } - lastProxStart = proxStart; - } - + pendingTerms.add(new PendingTerm(freqStart, proxStart, skipOffset)); + lastDocID = 0; df = 0; - pendingCount++; } + private final RAMOutputStream bytesWriter = new RAMOutputStream(); + @Override - public void flushTermsBlock() throws IOException { - //System.out.println("SPW.flushBlock pendingCount=" + pendingCount); + public void flushTermsBlock(int start, int count) throws IOException { + //if (DEBUG) System.out.println("SPW: flushTermsBlock start=" + start + " count=" + count + " left=" + (pendingTerms.size()-count) + " pendingTerms.size()=" + pendingTerms.size()); + + if (count == 0) { + termsOut.writeByte((byte) 0); + return; + } + + assert start <= pendingTerms.size(); + assert count <= start; + + final int limit = pendingTerms.size() - start + count; + final PendingTerm firstTerm = pendingTerms.get(limit - count); + // First term in block is abs coded: + bytesWriter.writeVLong(firstTerm.freqStart); + + if (firstTerm.skipOffset != -1) { + assert firstTerm.skipOffset > 0; + bytesWriter.writeVInt(firstTerm.skipOffset); + } + if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + bytesWriter.writeVLong(firstTerm.proxStart); + } + long lastFreqStart = firstTerm.freqStart; + long lastProxStart = firstTerm.proxStart; + for(int idx=limit-count+1; idx runAutomata = initAutomata(editDistance); if (editDistance < runAutomata.size()) { - return new AutomatonFuzzyTermsEnum(runAutomata.subList(0, editDistance + 1) - .toArray(new CompiledAutomaton[editDistance + 1]), lastTerm); + //if (BlockTreeTermsWriter.DEBUG) System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString())); + final CompiledAutomaton compiled = runAutomata.get(editDistance); + return new AutomatonFuzzyTermsEnum(terms.intersect(compiled, lastTerm == null ? null : compiled.floor(lastTerm, new BytesRef())), + runAutomata.subList(0, editDistance + 1).toArray(new CompiledAutomaton[editDistance + 1])); } else { return null; } @@ -153,6 +156,7 @@ public final class FuzzyTermsEnum extends TermsEnum { /** initialize levenshtein DFAs up to maxDistance, if possible */ private List initAutomata(int maxDistance) { final List runAutomata = dfaAtt.automata(); + //System.out.println("cached automata size: " + runAutomata.size()); if (runAutomata.size() <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = @@ -160,13 +164,14 @@ public final class FuzzyTermsEnum extends TermsEnum { for (int i = runAutomata.size(); i <= maxDistance; i++) { Automaton a = builder.toAutomaton(i); + //System.out.println("compute automaton n=" + i); // constant prefix if (realPrefixLength > 0) { Automaton prefix = BasicAutomata.makeString( UnicodeUtil.newString(termText, 0, realPrefixLength)); a = BasicOperations.concatenate(prefix, a); } - runAutomata.add(new CompiledAutomaton(a, true)); + runAutomata.add(new CompiledAutomaton(a, true, false)); } } return runAutomata; @@ -301,65 +306,65 @@ public final class FuzzyTermsEnum extends TermsEnum { public BytesRef term() throws IOException { return actualEnum.term(); } - + /** - * Implement fuzzy enumeration with automaton. + * Implement fuzzy enumeration with Terms.intersect. *

    * This is the fastest method as opposed to LinearFuzzyTermsEnum: * as enumeration is logarithmic to the number of terms (instead of linear) * and comparison is linear to length of the term (rather than quadratic) */ - private class AutomatonFuzzyTermsEnum extends AutomatonTermsEnum { + private class AutomatonFuzzyTermsEnum extends FilteredTermsEnum { private final ByteRunAutomaton matchers[]; private final BytesRef termRef; - private final BytesRef lastTerm; private final BoostAttribute boostAtt = attributes().addAttribute(BoostAttribute.class); - public AutomatonFuzzyTermsEnum(CompiledAutomaton compiled[], - BytesRef lastTerm) throws IOException { - super(tenum, compiled[compiled.length - 1]); + public AutomatonFuzzyTermsEnum(TermsEnum tenum, CompiledAutomaton compiled[]) + throws IOException { + super(tenum, false); this.matchers = new ByteRunAutomaton[compiled.length]; for (int i = 0; i < compiled.length; i++) this.matchers[i] = compiled[i].runAutomaton; - this.lastTerm = lastTerm; termRef = new BytesRef(term.text()); } - + /** finds the smallest Lev(n) DFA that accepts the term. */ @Override protected AcceptStatus accept(BytesRef term) { + //System.out.println("AFTE.accept term=" + term); int ed = matchers.length - 1; - if (matches(term, ed)) { // we match the outer dfa - // now compute exact edit distance - while (ed > 0) { - if (matches(term, ed - 1)) { - ed--; - } else { - break; - } - } - - // scale to a boost and return (if similarity > minSimilarity) - if (ed == 0) { // exact match - boostAtt.setBoost(1.0F); - return AcceptStatus.YES_AND_SEEK; + // we are wrapping either an intersect() TermsEnum or an AutomatonTermsENum, + // so we know the outer DFA always matches. + // now compute exact edit distance + while (ed > 0) { + if (matches(term, ed - 1)) { + ed--; } else { - final int codePointCount = UnicodeUtil.codePointCount(term); - final float similarity = 1.0f - ((float) ed / (float) - (Math.min(codePointCount, termLength))); - if (similarity > minSimilarity) { - boostAtt.setBoost((similarity - minSimilarity) * scale_factor); - return AcceptStatus.YES_AND_SEEK; - } else { - return AcceptStatus.NO_AND_SEEK; - } + break; } + } + //System.out.println("CHECK term=" + term.utf8ToString() + " ed=" + ed); + + // scale to a boost and return (if similarity > minSimilarity) + if (ed == 0) { // exact match + boostAtt.setBoost(1.0F); + //System.out.println(" yes"); + return AcceptStatus.YES; } else { - return AcceptStatus.NO_AND_SEEK; + final int codePointCount = UnicodeUtil.codePointCount(term); + final float similarity = 1.0f - ((float) ed / (float) + (Math.min(codePointCount, termLength))); + if (similarity > minSimilarity) { + boostAtt.setBoost((similarity - minSimilarity) * scale_factor); + //System.out.println(" yes"); + return AcceptStatus.YES; + } else { + return AcceptStatus.NO; + } } } @@ -367,16 +372,8 @@ public final class FuzzyTermsEnum extends TermsEnum { final boolean matches(BytesRef term, int k) { return k == 0 ? term.equals(termRef) : matchers[k].run(term.bytes, term.offset, term.length); } - - /** defers to superclass, except can start at an arbitrary location */ - @Override - protected BytesRef nextSeekTerm(BytesRef term) throws IOException { - if (term == null) - term = lastTerm; - return super.nextSeekTerm(term); - } } - + /** * Implement fuzzy enumeration with linear brute force. */ @@ -408,7 +405,7 @@ public final class FuzzyTermsEnum extends TermsEnum { * @throws IOException */ public LinearFuzzyTermsEnum() throws IOException { - super(tenum); + super(terms.iterator()); this.text = new int[termLength - realPrefixLength]; System.arraycopy(termText, realPrefixLength, text, 0, text.length); diff --git a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java index a50d7713978..95da0819c88 100644 --- a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java @@ -882,6 +882,6 @@ public class IndexSearcher implements Closeable { @Override public String toString() { - return "IndexSearcher(" + reader + ")"; + return "IndexSearcher(" + reader + "; executor=" + executor + ")"; } } diff --git a/lucene/src/java/org/apache/lucene/search/PrefixQuery.java b/lucene/src/java/org/apache/lucene/search/PrefixQuery.java index 27a89bb3083..42cdebfdce9 100644 --- a/lucene/src/java/org/apache/lucene/search/PrefixQuery.java +++ b/lucene/src/java/org/apache/lucene/search/PrefixQuery.java @@ -51,7 +51,7 @@ public class PrefixQuery extends MultiTermQuery { // no prefix -- match all terms for this field: return tenum; } - return new PrefixTermsEnum(tenum, prefix); + return new PrefixTermsEnum(tenum, prefix.bytes()); } /** Prints a user-readable version of this query. */ diff --git a/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java b/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java index a5f8fcfdbab..23717be751d 100644 --- a/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java @@ -19,7 +19,6 @@ package org.apache.lucene.search; import java.io.IOException; -import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; @@ -34,9 +33,9 @@ public class PrefixTermsEnum extends FilteredTermsEnum { private final BytesRef prefixRef; - public PrefixTermsEnum(TermsEnum tenum, Term prefix) throws IOException { + public PrefixTermsEnum(TermsEnum tenum, BytesRef prefixText) throws IOException { super(tenum); - setInitialSeekTerm(prefixRef = prefix.bytes()); + setInitialSeekTerm(this.prefixRef = prefixText); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java b/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java index 55fadef5a12..c890f504f2e 100644 --- a/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java @@ -19,7 +19,6 @@ package org.apache.lucene.search; import java.io.IOException; -import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; @@ -39,10 +38,10 @@ public final class SingleTermsEnum extends FilteredTermsEnum { * After calling the constructor the enumeration is already pointing to the term, * if it exists. */ - public SingleTermsEnum(TermsEnum tenum, Term singleTerm) throws IOException { + public SingleTermsEnum(TermsEnum tenum, BytesRef termText) throws IOException { super(tenum); - singleRef = singleTerm.bytes(); - setInitialSeekTerm(singleRef); + singleRef = termText; + setInitialSeekTerm(termText); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/TermQuery.java b/lucene/src/java/org/apache/lucene/search/TermQuery.java index aecd866ca62..048588878c0 100644 --- a/lucene/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/TermQuery.java @@ -98,17 +98,18 @@ public class TermQuery extends Query { TermsEnum getTermsEnum(AtomicReaderContext context) throws IOException { final TermState state = termStates.get(context.ord); if (state == null) { // term is not present in that reader - assert termNotInReader(context.reader, term.field(), term.bytes()) : "no termstate found but term exists in reader"; + assert termNotInReader(context.reader, term.field(), term.bytes()) : "no termstate found but term exists in reader term=" + term; return null; } - final TermsEnum termsEnum = context.reader.terms(term.field()) - .getThreadTermsEnum(); + //System.out.println("LD=" + reader.getLiveDocs() + " set?=" + (reader.getLiveDocs() != null ? reader.getLiveDocs().get(0) : "null")); + final TermsEnum termsEnum = context.reader.terms(term.field()).getThreadTermsEnum(); termsEnum.seekExact(term.bytes(), state); return termsEnum; } private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException { // only called from assert + //System.out.println("TQ.termNotInReader reader=" + reader + " term=" + field + ":" + bytes.utf8ToString()); final Terms terms = reader.terms(field); return terms == null || terms.docFreq(bytes) == 0; } diff --git a/lucene/src/java/org/apache/lucene/search/TermScorer.java b/lucene/src/java/org/apache/lucene/search/TermScorer.java index 3534079fb34..066ce66821e 100644 --- a/lucene/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/src/java/org/apache/lucene/search/TermScorer.java @@ -70,6 +70,7 @@ final class TermScorer extends Scorer { public boolean score(Collector c, int end, int firstDocID) throws IOException { c.setScorer(this); while (doc < end) { // for docs in window + //System.out.println("TS: collect doc=" + doc); c.collect(doc); // collect score if (++pointer >= pointerMax) { refillBuffer(); diff --git a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java index 4ad6222b801..0a75f37e727 100644 --- a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java +++ b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java @@ -78,15 +78,38 @@ public abstract class TopTermsRewrite extends TermCollectingRew public void setNextEnum(TermsEnum termsEnum) throws IOException { this.termsEnum = termsEnum; this.termComp = termsEnum.getComparator(); + + assert compareToLastTerm(null); + // lazy init the initial ScoreTerm because comparator is not known on ctor: if (st == null) st = new ScoreTerm(this.termComp, new TermContext(topReaderContext)); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); } + // for assert: + private BytesRef lastTerm; + private boolean compareToLastTerm(BytesRef t) throws IOException { + if (lastTerm == null && t != null) { + lastTerm = new BytesRef(t); + } else if (t == null) { + lastTerm = null; + } else { + assert termsEnum.getComparator().compare(lastTerm, t) < 0: "lastTerm=" + lastTerm + " t=" + t; + lastTerm.copy(t); + } + return true; + } + @Override public boolean collect(BytesRef bytes) throws IOException { final float boost = boostAtt.getBoost(); + + // make sure within a single seg we always collect + // terms in order + assert compareToLastTerm(bytes); + + //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.size() == maxSize) { final ScoreTerm t = stQueue.peek(); @@ -134,9 +157,10 @@ public abstract class TopTermsRewrite extends TermCollectingRew final Q q = getTopLevelQuery(); final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); ArrayUtil.mergeSort(scoreTerms, scoreTermSortByTermComp); + for (final ScoreTerm st : scoreTerms) { final Term term = new Term(query.field, st.bytes); - assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq(); + assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term; addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query } query.incTotalNumberOfTerms(scoreTerms.length); diff --git a/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java b/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java index 2ebdf896c5f..0779a168f33 100644 --- a/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java +++ b/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java @@ -43,16 +43,30 @@ public final class ByteArrayDataInput extends DataInput { reset(bytes, 0, bytes.length); } + // NOTE: sets pos to 0, which is not right if you had + // called reset w/ non-zero offset!! + public void rewind() { + pos = 0; + } + public int getPosition() { return pos; } + public void setPosition(int pos) { + this.pos = pos; + } + public void reset(byte[] bytes, int offset, int len) { this.bytes = bytes; pos = offset; limit = offset + len; } + public int length() { + return limit; + } + public boolean eof() { return pos == limit; } diff --git a/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java b/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java index b9ec9aaa3fa..1f9eb222b03 100644 --- a/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java +++ b/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java @@ -19,10 +19,6 @@ package org.apache.lucene.store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.Lock; import org.apache.lucene.util.IOUtils; import java.util.Collection; @@ -189,14 +185,14 @@ public abstract class CompoundFileDirectory extends Directory { } @Override - public synchronized IndexInput openInput(String id, IOContext context) throws IOException { + public synchronized IndexInput openInput(String fileName, IOContext context) throws IOException { ensureOpen(); assert !openForWrite; - id = IndexFileNames.stripSegmentName(id); + final String id = IndexFileNames.stripSegmentName(fileName); final FileEntry entry = entries.get(id); - if (entry == null) - throw new IOException("No sub-file with id " + id + " found (files: " + entries.keySet() + ")"); - + if (entry == null) { + throw new IOException("No sub-file with id " + id + " found (fileName=" + fileName + " files: " + entries.keySet() + ")"); + } return openInputSlice(id, entry.offset, entry.length, readBufferSize); } diff --git a/lucene/src/java/org/apache/lucene/store/FSDirectory.java b/lucene/src/java/org/apache/lucene/store/FSDirectory.java index e3a36b57708..7d5e7dec4da 100644 --- a/lucene/src/java/org/apache/lucene/store/FSDirectory.java +++ b/lucene/src/java/org/apache/lucene/store/FSDirectory.java @@ -448,6 +448,7 @@ public abstract class FSDirectory extends Directory { /** output methods: */ @Override public void flushBuffer(byte[] b, int offset, int size) throws IOException { + assert isOpen; if (rateLimiter != null) { rateLimiter.pause(size); } diff --git a/lucene/src/java/org/apache/lucene/util/BytesRef.java b/lucene/src/java/org/apache/lucene/util/BytesRef.java index de40c3ce2a9..a70432a32c5 100644 --- a/lucene/src/java/org/apache/lucene/util/BytesRef.java +++ b/lucene/src/java/org/apache/lucene/util/BytesRef.java @@ -65,6 +65,18 @@ public final class BytesRef implements Comparable { this.bytes = new byte[capacity]; } + /** Incoming IntsRef values must be Byte.MIN_VALUE - + * Byte.MAX_VALUE. */ + public BytesRef(IntsRef intsRef) { + bytes = new byte[intsRef.length]; + for(int idx=0;idx= Byte.MIN_VALUE && v <= Byte.MAX_VALUE; + bytes[idx] = (byte) v; + } + length = intsRef.length; + } + /** * @param text Initialize the byte[] from the UTF8 bytes * for the provided Sring. This must be well-formed diff --git a/lucene/src/java/org/apache/lucene/util/TermContext.java b/lucene/src/java/org/apache/lucene/util/TermContext.java index 746405c353d..aed51f57c9f 100644 --- a/lucene/src/java/org/apache/lucene/util/TermContext.java +++ b/lucene/src/java/org/apache/lucene/util/TermContext.java @@ -21,14 +21,13 @@ import java.io.IOException; import java.util.Arrays; import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.index.IndexReader.ReaderContext; -import org.apache.lucene.index.TermsEnum.SeekStatus; /** * Maintains a {@link IndexReader} {@link TermState} view over @@ -45,6 +44,9 @@ public final class TermContext { private int docFreq; private long totalTermFreq; + //public static boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + /** * Creates an empty {@link TermContext} from a {@link ReaderContext} */ @@ -85,7 +87,9 @@ public final class TermContext { final BytesRef bytes = term.bytes(); final TermContext perReaderTermState = new TermContext(context); final AtomicReaderContext[] leaves = ReaderUtil.leaves(context); + //if (DEBUG) System.out.println("prts.build term=" + term); for (int i = 0; i < leaves.length; i++) { + //if (DEBUG) System.out.println(" r=" + leaves[i].reader); final Fields fields = leaves[i].reader.fields(); if (fields != null) { final Terms terms = fields.terms(field); @@ -93,6 +97,7 @@ public final class TermContext { final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share! if (termsEnum.seekExact(bytes, cache)) { final TermState termState = termsEnum.termState(); + //if (DEBUG) System.out.println(" found"); perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } diff --git a/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java new file mode 100644 index 00000000000..48b0e537459 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java @@ -0,0 +1,313 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.PrefixTermsEnum; +import org.apache.lucene.search.SingleTermsEnum; +import org.apache.lucene.util.BytesRef; + +/** + * Immutable class holding compiled details for a given + * Automaton. The Automaton is deterministic, must not have + * dead states but may not be minimal. + * + * @lucene.experimental + */ +public class CompiledAutomaton { + public enum AUTOMATON_TYPE {NONE, ALL, SINGLE, PREFIX, NORMAL}; + public final AUTOMATON_TYPE type; + + // For PREFIX, this is the prefix term; for SINGLE this is + // the singleton term: + public final BytesRef term; + + // NOTE: the next 4 members are only non-null if type == + // NORMAL: + public final ByteRunAutomaton runAutomaton; + // TODO: would be nice if these sortedTransitions had "int + // to;" instead of "State to;" somehow: + public final Transition[][] sortedTransitions; + public final BytesRef commonSuffixRef; + public final Boolean finite; + + public CompiledAutomaton(Automaton automaton) { + this(automaton, null, true); + } + + public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) { + + if (simplify) { + // Test whether the automaton is a "simple" form and + // if so, don't create a runAutomaton. Note that on a + // large automaton these tests could be costly: + if (BasicOperations.isEmpty(automaton)) { + // matches nothing + type = AUTOMATON_TYPE.NONE; + term = null; + commonSuffixRef = null; + runAutomaton = null; + sortedTransitions = null; + this.finite = null; + return; + } else if (BasicOperations.isTotal(automaton)) { + // matches all possible strings + type = AUTOMATON_TYPE.ALL; + term = null; + commonSuffixRef = null; + runAutomaton = null; + sortedTransitions = null; + this.finite = null; + return; + } else { + final String commonPrefix; + final String singleton; + if (automaton.getSingleton() == null) { + commonPrefix = SpecialOperations.getCommonPrefix(automaton); + if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) { + singleton = commonPrefix; + } else { + singleton = null; + } + } else { + commonPrefix = null; + singleton = automaton.getSingleton(); + } + + if (singleton != null) { + // matches a fixed string in singleton or expanded + // representation + type = AUTOMATON_TYPE.SINGLE; + term = new BytesRef(singleton); + commonSuffixRef = null; + runAutomaton = null; + sortedTransitions = null; + this.finite = null; + return; + } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate( + BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) { + // matches a constant prefix + type = AUTOMATON_TYPE.PREFIX; + term = new BytesRef(commonPrefix); + commonSuffixRef = null; + runAutomaton = null; + sortedTransitions = null; + this.finite = null; + return; + } + } + } + + type = AUTOMATON_TYPE.NORMAL; + term = null; + if (finite == null) { + this.finite = SpecialOperations.isFinite(automaton); + } else { + this.finite = finite; + } + Automaton utf8 = new UTF32ToUTF8().convert(automaton); + if (this.finite) { + commonSuffixRef = null; + } else { + commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8); + } + runAutomaton = new ByteRunAutomaton(utf8, true); + sortedTransitions = utf8.getSortedTransitions(); + } + + //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) { + + // Find biggest transition that's < label + // TODO: use binary search here + Transition maxTransition = null; + for (Transition transition : sortedTransitions[state]) { + if (transition.min < leadLabel) { + maxTransition = transition; + } + } + + assert maxTransition != null; + + // Append floorLabel + final int floorLabel; + if (maxTransition.max > leadLabel-1) { + floorLabel = leadLabel-1; + } else { + floorLabel = maxTransition.max; + } + if (idx >= term.bytes.length) { + term.grow(1+idx); + } + //if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx); + term.bytes[idx] = (byte) floorLabel; + + state = maxTransition.to.getNumber(); + idx++; + + // Push down to last accept state + while (true) { + Transition[] transitions = sortedTransitions[state]; + if (transitions.length == 0) { + assert runAutomaton.isAccept(state); + term.length = idx; + //if (DEBUG) System.out.println(" return " + term.utf8ToString()); + return term; + } else { + // We are pushing "top" -- so get last label of + // last transition: + assert transitions.length != 0; + Transition lastTransition = transitions[transitions.length-1]; + if (idx >= term.bytes.length) { + term.grow(1+idx); + } + //if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx); + term.bytes[idx] = (byte) lastTransition.max; + state = lastTransition.to.getNumber(); + idx++; + } + } + } + + // TODO: should this take startTerm too? This way + // Terms.intersect could forward to this method if type != + // NORMAL: + public TermsEnum getTermsEnum(Terms terms) throws IOException { + switch(type) { + case NONE: + return TermsEnum.EMPTY; + case ALL: + return terms.iterator(); + case SINGLE: + return new SingleTermsEnum(terms.iterator(), term); + case PREFIX: + // TODO: this is very likely faster than .intersect, + // but we should test and maybe cutover + return new PrefixTermsEnum(terms.iterator(), term); + case NORMAL: + return terms.intersect(this, null); + default: + // unreachable + throw new RuntimeException("unhandled case"); + } + } + + /** Finds largest term accepted by this Automaton, that's + * <= the provided input term. The result is placed in + * output; it's fine for output and input to point to + * the same BytesRef. The returned result is either the + * provided output, or null if there is no floor term + * (ie, the provided input term is before the first term + * accepted by this Automaton). */ + public BytesRef floor(BytesRef input, BytesRef output) { + + output.offset = 0; + //if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString()); + + int state = runAutomaton.getInitialState(); + + // Special case empty string: + if (input.length == 0) { + if (runAutomaton.isAccept(state)) { + output.length = 0; + return output; + } else { + return null; + } + } + + final List stack = new ArrayList(); + + int idx = 0; + while (true) { + int label = input.bytes[input.offset + idx] & 0xff; + int nextState = runAutomaton.step(state, label); + //if (DEBUG) System.out.println(" cycle label=" + (char) label + " nextState=" + nextState); + + if (idx == input.length-1) { + if (nextState != -1 && runAutomaton.isAccept(nextState)) { + // Input string is accepted + if (idx >= output.bytes.length) { + output.grow(1+idx); + } + output.bytes[idx] = (byte) label; + output.length = input.length; + //if (DEBUG) System.out.println(" input is accepted; return term=" + output.utf8ToString()); + return output; + } else { + nextState = -1; + } + } + + if (nextState == -1) { + + // Pop back to a state that has a transition + // <= our label: + while (true) { + Transition[] transitions = sortedTransitions[state]; + if (transitions.length == 0) { + assert runAutomaton.isAccept(state); + output.length = idx; + //if (DEBUG) System.out.println(" return " + output.utf8ToString()); + return output; + } else if (label-1 < transitions[0].min) { + + if (runAutomaton.isAccept(state)) { + output.length = idx; + //if (DEBUG) System.out.println(" return " + output.utf8ToString()); + return output; + } + // pop + if (stack.size() == 0) { + //if (DEBUG) System.out.println(" pop ord=" + idx + " return null"); + return null; + } else { + state = stack.remove(stack.size()-1); + idx--; + //if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min); + label = input.bytes[input.offset + idx] & 0xff; + } + + } else { + //if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min); + break; + } + } + + //if (DEBUG) System.out.println(" label=" + (char) label + " idx=" + idx); + + return addTail(state, output, idx, label); + + } else { + if (idx >= output.bytes.length) { + output.grow(1+idx); + } + output.bytes[idx] = (byte) label; + stack.add(state); + state = nextState; + idx++; + } + } + } +} diff --git a/lucene/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/src/java/org/apache/lucene/util/fst/Builder.java index 0fb0b82727b..39105a6c849 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/Builder.java +++ b/lucene/src/java/org/apache/lucene/util/fst/Builder.java @@ -53,6 +53,8 @@ public class Builder { private final FST fst; private final T NO_OUTPUT; + // private static final boolean DEBUG = false; + // simplistic pruning: we prune node (and all following // nodes) if less than this number of terms go through it: private final int minSuffixCount1; @@ -73,13 +75,21 @@ public class Builder { // current "frontier" private UnCompiledNode[] frontier; + // Expert: you pass an instance of this if you want to do + // something "custom" as suffixes are "frozen": + public static abstract class FreezeTail { + public abstract void freeze(final UnCompiledNode[] frontier, int prefixLenPlus1, IntsRef prevInput) throws IOException; + } + + private final FreezeTail freezeTail; + /** * Instantiates an FST/FSA builder without any pruning. A shortcut * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs)} with * pruning options turned off. */ public Builder(FST.INPUT_TYPE inputType, Outputs outputs) { - this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs); + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null); } /** @@ -120,9 +130,11 @@ public class Builder { * singleton output object. */ public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, - boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs) { + boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, + FreezeTail freezeTail) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; + this.freezeTail = freezeTail; this.doShareNonSingletonNodes = doShareNonSingletonNodes; this.shareMaxTailLength = shareMaxTailLength; fst = new FST(inputType, outputs); @@ -179,94 +191,100 @@ public class Builder { return fn; } - private void compilePrevTail(int prefixLenPlus1) throws IOException { - assert prefixLenPlus1 >= 1; - //System.out.println(" compileTail " + prefixLenPlus1); - for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) { - boolean doPrune = false; - boolean doCompile = false; + private void freezeTail(int prefixLenPlus1) throws IOException { + if (freezeTail != null) { + // Custom plugin: + freezeTail.freeze(frontier, prefixLenPlus1, lastInput); + } else { + //System.out.println(" compileTail " + prefixLenPlus1); + final int downTo = Math.max(1, prefixLenPlus1); + for(int idx=lastInput.length; idx >= downTo; idx--) { - final UnCompiledNode node = frontier[idx]; - final UnCompiledNode parent = frontier[idx-1]; + boolean doPrune = false; + boolean doCompile = false; - if (node.inputCount < minSuffixCount1) { - doPrune = true; - doCompile = true; - } else if (idx > prefixLenPlus1) { - // prune if parent's inputCount is less than suffixMinCount2 - if (parent.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && parent.inputCount == 1) { - // my parent, about to be compiled, doesn't make the cut, so - // I'm definitely pruned + final UnCompiledNode node = frontier[idx]; + final UnCompiledNode parent = frontier[idx-1]; - // if pruneCount2 is 1, we keep only up - // until the 'distinguished edge', ie we keep only the - // 'divergent' part of the FST. if my parent, about to be - // compiled, has inputCount 1 then we are already past the - // distinguished edge. NOTE: this only works if - // the FST outputs are not "compressible" (simple - // ords ARE compressible). + if (node.inputCount < minSuffixCount1) { doPrune = true; + doCompile = true; + } else if (idx > prefixLenPlus1) { + // prune if parent's inputCount is less than suffixMinCount2 + if (parent.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && parent.inputCount == 1 && idx > 1)) { + // my parent, about to be compiled, doesn't make the cut, so + // I'm definitely pruned + + // if minSuffixCount2 is 1, we keep only up + // until the 'distinguished edge', ie we keep only the + // 'divergent' part of the FST. if my parent, about to be + // compiled, has inputCount 1 then we are already past the + // distinguished edge. NOTE: this only works if + // the FST outputs are not "compressible" (simple + // ords ARE compressible). + doPrune = true; + } else { + // my parent, about to be compiled, does make the cut, so + // I'm definitely not pruned + doPrune = false; + } + doCompile = true; } else { - // my parent, about to be compiled, does make the cut, so - // I'm definitely not pruned - doPrune = false; + // if pruning is disabled (count is 0) we can always + // compile current node + doCompile = minSuffixCount2 == 0; } - doCompile = true; - } else { - // if pruning is disabled (count is 0) we can always - // compile current node - doCompile = minSuffixCount2 == 0; - } - //System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune); + //System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune); - if (node.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && node.inputCount == 1) { - // drop all arcs - for(int arcIdx=0;arcIdx target = (UnCompiledNode) node.arcs[arcIdx].target; - target.clear(); + if (node.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && node.inputCount == 1 && idx > 1)) { + // drop all arcs + for(int arcIdx=0;arcIdx target = (UnCompiledNode) node.arcs[arcIdx].target; + target.clear(); + } + node.numArcs = 0; } - node.numArcs = 0; - } - if (doPrune) { - // this node doesn't make it -- deref it - node.clear(); - parent.deleteLast(lastInput.ints[lastInput.offset+idx-1], node); - } else { - - if (minSuffixCount2 != 0) { - compileAllTargets(node, lastInput.length-idx); - } - final T nextFinalOutput = node.output; - - // We "fake" the node as being final if it has no - // outgoing arcs; in theory we could leave it - // as non-final (the FST can represent this), but - // FSTEnum, Util, etc., have trouble w/ non-final - // dead-end states: - final boolean isFinal = node.isFinal || node.numArcs == 0; - - if (doCompile) { - // this node makes it and we now compile it. first, - // compile any targets that were previously - // undecided: - parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], - compileNode(node, 1+lastInput.length-idx), - nextFinalOutput, - isFinal); + if (doPrune) { + // this node doesn't make it -- deref it + node.clear(); + parent.deleteLast(lastInput.ints[lastInput.offset+idx-1], node); } else { - // replaceLast just to install - // nextFinalOutput/isFinal onto the arc - parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], - node, - nextFinalOutput, - isFinal); - // this node will stay in play for now, since we are - // undecided on whether to prune it. later, it - // will be either compiled or pruned, so we must - // allocate a new node: - frontier[idx] = new UnCompiledNode(this, idx); + + if (minSuffixCount2 != 0) { + compileAllTargets(node, lastInput.length-idx); + } + final T nextFinalOutput = node.output; + + // We "fake" the node as being final if it has no + // outgoing arcs; in theory we could leave it + // as non-final (the FST can represent this), but + // FSTEnum, Util, etc., have trouble w/ non-final + // dead-end states: + final boolean isFinal = node.isFinal || node.numArcs == 0; + + if (doCompile) { + // this node makes it and we now compile it. first, + // compile any targets that were previously + // undecided: + parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], + compileNode(node, 1+lastInput.length-idx), + nextFinalOutput, + isFinal); + } else { + // replaceLast just to install + // nextFinalOutput/isFinal onto the arc + parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], + node, + nextFinalOutput, + isFinal); + // this node will stay in play for now, since we are + // undecided on whether to prune it. later, it + // will be either compiled or pruned, so we must + // allocate a new node: + frontier[idx] = new UnCompiledNode(this, idx); + } } } } @@ -320,11 +338,36 @@ public class Builder { add(scratchIntsRef, output); } + // for debugging + /* + private String toString(BytesRef b) { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + return b.toString(); + } + } + */ + /** It's OK to add the same input twice in a row with * different outputs, as long as outputs impls the merge * method. */ public void add(IntsRef input, T output) throws IOException { - //System.out.println("\nFST ADD: input=" + input + " output=" + fst.outputs.outputToString(output)); + /* + if (DEBUG) { + BytesRef b = new BytesRef(input.length); + for(int x=0;x { * nothing is accepted by the FST. */ public FST finish() throws IOException { + final UnCompiledNode root = frontier[0]; + // minimize nodes in the last word's suffix - compilePrevTail(1); - //System.out.println("finish: inputCount=" + frontier[0].inputCount); - if (frontier[0].inputCount < minSuffixCount1 || frontier[0].inputCount < minSuffixCount2 || frontier[0].numArcs == 0) { + freezeTail(0); + if (root.inputCount < minSuffixCount1 || root.inputCount < minSuffixCount2 || root.numArcs == 0) { if (fst.emptyOutput == null) { return null; } else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) { // empty string got pruned return null; - } else { - fst.finish(compileNode(frontier[0], lastInput.length).address); - //System.out.println("compile addr = " + fst.getStartNode()); - return fst; } } else { if (minSuffixCount2 != 0) { - compileAllTargets(frontier[0], lastInput.length); + compileAllTargets(root, lastInput.length); } - //System.out.println("NOW: " + frontier[0].numArcs); - fst.finish(compileNode(frontier[0], lastInput.length).address); } + //if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output); + fst.finish(compileNode(root, lastInput.length).address); - /* - if (dedupHash != null) { - System.out.println("NH: " + dedupHash.count()); - } - */ - return fst; } @@ -479,7 +512,7 @@ public class Builder { } } - static class Arc { + public static class Arc { public int label; // really an "unsigned" byte public Node target; public boolean isFinal; @@ -502,16 +535,20 @@ public class Builder { } } - static final class UnCompiledNode implements Node { + public static final class UnCompiledNode implements Node { final Builder owner; - int numArcs; - Arc[] arcs; - T output; - boolean isFinal; - long inputCount; + public int numArcs; + public Arc[] arcs; + // TODO: instead of recording isFinal/output on the + // node, maybe we should use -1 arc to mean "end" (like + // we do when reading the FST). Would simplify much + // code here... + public T output; + public boolean isFinal; + public long inputCount; /** This node's depth, starting from the automaton root. */ - final int depth; + public final int depth; /** * @param depth diff --git a/lucene/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java b/lucene/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java index 89690908201..10f566c5396 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java +++ b/lucene/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java @@ -133,6 +133,6 @@ public final class ByteSequenceOutputs extends Outputs { @Override public String outputToString(BytesRef output) { - return output.utf8ToString(); + return output.toString(); } } diff --git a/lucene/src/java/org/apache/lucene/util/fst/FST.java b/lucene/src/java/org/apache/lucene/util/fst/FST.java index 3422382ac28..308dbf19c83 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/src/java/org/apache/lucene/util/fst/FST.java @@ -123,7 +123,7 @@ public class FST { public int label; public T output; - int target; + public int target; byte flags; public T nextFinalOutput; @@ -274,6 +274,10 @@ public class FST { } } + public T getEmptyOutput() { + return emptyOutput; + } + void setEmptyOutput(T v) throws IOException { if (emptyOutput != null) { emptyOutput = outputs.merge(emptyOutput, v); @@ -597,9 +601,9 @@ public class FST { arc.label = END_LABEL; arc.output = follow.nextFinalOutput; if (follow.target <= 0) { - arc.flags = BIT_LAST_ARC; + arc.flags = BIT_LAST_ARC | BIT_FINAL_ARC; } else { - arc.flags = 0; + arc.flags = BIT_FINAL_ARC; arc.nextArc = follow.target; } //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output)); @@ -609,8 +613,7 @@ public class FST { } } - // Not private because NodeHash needs access: - Arc readFirstRealArc(int address, Arc arc) throws IOException { + public Arc readFirstRealArc(int address, Arc arc) throws IOException { final BytesReader in = getBytesReader(address); @@ -693,7 +696,9 @@ public class FST { return readLabel(in); } - Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { + /** Never returns null, but you should never call this if + * arc.isLast() is true. */ + public Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { // this is a continuing arc in a fixed array if (arc.bytesPerArc != 0) { // arcs are at fixed entries @@ -925,7 +930,7 @@ public class FST { } } - final BytesReader getBytesReader(int pos) { + public final BytesReader getBytesReader(int pos) { // TODO: maybe re-use via ThreadLocal? return new BytesReader(pos); } diff --git a/lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java b/lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java index 1bae8f9a6cc..4908301f99d 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java +++ b/lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java @@ -39,8 +39,8 @@ import org.apache.lucene.store.DataOutput; public final class UpToTwoPositiveIntOutputs extends Outputs { public final static class TwoLongs { - final long first; - final long second; + public final long first; + public final long second; public TwoLongs(long first, long second) { this.first = first; diff --git a/lucene/src/java/org/apache/lucene/util/fst/Util.java b/lucene/src/java/org/apache/lucene/util/fst/Util.java index 2101f894def..8ec209cf7b1 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/src/java/org/apache/lucene/util/fst/Util.java @@ -213,6 +213,7 @@ public final class Util { // Shape for states. final String stateShape = "circle"; + final String finalStateShape = "doublecircle"; // Emit DOT prologue. out.write("digraph FST {\n"); @@ -223,12 +224,34 @@ public final class Util { } emitDotState(out, "initial", "point", "white", ""); - emitDotState(out, Integer.toString(startArc.target), stateShape, - fst.isExpandedTarget(startArc) ? expandedNodeColor : null, - ""); - out.write(" initial -> " + startArc.target + "\n"); final T NO_OUTPUT = fst.outputs.getNoOutput(); + + // final FST.Arc scratchArc = new FST.Arc(); + + { + final String stateColor; + if (fst.isExpandedTarget(startArc)) { + stateColor = expandedNodeColor; + } else { + stateColor = null; + } + + final boolean isFinal; + final T finalOutput; + if (startArc.isFinal()) { + isFinal = true; + finalOutput = startArc.nextFinalOutput == NO_OUTPUT ? null : startArc.nextFinalOutput; + } else { + isFinal = false; + finalOutput = null; + } + + emitDotState(out, Integer.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput)); + } + + out.write(" initial -> " + startArc.target + "\n"); + int level = 0; while (!nextLevelQueue.isEmpty()) { @@ -240,19 +263,48 @@ public final class Util { out.write("\n // Transitions and states at level: " + level + "\n"); while (!thisLevelQueue.isEmpty()) { final FST.Arc arc = thisLevelQueue.remove(thisLevelQueue.size() - 1); - if (fst.targetHasArcs(arc)) { // scan all arcs final int node = arc.target; fst.readFirstTargetArc(arc, arc); - + + if (arc.label == FST.END_LABEL) { + // Skip it -- prior recursion took this into account already + assert !arc.isLast(); + fst.readNextArc(arc); + } + while (true) { + // Emit the unseen state and add it to the queue for the next level. if (arc.target >= 0 && !seen.get(arc.target)) { - final boolean isExpanded = fst.isExpandedTarget(arc); - emitDotState(out, Integer.toString(arc.target), stateShape, - isExpanded ? expandedNodeColor : null, - labelStates ? Integer.toString(arc.target) : ""); + + /* + boolean isFinal = false; + T finalOutput = null; + fst.readFirstTargetArc(arc, scratchArc); + if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { + // target is final + isFinal = true; + finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; + System.out.println("dot hit final label=" + (char) scratchArc.label); + } + */ + final String stateColor; + if (fst.isExpandedTarget(arc)) { + stateColor = expandedNodeColor; + } else { + stateColor = null; + } + + final String finalOutput; + if (arc.nextFinalOutput != null && arc.nextFinalOutput != NO_OUTPUT) { + finalOutput = fst.outputs.outputToString(arc.nextFinalOutput); + } else { + finalOutput = ""; + } + + emitDotState(out, Integer.toString(arc.target), arc.isFinal() ? finalStateShape : stateShape, stateColor, finalOutput); seen.set(arc.target); nextLevelQueue.add(new FST.Arc().copyFrom(arc)); sameLevelStates.add(arc.target); @@ -265,15 +317,19 @@ public final class Util { outs = ""; } - final String cl; - if (arc.label == FST.END_LABEL) { - cl = "~"; - } else { - cl = printableLabel(arc.label); + if (!fst.targetHasArcs(arc) && arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) { + // Tricky special case: sometimes, due to + // pruning, the builder can [sillily] produce + // an FST with an arc into the final end state + // (-1) but also with a next final output; in + // this case we pull that output up onto this + // arc + outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]"; } - out.write(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n"); - + assert arc.label != FST.END_LABEL; + out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"]\n"); + // Break the loop if we're on the last arc of this state. if (arc.isLast()) { break; @@ -295,7 +351,7 @@ public final class Util { } // Emit terminating state (always there anyway). - out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n"); + out.write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n"); out.write(" {rank=sink; -1 }\n"); out.write("}\n"); diff --git a/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java b/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java index 40b9fa30b38..77896070ad2 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java +++ b/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java @@ -60,7 +60,7 @@ public class RandomIndexWriter implements Closeable { private final Random r; - public MockIndexWriter(Random r,Directory dir, IndexWriterConfig conf) throws IOException { + public MockIndexWriter(Random r, Directory dir, IndexWriterConfig conf) throws IOException { super(dir, conf); // must make a private random since our methods are // called from different threads; else test failures may diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java index e665e82f02c..087d3be419b 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java @@ -30,8 +30,8 @@ import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.sep.IntStreamFactory; import org.apache.lucene.index.codecs.sep.IntIndexInput; import org.apache.lucene.index.codecs.sep.IntIndexOutput; -import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; -import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; +import org.apache.lucene.index.codecs.sep.SepPostingsReader; +import org.apache.lucene.index.codecs.sep.SepPostingsWriter; import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput; import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutput; @@ -127,7 +127,7 @@ public class MockFixedIntBlockCodec extends Codec { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new SepPostingsWriterImpl(state, new MockIntFactory(blockSize)); + PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new MockIntFactory(blockSize)); boolean success = false; TermsIndexWriterBase indexWriter; @@ -158,10 +158,10 @@ public class MockFixedIntBlockCodec extends Codec { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new SepPostingsReaderImpl(state.dir, - state.segmentInfo, - state.context, - new MockIntFactory(blockSize), state.codecId); + PostingsReaderBase postingsReader = new SepPostingsReader(state.dir, + state.segmentInfo, + state.context, + new MockIntFactory(blockSize), state.codecId); TermsIndexReaderBase indexReader; boolean success = false; @@ -204,7 +204,7 @@ public class MockFixedIntBlockCodec extends Codec { @Override public void files(Directory dir, SegmentInfo segmentInfo, int codecId, Set files) throws IOException { - SepPostingsReaderImpl.files(segmentInfo, codecId, files); + SepPostingsReader.files(segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS()); @@ -212,7 +212,7 @@ public class MockFixedIntBlockCodec extends Codec { @Override public void getExtensions(Set extensions) { - SepPostingsWriterImpl.getExtensions(extensions); + SepPostingsWriter.getExtensions(extensions); BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java index 6d15b92cd43..38950cae14c 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java @@ -30,8 +30,8 @@ import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.sep.IntStreamFactory; import org.apache.lucene.index.codecs.sep.IntIndexInput; import org.apache.lucene.index.codecs.sep.IntIndexOutput; -import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; -import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; +import org.apache.lucene.index.codecs.sep.SepPostingsReader; +import org.apache.lucene.index.codecs.sep.SepPostingsWriter; import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexInput; import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexOutput; @@ -150,7 +150,7 @@ public class MockVariableIntBlockCodec extends Codec { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new SepPostingsWriterImpl(state, new MockIntFactory(baseBlockSize)); + PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new MockIntFactory(baseBlockSize)); boolean success = false; TermsIndexWriterBase indexWriter; @@ -181,10 +181,10 @@ public class MockVariableIntBlockCodec extends Codec { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new SepPostingsReaderImpl(state.dir, - state.segmentInfo, - state.context, - new MockIntFactory(baseBlockSize), state.codecId); + PostingsReaderBase postingsReader = new SepPostingsReader(state.dir, + state.segmentInfo, + state.context, + new MockIntFactory(baseBlockSize), state.codecId); TermsIndexReaderBase indexReader; boolean success = false; @@ -227,7 +227,7 @@ public class MockVariableIntBlockCodec extends Codec { @Override public void files(Directory dir, SegmentInfo segmentInfo, int codecId, Set files) throws IOException { - SepPostingsReaderImpl.files(segmentInfo, codecId, files); + SepPostingsReader.files(segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS()); @@ -235,7 +235,7 @@ public class MockVariableIntBlockCodec extends Codec { @Override public void getExtensions(Set extensions) { - SepPostingsWriterImpl.getExtensions(extensions); + SepPostingsWriter.getExtensions(extensions); BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java index c15865c7aa8..23a542e7e37 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java @@ -30,6 +30,8 @@ import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.codecs.BlockTreeTermsReader; +import org.apache.lucene.index.codecs.BlockTreeTermsWriter; import org.apache.lucene.index.codecs.BlockTermsReader; import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.Codec; @@ -51,13 +53,13 @@ import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter; import org.apache.lucene.index.codecs.mockintblock.MockFixedIntBlockCodec; import org.apache.lucene.index.codecs.mockintblock.MockVariableIntBlockCodec; import org.apache.lucene.index.codecs.mocksep.MockSingleIntFactory; -import org.apache.lucene.index.codecs.pulsing.PulsingPostingsReaderImpl; -import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl; +import org.apache.lucene.index.codecs.pulsing.PulsingPostingsReader; +import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriter; import org.apache.lucene.index.codecs.sep.IntIndexInput; import org.apache.lucene.index.codecs.sep.IntIndexOutput; import org.apache.lucene.index.codecs.sep.IntStreamFactory; -import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; -import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; +import org.apache.lucene.index.codecs.sep.SepPostingsReader; +import org.apache.lucene.index.codecs.sep.SepPostingsWriter; import org.apache.lucene.index.codecs.standard.StandardPostingsReader; import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; import org.apache.lucene.store.Directory; @@ -137,7 +139,7 @@ public class MockRandomCodec extends Codec { final long seed = seedRandom.nextLong(); if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: writing to seg=" + state.segmentName + " seed=" + seed); + System.out.println("MockRandomCodec: writing to seg=" + state.segmentName + " codecID=" + state.codecId + " seed=" + seed); } final String seedFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, SEED_EXT); @@ -151,11 +153,10 @@ public class MockRandomCodec extends Codec { final Random random = new Random(seed); random.nextInt(); // consume a random for buffersize - - PostingsWriterBase postingsWriter; + PostingsWriterBase postingsWriter; if (random.nextBoolean()) { - postingsWriter = new SepPostingsWriterImpl(state, new MockIntStreamFactory(random), skipInterval); + postingsWriter = new SepPostingsWriter(state, new MockIntStreamFactory(random), skipInterval); } else { if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: writing Standard postings"); @@ -166,76 +167,107 @@ public class MockRandomCodec extends Codec { if (random.nextBoolean()) { final int totTFCutoff = _TestUtil.nextInt(random, 1, 20); if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: pulsing postings with totTFCutoff=" + totTFCutoff); + System.out.println("MockRandomCodec: writing pulsing postings with totTFCutoff=" + totTFCutoff); } - postingsWriter = new PulsingPostingsWriterImpl(totTFCutoff, postingsWriter); + postingsWriter = new PulsingPostingsWriter(totTFCutoff, postingsWriter); } - final TermsIndexWriterBase indexWriter; - boolean success = false; + final FieldsConsumer fields; - try { - if (random.nextBoolean()) { - state.termIndexInterval = _TestUtil.nextInt(random, 1, 100); - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: fixed-gap terms index (tii=" + state.termIndexInterval + ")"); - } - indexWriter = new FixedGapTermsIndexWriter(state); - } else { - final VariableGapTermsIndexWriter.IndexTermSelector selector; - final int n2 = random.nextInt(3); - if (n2 == 0) { - final int tii = _TestUtil.nextInt(random, 1, 100); - selector = new VariableGapTermsIndexWriter.EveryNTermSelector(tii); - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: variable-gap terms index (tii=" + tii + ")"); - } - } else if (n2 == 1) { - final int docFreqThresh = _TestUtil.nextInt(random, 2, 100); - final int tii = _TestUtil.nextInt(random, 1, 100); - selector = new VariableGapTermsIndexWriter.EveryNOrDocFreqTermSelector(docFreqThresh, tii); - } else { - final long seed2 = random.nextLong(); - final int gap = _TestUtil.nextInt(random, 2, 40); - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: random-gap terms index (max gap=" + gap + ")"); - } - selector = new VariableGapTermsIndexWriter.IndexTermSelector() { - final Random rand = new Random(seed2); + if (random.nextBoolean()) { + // Use BlockTree terms dict - @Override - public boolean isIndexTerm(BytesRef term, TermStats stats) { - return rand.nextInt(gap) == gap/2; - } - - @Override - public void newField(FieldInfo fieldInfo) { - } - }; - } - indexWriter = new VariableGapTermsIndexWriter(state, selector); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: writing BlockTree terms dict"); } - success = true; - } finally { - if (!success) { - postingsWriter.close(); - } - } - success = false; - try { - FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter); - success = true; - return ret; - } finally { - if (!success) { - try { + // TODO: would be nice to allow 1 but this is very + // slow to write + final int minTermsInBlock = _TestUtil.nextInt(random, 2, 100); + final int maxTermsInBlock = Math.max(2, (minTermsInBlock-1)*2 + random.nextInt(100)); + + boolean success = false; + try { + fields = new BlockTreeTermsWriter(state, postingsWriter, minTermsInBlock, maxTermsInBlock); + success = true; + } finally { + if (!success) { postingsWriter.close(); - } finally { - indexWriter.close(); + } + } + } else { + + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: writing Block terms dict"); + } + + boolean success = false; + + final TermsIndexWriterBase indexWriter; + try { + if (random.nextBoolean()) { + state.termIndexInterval = _TestUtil.nextInt(random, 1, 100); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: fixed-gap terms index (tii=" + state.termIndexInterval + ")"); + } + indexWriter = new FixedGapTermsIndexWriter(state); + } else { + final VariableGapTermsIndexWriter.IndexTermSelector selector; + final int n2 = random.nextInt(3); + if (n2 == 0) { + final int tii = _TestUtil.nextInt(random, 1, 100); + selector = new VariableGapTermsIndexWriter.EveryNTermSelector(tii); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: variable-gap terms index (tii=" + tii + ")"); + } + } else if (n2 == 1) { + final int docFreqThresh = _TestUtil.nextInt(random, 2, 100); + final int tii = _TestUtil.nextInt(random, 1, 100); + selector = new VariableGapTermsIndexWriter.EveryNOrDocFreqTermSelector(docFreqThresh, tii); + } else { + final long seed2 = random.nextLong(); + final int gap = _TestUtil.nextInt(random, 2, 40); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: random-gap terms index (max gap=" + gap + ")"); + } + selector = new VariableGapTermsIndexWriter.IndexTermSelector() { + final Random rand = new Random(seed2); + + @Override + public boolean isIndexTerm(BytesRef term, TermStats stats) { + return rand.nextInt(gap) == gap/2; + } + + @Override + public void newField(FieldInfo fieldInfo) { + } + }; + } + indexWriter = new VariableGapTermsIndexWriter(state, selector); + } + success = true; + } finally { + if (!success) { + postingsWriter.close(); + } + } + + success = false; + try { + fields = new BlockTermsWriter(indexWriter, state, postingsWriter); + success = true; + } finally { + if (!success) { + try { + postingsWriter.close(); + } finally { + indexWriter.close(); + } } } } + + return fields; } @Override @@ -245,7 +277,7 @@ public class MockRandomCodec extends Codec { final IndexInput in = state.dir.openInput(seedFileName, state.context); final long seed = in.readLong(); if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: reading from seg=" + state.segmentInfo.name + " seed=" + seed); + System.out.println("MockRandomCodec: reading from seg=" + state.segmentInfo.name + " codecID=" + state.codecId + " seed=" + seed); } in.close(); @@ -259,8 +291,11 @@ public class MockRandomCodec extends Codec { PostingsReaderBase postingsReader; if (random.nextBoolean()) { - postingsReader = new SepPostingsReaderImpl(state.dir, state.segmentInfo, - state.context, new MockIntStreamFactory(random), state.codecId); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: reading Sep postings"); + } + postingsReader = new SepPostingsReader(state.dir, state.segmentInfo, + state.context, new MockIntStreamFactory(random), state.codecId); } else { if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: reading Standard postings"); @@ -273,86 +308,119 @@ public class MockRandomCodec extends Codec { if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: reading pulsing postings with totTFCutoff=" + totTFCutoff); } - postingsReader = new PulsingPostingsReaderImpl(postingsReader); + postingsReader = new PulsingPostingsReader(postingsReader); } - final TermsIndexReaderBase indexReader; - boolean success = false; + final FieldsProducer fields; - try { - if (random.nextBoolean()) { - // if termsIndexDivisor is set to -1, we should not touch it. It means a - // test explicitly instructed not to load the terms index. - if (state.termsIndexDivisor != -1) { - state.termsIndexDivisor = _TestUtil.nextInt(random, 1, 10); - } - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: fixed-gap terms index (divisor=" + state.termsIndexDivisor + ")"); - } - indexReader = new FixedGapTermsIndexReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUnicodeComparator(), - state.codecId, state.context); - } else { - final int n2 = random.nextInt(3); - if (n2 == 1) { - random.nextInt(); - } else if (n2 == 2) { - random.nextLong(); - } - if (LuceneTestCase.VERBOSE) { - System.out.println("MockRandomCodec: variable-gap terms index (divisor=" + state.termsIndexDivisor + ")"); - } - if (state.termsIndexDivisor != -1) { - state.termsIndexDivisor = _TestUtil.nextInt(random, 1, 10); - } - indexReader = new VariableGapTermsIndexReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - state.termsIndexDivisor, - state.codecId, state.context); + if (random.nextBoolean()) { + // Use BlockTree terms dict + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: reading BlockTree terms dict"); } - success = true; - } finally { - if (!success) { - postingsReader.close(); - } - } - final int termsCacheSize = _TestUtil.nextInt(random, 1, 1024); - - success = false; - try { - FieldsProducer ret = new BlockTermsReader(indexReader, - state.dir, - state.fieldInfos, - state.segmentInfo.name, - postingsReader, - state.context, - termsCacheSize, - state.codecId); - success = true; - return ret; - } finally { - if (!success) { - try { + boolean success = false; + try { + fields = new BlockTreeTermsReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + postingsReader, + state.context, + state.codecId, + state.termsIndexDivisor); + success = true; + } finally { + if (!success) { postingsReader.close(); - } finally { - indexReader.close(); + } + } + } else { + + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: reading Block terms dict"); + } + final TermsIndexReaderBase indexReader; + boolean success = false; + try { + final boolean doFixedGap = random.nextBoolean(); + + // randomness diverges from writer, here: + if (state.termsIndexDivisor != -1) { + state.termsIndexDivisor = _TestUtil.nextInt(random, 1, 10); + } + + if (doFixedGap) { + // if termsIndexDivisor is set to -1, we should not touch it. It means a + // test explicitly instructed not to load the terms index. + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: fixed-gap terms index (divisor=" + state.termsIndexDivisor + ")"); + } + indexReader = new FixedGapTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + BytesRef.getUTF8SortedAsUnicodeComparator(), + state.codecId, state.context); + } else { + final int n2 = random.nextInt(3); + if (n2 == 1) { + random.nextInt(); + } else if (n2 == 2) { + random.nextLong(); + } + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: variable-gap terms index (divisor=" + state.termsIndexDivisor + ")"); + } + indexReader = new VariableGapTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + state.codecId, state.context); + + } + + success = true; + } finally { + if (!success) { + postingsReader.close(); + } + } + + final int termsCacheSize = _TestUtil.nextInt(random, 1, 1024); + + success = false; + try { + fields = new BlockTermsReader(indexReader, + state.dir, + state.fieldInfos, + state.segmentInfo.name, + postingsReader, + state.context, + termsCacheSize, + state.codecId); + success = true; + } finally { + if (!success) { + try { + postingsReader.close(); + } finally { + indexReader.close(); + } } } } + + return fields; } @Override public void files(Directory dir, SegmentInfo segmentInfo, int codecId, Set files) throws IOException { final String seedFileName = IndexFileNames.segmentFileName(segmentInfo.name, codecId, SEED_EXT); files.add(seedFileName); - SepPostingsReaderImpl.files(segmentInfo, codecId, files); + SepPostingsReader.files(segmentInfo, codecId, files); StandardPostingsReader.files(dir, segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files); + BlockTreeTermsReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); VariableGapTermsIndexReader.files(dir, segmentInfo, codecId, files); DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS()); @@ -369,8 +437,9 @@ public class MockRandomCodec extends Codec { @Override public void getExtensions(Set extensions) { - SepPostingsWriterImpl.getExtensions(extensions); + SepPostingsWriter.getExtensions(extensions); BlockTermsReader.getExtensions(extensions); + BlockTreeTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); VariableGapTermsIndexReader.getIndexExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java index 30cd3643657..6b2884e2ff2 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java @@ -40,8 +40,8 @@ import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.TermsIndexReaderBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase; import org.apache.lucene.index.codecs.standard.StandardCodec; -import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; -import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; +import org.apache.lucene.index.codecs.sep.SepPostingsWriter; +import org.apache.lucene.index.codecs.sep.SepPostingsReader; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; @@ -60,7 +60,7 @@ public class MockSepCodec extends Codec { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new SepPostingsWriterImpl(state, new MockSingleIntFactory()); + PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new MockSingleIntFactory()); boolean success = false; TermsIndexWriterBase indexWriter; @@ -92,7 +92,7 @@ public class MockSepCodec extends Codec { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new SepPostingsReaderImpl(state.dir, state.segmentInfo, + PostingsReaderBase postingsReader = new SepPostingsReader(state.dir, state.segmentInfo, state.context, new MockSingleIntFactory(), state.codecId); TermsIndexReaderBase indexReader; @@ -136,7 +136,7 @@ public class MockSepCodec extends Codec { @Override public void files(Directory dir, SegmentInfo segmentInfo, int codecId, Set files) throws IOException { - SepPostingsReaderImpl.files(segmentInfo, codecId, files); + SepPostingsReader.files(segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS()); @@ -149,7 +149,7 @@ public class MockSepCodec extends Codec { } public static void getSepExtensions(Set extensions) { - SepPostingsWriterImpl.getExtensions(extensions); + SepPostingsWriter.getExtensions(extensions); BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); } diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexOutput.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexOutput.java index 7830b786a57..006299e9039 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexOutput.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexOutput.java @@ -66,6 +66,11 @@ public class MockSingleIntIndexOutput extends IntIndexOutput { out.close(); } + @Override + public String toString() { + return "MockSingleIntIndexOutput fp=" + out.getFilePointer(); + } + private class Index extends IntIndexOutput.Index { long fp; long lastFP; @@ -74,8 +79,11 @@ public class MockSingleIntIndexOutput extends IntIndexOutput { fp = out.getFilePointer(); } @Override - public void set(IntIndexOutput.Index other) { - lastFP = fp = ((Index) other).fp; + public void copyFrom(IntIndexOutput.Index other, boolean copyLast) { + fp = ((Index) other).fp; + if (copyLast) { + lastFP = ((Index) other).fp; + } } @Override public void write(IndexOutput indexOut, boolean absolute) diff --git a/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java b/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java index b9c8aceffb6..6c92fe5b14b 100644 --- a/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java @@ -280,7 +280,11 @@ public abstract class LuceneTestCase extends Assert { } swapCodec(new MockSepCodec(), cp); - swapCodec(new PulsingCodec(codecHasParam && "Pulsing".equals(codec) ? codecParam : 1 + random.nextInt(20)), cp); + // TODO: make it possible to specify min/max iterms per + // block via CL: + int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100); + int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100); + swapCodec(new PulsingCodec(codecHasParam && "Pulsing".equals(codec) ? codecParam : 1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), cp); swapCodec(new MockFixedIntBlockCodec(codecHasParam && "MockFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000)), cp); // baseBlockSize cannot be over 127: swapCodec(new MockVariableIntBlockCodec(codecHasParam && "MockVariableIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 127)), cp); @@ -307,7 +311,7 @@ public abstract class LuceneTestCase extends Assert { cp.unregister(cp.lookup("MockFixedIntBlock")); cp.unregister(cp.lookup("MockVariableIntBlock")); cp.unregister(cp.lookup("MockRandom")); - swapCodec(new PulsingCodec(1), cp); + swapCodec(new PulsingCodec(), cp); cp.setDefaultFieldCodec(savedDefaultCodec); } @@ -485,7 +489,7 @@ public abstract class LuceneTestCase extends Assert { System.err.println("NOTE: test params are: codec=" + codecDescription + ", locale=" + locale + ", timezone=" + (timeZone == null ? "(null)" : timeZone.getID())); - if (testsFailed) { + if (VERBOSE || testsFailed) { System.err.println("NOTE: all tests run in this JVM:"); System.err.println(Arrays.toString(testClassesRun.toArray())); System.err.println("NOTE: " + System.getProperty("os.name") + " " @@ -1561,9 +1565,17 @@ public abstract class LuceneTestCase extends Assert { RandomCodecProvider(Random random) { this.perFieldSeed = random.nextInt(); - register(randomizCodec(random, new StandardCodec())); + // TODO: make it possible to specify min/max iterms per + // block via CL: + int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100); + int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100); + register(randomizCodec(random, new StandardCodec(minItemsPerBlock, maxItemsPerBlock))); register(randomizCodec(random, new PreFlexCodec())); - register(randomizCodec(random, new PulsingCodec( 1 + random.nextInt(20)))); + // TODO: make it possible to specify min/max iterms per + // block via CL: + minItemsPerBlock = _TestUtil.nextInt(random, 2, 100); + maxItemsPerBlock = 2*(Math.max(1, minItemsPerBlock-1)) + random.nextInt(100); + register(randomizCodec(random, new PulsingCodec( 1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock))); register(randomizCodec(random, new SimpleTextCodec())); register(randomizCodec(random, new MemoryCodec())); Collections.shuffle(knownCodecs, random); diff --git a/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java b/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java index 1977aa2d78b..0eee8b0d343 100644 --- a/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java +++ b/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java @@ -154,7 +154,7 @@ public class _TestUtil { public static CheckIndex.Status checkIndex(Directory dir, CodecProvider codecs) throws IOException { ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); CheckIndex checker = new CheckIndex(dir); - checker.setInfoStream(new PrintStream(bos)); + checker.setInfoStream(new PrintStream(bos), false); CheckIndex.Status indexStatus = checker.checkIndex(null, codecs); if (indexStatus == null || indexStatus.clean == false) { System.out.println("CheckIndex failed"); diff --git a/lucene/src/test-framework/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/src/test-framework/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java new file mode 100644 index 00000000000..dad8236be29 --- /dev/null +++ b/lucene/src/test-framework/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -0,0 +1,361 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.*; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; + +/** + * Builds a minimal deterministic automaton that accepts a set of strings. The + * algorithm requires sorted input data, but is very fast (nearly linear with + * the input size). + */ +public final class DaciukMihovAutomatonBuilder { + /** + * DFSA state with char labels on transitions. + */ + final static class State { + + /** An empty set of labels. */ + private final static int[] NO_LABELS = new int[0]; + + /** An empty set of states. */ + private final static State[] NO_STATES = new State[0]; + + /** + * Labels of outgoing transitions. Indexed identically to {@link #states}. + * Labels must be sorted lexicographically. + */ + int[] labels = NO_LABELS; + + /** + * States reachable from outgoing transitions. Indexed identically to + * {@link #labels}. + */ + State[] states = NO_STATES; + + /** + * true if this state corresponds to the end of at least one + * input sequence. + */ + boolean is_final; + + /** + * Returns the target state of a transition leaving this state and labeled + * with label. If no such transition exists, returns + * null. + */ + public State getState(int label) { + final int index = Arrays.binarySearch(labels, label); + return index >= 0 ? states[index] : null; + } + + /** + * Returns an array of outgoing transition labels. The array is sorted in + * lexicographic order and indexes correspond to states returned from + * {@link #getStates()}. + */ + public int[] getTransitionLabels() { + return this.labels; + } + + /** + * Returns an array of outgoing transitions from this state. The returned + * array must not be changed. + */ + public State[] getStates() { + return this.states; + } + + /** + * Two states are equal if: + *
      + *
    • they have an identical number of outgoing transitions, labeled with + * the same labels
    • + *
    • corresponding outgoing transitions lead to the same states (to states + * with an identical right-language). + *
    + */ + @Override + public boolean equals(Object obj) { + final State other = (State) obj; + return is_final == other.is_final + && Arrays.equals(this.labels, other.labels) + && referenceEquals(this.states, other.states); + } + + /** + * Return true if this state has any children (outgoing + * transitions). + */ + public boolean hasChildren() { + return labels.length > 0; + } + + /** + * Is this state a final state in the automaton? + */ + public boolean isFinal() { + return is_final; + } + + /** + * Compute the hash code of the current status of this state. + */ + @Override + public int hashCode() { + int hash = is_final ? 1 : 0; + + hash ^= hash * 31 + this.labels.length; + for (int c : this.labels) + hash ^= hash * 31 + c; + + /* + * Compare the right-language of this state using reference-identity of + * outgoing states. This is possible because states are interned (stored + * in registry) and traversed in post-order, so any outgoing transitions + * are already interned. + */ + for (State s : this.states) { + hash ^= System.identityHashCode(s); + } + + return hash; + } + + /** + * Create a new outgoing transition labeled label and return + * the newly created target state for this transition. + */ + State newState(int label) { + assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: " + + label; + + labels = copyOf(labels, labels.length + 1); + states = copyOf(states, states.length + 1); + + labels[labels.length - 1] = label; + return states[states.length - 1] = new State(); + } + + /** + * Return the most recent transitions's target state. + */ + State lastChild() { + assert hasChildren() : "No outgoing transitions."; + return states[states.length - 1]; + } + + /** + * Return the associated state if the most recent transition is labeled with + * label. + */ + State lastChild(int label) { + final int index = labels.length - 1; + State s = null; + if (index >= 0 && labels[index] == label) { + s = states[index]; + } + assert s == getState(label); + return s; + } + + /** + * Replace the last added outgoing transition's target state with the given + * state. + */ + void replaceLastChild(State state) { + assert hasChildren() : "No outgoing transitions."; + states[states.length - 1] = state; + } + + /** + * JDK1.5-replacement of {@link Arrays#copyOf(int[], int)} + */ + private static int[] copyOf(int[] original, int newLength) { + int[] copy = new int[newLength]; + System.arraycopy(original, 0, copy, 0, + Math.min(original.length, newLength)); + return copy; + } + + /** + * JDK1.5-replacement of {@link Arrays#copyOf(char[], int)} + */ + public static State[] copyOf(State[] original, int newLength) { + State[] copy = new State[newLength]; + System.arraycopy(original, 0, copy, 0, + Math.min(original.length, newLength)); + return copy; + } + + /** + * Compare two lists of objects for reference-equality. + */ + private static boolean referenceEquals(Object[] a1, Object[] a2) { + if (a1.length != a2.length) return false; + + for (int i = 0; i < a1.length; i++) + if (a1[i] != a2[i]) return false; + + return true; + } + } + + /** + * "register" for state interning. + */ + private HashMap register = new HashMap(); + + /** + * Root automaton state. + */ + private State root = new State(); + + /** + * Previous sequence added to the automaton in {@link #add(CharSequence)}. + */ + private CharsRef previous; + + private static final Comparator comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); + + /** + * Add another character sequence to this automaton. The sequence must be + * lexicographically larger or equal compared to any previous sequences added + * to this automaton (the input must be sorted). + */ + public void add(CharsRef current) { + assert register != null : "Automaton already built."; + assert previous == null + || comparator.compare(previous, current) <= 0 : "Input must be sorted: " + + previous + " >= " + current; + assert setPrevious(current); + + // Descend in the automaton (find matching prefix). + int pos = 0, max = current.length(); + State next, state = root; + while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) { + state = next; + // todo, optimize me + pos += Character.charCount(Character.codePointAt(current, pos)); + } + + if (state.hasChildren()) replaceOrRegister(state); + + addSuffix(state, current, pos); + } + + /** + * Finalize the automaton and return the root state. No more strings can be + * added to the builder after this call. + * + * @return Root automaton state. + */ + public State complete() { + if (this.register == null) throw new IllegalStateException(); + + if (root.hasChildren()) replaceOrRegister(root); + + register = null; + return root; + } + + /** + * Internal recursive traversal for conversion. + */ + private static org.apache.lucene.util.automaton.State convert(State s, + IdentityHashMap visited) { + org.apache.lucene.util.automaton.State converted = visited.get(s); + if (converted != null) return converted; + + converted = new org.apache.lucene.util.automaton.State(); + converted.setAccept(s.is_final); + + visited.put(s, converted); + int i = 0; + int[] labels = s.labels; + for (DaciukMihovAutomatonBuilder.State target : s.states) { + converted.addTransition(new Transition(labels[i++], convert(target, + visited))); + } + + return converted; + } + + /** + * Build a minimal, deterministic automaton from a sorted list of strings. + */ + public static Automaton build(Collection input) { + final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); + + CharsRef scratch = new CharsRef(); + for (BytesRef b : input) { + UnicodeUtil.UTF8toUTF16(b, scratch); + builder.add(scratch); + } + + Automaton a = new Automaton(); + a.initial = convert(builder.complete(), new IdentityHashMap()); + a.deterministic = true; + return a; + } + + /** + * Copy current into an internal buffer. + */ + private boolean setPrevious(CharsRef current) { + // don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277 + // still, called only from assert + previous = new CharsRef(current); + return true; + } + + /** + * Replace last child of state with an already registered state + * or register the last child state. + */ + private void replaceOrRegister(State state) { + final State child = state.lastChild(); + + if (child.hasChildren()) replaceOrRegister(child); + + final State registered = register.get(child); + if (registered != null) { + state.replaceLastChild(registered); + } else { + register.put(child, child); + } + } + + /** + * Add a suffix of current starting at fromIndex + * (inclusive) to state state. + */ + private void addSuffix(State state, CharSequence current, int fromIndex) { + final int len = current.length(); + while (fromIndex < len) { + int cp = Character.codePointAt(current, fromIndex); + state = state.newState(cp); + fromIndex += Character.charCount(cp); + } + state.is_final = true; + } +} diff --git a/lucene/src/test/org/apache/lucene/TestSearchForDuplicates.java b/lucene/src/test/org/apache/lucene/TestSearchForDuplicates.java index 7a5c32beb3c..00e9c96ae88 100644 --- a/lucene/src/test/org/apache/lucene/TestSearchForDuplicates.java +++ b/lucene/src/test/org/apache/lucene/TestSearchForDuplicates.java @@ -86,7 +86,7 @@ public class TestSearchForDuplicates extends LuceneTestCase { } IndexWriter writer = new IndexWriter(directory, conf); if (VERBOSE) { - System.out.println("TEST: now build index"); + System.out.println("TEST: now build index MAX_DOCS=" + MAX_DOCS); writer.setInfoStream(System.out); } diff --git a/lucene/src/test/org/apache/lucene/index/Test2BTerms.java b/lucene/src/test/org/apache/lucene/index/Test2BTerms.java index 20eb96b7389..346c6158b28 100644 --- a/lucene/src/test/org/apache/lucene/index/Test2BTerms.java +++ b/lucene/src/test/org/apache/lucene/index/Test2BTerms.java @@ -155,9 +155,9 @@ public class Test2BTerms extends LuceneTestCase { List savedTerms = null; MockDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BTerms")); + //MockDirectoryWrapper dir = newFSDirectory(new File("/p/lucene/indices/2bindex")); dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER); dir.setCheckIndexOnClose(false); // don't double-checkindex - //Directory dir = newFSDirectory(new File("/p/lucene/indices/2bindex")); if (true) { @@ -169,6 +169,7 @@ public class Test2BTerms extends LuceneTestCase { .setMergePolicy(newLogMergePolicy(false, 10)) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); + w.setInfoStream(VERBOSE ? System.out : null); MergePolicy mp = w.getConfig().getMergePolicy(); if (mp instanceof LogByteSizeMergePolicy) { // 1 petabyte: diff --git a/lucene/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/src/test/org/apache/lucene/index/TestCodecs.java index ca4d89b57b3..142d3e8fdc7 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/src/test/org/apache/lucene/index/TestCodecs.java @@ -40,8 +40,6 @@ import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IOContext.Context; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.Version; @@ -504,6 +502,9 @@ public class TestCodecs extends LuceneTestCase { } // Test seek to non-existent terms: + if (VERBOSE) { + System.out.println("TEST: seek non-exist terms"); + } for(int i=0;i<100;i++) { final String text2 = _TestUtil.randomUnicodeString(random) + "."; status = termsEnum.seekCeil(new BytesRef(text2)); @@ -512,6 +513,9 @@ public class TestCodecs extends LuceneTestCase { } // Seek to each term, backwards: + if (VERBOSE) { + System.out.println("TEST: seek terms backwards"); + } for(int i=field.terms.length-1;i>=0;i--) { assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(field.terms[i].text2))); assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); diff --git a/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java b/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java index b568279d572..0bdb2ff86ea 100644 --- a/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java +++ b/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java @@ -462,7 +462,7 @@ public class TestDocTermOrds extends LuceneTestCase { */ if (VERBOSE) { - System.out.println("TEST: verify prefix=" + prefixRef.utf8ToString()); + System.out.println("TEST: verify prefix=" + (prefixRef==null ? "null" : prefixRef.utf8ToString())); System.out.println("TEST: all TERMS:"); TermsEnum allTE = MultiFields.getTerms(r, "field").iterator(); int ord = 0; diff --git a/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java b/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java index fb96139eac5..11f383461d2 100644 --- a/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java +++ b/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java @@ -194,7 +194,8 @@ public class TestDocsAndPositions extends LuceneTestCase { public void testRandomDocs() throws IOException { Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random, dir, - newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + writer.w.setInfoStream(VERBOSE ? System.out : null); int numDocs = atLeast(49); int max = 15678; int term = random.nextInt(max); @@ -290,7 +291,7 @@ public class TestDocsAndPositions extends LuceneTestCase { writer.addDocument(doc); } - // now do seaches + // now do searches IndexReader reader = writer.getReader(); writer.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java b/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java index a87bcb9e776..ffc86add930 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java @@ -19,7 +19,6 @@ package org.apache.lucene.index; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.Similarity; import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; @@ -169,7 +168,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { Set dif = difFiles(files, files2); if (!Arrays.equals(files, files2)) { - fail("IndexFileDeleter failed to delete unreferenced extra files: should have deleted " + (filesPre.length-files.length) + " files but only deleted " + (filesPre.length - files2.length) + "; expected files:\n " + asString(files) + "\n actual files:\n " + asString(files2)+"\ndif: "+dif); + fail("IndexFileDeleter failed to delete unreferenced extra files: should have deleted " + (filesPre.length-files.length) + " files but only deleted " + (filesPre.length - files2.length) + "; expected files:\n " + asString(files) + "\n actual files:\n " + asString(files2)+"\ndiff: "+dif); } } diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java index b6c021d4102..1173f03731f 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java @@ -74,8 +74,15 @@ public class TestIndexWriterDelete extends LuceneTestCase { Term term = new Term("city", "Amsterdam"); int hitCount = getHitCount(dir, term); assertEquals(1, hitCount); + if (VERBOSE) { + System.out.println("\nTEST: now delete by term=" + term); + } modifier.deleteDocuments(term); modifier.commit(); + + if (VERBOSE) { + System.out.println("\nTEST: now getHitCount"); + } hitCount = getHitCount(dir, term); assertEquals(0, hitCount); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterMerging.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterMerging.java index 909149ef027..9e6569b169d 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterMerging.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterMerging.java @@ -221,6 +221,8 @@ public class TestIndexWriterMerging extends LuceneTestCase setMergePolicy(newLogMergePolicy(50)) ); + writer.setInfoStream(VERBOSE ? System.out : null); + Document document = new Document(); document = new Document(); diff --git a/lucene/src/test/org/apache/lucene/index/TestLongPostings.java b/lucene/src/test/org/apache/lucene/index/TestLongPostings.java index 3791968fa57..9bc5d5b7eeb 100644 --- a/lucene/src/test/org/apache/lucene/index/TestLongPostings.java +++ b/lucene/src/test/org/apache/lucene/index/TestLongPostings.java @@ -355,7 +355,7 @@ public class TestLongPostings extends LuceneTestCase { } if (VERBOSE) { - System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1); + System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1 + " term=" + term); } final DocsEnum postings = MultiFields.getTermDocsEnum(r, null, "field", new BytesRef(term)); @@ -426,7 +426,7 @@ public class TestLongPostings extends LuceneTestCase { if (random.nextInt(6) == 3) { final int freq = postings.freq(); - assertTrue(freq >=1 && freq <= 4); + assertTrue("got invalid freq=" + freq, freq >=1 && freq <= 4); } } } diff --git a/lucene/src/test/org/apache/lucene/index/TestMultiFields.java b/lucene/src/test/org/apache/lucene/index/TestMultiFields.java index 7066efad104..3bf59b1ce22 100644 --- a/lucene/src/test/org/apache/lucene/index/TestMultiFields.java +++ b/lucene/src/test/org/apache/lucene/index/TestMultiFields.java @@ -29,10 +29,15 @@ public class TestMultiFields extends LuceneTestCase { int num = atLeast(2); for (int iter = 0; iter < num; iter++) { + if (VERBOSE) { + System.out.println("TEST: iter=" + iter); + } + Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(NoMergePolicy.COMPOUND_FILES)); _TestUtil.keepFullyDeletedSegments(w); + w.setInfoStream(VERBOSE ? System.out : null); Map> docs = new HashMap>(); Set deleted = new HashSet(); @@ -46,6 +51,9 @@ public class TestMultiFields extends LuceneTestCase { doc.add(id); boolean onlyUniqueTerms = random.nextBoolean(); + if (VERBOSE) { + System.out.println("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs); + } Set uniqueTerms = new HashSet(); for(int i=0;i termsList = new ArrayList(uniqueTerms); Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); - System.out.println("UTF16 order:"); + System.out.println("TEST: terms in UTF16 order:"); for(BytesRef b : termsList) { - System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString())); + System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString()) + " " + b); + for(int docID : docs.get(b)) { + if (deleted.contains(docID)) { + System.out.println(" " + docID + " (deleted)"); + } else { + System.out.println(" " + docID); + } + } } } IndexReader reader = w.getReader(); w.close(); - //System.out.println("TEST reader=" + reader); + if (VERBOSE) { + System.out.println("TEST: reader=" + reader); + } Bits liveDocs = MultiFields.getLiveDocs(reader); for(int delDoc : deleted) { @@ -99,7 +119,7 @@ public class TestMultiFields extends LuceneTestCase { for(int i=0;i<100;i++) { BytesRef term = terms.get(random.nextInt(terms.size())); if (VERBOSE) { - System.out.println("TEST: seek to term= "+ UnicodeUtil.toHexString(term.utf8ToString())); + System.out.println("TEST: seek term="+ UnicodeUtil.toHexString(term.utf8ToString()) + " " + term); } DocsEnum docsEnum = terms2.docs(liveDocs, term, null); diff --git a/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java b/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java index b134fffb340..18845e7f470 100644 --- a/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java +++ b/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java @@ -17,15 +17,32 @@ package org.apache.lucene.index; * limitations under the License. */ +import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.search.FieldCache; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LineFileDocs; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; public class TestTermsEnum extends LuceneTestCase { @@ -140,4 +157,563 @@ public class TestTermsEnum extends LuceneTestCase { r.close(); d.close(); } + + private String randomString() { + //return _TestUtil.randomSimpleString(random); + return _TestUtil.randomRealisticUnicodeString(random); + } + + private void addDoc(RandomIndexWriter w, Collection terms, Map termToID, int id) throws IOException { + Document doc = new Document(); + doc.add(new NumericField("id").setIntValue(id)); + if (VERBOSE) { + System.out.println("TEST: addDoc id:" + id + " terms=" + terms); + } + for (String s2 : terms) { + doc.add(newField("f", s2, Field.Index.NOT_ANALYZED)); + termToID.put(new BytesRef(s2), id); + } + w.addDocument(doc); + terms.clear(); + } + + private boolean accepts(CompiledAutomaton c, BytesRef b) { + int state = c.runAutomaton.getInitialState(); + for(int idx=0;idx terms = new HashSet(); + final Collection pendingTerms = new ArrayList(); + final Map termToID = new HashMap(); + int id = 0; + while(terms.size() != numTerms) { + final String s = randomString(); + if (!terms.contains(s)) { + terms.add(s); + pendingTerms.add(s); + if (random.nextInt(20) == 7) { + addDoc(w, pendingTerms, termToID, id++); + } + } + } + addDoc(w, pendingTerms, termToID, id++); + + final BytesRef[] termsArray = new BytesRef[terms.size()]; + final Set termsSet = new HashSet(); + { + int upto = 0; + for(String s : terms) { + final BytesRef b = new BytesRef(s); + termsArray[upto++] = b; + termsSet.add(b); + } + Arrays.sort(termsArray); + } + + if (VERBOSE) { + System.out.println("\nTEST: indexed terms (unicode order):"); + for(BytesRef t : termsArray) { + System.out.println(" " + t.utf8ToString() + " -> id:" + termToID.get(t)); + } + } + + final IndexReader r = w.getReader(); + w.close(); + + // NOTE: intentional insanity!! + final int[] docIDToID = FieldCache.DEFAULT.getInts(r, "id"); + + for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) { + + // TODO: can we also test infinite As here...? + + // From the random terms, pick some ratio and compile an + // automaton: + final Set acceptTerms = new HashSet(); + final TreeSet sortedAcceptTerms = new TreeSet(); + final double keepPct = random.nextDouble(); + Automaton a; + if (iter == 0) { + if (VERBOSE) { + System.out.println("\nTEST: empty automaton"); + } + a = BasicAutomata.makeEmpty(); + } else { + if (VERBOSE) { + System.out.println("\nTEST: keepPct=" + keepPct); + } + for (String s : terms) { + final String s2; + if (random.nextDouble() <= keepPct) { + s2 = s; + } else { + s2 = randomString(); + } + acceptTerms.add(s2); + sortedAcceptTerms.add(new BytesRef(s2)); + } + a = DaciukMihovAutomatonBuilder.build(sortedAcceptTerms); + } + final CompiledAutomaton c = new CompiledAutomaton(a, true, false); + + final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()]; + final Set acceptTermsSet = new HashSet(); + int upto = 0; + for(String s : acceptTerms) { + final BytesRef b = new BytesRef(s); + acceptTermsArray[upto++] = b; + acceptTermsSet.add(b); + assertTrue(accepts(c, b)); + } + Arrays.sort(acceptTermsArray); + + if (VERBOSE) { + System.out.println("\nTEST: accept terms (unicode order):"); + for(BytesRef t : acceptTermsArray) { + System.out.println(" " + t.utf8ToString() + (termsSet.contains(t) ? " (exists)" : "")); + } + System.out.println(a.toDot()); + } + + for(int iter2=0;iter2<100;iter2++) { + final BytesRef startTerm = acceptTermsArray.length == 0 || random.nextBoolean() ? null : acceptTermsArray[random.nextInt(acceptTermsArray.length)]; + + final TermsEnum te = MultiFields.getTerms(r, "f").intersect(c, startTerm); + + if (VERBOSE) { + System.out.println("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "" : startTerm.utf8ToString())); + } + + int loc; + if (startTerm == null) { + loc = 0; + } else { + loc = Arrays.binarySearch(termsArray, new BytesRef(startTerm)); + if (loc < 0) { + loc = -(loc+1); + } else { + // startTerm exists in index + loc++; + } + } + while(loc < termsArray.length && !acceptTermsSet.contains(termsArray[loc])) { + loc++; + } + + DocsEnum docsEnum = null; + while (loc < termsArray.length) { + final BytesRef expected = termsArray[loc]; + final BytesRef actual = te.next(); + if (VERBOSE) { + System.out.println("TEST: next() expected=" + expected.utf8ToString() + " actual=" + actual.utf8ToString()); + } + assertEquals(expected, actual); + assertEquals(1, te.docFreq()); + docsEnum = te.docs(null, docsEnum); + final int docID = docsEnum.nextDoc(); + assertTrue(docID != DocsEnum.NO_MORE_DOCS); + assertEquals(docIDToID[docID], termToID.get(expected).intValue()); + do { + loc++; + } while (loc < termsArray.length && !acceptTermsSet.contains(termsArray[loc])); + } + + assertNull(te.next()); + } + } + + r.close(); + dir.close(); + } + + private Directory d; + private IndexReader r; + + private final String FIELD = "field"; + + private IndexReader makeIndex(String... terms) throws Exception { + d = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)); + + /* + CoreCodecProvider cp = new CoreCodecProvider(); + cp.unregister(cp.lookup("Standard")); + cp.register(new StandardCodec(minTermsInBlock, maxTermsInBlock)); + cp.setDefaultFieldCodec("Standard"); + iwc.setCodecProvider(cp); + */ + + final RandomIndexWriter w = new RandomIndexWriter(random, d, iwc); + w.w.setInfoStream(VERBOSE ? System.out : null); + for(String term : terms) { + Document doc = new Document(); + Field f = newField(FIELD, term, Field.Index.NOT_ANALYZED_NO_NORMS); + doc.add(f); + w.addDocument(doc); + } + if (r != null) { + close(); + } + r = w.getReader(); + w.close(); + return r; + } + + private void close() throws Exception { + final Directory d = ((SegmentReader) r.getSequentialSubReaders()[0]).directory(); + r.close(); + d.close(); + } + + private int docFreq(IndexReader r, String term) throws Exception { + return r.docFreq(new Term(FIELD, term)); + } + + public void testEasy() throws Exception { + // No floor arcs: + r = makeIndex("aa0", "aa1", "aa2", "aa3", "bb0", "bb1", "bb2", "bb3", "aa"); + + // First term in block: + assertEquals(1, docFreq(r, "aa0")); + + // Scan forward to another term in same block + assertEquals(1, docFreq(r, "aa2")); + + assertEquals(1, docFreq(r, "aa")); + + // Reset same block then scan forwards + assertEquals(1, docFreq(r, "aa1")); + + // Not found, in same block + assertEquals(0, docFreq(r, "aa5")); + + // Found, in same block + assertEquals(1, docFreq(r, "aa2")); + + // Not found in index: + assertEquals(0, docFreq(r, "b0")); + + // Found: + assertEquals(1, docFreq(r, "aa2")); + + // Found, rewind: + assertEquals(1, docFreq(r, "aa0")); + + + // First term in block: + assertEquals(1, docFreq(r, "bb0")); + + // Scan forward to another term in same block + assertEquals(1, docFreq(r, "bb2")); + + // Reset same block then scan forwards + assertEquals(1, docFreq(r, "bb1")); + + // Not found, in same block + assertEquals(0, docFreq(r, "bb5")); + + // Found, in same block + assertEquals(1, docFreq(r, "bb2")); + + // Not found in index: + assertEquals(0, docFreq(r, "b0")); + + // Found: + assertEquals(1, docFreq(r, "bb2")); + + // Found, rewind: + assertEquals(1, docFreq(r, "bb0")); + + close(); + } + + // tests: + // - test same prefix has non-floor block and floor block (ie, has 2 long outputs on same term prefix) + // - term that's entirely in the index + + public void testFloorBlocks() throws Exception { + final String[] terms = new String[] {"aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9", "aa", "xx"}; + r = makeIndex(terms); + //r = makeIndex("aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9"); + + // First term in first block: + assertEquals(1, docFreq(r, "aa0")); + assertEquals(1, docFreq(r, "aa4")); + + // No block + assertEquals(0, docFreq(r, "bb0")); + + // Second block + assertEquals(1, docFreq(r, "aa4")); + + // Backwards to prior floor block: + assertEquals(1, docFreq(r, "aa0")); + + // Forwards to last floor block: + assertEquals(1, docFreq(r, "aa9")); + + assertEquals(0, docFreq(r, "a")); + assertEquals(1, docFreq(r, "aa")); + assertEquals(0, docFreq(r, "a")); + assertEquals(1, docFreq(r, "aa")); + + // Forwards to last floor block: + assertEquals(1, docFreq(r, "xx")); + assertEquals(1, docFreq(r, "aa1")); + assertEquals(0, docFreq(r, "yy")); + + assertEquals(1, docFreq(r, "xx")); + assertEquals(1, docFreq(r, "aa9")); + + assertEquals(1, docFreq(r, "xx")); + assertEquals(1, docFreq(r, "aa4")); + + final TermsEnum te = MultiFields.getTerms(r, FIELD).iterator(); + while(te.next() != null) { + //System.out.println("TEST: next term=" + te.term().utf8ToString()); + } + + assertTrue(seekExact(te, "aa1")); + assertEquals("aa2", next(te)); + assertTrue(seekExact(te, "aa8")); + assertEquals("aa9", next(te)); + assertEquals("xx", next(te)); + + testRandomSeeks(r, terms); + close(); + } + + public void testZeroTerms() throws Exception { + d = newDirectory(); + final RandomIndexWriter w = new RandomIndexWriter(random, d); + w.w.setInfoStream(VERBOSE ? System.out : null); + Document doc = new Document(); + doc.add(newField("field", "one two three", Field.Index.ANALYZED)); + doc = new Document(); + doc.add(newField("field2", "one two three", Field.Index.ANALYZED)); + w.addDocument(doc); + w.commit(); + w.deleteDocuments(new Term("field", "one")); + w.optimize(); + IndexReader r = w.getReader(); + w.close(); + assertEquals(1, r.numDocs()); + assertEquals(1, r.maxDoc()); + Terms terms = MultiFields.getTerms(r, "field"); + if (terms != null) { + assertNull(terms.iterator().next()); + } + r.close(); + d.close(); + } + + private String getRandomString() { + //return _TestUtil.randomSimpleString(random); + return _TestUtil.randomRealisticUnicodeString(random); + } + + public void testRandomTerms() throws Exception { + final String[] terms = new String[_TestUtil.nextInt(random, 1, atLeast(1000))]; + final Set seen = new HashSet(); + + final boolean allowEmptyString = random.nextBoolean(); + + if (random.nextInt(10) == 7 && terms.length > 2) { + // Sometimes add a bunch of terms sharing a longish common prefix: + final int numTermsSamePrefix = random.nextInt(terms.length/2); + if (numTermsSamePrefix > 0) { + String prefix; + while(true) { + prefix = getRandomString(); + if (prefix.length() < 5) { + continue; + } else { + break; + } + } + while(seen.size() < numTermsSamePrefix) { + final String t = prefix + getRandomString(); + if (!seen.contains(t)) { + terms[seen.size()] = t; + seen.add(t); + } + } + } + } + + while(seen.size() < terms.length) { + final String t = getRandomString(); + if (!seen.contains(t) && (allowEmptyString || t.length() != 0)) { + terms[seen.size()] = t; + seen.add(t); + } + } + r = makeIndex(terms); + testRandomSeeks(r, terms); + close(); + } + + // sugar + private boolean seekExact(TermsEnum te, String term) throws IOException { + return te.seekExact(new BytesRef(term), random.nextBoolean()); + } + + // sugar + private String next(TermsEnum te) throws IOException { + final BytesRef br = te.next(); + if (br == null) { + return null; + } else { + return br.utf8ToString(); + } + } + + private BytesRef getNonExistTerm(BytesRef[] terms) { + BytesRef t = null; + while(true) { + final String ts = getRandomString(); + t = new BytesRef(ts); + if (Arrays.binarySearch(terms, t) < 0) { + return t; + } + } + } + + private static class TermAndState { + public final BytesRef term; + public final TermState state; + + public TermAndState(BytesRef term, TermState state) { + this.term = term; + this.state = state; + } + } + + private void testRandomSeeks(IndexReader r, String... validTermStrings) throws IOException { + final BytesRef[] validTerms = new BytesRef[validTermStrings.length]; + for(int termIDX=0;termIDX termStates = new ArrayList(); + + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + + final BytesRef t; + int loc; + final TermState termState; + if (random.nextInt(6) == 4) { + // pick term that doens't exist: + t = getNonExistTerm(validTerms); + termState = null; + if (VERBOSE) { + System.out.println("\nTEST: invalid term=" + t.utf8ToString()); + } + loc = Arrays.binarySearch(validTerms, t); + } else if (termStates.size() != 0 && random.nextInt(4) == 1) { + final TermAndState ts = termStates.get(random.nextInt(termStates.size())); + t = ts.term; + loc = Arrays.binarySearch(validTerms, t); + assertTrue(loc >= 0); + termState = ts.state; + if (VERBOSE) { + System.out.println("\nTEST: valid termState term=" + t.utf8ToString()); + } + } else { + // pick valid term + loc = random.nextInt(validTerms.length); + t = new BytesRef(validTerms[loc]); + termState = null; + if (VERBOSE) { + System.out.println("\nTEST: valid term=" + t.utf8ToString()); + } + } + + // seekCeil or seekExact: + final boolean doSeekExact = random.nextBoolean(); + if (termState != null) { + if (VERBOSE) { + System.out.println(" seekExact termState"); + } + te.seekExact(t, termState); + } else if (doSeekExact) { + if (VERBOSE) { + System.out.println(" seekExact"); + } + assertEquals(loc >= 0, te.seekExact(t, random.nextBoolean())); + } else { + if (VERBOSE) { + System.out.println(" seekCeil"); + } + + final TermsEnum.SeekStatus result = te.seekCeil(t, random.nextBoolean()); + if (VERBOSE) { + System.out.println(" got " + result); + } + + if (loc >= 0) { + assertEquals(TermsEnum.SeekStatus.FOUND, result); + } else if (loc == END_LOC) { + assertEquals(TermsEnum.SeekStatus.END, result); + } else { + assert loc >= -validTerms.length; + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, result); + } + } + + if (loc >= 0) { + assertEquals(t, te.term()); + } else if (doSeekExact) { + // TermsEnum is unpositioned if seekExact returns false + continue; + } else if (loc == END_LOC) { + continue; + } else { + loc = -loc-1; + assertEquals(validTerms[loc], te.term()); + } + + // Do a bunch of next's after the seek + final int numNext = random.nextInt(validTerms.length); + + for(int nextCount=0;nextCount terms; // the terms we put in the index + private Automaton termsAutomaton; // automata of the same + int numIterations; + + public void setUp() throws Exception { + super.setUp(); + // we generate aweful regexps: good for testing. + // but for preflex codec, the test can be very slow, so use less iterations. + numIterations = CodecProvider.getDefault().getFieldCodec("field").equals("PreFlex") ? 10 * RANDOM_MULTIPLIER : atLeast(50); + dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random, MockTokenizer.KEYWORD, false)) + .setMaxBufferedDocs(_TestUtil.nextInt(random, 50, 1000))); + Document doc = new Document(); + Field field = newField("field", "", Field.Store.YES, Field.Index.NOT_ANALYZED); + doc.add(field); + terms = new TreeSet(); + + int num = atLeast(200); + for (int i = 0; i < num; i++) { + String s = _TestUtil.randomUnicodeString(random); + field.setValue(s); + terms.add(new BytesRef(s)); + writer.addDocument(doc); + } + + termsAutomaton = DaciukMihovAutomatonBuilder.build(terms); + + reader = writer.getReader(); + searcher = newSearcher(reader); + writer.close(); + } + + public void tearDown() throws Exception { + searcher.close(); + reader.close(); + dir.close(); + super.tearDown(); + } + + /** tests a pre-intersected automaton against the original */ + public void testFiniteVersusInfinite() throws Exception { + for (int i = 0; i < numIterations; i++) { + String reg = AutomatonTestUtil.randomRegexp(random); + Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); + final List matchedTerms = new ArrayList(); + for(BytesRef t : terms) { + if (BasicOperations.run(automaton, t.utf8ToString())) { + matchedTerms.add(t); + } + } + + Automaton alternate = DaciukMihovAutomatonBuilder.build(matchedTerms); + //System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " states, sigma=" + alternate.getStartPoints().length); + //AutomatonTestUtil.minimizeSimple(alternate); + //System.out.println("minmize done"); + AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton); + AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate); + CheckHits.checkEqual(a1, searcher.search(a1, 25).scoreDocs, searcher.search(a2, 25).scoreDocs); + } + } + + /** seeks to every term accepted by some automata */ + public void testSeeking() throws Exception { + for (int i = 0; i < numIterations; i++) { + String reg = AutomatonTestUtil.randomRegexp(random); + Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); + TermsEnum te = MultiFields.getTerms(reader, "field").iterator(); + ArrayList unsortedTerms = new ArrayList(terms); + Collections.shuffle(unsortedTerms, random); + + for (BytesRef term : unsortedTerms) { + if (BasicOperations.run(automaton, term.utf8ToString())) { + // term is accepted + if (random.nextBoolean()) { + // seek exact + assertTrue(te.seekExact(term, random.nextBoolean())); + } else { + // seek ceil + assertEquals(SeekStatus.FOUND, te.seekCeil(term, random.nextBoolean())); + assertEquals(term, te.term()); + } + } + } + } + } + + /** mixes up seek and next for all terms */ + public void testSeekingAndNexting() throws Exception { + for (int i = 0; i < numIterations; i++) { + TermsEnum te = MultiFields.getTerms(reader, "field").iterator(); + + for (BytesRef term : terms) { + int c = random.nextInt(3); + if (c == 0) { + assertEquals(term, te.next()); + } else if (c == 1) { + assertEquals(SeekStatus.FOUND, te.seekCeil(term, random.nextBoolean())); + assertEquals(term, te.term()); + } else { + assertTrue(te.seekExact(term, random.nextBoolean())); + } + } + } + } + + /** tests intersect: TODO start at a random term! */ + public void testIntersect() throws Exception { + for (int i = 0; i < numIterations; i++) { + String reg = AutomatonTestUtil.randomRegexp(random); + Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); + CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false); + TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null); + Automaton expected = BasicOperations.intersection(termsAutomaton, automaton); + TreeSet found = new TreeSet(); + while (te.next() != null) { + found.add(new BytesRef(te.term())); + } + + Automaton actual = DaciukMihovAutomatonBuilder.build(found); + assertTrue(BasicOperations.sameLanguage(expected, actual)); + } + } +} diff --git a/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java b/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java index 623cc7c8018..5fde0397f69 100644 --- a/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java +++ b/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java @@ -223,6 +223,7 @@ public class TestDocValuesIndexing extends LuceneTestCase { return cfg; } + @SuppressWarnings("fallthrough") public void runTestNumerics(IndexWriterConfig cfg, boolean withDeletions) throws IOException { Directory d = newDirectory(); diff --git a/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java b/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java index 9b57fb64834..a148c237387 100644 --- a/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java @@ -154,9 +154,7 @@ public class TestAutomatonQuery extends LuceneTestCase { assertEquals(a1, a2); assertEquals(a1, a3); - - assertEquals(a1.toString(), a3.toString()); - + // different class AutomatonQuery w1 = new WildcardQuery(newTerm("foobar")); // different class diff --git a/lucene/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java b/lucene/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java index 463729cc7c3..339d2fc548d 100644 --- a/lucene/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java +++ b/lucene/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java @@ -87,6 +87,7 @@ public class TestBooleanMinShouldMatch extends LuceneTestCase { printHits(getName(), h, s); } assertEquals("result count", expected, h.length); + //System.out.println("TEST: now check"); QueryUtils.check(random, q,s); } diff --git a/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java b/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java index 1bbd21b8963..10ae2109154 100644 --- a/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java +++ b/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java @@ -75,6 +75,9 @@ public class TestFuzzyQuery2 extends LuceneTestCase { } public void assertFromTestData(int codePointTable[]) throws Exception { + if (VERBOSE) { + System.out.println("TEST: codePointTable=" + codePointTable); + } InputStream stream = getClass().getResourceAsStream("fuzzyTestData.txt"); BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); @@ -83,6 +86,8 @@ public class TestFuzzyQuery2 extends LuceneTestCase { Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random, dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy())); + + writer.w.setInfoStream(VERBOSE ? System.out : null); Document doc = new Document(); Field field = newField("field", "", Field.Store.NO, Field.Index.ANALYZED); @@ -95,6 +100,9 @@ public class TestFuzzyQuery2 extends LuceneTestCase { IndexReader r = writer.getReader(); IndexSearcher searcher = newSearcher(r); + if (VERBOSE) { + System.out.println("TEST: searcher=" + searcher); + } writer.close(); String line; while ((line = reader.readLine()) != null) { diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java b/lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java index 59b05e17439..b6a1b6ea72b 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java +++ b/lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java @@ -94,12 +94,18 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter { public static Query csrq(String f, String l, String h, boolean il, boolean ih) { TermRangeQuery query = TermRangeQuery.newStringRange(f, l, h, il, ih); query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); + if (VERBOSE) { + System.out.println("TEST: query=" + query); + } return query; } public static Query csrq(String f, String l, String h, boolean il, boolean ih, MultiTermQuery.RewriteMethod method) { TermRangeQuery query = TermRangeQuery.newStringRange(f, l, h, il, ih); query.setRewriteMethod(method); + if (VERBOSE) { + System.out.println("TEST: query=" + query + " method=" + method); + } return query; } @@ -275,6 +281,10 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter { IndexReader reader = signedIndexReader; IndexSearcher search = newSearcher(reader); + if (VERBOSE) { + System.out.println("TEST: reader=" + reader); + } + int medId = ((maxId - minId) / 2); String minIP = pad(minId); diff --git a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java index 7f2e5934906..4e9acbe71c4 100644 --- a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java +++ b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java @@ -18,28 +18,27 @@ package org.apache.lucene.search; */ import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.ArrayList; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -143,6 +142,9 @@ public class TestRegexpRandom2 extends LuceneTestCase { int num = CodecProvider.getDefault().getFieldCodec("field").equals("PreFlex") ? 100 * RANDOM_MULTIPLIER : atLeast(1000); for (int i = 0; i < num; i++) { String reg = AutomatonTestUtil.randomRegexp(random); + if (VERBOSE) { + System.out.println("TEST: regexp=" + reg); + } assertSame(reg); } } @@ -153,18 +155,7 @@ public class TestRegexpRandom2 extends LuceneTestCase { protected void assertSame(String regexp) throws IOException { RegexpQuery smart = new RegexpQuery(new Term("field", regexp), RegExp.NONE); DumbRegexpQuery dumb = new DumbRegexpQuery(new Term("field", regexp), RegExp.NONE); - - // we can't compare the two if automaton rewrites to a simpler enum. - // for example: "a\uda07\udcc7?.*?" gets rewritten to a simpler query: - // a\uda07* prefixquery. Prefixquery then does the "wrong" thing, which - // isn't really wrong as the query was undefined to begin with... but not - // automatically comparable. - - // TODO: does this check even matter anymore?! - Terms terms = MultiFields.getTerms(searcher1.getIndexReader(), "field"); - if (!(smart.getTermsEnum(terms) instanceof AutomatonTermsEnum)) - return; - + TopDocs smartDocs = searcher1.search(smart, 25); TopDocs dumbDocs = searcher2.search(dumb, 25); diff --git a/lucene/src/test/org/apache/lucene/search/TestWildcard.java b/lucene/src/test/org/apache/lucene/search/TestWildcard.java index 123cd028c72..0863bc234ed 100644 --- a/lucene/src/test/org/apache/lucene/search/TestWildcard.java +++ b/lucene/src/test/org/apache/lucene/search/TestWildcard.java @@ -135,7 +135,7 @@ public class TestWildcard wq = new WildcardQuery(new Term("field", "*")); assertMatches(searcher, wq, 2); assertFalse(wq.getTermsEnum(terms) instanceof PrefixTermsEnum); - assertFalse(wq.getTermsEnum(terms) instanceof AutomatonTermsEnum); + assertFalse(wq.getTermsEnum(terms).getClass().getSimpleName().contains("AutomatonTermsEnum")); searcher.close(); indexStore.close(); } diff --git a/lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java b/lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java index 89626d414f0..b4f4bae6bb0 100644 --- a/lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java +++ b/lucene/src/test/org/apache/lucene/search/TestWildcardRandom.java @@ -63,6 +63,9 @@ public class TestWildcardRandom extends LuceneTestCase { reader = writer.getReader(); searcher = newSearcher(reader); writer.close(); + if (VERBOSE) { + System.out.println("TEST: setUp searcher=" + searcher); + } } private char N() { @@ -85,7 +88,11 @@ public class TestWildcardRandom extends LuceneTestCase { private void assertPatternHits(String pattern, int numHits) throws Exception { // TODO: run with different rewrites - Query wq = new WildcardQuery(new Term("field", fillPattern(pattern))); + final String filledPattern = fillPattern(pattern); + if (VERBOSE) { + System.out.println("TEST: run wildcard pattern=" + pattern + " filled=" + filledPattern); + } + Query wq = new WildcardQuery(new Term("field", filledPattern)); TopDocs docs = searcher.search(wq, 25); assertEquals("Incorrect hits for pattern: " + pattern, numHits, docs.totalHits); } diff --git a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java index 2e1da3af04f..ee7e4f061dd 100644 --- a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java +++ b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java @@ -152,11 +152,14 @@ public class TestPayloadNearQuery extends LuceneTestCase { } for (int i=1;i<10;i++) { query = newPhraseQuery("field", English.intToEnglish(i)+" hundred", true, new AveragePayloadFunction()); + if (VERBOSE) { + System.out.println("TEST: run query=" + query); + } // all should have score = 3 because adjacent terms have payloads of 2,4 // and all the similarity factors are set to 1 hits = searcher.search(query, null, 100); assertTrue("hits is null and it shouldn't be", hits != null); - assertTrue("should be 100 hits", hits.totalHits == 100); + assertEquals("should be 100 hits", 100, hits.totalHits); for (int j = 0; j < hits.scoreDocs.length; j++) { ScoreDoc doc = hits.scoreDocs[j]; // System.out.println("Doc: " + doc.toString()); diff --git a/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java b/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java new file mode 100644 index 00000000000..f346d4cf5f9 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java @@ -0,0 +1,121 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +public class TestCompiledAutomaton extends LuceneTestCase { + + private CompiledAutomaton build(String... strings) { + final List as = new ArrayList(); + for(String s : strings) { + as.add(BasicAutomata.makeString(s)); + } + Automaton a = BasicOperations.union(as); + a.determinize(); + return new CompiledAutomaton(a, true, false); + } + + private void testFloor(CompiledAutomaton c, String input, String expected) { + final BytesRef b = new BytesRef(input); + final BytesRef result = c.floor(b, b); + if (expected == null) { + assertNull(result); + } else { + assertNotNull(result); + assertEquals("actual=" + result.utf8ToString() + " vs expected=" + expected + " (input=" + input + ")", + result, new BytesRef(expected)); + } + } + + private void testTerms(String[] terms) throws Exception { + final CompiledAutomaton c = build(terms); + final BytesRef[] termBytes = new BytesRef[terms.length]; + for(int idx=0;idx= 0) { + expected = s; + } else { + // term doesn't exist + loc = -(loc+1); + if (loc == 0) { + expected = null; + } else { + expected = termBytes[loc-1].utf8ToString(); + } + } + if (VERBOSE) { + System.out.println(" expected=" + expected); + } + testFloor(c, s, expected); + } + } + + public void testRandom() throws Exception { + final int numTerms = atLeast(1000); + final Set terms = new HashSet(); + while(terms.size() != numTerms) { + terms.add(randomString()); + } + testTerms(terms.toArray(new String[terms.size()])); + } + + private String randomString() { + // return _TestUtil.randomSimpleString(random); + return _TestUtil.randomRealisticUnicodeString(random); + } + + public void testBasic() throws Exception { + CompiledAutomaton c = build("foo", "fob", "goo"); + testFloor(c, "goo", "goo"); + testFloor(c, "ga", "foo"); + testFloor(c, "g", "foo"); + testFloor(c, "foc", "fob"); + testFloor(c, "foz", "foo"); + testFloor(c, "f", null); + testFloor(c, "", null); + testFloor(c, "aa", null); + testFloor(c, "zzz", "goo"); + } +} diff --git a/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java index c31d4dd41ab..2e11bd21405 100644 --- a/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -24,19 +24,25 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; +import java.io.StringWriter; import java.io.Writer; import java.util.*; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.store.IOContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.IndexInput; @@ -456,7 +462,8 @@ public class TestFSTs extends LuceneTestCase { prune1==0 && prune2==0, allowRandomSuffixSharing ? random.nextBoolean() : true, allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, - outputs); + outputs, + null); for(InputOutput pair : pairs) { if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { @@ -872,15 +879,15 @@ public class TestFSTs extends LuceneTestCase { } } - //System.out.println("TEST: after prune"); - /* - for(Map.Entry ent : prefixes.entrySet()) { - System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); - if (ent.getValue().isFinal) { - System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); + if (VERBOSE) { + System.out.println("TEST: after prune"); + for(Map.Entry> ent : prefixes.entrySet()) { + System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); + if (ent.getValue().isFinal) { + System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); + } } - } - */ + } if (prefixes.size() <= 1) { assertNull(fst); @@ -1081,7 +1088,7 @@ public class TestFSTs extends LuceneTestCase { final BytesRef randomTerm = new BytesRef(getRandomString()); if (VERBOSE) { - System.out.println("TEST: seek " + randomTerm.utf8ToString() + " " + randomTerm); + System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm); } final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm); @@ -1133,10 +1140,10 @@ public class TestFSTs extends LuceneTestCase { assertEquals(termsEnum.term().utf8ToString() + " != " + fstEnum.current().input.utf8ToString(), termsEnum.term(), fstEnum.current().input); if (storeOrd) { // fst stored the ord - assertEquals(termsEnum.ord(), ((Long) fstEnum.current().output).longValue()); + assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.ord(), ((Long) fstEnum.current().output).longValue()); } else { // fst stored the docFreq - assertEquals(termsEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); + assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); } } } @@ -1154,7 +1161,7 @@ public class TestFSTs extends LuceneTestCase { this.inputMode = inputMode; this.outputs = outputs; - builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs); + builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null); } protected abstract T getOutput(IntsRef input, int ord) throws IOException; @@ -1254,7 +1261,7 @@ public class TestFSTs extends LuceneTestCase { } } - // java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out + // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out public static void main(String[] args) throws IOException { int prune = 0; int limit = Integer.MAX_VALUE; @@ -1411,6 +1418,198 @@ public class TestFSTs extends LuceneTestCase { assertEquals(42, (long) seekResult.output); } + public void testPrimaryKeys() throws Exception { + Directory dir = newDirectory(); + + for(int cycle=0;cycle<2;cycle++) { + if (VERBOSE) { + System.out.println("TEST: cycle=" + cycle); + } + RandomIndexWriter w = new RandomIndexWriter(random, dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); + Document doc = new Document(); + Field idField = newField("id", "", Field.Index.NOT_ANALYZED); + doc.add(idField); + + final int NUM_IDS = (int) (1000*RANDOM_MULTIPLIER*(1.0+random.nextDouble())); + //final int NUM_IDS = (int) (377 * (1.0+random.nextDouble())); + if (VERBOSE) { + System.out.println("TEST: NUM_IDS=" + NUM_IDS); + } + final Set allIDs = new HashSet(); + for(int id=0;id allIDsList = new ArrayList(allIDs); + final List sortedAllIDsList = new ArrayList(allIDsList); + Collections.sort(sortedAllIDsList); + + // Sprinkle in some non-existent PKs: + Set outOfBounds = new HashSet(); + for(int idx=0;idx builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null); + builder.add("stat", outputs.get(17)); + builder.add("station", outputs.get(10)); + final FST fst = builder.finish(); + //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); + StringWriter w = new StringWriter(); + Util.toDot(fst, w, false, false); + w.close(); + //System.out.println(w.toString()); + assertTrue(w.toString().indexOf("label=\"t/[7]\"") != -1); + } + + public void testInternalFinalState() throws Exception { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null); + builder.add(new BytesRef("stat"), outputs.getNoOutput()); + builder.add(new BytesRef("station"), outputs.getNoOutput()); + final FST fst = builder.finish(); + StringWriter w = new StringWriter(); + //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); + Util.toDot(fst, w, false, false); + w.close(); + //System.out.println(w.toString()); + assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1); + } + // Make sure raw FST can differentiate between final vs // non-final end nodes public void testNonFinalStopNodes() throws Exception { diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java b/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java index 36804eed92a..66b76d7d18c 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java @@ -400,7 +400,7 @@ public class DirectSpellChecker { if (terms == null) { return Collections.emptyList(); } - FuzzyTermsEnum e = new FuzzyTermsEnum(terms.iterator(), atts, term, editDistance, Math.max(minPrefix, editDistance-1)); + FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1)); final PriorityQueue stQueue = new PriorityQueue(); BytesRef queryTerm = new BytesRef(term.text());