diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java deleted file mode 100644 index b83866a8175..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/AutoPrefixPostingsFormat.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.autoprefix; - - -import java.io.IOException; - -import org.apache.lucene.codecs.FieldsConsumer; -import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader; -import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter; -import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; -import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader; -import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.util.IOUtils; - -/** - * Just like {@link Lucene50PostingsFormat} except this format - * exposes the experimental auto-prefix terms. - * - * @lucene.experimental - */ - -public final class AutoPrefixPostingsFormat extends PostingsFormat { - - private final int minItemsInBlock; - private final int maxItemsInBlock; - private final int minItemsInAutoPrefix; - private final int maxItemsInAutoPrefix; - - /** Creates {@code AutoPrefixPostingsFormat} with default settings. */ - public AutoPrefixPostingsFormat() { - this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, - BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE, - 25, 48); - } - - /** Creates {@code Lucene50PostingsFormat} with custom - * values for {@code minBlockSize} and {@code - * maxBlockSize} passed to block terms dictionary. - * @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */ - public AutoPrefixPostingsFormat(int minItemsInAutoPrefix, int maxItemsInAutoPrefix) { - this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, - BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE, - minItemsInAutoPrefix, - maxItemsInAutoPrefix); - } - - /** Creates {@code Lucene50PostingsFormat} with custom - * values for {@code minBlockSize}, {@code - * maxBlockSize}, {@code minItemsInAutoPrefix} and {@code maxItemsInAutoPrefix}, passed - * to block tree terms dictionary. - * @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int,int,int) */ - public AutoPrefixPostingsFormat(int minItemsInBlock, int maxItemsInBlock, int minItemsInAutoPrefix, int maxItemsInAutoPrefix) { - super("AutoPrefix"); - BlockTreeTermsWriter.validateSettings(minItemsInBlock, - maxItemsInBlock); - BlockTreeTermsWriter.validateAutoPrefixSettings(minItemsInAutoPrefix, - maxItemsInAutoPrefix); - this.minItemsInBlock = minItemsInBlock; - this.maxItemsInBlock = maxItemsInBlock; - this.minItemsInAutoPrefix = minItemsInAutoPrefix; - this.maxItemsInAutoPrefix = maxItemsInAutoPrefix; - } - - @Override - public String toString() { - return getName(); - } - - @Override - public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state); - - boolean success = false; - try { - FieldsConsumer ret = new BlockTreeTermsWriter(state, - postingsWriter, - minItemsInBlock, - maxItemsInBlock, - minItemsInAutoPrefix, - maxItemsInAutoPrefix); - success = true; - return ret; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(postingsWriter); - } - } - } - - @Override - public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene50PostingsReader(state); - boolean success = false; - try { - FieldsProducer ret = new BlockTreeTermsReader(postingsReader, state); - success = true; - return ret; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(postingsReader); - } - } - } -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java b/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java deleted file mode 100644 index b15f2fad7b2..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * An experimental postings format that automatically indexes appropriate - * prefix terms for fast range and prefix queries. - */ -package org.apache.lucene.codecs.autoprefix; diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 753b6d7b3ca..710c6050c67 100644 --- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -19,4 +19,3 @@ org.apache.lucene.codecs.memory.DirectPostingsFormat org.apache.lucene.codecs.memory.FSTOrdPostingsFormat org.apache.lucene.codecs.memory.FSTPostingsFormat org.apache.lucene.codecs.memory.MemoryPostingsFormat -org.apache.lucene.codecs.autoprefix.AutoPrefixPostingsFormat diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java deleted file mode 100644 index 3c7059edb71..00000000000 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixPostingsFormat.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.autoprefix; - - -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.RandomPostingsTester; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.TestUtil; - -/** - * Tests AutoPrefix's postings - */ - -// NOTE: we don't extend BasePostingsFormatTestCase becase we can only handle DOCS_ONLY fields: - -public class TestAutoPrefixPostingsFormat extends LuceneTestCase { - public void test() throws Exception { - new RandomPostingsTester(random()).testFull(TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat()), - createTempDir("autoprefix"), - IndexOptions.DOCS, - false); - } -} diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java b/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java deleted file mode 100644 index 316fc9a5709..00000000000 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/autoprefix/TestAutoPrefixTerms.java +++ /dev/null @@ -1,941 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.autoprefix; - - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.codecs.Codec; -import org.apache.lucene.document.BinaryDocValuesField; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.NumericDocValuesField; -import org.apache.lucene.document.StringField; -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.MultiDocValues; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.SerialMergeScheduler; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.PrefixQuery; -import org.apache.lucene.search.TermRangeQuery; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.MathUtil; -import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.automaton.Automata; -import org.apache.lucene.util.automaton.CompiledAutomaton; - -public class TestAutoPrefixTerms extends LuceneTestCase { - - private int minItemsPerBlock = TestUtil.nextInt(random(), 2, 100); - private int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random().nextInt(100); - private int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, 100); - private int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2, (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE; - - public TestAutoPrefixTerms() { - if (LuceneTestCase.VERBOSE) { - System.out.println("TEST: using minItemsPerBlock=" + minItemsPerBlock + " maxItemsPerBlock=" + maxItemsPerBlock); - System.out.println("TEST: using minTermsAutoPrefix=" + minTermsAutoPrefix + " maxTermsAutoPrefix=" + maxTermsAutoPrefix); - } - } - - private final Codec codec = TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat(minItemsPerBlock, maxItemsPerBlock, - minTermsAutoPrefix, maxTermsAutoPrefix)); - - // Numbers in a restricted range, encoded in decimal, left-0-padded: - public void testBasicNumericRanges() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(codec); - IndexWriter w = new IndexWriter(dir, iwc); - int numTerms = TestUtil.nextInt(random(), 3000, 50000); - Set terms = new HashSet<>(); - int digits = TestUtil.nextInt(random(), 5, 10); - int maxValue = 1; - for(int i=0;i sortedTerms = new ArrayList<>(terms); - Collections.sort(sortedTerms); - - if (VERBOSE) { - System.out.println("TEST: sorted terms:"); - int idx = 0; - for(String term : sortedTerms) { - System.out.println(idx + ": " + term); - idx++; - } - } - - int iters = atLeast(100); - for(int iter=0;iter max) { - int x = min; - min = max; - max = x; - } - break; - } - - if (VERBOSE) { - System.out.println("\nTEST: iter=" + iter + " min=" + min + " max=" + max); - } - - boolean minInclusive = random().nextBoolean(); - boolean maxInclusive = random().nextBoolean(); - BytesRef minTerm = new BytesRef(String.format(Locale.ROOT, format, min)); - BytesRef maxTerm = new BytesRef(String.format(Locale.ROOT, format, max)); - CompiledAutomaton ca = new CompiledAutomaton(Automata.makeBinaryInterval(minTerm, minInclusive, maxTerm, maxInclusive), - true, false, Integer.MAX_VALUE, true); - - TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field")); - NumericDocValues docValues = MultiDocValues.getNumericValues(r, "field"); - PostingsEnum postingsEnum = null; - - VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), minTerm, maxTerm); - - while (te.next() != null) { - if (VERBOSE) { - System.out.println(" got term=" + te.term().utf8ToString()); - } - verifier.sawTerm(te.term()); - postingsEnum = te.postings(postingsEnum); - int docID; - while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { - long v = docValues.get(docID); - assert v >= min && v <= max: "docID=" + docID + " v=" + v; - // The auto-prefix terms should never "overlap" one another, so we should only ever see a given docID one time: - if (VERBOSE) { - System.out.println(" got docID=" + docID + " v=" + v); - } - verifier.sawDoc(docID); - } - } - - int startLoc = Collections.binarySearch(sortedTerms, String.format(Locale.ROOT, format, min)); - if (startLoc < 0) { - startLoc = -startLoc-1; - } else if (minInclusive == false) { - startLoc++; - } - int endLoc = Collections.binarySearch(sortedTerms, String.format(Locale.ROOT, format, max)); - if (endLoc < 0) { - endLoc = -endLoc-2; - } else if (maxInclusive == false) { - endLoc--; - } - verifier.finish(endLoc-startLoc+1, maxTermsAutoPrefix); - } - - r.close(); - w.close(); - dir.close(); - } - - private static BytesRef intToBytes(int v) { - int sortableBits = v ^ 0x80000000; - BytesRef token = new BytesRef(4); - token.length = 4; - int index = 3; - while (index >= 0) { - token.bytes[index] = (byte) (sortableBits & 0xff); - index--; - sortableBits >>>= 8; - } - return token; - } - - // Numbers are encoded in full binary (4 byte ints): - public void testBinaryNumericRanges() throws Exception { - if (VERBOSE) { - System.out.println("TEST: minItemsPerBlock=" + minItemsPerBlock); - System.out.println("TEST: maxItemsPerBlock=" + maxItemsPerBlock); - System.out.println("TEST: minTermsAutoPrefix=" + minTermsAutoPrefix); - System.out.println("TEST: maxTermsAutoPrefix=" + maxTermsAutoPrefix); - } - Directory dir = newDirectory(); - IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(codec); - IndexWriter w = new IndexWriter(dir, iwc); - int numTerms = TestUtil.nextInt(random(), 3000, 50000); - Set terms = new HashSet<>(); - while (terms.size() < numTerms) { - terms.add(random().nextInt()); - } - - for(Integer term : terms) { - Document doc = new Document(); - doc.add(newStringField("field", intToBytes(term), Field.Store.NO)); - doc.add(new NumericDocValuesField("field", term)); - w.addDocument(doc); - } - - if (random().nextBoolean()) { - if (VERBOSE) System.out.println("TEST: now force merge"); - w.forceMerge(1); - } - - IndexReader r = DirectoryReader.open(w); - - List sortedTerms = new ArrayList<>(terms); - Collections.sort(sortedTerms); - - if (VERBOSE) { - System.out.println("TEST: sorted terms:"); - int idx = 0; - for(Integer term : sortedTerms) { - System.out.println(idx + ": " + term); - idx++; - } - } - - int iters = atLeast(100); - for(int iter=0;iter max) { - int x = min; - min = max; - max = x; - } - break; - } - - if (VERBOSE) { - System.out.println("\nTEST: iter=" + iter + " min=" + min + " (" + intToBytes(min) + ") max=" + max + " (" + intToBytes(max) + ")"); - } - - boolean minInclusive = random().nextBoolean(); - BytesRef minTerm = intToBytes(min); - boolean maxInclusive = random().nextBoolean(); - BytesRef maxTerm = intToBytes(max); - CompiledAutomaton ca = new CompiledAutomaton(Automata.makeBinaryInterval(minTerm, minInclusive, maxTerm, maxInclusive), - true, false, Integer.MAX_VALUE, true); - - TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field")); - NumericDocValues docValues = MultiDocValues.getNumericValues(r, "field"); - PostingsEnum postingsEnum = null; - VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), minTerm, maxTerm); - while (te.next() != null) { - if (VERBOSE) { - System.out.println(" got term=" + te.term() + " docFreq=" + te.docFreq()); - } - verifier.sawTerm(te.term()); - postingsEnum = te.postings(postingsEnum); - int docID; - while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { - long v = docValues.get(docID); - assert v >= min && v <= max: "docID=" + docID + " v=" + v; - verifier.sawDoc(docID); - } - } - - int startLoc = Collections.binarySearch(sortedTerms, min); - if (startLoc < 0) { - startLoc = -startLoc-1; - } else if (minInclusive == false) { - startLoc++; - } - int endLoc = Collections.binarySearch(sortedTerms, max); - if (endLoc < 0) { - endLoc = -endLoc-2; - } else if (maxInclusive == false) { - endLoc--; - } - int expectedHits = endLoc-startLoc+1; - try { - verifier.finish(expectedHits, maxTermsAutoPrefix); - } catch (AssertionError ae) { - for(int i=0;i min || (v == min && minInclusive)) && - (v < max || (v == max && maxInclusive)); - if (accept) { - System.out.println("MISSING: docID=" + i + " v=" + v + " term=" + intToBytes(v)); - } - } - } - - throw ae; - } - } - - r.close(); - w.close(); - dir.close(); - } - - // Non-numeric, simple prefix query - public void testBasicPrefixTerms() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(codec); - iwc.setMergeScheduler(new SerialMergeScheduler()); - IndexWriter w = new IndexWriter(dir, iwc); - int numTerms = TestUtil.nextInt(random(), 3000, 50000); - Set terms = new HashSet<>(); - while (terms.size() < numTerms) { - terms.add(TestUtil.randomSimpleString(random())); - } - - for(String term : terms) { - Document doc = new Document(); - doc.add(new StringField("field", term, Field.Store.NO)); - doc.add(new BinaryDocValuesField("field", new BytesRef(term))); - w.addDocument(doc); - } - - if (random().nextBoolean()) { - if (VERBOSE) { - System.out.println("TEST: now force merge"); - } - w.forceMerge(1); - } - - IndexReader r = DirectoryReader.open(w); - - List sortedTerms = new ArrayList<>(terms); - Collections.sort(sortedTerms); - - if (VERBOSE) { - System.out.println("TEST: sorted terms:"); - int idx = 0; - for(String term : sortedTerms) { - System.out.println(idx + ": " + term); - idx++; - } - } - - if (VERBOSE) { - System.out.println("TEST: r=" + r); - } - - int iters = atLeast(100); - for(int iter=0;iter prefixCounts = new HashMap<>(); - private int totPrefixCount; - private final BytesRef[] bounds; - private int totTermCount; - private BytesRef lastTerm; - - public VerifyAutoPrefixTerms(int maxDoc, BytesRef... bounds) { - allHits = new FixedBitSet(maxDoc); - assert bounds.length > 0; - this.bounds = bounds; - } - - public void sawTerm(BytesRef term) { - //System.out.println("saw term=" + term); - if (lastTerm != null) { - assertTrue(lastTerm.compareTo(term) < 0); - } - lastTerm = BytesRef.deepCopyOf(term); - totTermCount++; - totPrefixCount += term.length; - for(int i=1;i<=term.length;i++) { - BytesRef prefix = BytesRef.deepCopyOf(term); - prefix.length = i; - Integer count = prefixCounts.get(prefix); - if (count == null) { - count = 1; - } else { - count += 1; - } - prefixCounts.put(prefix, count); - } - } - - public void sawDoc(int docID) { - // The auto-prefix terms should never "overlap" one another, so we should only ever see a given docID one time: - assertFalse(allHits.getAndSet(docID)); - } - - public void finish(int expectedNumHits, int maxPrefixCount) { - - if (maxPrefixCount != -1) { - // Auto-prefix terms were used in this test - long allowedMaxTerms; - - if (bounds.length == 1) { - // Simple prefix query: we should never see more than maxPrefixCount terms, except for the empty string: - if (bounds[0].length == 0) { - allowedMaxTerms = Integer.MAX_VALUE; - } else { - allowedMaxTerms = maxPrefixCount; - } - } else { - // Trickier: we need to allow for maxPrefixTerms for each different leading byte in the min and max: - assert bounds.length == 2; - BytesRef minTerm = bounds[0]; - BytesRef maxTerm = bounds[1]; - - int commonPrefix = 0; - for(int i=0;i min) { - // When maxPrefixCount is small (< 16), each byte of the term can require more than one "level" of auto-prefixing: - // NOTE: this is still only approximate ... it's tricky to get a closed form max bound that's "tight" - allowedMaxTerms += MathUtil.log(max-min, maxPrefixCount); - } - } - } - - assertTrue("totTermCount=" + totTermCount + " is > allowedMaxTerms=" + allowedMaxTerms, totTermCount <= allowedMaxTerms); - } - - assertEquals(expectedNumHits, allHits.cardinality()); - int sum = 0; - for(Map.Entry ent : prefixCounts.entrySet()) { - - BytesRef prefix = ent.getKey(); - if (VERBOSE) { - System.out.println(" verify prefix=" + TestUtil.bytesRefToString(prefix) + " count=" + ent.getValue()); - } - - if (maxPrefixCount != -1) { - // Auto-prefix terms were used in this test - - int sumLeftoverSuffix = 0; - for(BytesRef bound : bounds) { - - int minSharedLength = Math.min(bound.length, prefix.length); - int commonPrefix = minSharedLength; - for(int i=0;i { - w.commit(); - }); - assertEquals("ranges can only be indexed with IndexOptions.DOCS (field: foo)", expected.getMessage()); - w.close(); - dir.close(); - } - - /** Make sure you get clear exc. if you try to use this within anything but IndexOptions.DOCS fields. */ - public void testWithPositions() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(codec); - IndexWriter w = new IndexWriter(dir, iwc); - FieldType ft = new FieldType(StringField.TYPE_NOT_STORED); - ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); - Document doc = new Document(); - doc.add(new Field("foo", "bar bar", ft)); - w.addDocument(doc); - IllegalStateException expected = expectThrows(IllegalStateException.class, () -> { - w.commit(); - }); - assertEquals("ranges can only be indexed with IndexOptions.DOCS (field: foo)", expected.getMessage()); - - w.close(); - dir.close(); - } - - /** Make sure you get clear exc. if you try to use this within anything but IndexOptions.DOCS fields. */ - public void testWithOffsets() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(codec); - IndexWriter w = new IndexWriter(dir, iwc); - FieldType ft = new FieldType(StringField.TYPE_NOT_STORED); - ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Document doc = new Document(); - doc.add(new Field("foo", "bar bar", ft)); - w.addDocument(doc); - IllegalStateException expected = expectThrows(IllegalStateException.class, () -> { - w.commit(); - }); - assertEquals("ranges can only be indexed with IndexOptions.DOCS (field: foo)", expected.getMessage()); - - w.close(); - dir.close(); - } - - /** Make sure auto prefix terms are used with TermRangeQuery */ - public void testTermRange() throws Exception { - - List prefixes = new ArrayList<>(); - for(int i=1;i<5;i++) { - char[] chars = new char[i]; - Arrays.fill(chars, 'a'); - prefixes.add(new String(chars)); - } - - Set randomTerms = new HashSet<>(); - int numTerms = atLeast(10000); - while (randomTerms.size() < numTerms) { - for(String prefix : prefixes) { - randomTerms.add(prefix + TestUtil.randomSimpleString(random())); - } - } - - // We make term range aa - aa - char start; - char end; - - int actualCount; - boolean startInclusive = random().nextBoolean(); - boolean endInclusive = random().nextBoolean(); - String startTerm; - String endTerm; - - while (true) { - start = (char) TestUtil.nextInt(random(), 'a', 'm'); - end = (char) TestUtil.nextInt(random(), start+1, 'z'); - - actualCount = 0; - - startTerm = "aa" + start; - endTerm = "aa" + end; - - for(String term : randomTerms) { - int cmpStart = startTerm.compareTo(term); - int cmpEnd = endTerm.compareTo(term); - if ((cmpStart < 0 || (startInclusive && cmpStart == 0)) && - (cmpEnd > 0 || (endInclusive && cmpEnd == 0))) { - actualCount++; - } - } - - if (actualCount > 2000) { - break; - } - } - - if (VERBOSE) { - System.out.println("start " + startTerm + " inclusive? " + startInclusive); - System.out.println("end " + endTerm + " inclusive? " + endInclusive); - System.out.println("actual count " + actualCount); - } - - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - int minTermsInBlock = TestUtil.nextInt(random(), 2, 100); - int maxTermsInBlock = Math.max(2, (minTermsInBlock-1)*2 + random().nextInt(100)); - - int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, 100); - int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2, (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE; - - if (VERBOSE) { - System.out.println("minTermsAutoPrefix " + minTermsAutoPrefix); - System.out.println("maxTermsAutoPrefix " + maxTermsAutoPrefix); - } - - iwc.setCodec(TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat(minTermsInBlock, maxTermsInBlock, - minTermsAutoPrefix, maxTermsAutoPrefix))); - RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); - - if (VERBOSE) { - System.out.println("TEST: index terms"); - } - for (String term : randomTerms) { - Document doc = new Document(); - doc.add(new StringField("field", term, Field.Store.NO)); - w.addDocument(doc); - if (VERBOSE) { - System.out.println(" " + term); - } - } - - if (VERBOSE) { - System.out.println("TEST: now force merge"); - } - - w.forceMerge(1); - IndexReader r = w.getReader(); - final Terms terms = MultiFields.getTerms(r, "field"); - IndexSearcher s = new IndexSearcher(r); - final int finalActualCount = actualCount; - if (VERBOSE) { - System.out.println("start=" + startTerm + " end=" + endTerm + " startIncl=" + startInclusive + " endIncl=" + endInclusive); - } - TermRangeQuery q = new TermRangeQuery("field", new BytesRef(startTerm), new BytesRef(endTerm), startInclusive, endInclusive) { - public TermRangeQuery checkTerms() throws IOException { - TermsEnum termsEnum = getTermsEnum(terms, new AttributeSource()); - int count = 0; - while (termsEnum.next() != null) { - if (VERBOSE) { - System.out.println("got term: " + termsEnum.term().utf8ToString()); - } - count++; - } - if (VERBOSE) { - System.out.println("count " + count + " vs finalActualCount=" + finalActualCount); - } - - // Auto-prefix term(s) should have kicked in, so we should have visited fewer than the total number of aa* terms: - assertTrue(count < finalActualCount); - - return this; - } - }.checkTerms(); - - int maxClauseCount = BooleanQuery.getMaxClauseCount(); - - try { - - // TODO test with boolean rewrite as well once we can create term - // queries on fake terms - /*if (random().nextBoolean()) { - q.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE); - BooleanQuery.setMaxClauseCount(actualCount); - } else if (random().nextBoolean()) { - q.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE); - BooleanQuery.setMaxClauseCount(actualCount); - }*/ - - if (VERBOSE) { - System.out.println("TEST: use rewrite method " + q.getRewriteMethod()); - } - assertEquals(actualCount, s.search(q, 1).totalHits); - } finally { - BooleanQuery.setMaxClauseCount(maxClauseCount); - } - - // Test when min == max: - List randomTermsList = new ArrayList<>(randomTerms); - for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { - String term = randomTermsList.get(random().nextInt(randomTermsList.size())); - q = new TermRangeQuery("field", new BytesRef(term), new BytesRef(term), true, true); - assertEquals(1, s.search(q, 1).totalHits); - } - - r.close(); - w.close(); - dir.close(); - } - - - /** Make sure auto prefix terms are used with PrefixQuery. */ - public void testPrefixQuery() throws Exception { - - List prefixes = new ArrayList<>(); - for(int i=1;i<5;i++) { - char[] chars = new char[i]; - Arrays.fill(chars, 'a'); - prefixes.add(new String(chars)); - } - - Set randomTerms = new HashSet<>(); - int numTerms = atLeast(10000); - while (randomTerms.size() < numTerms) { - for(String prefix : prefixes) { - randomTerms.add(prefix + TestUtil.randomRealisticUnicodeString(random())); - } - } - - int actualCount = 0; - for(String term : randomTerms) { - if (term.startsWith("aa")) { - actualCount++; - } - } - - //System.out.println("actual count " + actualCount); - - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - int minTermsInBlock = TestUtil.nextInt(random(), 2, 100); - int maxTermsInBlock = Math.max(2, (minTermsInBlock-1)*2 + random().nextInt(100)); - - // As long as this is never > actualCount, aa should always see at least one auto-prefix term: - int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, actualCount); - int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2, (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE; - - iwc.setCodec(TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat(minTermsInBlock, maxTermsInBlock, - minTermsAutoPrefix, maxTermsAutoPrefix))); - RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); - - for (String term : randomTerms) { - Document doc = new Document(); - doc.add(new StringField("field", term, Field.Store.NO)); - w.addDocument(doc); - } - - w.forceMerge(1); - IndexReader r = w.getReader(); - final Terms terms = MultiFields.getTerms(r, "field"); - IndexSearcher s = new IndexSearcher(r); - final int finalActualCount = actualCount; - PrefixQuery q = new PrefixQuery(new Term("field", "aa")) { - public PrefixQuery checkTerms() throws IOException { - TermsEnum termsEnum = getTermsEnum(terms, new AttributeSource()); - int count = 0; - while (termsEnum.next() != null) { - //System.out.println("got term: " + termsEnum.term().utf8ToString()); - count++; - } - - // Auto-prefix term(s) should have kicked in, so we should have visited fewer than the total number of aa* terms: - assertTrue(count < finalActualCount); - - return this; - } - }.checkTerms(); - - int x = BooleanQuery.getMaxClauseCount(); - try { - // TODO test with boolean rewrite as well once we can create term - // queries on fake terms - /*BooleanQuery.setMaxClauseCount(randomTerms.size()); - if (random().nextBoolean()) { - q.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE); - } else if (random().nextBoolean()) { - q.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE); - }*/ - - assertEquals(actualCount, s.search(q, 1).totalHits); - } finally { - BooleanQuery.setMaxClauseCount(x); - } - - r.close(); - w.close(); - dir.close(); - } -} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java index 595bfbb8ad5..c317668a12c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.codecs; -import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader; // javadocs import org.apache.lucene.index.OrdTermState; import org.apache.lucene.index.TermState; @@ -39,11 +38,6 @@ public class BlockTermState extends OrdTermState { // TODO: update BTR to nuke this public long blockFilePointer; - /** True if this term is "real" (e.g., not an auto-prefix term or - * some other "secret" term; currently only {@link BlockTreeTermsReader} - * sets this). */ - public boolean isRealTerm = true; - /** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ protected BlockTermState() { @@ -58,16 +52,10 @@ public class BlockTermState extends OrdTermState { totalTermFreq = other.totalTermFreq; termBlockOrd = other.termBlockOrd; blockFilePointer = other.blockFilePointer; - isRealTerm = other.isRealTerm; - } - - @Override - public boolean isRealTerm() { - return isRealTerm; } @Override public String toString() { - return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer + " isRealTerm=" + isRealTerm; + return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer; } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java deleted file mode 100644 index 501068d627a..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java +++ /dev/null @@ -1,438 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.blocktree; - - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import org.apache.lucene.index.FilteredTermsEnum; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.StringHelper; - -// TODO: instead of inlining auto-prefix terms with normal terms, -// we could write them into their own virtual/private field. This -// would make search time a bit more complex, since we'd need to -// merge sort between two TermEnums, but it would also make stats -// API (used by CheckIndex -verbose) easier to implement since we could -// just walk this virtual field and gather its stats) - -/** Used in the first pass when writing a segment to locate - * "appropriate" auto-prefix terms to pre-compile into the index. - * This visits every term in the index to find prefixes that - * match {@code >= min} and {@code <= max} number of terms. */ - -class AutoPrefixTermsWriter { - - //static boolean DEBUG = BlockTreeTermsWriter.DEBUG; - //static boolean DEBUG = false; - //static boolean DEBUG2 = BlockTreeTermsWriter.DEBUG2; - //static boolean DEBUG2 = true; - - /** Describes a range of term-space to match, either a simple prefix - * (foo*) or a floor-block range of a prefix (e.g. foo[a-m]*, - * foo[n-z]*) when there are too many terms starting with foo*. */ - public static final class PrefixTerm implements Comparable { - /** Common prefix */ - public final byte[] prefix; - - /** If this is -2, this is a normal prefix (foo *), else it's the minimum lead byte of the suffix (e.g. 'd' in foo[d-m]*). */ - public final int floorLeadStart; - - /** The lead byte (inclusive) of the suffix for the term range we match (e.g. 'm' in foo[d-m*]); this is ignored when - * floorLeadStart is -2. */ - public final int floorLeadEnd; - - public final BytesRef term; - - /** Sole constructor. */ - public PrefixTerm(byte[] prefix, int floorLeadStart, int floorLeadEnd) { - this.prefix = prefix; - this.floorLeadStart = floorLeadStart; - this.floorLeadEnd = floorLeadEnd; - this.term = toBytesRef(prefix, floorLeadStart); - - assert floorLeadEnd >= floorLeadStart; - assert floorLeadEnd >= 0; - assert floorLeadStart == -2 || floorLeadStart >= 0; - - // We should never create empty-string prefix term: - assert prefix.length > 0 || floorLeadStart != -2 || floorLeadEnd != 0xff; - } - - @Override - public String toString() { - String s = brToString(new BytesRef(prefix)); - if (floorLeadStart == -2) { - s += "[-" + Integer.toHexString(floorLeadEnd) + "]"; - } else { - s += "[" + Integer.toHexString(floorLeadStart) + "-" + Integer.toHexString(floorLeadEnd) + "]"; - } - return s; - } - - @Override - public int compareTo(PrefixTerm other) { - int cmp = term.compareTo(other.term); - if (cmp == 0) { - if (prefix.length != other.prefix.length) { - return prefix.length - other.prefix.length; - } - - // On tie, sort the bigger floorLeadEnd, earlier, since it - // spans more terms, so during intersect, we want to encounter this one - // first so we can use it if the automaton accepts the larger range: - cmp = other.floorLeadEnd - floorLeadEnd; - } - - return cmp; - } - - /** Returns the leading term for this prefix term, e.g. "foo" (for - * the foo* prefix) or "foom" (for the foo[m-z]* case). */ - private static BytesRef toBytesRef(byte[] prefix, int floorLeadStart) { - BytesRef br; - if (floorLeadStart != -2) { - assert floorLeadStart >= 0; - br = new BytesRef(prefix.length+1); - } else { - br = new BytesRef(prefix.length); - } - System.arraycopy(prefix, 0, br.bytes, 0, prefix.length); - br.length = prefix.length; - if (floorLeadStart != -2) { - assert floorLeadStart >= 0; - br.bytes[br.length++] = (byte) floorLeadStart; - } - - return br; - } - - public int compareTo(BytesRef term) { - return this.term.compareTo(term); - } - - public TermsEnum getTermsEnum(TermsEnum in) { - - final BytesRef prefixRef = new BytesRef(prefix); - - return new FilteredTermsEnum(in) { - { - setInitialSeekTerm(term); - } - - @Override - protected AcceptStatus accept(BytesRef term) { - if (StringHelper.startsWith(term, prefixRef) && - (floorLeadEnd == -1 || term.length == prefixRef.length || (term.bytes[term.offset + prefixRef.length] & 0xff) <= floorLeadEnd)) { - return AcceptStatus.YES; - } else { - return AcceptStatus.END; - } - } - }; - } - } - - // for debugging - static String brToString(BytesRef b) { - try { - return b.utf8ToString() + " " + b; - } catch (Throwable t) { - // If BytesRef isn't actually UTF8, or it's eg a - // prefix of UTF8 that ends mid-unicode-char, we - // fallback to hex: - return b.toString(); - } - } - - final List prefixes = new ArrayList<>(); - private final int minItemsInPrefix; - private final int maxItemsInPrefix; - - // Records index into pending where the current prefix at that - // length "started"; for example, if current term starts with 't', - // startsByPrefix[0] is the index into pending for the first - // term/sub-block starting with 't'. We use this to figure out when - // to write a new block: - private final BytesRefBuilder lastTerm = new BytesRefBuilder(); - private int[] prefixStarts = new int[8]; - private List pending = new ArrayList<>(); - - //private final String segment; - - public AutoPrefixTermsWriter(Terms terms, int minItemsInPrefix, int maxItemsInPrefix) throws IOException { - this.minItemsInPrefix = minItemsInPrefix; - this.maxItemsInPrefix = maxItemsInPrefix; - //this.segment = segment; - - TermsEnum termsEnum = terms.iterator(); - while (true) { - BytesRef term = termsEnum.next(); - if (term == null) { - break; - } - //if (DEBUG) System.out.println("pushTerm: " + brToString(term)); - pushTerm(term); - } - - if (pending.size() > 1) { - pushTerm(BlockTreeTermsWriter.EMPTY_BYTES_REF); - - // Also maybe save floor prefixes in root block; this can be a biggish perf gain for large ranges: - /* - System.out.println("root block pending.size=" + pending.size()); - for(Object o : pending) { - System.out.println(" " + o); - } - */ - while (pending.size() >= minItemsInPrefix) { - savePrefixes(0, pending.size()); - } - } - - // Even though we visited terms in already-sorted order, the prefixes - // can be slightly unsorted, e.g. aaaaa will be before aaa, so we - // must sort here so our caller can do merge sort into actual terms - // when writing. Probably we should use CollectionUtil.timSort here? - Collections.sort(prefixes); - } - - /** Pushes the new term to the top of the stack, and writes new blocks. */ - private void pushTerm(BytesRef text) throws IOException { - int limit = Math.min(lastTerm.length(), text.length); - //if (DEBUG) System.out.println("\nterm: " + text.utf8ToString()); - - // Find common prefix between last term and current term: - int pos = 0; - while (pos < limit && lastTerm.byteAt(pos) == text.bytes[text.offset+pos]) { - pos++; - } - - //if (DEBUG) System.out.println(" shared=" + pos + " lastTerm.length=" + lastTerm.length()); - - // Close the "abandoned" suffix now: - for(int i=lastTerm.length()-1;i>=pos;i--) { - - // How many items on top of the stack share the current suffix - // we are closing: - int prefixTopSize = pending.size() - prefixStarts[i]; - - while (prefixTopSize >= minItemsInPrefix) { - //if (DEBUG) System.out.println(" pop: i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix); - savePrefixes(i+1, prefixTopSize); - //prefixStarts[i] -= prefixTopSize; - //if (DEBUG) System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]); - - // For large floor blocks, it's possible we should now re-run on the new prefix terms we just created: - prefixTopSize = pending.size() - prefixStarts[i]; - } - } - - if (prefixStarts.length < text.length) { - prefixStarts = ArrayUtil.grow(prefixStarts, text.length); - } - - // Init new tail: - for(int i=pos;i 0 || pending.isEmpty()) { - byte[] termBytes = new byte[text.length]; - System.arraycopy(text.bytes, text.offset, termBytes, 0, text.length); - pending.add(termBytes); - } - } - - void savePrefixes(int prefixLength, int count) throws IOException { - - assert count > 0; - - /* - if (DEBUG2) { - BytesRef br = new BytesRef(lastTerm.bytes()); - br.length = prefixLength; - //System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size()); - System.out.println(" savePrefixes: " + brToString(br) + " count=" + count + " pending.size()=" + pending.size()); - } - */ - - int lastSuffixLeadLabel = -2; - - int start = pending.size()-count; - assert start >=0; - - // Special case empty-string suffix case: we are being asked to build prefix terms for all aaa* terms, but - // the exact term aaa is here, and we must skip it (it is handled "higher", under the aa* terms): - Object o = pending.get(start); - boolean skippedEmptyStringSuffix = false; - if (o instanceof byte[]) { - if (((byte[]) o).length == prefixLength) { - start++; - count--; - //if (DEBUG) System.out.println(" skip empty-string term suffix"); - skippedEmptyStringSuffix = true; - } - } else { - PrefixTerm prefix = (PrefixTerm) o; - if (prefix.term.bytes.length == prefixLength) { - start++; - count--; - //if (DEBUG) System.out.println(" skip empty-string PT suffix"); - skippedEmptyStringSuffix = true; - } - } - - int end = pending.size(); - int nextBlockStart = start; - int nextFloorLeadLabel = -1; - int prefixCount = 0; - - PrefixTerm lastPTEntry = null; - - for (int i=start; i prefixLength; - ptEntry = null; - } - } - - //if (DEBUG) System.out.println(" check term=" + brToString(new BytesRef(termBytes)) + " o=" + o); - - // We handled the empty-string suffix case up front: - assert termBytes.length > prefixLength; - - int suffixLeadLabel = termBytes[prefixLength] & 0xff; - - //if (DEBUG) System.out.println(" i=" + i + " o=" + o + " suffixLeadLabel=" + Integer.toHexString(suffixLeadLabel) + " pendingCount=" + (i - nextBlockStart) + " min=" + minItemsInPrefix); - - if (suffixLeadLabel != lastSuffixLeadLabel) { - // This is a boundary, a chance to make an auto-prefix term if we want: - - // When we are "recursing" (generating auto-prefix terms on a block of - // floor'd auto-prefix terms), this assert is non-trivial because it - // ensures the floorLeadEnd of the previous terms is in fact less - // than the lead start of the current entry: - assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel; - - int itemsInBlock = i - nextBlockStart; - - if (itemsInBlock >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix) { - // The count is too large for one block, so we must break it into "floor" blocks, where we record - // the leading label of the suffix of the first term in each floor block, so at search time we can - // jump to the right floor block. We just use a naive greedy segmenter here: make a new floor - // block as soon as we have at least minItemsInBlock. This is not always best: it often produces - // a too-small block as the final block: - - // If the last entry was another prefix term of the same length, then it represents a range of terms, so we must use its ending - // prefix label as our ending label: - if (lastPTEntry != null) { - //if (DEBUG) System.out.println(" use last"); - lastSuffixLeadLabel = lastPTEntry.floorLeadEnd; - } - savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel); - - prefixCount++; - nextFloorLeadLabel = suffixLeadLabel; - nextBlockStart = i; - } - - if (nextFloorLeadLabel == -1) { - nextFloorLeadLabel = suffixLeadLabel; - //if (DEBUG) System.out.println("set first lead label=" + nextFloorLeadLabel); - } - - lastSuffixLeadLabel = suffixLeadLabel; - } - - lastPTEntry = ptEntry; - } - - // Write last block, if any: - if (nextBlockStart < end) { - //System.out.println(" lastPTEntry=" + lastPTEntry + " lastSuffixLeadLabel=" + lastSuffixLeadLabel); - if (lastPTEntry != null) { - lastSuffixLeadLabel = lastPTEntry.floorLeadEnd; - } - assert lastSuffixLeadLabel >= nextFloorLeadLabel: "lastSuffixLeadLabel=" + lastSuffixLeadLabel + " nextFloorLeadLabel=" + nextFloorLeadLabel; - if (prefixCount == 0) { - if (prefixLength > 0) { - savePrefix(prefixLength, -2, 0xff); - prefixCount++; - - // If we skipped empty string suffix, e.g. term aaa for prefix aaa*, since we - // are now writing the full aaa* prefix term, we include it here: - if (skippedEmptyStringSuffix) { - count++; - } - } else { - // Don't add a prefix term for all terms in the index! - } - } else { - if (lastSuffixLeadLabel == -2) { - // Special case when closing the empty string root block: - lastSuffixLeadLabel = 0xff; - } - savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel); - prefixCount++; - } - } - - // Remove slice from the top of the pending stack, that we just wrote: - - pending.subList(pending.size()-count, pending.size()).clear(); - - // Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up": - for(int i=0;i= VERSION_AUTO_PREFIX_TERMS_REMOVED) { + // Old (pre-5.2.0) or recent (6.2.0+) index, no auto-prefix terms: this.anyAutoPrefixTerms = false; } else if (version == VERSION_AUTO_PREFIX_TERMS) { // 5.2.x index, might have auto-prefix terms: this.anyAutoPrefixTerms = true; } else { // 5.3.x index, we record up front if we may have written any auto-prefix terms: - assert version >= VERSION_AUTO_PREFIX_TERMS_COND; + assert version == VERSION_AUTO_PREFIX_TERMS_COND; byte b = termsIn.readByte(); if (b == 0) { this.anyAutoPrefixTerms = false; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java index 340b0c94549..a4a150ba14b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java @@ -25,13 +25,11 @@ import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.blocktree.AutoPrefixTermsWriter.PrefixTerm; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -90,15 +88,6 @@ import org.apache.lucene.util.packed.PackedInts; * between two index terms. *

* - * If {@code minItemsInAutoPrefix} is not zero, then for - * {@link IndexOptions#DOCS} fields we detect prefixes that match - * "enough" terms and insert auto-prefix terms into the index, which are - * used by {@link Terms#intersect} at search time to speed up prefix - * and range queries. Besides {@link Terms#intersect}, these - * auto-prefix terms are invisible to all other APIs (don't change terms - * stats, don't show up in normal {@link TermsEnum}s, etc.). - *

- * * Files: *

    *
  • .tim: Term Dictionary
  • @@ -124,7 +113,7 @@ import org.apache.lucene.util.packed.PackedInts; * and decoding the Postings Metadata and Term Metadata sections.

    * *
      - *
    • TermsDict (.tim) --> Header, HasAutoPrefixTerms, PostingsHeader, NodeBlockNumBlocks, + *
    • TermsDict (.tim) --> Header, PostingsHeader, NodeBlockNumBlocks, * FieldSummary, DirOffset, Footer
    • *
    • NodeBlock --> (OuterNode | InnerNode)
    • *
    • OuterNode --> EntryCount, SuffixLength, ByteSuffixLength, StatsLength, < TermStats >EntryCount, MetaLength, <TermMetadata>EntryCount
    • @@ -145,7 +134,6 @@ import org.apache.lucene.util.packed.PackedInts; *
        *
      • Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information * for the BlockTree implementation.
      • - *
      • HasAutoPrefixTerms is a single byte; 1 means there may be auto-prefix terms and 0 means there are none. *
      • DirOffset is a pointer to the FieldSummary section.
      • *
      • DocFreq is the count of documents which contain the term.
      • *
      • TotalTermFreq is the total number of occurrences of the term. This is encoded @@ -223,8 +211,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { final int maxDoc; final int minItemsInBlock; final int maxItemsInBlock; - final int minItemsInAutoPrefix; - final int maxItemsInAutoPrefix; final PostingsWriterBase postingsWriter; final FieldInfos fieldInfos; @@ -260,43 +246,14 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { private final List fields = new ArrayList<>(); - // private final String segment; - final FixedBitSet prefixDocs; - - /** Reused in getAutoPrefixTermsEnum: */ - final BitSetTermsEnum prefixFixedBitsTermsEnum; - - /** Reused in getAutoPrefixTermsEnum: */ - private TermsEnum prefixTermsEnum; - - /** Reused in getAutoPrefixTermsEnum: */ - private PostingsEnum prefixDocsEnum; - - /** Create a new writer, using default values for auto-prefix terms. */ - public BlockTreeTermsWriter(SegmentWriteState state, - PostingsWriterBase postingsWriter, - int minItemsInBlock, - int maxItemsInBlock) throws IOException { - this(state, postingsWriter, minItemsInBlock, maxItemsInBlock, 0, 0); - } - /** Create a new writer. The number of items (terms or * sub-blocks) per block will aim to be between * minItemsPerBlock and maxItemsPerBlock, though in some - * cases the blocks may be smaller than the min. - * For DOCS_ONLY fields, this terms dictionary will - * insert automatically generated prefix terms for common - * prefixes, as long as each prefix matches at least - * {@code minItemsInAutoPrefix} other terms or prefixes, - * and at most {@code maxItemsInAutoPrefix} other terms - * or prefixes. Set {@code minItemsInAutoPrefix} to 0 - * to disable auto-prefix terms. */ + * cases the blocks may be smaller than the min. */ public BlockTreeTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter, int minItemsInBlock, - int maxItemsInBlock, - int minItemsInAutoPrefix, - int maxItemsInAutoPrefix) + int maxItemsInBlock) throws IOException { validateSettings(minItemsInBlock, @@ -305,21 +262,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { this.minItemsInBlock = minItemsInBlock; this.maxItemsInBlock = maxItemsInBlock; - validateAutoPrefixSettings(minItemsInAutoPrefix, - maxItemsInAutoPrefix); - - if (minItemsInAutoPrefix != 0) { - // TODO: can we used compressed bitset instead? that auto-upgrades if it's dense enough... - prefixDocs = new FixedBitSet(state.segmentInfo.maxDoc()); - prefixFixedBitsTermsEnum = new BitSetTermsEnum(prefixDocs); - } else { - prefixDocs = null; - prefixFixedBitsTermsEnum = null; - } - - this.minItemsInAutoPrefix = minItemsInAutoPrefix; - this.maxItemsInAutoPrefix = maxItemsInAutoPrefix; - this.maxDoc = state.segmentInfo.maxDoc(); this.fieldInfos = state.fieldInfos; this.postingsWriter = postingsWriter; @@ -332,13 +274,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { CodecUtil.writeIndexHeader(termsOut, BlockTreeTermsReader.TERMS_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - // So at read time we know, globally, that there will be no auto-prefix terms: - if (minItemsInAutoPrefix == 0) { - termsOut.writeByte((byte) 0); - } else { - termsOut.writeByte((byte) 1); - } - final String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_INDEX_EXTENSION); indexOut = state.directory.createOutput(indexName, state.context); CodecUtil.writeIndexHeader(indexOut, BlockTreeTermsReader.TERMS_INDEX_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT, @@ -380,25 +315,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { } } - /** Throws {@code IllegalArgumentException} if any of these settings - * is invalid. */ - public static void validateAutoPrefixSettings(int minItemsInAutoPrefix, - int maxItemsInAutoPrefix) { - if (minItemsInAutoPrefix != 0) { - if (minItemsInAutoPrefix < 2) { - throw new IllegalArgumentException("minItemsInAutoPrefix must be at least 2; got minItemsInAutoPrefix=" + minItemsInAutoPrefix); - } - if (minItemsInAutoPrefix > maxItemsInAutoPrefix) { - throw new IllegalArgumentException("maxItemsInAutoPrefix must be >= minItemsInAutoPrefix; got maxItemsInAutoPrefix=" + maxItemsInAutoPrefix + " minItemsInAutoPrefix=" + minItemsInAutoPrefix); - } - if (2*(minItemsInAutoPrefix-1) > maxItemsInAutoPrefix) { - throw new IllegalArgumentException("maxItemsInAutoPrefix must be at least 2*(minItemsInAutoPrefix-1); got maxItemsInAutoPrefix=" + maxItemsInAutoPrefix + " minItemsInAutoPrefix=" + minItemsInAutoPrefix); - } - } else if (maxItemsInAutoPrefix != 0) { - throw new IllegalArgumentException("maxItemsInAutoPrefix must be 0 (disabled) when minItemsInAutoPrefix is 0"); - } - } - @Override public void write(Fields fields) throws IOException { //if (DEBUG) System.out.println("\nBTTW.write seg=" + segment); @@ -413,75 +329,26 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { if (terms == null) { continue; } - FieldInfo fieldInfo = fieldInfos.fieldInfo(field); - - // First pass to find all prefix terms we should compile into the index: - List prefixTerms; - if (minItemsInAutoPrefix != 0) { - if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) { - throw new IllegalStateException("ranges can only be indexed with IndexOptions.DOCS (field: " + fieldInfo.name + ")"); - } - prefixTerms = new AutoPrefixTermsWriter(terms, minItemsInAutoPrefix, maxItemsInAutoPrefix).prefixes; - //if (DEBUG) { - // for(PrefixTerm term : prefixTerms) { - // System.out.println("field=" + fieldInfo.name + " PREFIX TERM: " + term); - // } - //} - } else { - prefixTerms = null; - } TermsEnum termsEnum = terms.iterator(); TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field)); - int prefixTermUpto = 0; while (true) { BytesRef term = termsEnum.next(); //if (DEBUG) System.out.println("BTTW: next term " + term); - // Insert (merge sort) next prefix term(s): - if (prefixTerms != null) { - while (prefixTermUpto < prefixTerms.size() && (term == null || prefixTerms.get(prefixTermUpto).compareTo(term) <= 0)) { - PrefixTerm prefixTerm = prefixTerms.get(prefixTermUpto); - //if (DEBUG) System.out.println("seg=" + segment + " field=" + fieldInfo.name + " NOW INSERT prefix=" + prefixTerm); - termsWriter.write(prefixTerm.term, getAutoPrefixTermsEnum(terms, prefixTerm), prefixTerm); - prefixTermUpto++; - } - } - if (term == null) { break; } //if (DEBUG) System.out.println("write field=" + fieldInfo.name + " term=" + brToString(term)); - termsWriter.write(term, termsEnum, null); + termsWriter.write(term, termsEnum); } - assert prefixTerms == null || prefixTermUpto == prefixTerms.size(); - termsWriter.finish(); //if (DEBUG) System.out.println("\nBTTW.write done seg=" + segment + " field=" + field); } } - - private TermsEnum getAutoPrefixTermsEnum(Terms terms, final PrefixTerm prefix) throws IOException { - assert prefixDocs != null; - prefixDocs.clear(0, prefixDocs.length()); - - prefixTermsEnum = prefix.getTermsEnum(terms.iterator()); - - //System.out.println("BTTW.getAutoPrefixTE: prefix=" + prefix); - while (prefixTermsEnum.next() != null) { - //System.out.println(" got term=" + prefixTermsEnum.term().utf8ToString()); - //termCount++; - prefixDocsEnum = prefixTermsEnum.postings(prefixDocsEnum, 0); - //System.out.println(" " + prefixDocsEnum + " doc=" + prefixDocsEnum.docID()); - prefixDocs.or(prefixDocsEnum); - } - - //System.out.println(" done terms: " + prefixDocs.cardinality() + " doc seen; " + termCount + " terms seen"); - return prefixFixedBitsTermsEnum; - } static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) { assert fp < (1L << 62); @@ -500,16 +367,12 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { public final byte[] termBytes; // stats + metadata public final BlockTermState state; - // Non-null if this is an auto-prefix-term: - public final PrefixTerm prefixTerm; - public PendingTerm other; - public PendingTerm(BytesRef term, BlockTermState state, PrefixTerm prefixTerm) { + public PendingTerm(BytesRef term, BlockTermState state) { super(true); this.termBytes = new byte[term.length]; System.arraycopy(term.bytes, term.offset, termBytes, 0, term.length); this.state = state; - this.prefixTerm = prefixTerm; } @Override @@ -698,7 +561,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { // only points to sub-blocks in the terms index so we can avoid seeking // to it when we are looking for a term): boolean hasTerms = false; - boolean hasPrefixTerms = false; boolean hasSubBlocks = false; int start = pending.size()-count; @@ -739,11 +601,10 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { // block as soon as we have at least minItemsInBlock. This is not always best: it often produces // a too-small block as the final block: boolean isFloor = itemsInBlock < count; - newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasPrefixTerms, hasSubBlocks)); + newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasSubBlocks)); hasTerms = false; hasSubBlocks = false; - hasPrefixTerms = false; nextFloorLeadLabel = suffixLeadLabel; nextBlockStart = i; } @@ -753,7 +614,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { if (ent.isTerm) { hasTerms = true; - hasPrefixTerms |= ((PendingTerm) ent).prefixTerm != null; } else { hasSubBlocks = true; } @@ -763,7 +623,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { if (nextBlockStart < end) { int itemsInBlock = end - nextBlockStart; boolean isFloor = itemsInBlock < count; - newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasPrefixTerms, hasSubBlocks)); + newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasSubBlocks)); } assert newBlocks.isEmpty() == false; @@ -789,7 +649,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { * same prefix, and so we broke it into multiple floor blocks where * we record the starting label of the suffix of each floor block. */ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLeadLabel, int start, int end, - boolean hasTerms, boolean hasPrefixTerms, boolean hasSubBlocks) throws IOException { + boolean hasTerms, boolean hasSubBlocks) throws IOException { assert end > start; @@ -823,7 +683,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { // We optimize the leaf block case (block has only terms), writing a more // compact format in this case: - boolean isLeafBlock = hasSubBlocks == false && hasPrefixTerms == false; + boolean isLeafBlock = hasSubBlocks == false; //System.out.println(" isLeaf=" + isLeafBlock); @@ -839,7 +699,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { assert ent.isTerm: "i=" + i; PendingTerm term = (PendingTerm) ent; - assert term.prefixTerm == null; assert StringHelper.startsWith(term.termBytes, prefix): "term.term=" + term.termBytes + " prefix=" + prefix; BlockTermState state = term.state; @@ -876,7 +735,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { } else { // Block has at least one prefix term or a sub block: subIndices = new ArrayList<>(); - boolean sawAutoPrefixTerm = false; for (int i=start;i 0; - sawAutoPrefixTerm = true; - PrefixTerm prefixTerm = term.prefixTerm; - floorLeadEnd = prefixTerm.floorLeadEnd; - assert floorLeadEnd != -1; - - if (prefixTerm.floorLeadStart == -2) { - // Starts with empty string - code |= 2; - } else { - code |= 3; - } - } - suffixWriter.writeVInt(code); - suffixWriter.writeBytes(term.termBytes, prefixLength, suffix); - if (floorLeadEnd != -1) { - suffixWriter.writeByte((byte) floorLeadEnd); - } - assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel; - } + suffixWriter.writeVInt(suffix << 1); + suffixWriter.writeBytes(term.termBytes, prefixLength, suffix); // Write term stats, to separate byte[] blob: statsWriter.writeVInt(state.docFreq); @@ -961,13 +791,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { assert suffix > 0; // For non-leaf block we borrow 1 bit to record - // if entry is term or sub-block, and 1 bit (unset here) to - // record if it's a prefix term: - if (minItemsInAutoPrefix == 0) { - suffixWriter.writeVInt((suffix<<1)|1); - } else { - suffixWriter.writeVInt((suffix<<2)|1); - } + // if entry is term or sub-block:f + suffixWriter.writeVInt((suffix<<1)|1); suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix); //if (DEBUG2) { @@ -985,7 +810,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { } } - assert subIndices.size() != 0 || sawAutoPrefixTerm; + assert subIndices.size() != 0; } // TODO: we could block-write the term suffix pointers; @@ -1029,7 +854,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { } /** Writes one term's worth of postings. */ - public void write(BytesRef text, TermsEnum termsEnum, PrefixTerm prefixTerm) throws IOException { + public void write(BytesRef text, TermsEnum termsEnum) throws IOException { /* if (DEBUG) { int[] tmp = new int[lastTerm.length]; @@ -1045,20 +870,17 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { assert fieldInfo.getIndexOptions() == IndexOptions.DOCS || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter; pushTerm(text); - PendingTerm term = new PendingTerm(text, state, prefixTerm); + PendingTerm term = new PendingTerm(text, state); pending.add(term); //if (DEBUG) System.out.println(" add pending term = " + text + " pending.size()=" + pending.size()); - if (prefixTerm == null) { - // Only increment stats for real terms: - sumDocFreq += state.docFreq; - sumTotalTermFreq += state.totalTermFreq; - numTerms++; - if (firstPendingTerm == null) { - firstPendingTerm = term; - } - lastPendingTerm = term; + sumDocFreq += state.docFreq; + sumTotalTermFreq += state.totalTermFreq; + numTerms++; + if (firstPendingTerm == null) { + firstPendingTerm = term; } + lastPendingTerm = term; } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java index 4d5b4cebffa..3934c553d9b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java @@ -348,7 +348,6 @@ final class IntersectTermsEnum extends TermsEnum { private boolean skipPastLastAutoPrefixTerm() throws IOException { assert currentFrame.isAutoPrefixTerm; useAutoPrefixTerm = false; - currentFrame.termState.isRealTerm = true; // If we last returned an auto-prefix term, we must now skip all // actual terms sharing that prefix. At most, that skipping @@ -694,7 +693,6 @@ final class IntersectTermsEnum extends TermsEnum { if (useAutoPrefixTerm) { // All suffixes of this auto-prefix term are accepted by the automaton, so we can use it: copyTerm(); - currentFrame.termState.isRealTerm = false; return term; } else { // We move onto the next term diff --git a/lucene/core/src/java/org/apache/lucene/index/TermContext.java b/lucene/core/src/java/org/apache/lucene/index/TermContext.java index ee5163eba6e..e55aeba7c75 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermContext.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermContext.java @@ -177,18 +177,6 @@ public final class TermContext { return totalTermFreq; } - /** Returns true if all terms stored here are real (e.g., not auto-prefix terms). - * - * @lucene.internal */ - public boolean hasOnlyRealTerms() { - for (TermState termState : states) { - if (termState != null && termState.isRealTerm() == false) { - return false; - } - } - return true; - } - @Override public String toString() { StringBuilder sb = new StringBuilder(); diff --git a/lucene/core/src/java/org/apache/lucene/index/TermState.java b/lucene/core/src/java/org/apache/lucene/index/TermState.java index d8e9be4c619..7d3f1330b57 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermState.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermState.java @@ -48,12 +48,6 @@ public abstract class TermState implements Cloneable { // should not happen throw new RuntimeException(cnse); } - } - - /** Returns true if this term is real (e.g., not an auto-prefix term). - * @lucene.internal */ - public boolean isRealTerm() { - return true; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java index 66577570143..ed4c43073ec 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java @@ -122,10 +122,6 @@ final class MultiTermQueryConstantScoreWrapper extends return true; } TermState state = termsEnum.termState(); - if (state.isRealTerm() == false) { - // TermQuery does not accept fake terms for now - return false; - } terms.add(new TermAndState(BytesRef.deepCopyOf(term), state, termsEnum.docFreq(), termsEnum.totalTermFreq())); } return termsEnum.next() == null; diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java index 3a62e1599f4..38345b30a04 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java +++ b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java @@ -115,7 +115,7 @@ public abstract class ScoringRewrite extends TermCollectingRewrite { for (int i = 0; i < size; i++) { final int pos = sort[i]; final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef())); - assert termStates[pos].hasOnlyRealTerms() == false || reader.docFreq(term) == termStates[pos].docFreq(); + assert reader.docFreq(term) == termStates[pos].docFreq(); addClause(builder, term, termStates[pos].docFreq(), boost[pos], termStates[pos]); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index e815ff61a8a..6547b10439a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -53,8 +53,6 @@ public class TermQuery extends Query { super(TermQuery.this); this.needsScores = needsScores; assert termStates != null : "TermContext must not be null"; - // checked with a real exception in TermQuery constructor - assert termStates.hasOnlyRealTerms(); this.termStates = termStates; this.similarity = searcher.getSimilarity(needsScores); @@ -166,12 +164,6 @@ public class TermQuery extends Query { public TermQuery(Term t, TermContext states) { assert states != null; term = Objects.requireNonNull(t); - if (states.hasOnlyRealTerms() == false) { - // The reason for this is that fake terms might have the same bytes as - // real terms, and this confuses query caching because they don't match - // the same documents - throw new IllegalArgumentException("Term queries must be created on real terms"); - } perReaderTermState = Objects.requireNonNull(states); } diff --git a/solr/core/src/java/org/apache/solr/query/SolrRangeQuery.java b/solr/core/src/java/org/apache/solr/query/SolrRangeQuery.java index 132dbceff11..ee6340d9f7e 100644 --- a/solr/core/src/java/org/apache/solr/query/SolrRangeQuery.java +++ b/solr/core/src/java/org/apache/solr/query/SolrRangeQuery.java @@ -345,10 +345,6 @@ public final class SolrRangeQuery extends ExtendedQueryBase implements DocSetPro return -count - 1; } TermState state = termsEnum.termState(); - if (state.isRealTerm() == false) { - // TermQuery does not accept fake terms for now - return count; - } int df = termsEnum.docFreq(); count += df; terms.add(new TermAndState(BytesRef.deepCopyOf(term), state, df, termsEnum.totalTermFreq()));