LUCENE-7317: Remove auto-prefix terms.

This commit is contained in:
Adrien Grand 2016-06-09 18:44:25 +02:00
parent d3fcac779b
commit bac521d1aa
16 changed files with 31 additions and 1819 deletions

View File

@ -1,125 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.autoprefix;
import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/**
* Just like {@link Lucene50PostingsFormat} except this format
* exposes the experimental auto-prefix terms.
*
* @lucene.experimental
*/
public final class AutoPrefixPostingsFormat extends PostingsFormat {
private final int minItemsInBlock;
private final int maxItemsInBlock;
private final int minItemsInAutoPrefix;
private final int maxItemsInAutoPrefix;
/** Creates {@code AutoPrefixPostingsFormat} with default settings. */
public AutoPrefixPostingsFormat() {
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE,
25, 48);
}
/** Creates {@code Lucene50PostingsFormat} with custom
* values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
* @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */
public AutoPrefixPostingsFormat(int minItemsInAutoPrefix, int maxItemsInAutoPrefix) {
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE,
minItemsInAutoPrefix,
maxItemsInAutoPrefix);
}
/** Creates {@code Lucene50PostingsFormat} with custom
* values for {@code minBlockSize}, {@code
* maxBlockSize}, {@code minItemsInAutoPrefix} and {@code maxItemsInAutoPrefix}, passed
* to block tree terms dictionary.
* @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int,int,int) */
public AutoPrefixPostingsFormat(int minItemsInBlock, int maxItemsInBlock, int minItemsInAutoPrefix, int maxItemsInAutoPrefix) {
super("AutoPrefix");
BlockTreeTermsWriter.validateSettings(minItemsInBlock,
maxItemsInBlock);
BlockTreeTermsWriter.validateAutoPrefixSettings(minItemsInAutoPrefix,
maxItemsInAutoPrefix);
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
this.minItemsInAutoPrefix = minItemsInAutoPrefix;
this.maxItemsInAutoPrefix = maxItemsInAutoPrefix;
}
@Override
public String toString() {
return getName();
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret = new BlockTreeTermsWriter(state,
postingsWriter,
minItemsInBlock,
maxItemsInBlock,
minItemsInAutoPrefix,
maxItemsInAutoPrefix);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene50PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(postingsReader, state);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
}

View File

@ -1,22 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An experimental postings format that automatically indexes appropriate
* prefix terms for fast range and prefix queries.
*/
package org.apache.lucene.codecs.autoprefix;

View File

@ -19,4 +19,3 @@ org.apache.lucene.codecs.memory.DirectPostingsFormat
org.apache.lucene.codecs.memory.FSTOrdPostingsFormat
org.apache.lucene.codecs.memory.FSTPostingsFormat
org.apache.lucene.codecs.memory.MemoryPostingsFormat
org.apache.lucene.codecs.autoprefix.AutoPrefixPostingsFormat

View File

@ -1,38 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.autoprefix;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.RandomPostingsTester;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/**
* Tests AutoPrefix's postings
*/
// NOTE: we don't extend BasePostingsFormatTestCase becase we can only handle DOCS_ONLY fields:
public class TestAutoPrefixPostingsFormat extends LuceneTestCase {
public void test() throws Exception {
new RandomPostingsTester(random()).testFull(TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat()),
createTempDir("autoprefix"),
IndexOptions.DOCS,
false);
}
}

View File

@ -1,941 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.autoprefix;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CompiledAutomaton;
public class TestAutoPrefixTerms extends LuceneTestCase {
private int minItemsPerBlock = TestUtil.nextInt(random(), 2, 100);
private int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random().nextInt(100);
private int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, 100);
private int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2, (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE;
public TestAutoPrefixTerms() {
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: using minItemsPerBlock=" + minItemsPerBlock + " maxItemsPerBlock=" + maxItemsPerBlock);
System.out.println("TEST: using minTermsAutoPrefix=" + minTermsAutoPrefix + " maxTermsAutoPrefix=" + maxTermsAutoPrefix);
}
}
private final Codec codec = TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat(minItemsPerBlock, maxItemsPerBlock,
minTermsAutoPrefix, maxTermsAutoPrefix));
// Numbers in a restricted range, encoded in decimal, left-0-padded:
public void testBasicNumericRanges() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
int numTerms = TestUtil.nextInt(random(), 3000, 50000);
Set<String> terms = new HashSet<>();
int digits = TestUtil.nextInt(random(), 5, 10);
int maxValue = 1;
for(int i=0;i<digits;i++) {
maxValue *= 10;
}
String format = "%0" + digits + "d";
while (terms.size() < numTerms) {
terms.add(String.format(Locale.ROOT, format, random().nextInt(maxValue)));
}
for(String term : terms) {
Document doc = new Document();
doc.add(new StringField("field", term, Field.Store.NO));
doc.add(new NumericDocValuesField("field", Long.parseLong(term)));
w.addDocument(doc);
}
if (VERBOSE) System.out.println("\nTEST: now optimize");
if (random().nextBoolean()) {
w.forceMerge(1);
}
if (VERBOSE) System.out.println("\nTEST: now done");
IndexReader r = DirectoryReader.open(w);
List<String> sortedTerms = new ArrayList<>(terms);
Collections.sort(sortedTerms);
if (VERBOSE) {
System.out.println("TEST: sorted terms:");
int idx = 0;
for(String term : sortedTerms) {
System.out.println(idx + ": " + term);
idx++;
}
}
int iters = atLeast(100);
for(int iter=0;iter<iters;iter++) {
int min, max;
while (true) {
min = random().nextInt(maxValue);
max = random().nextInt(maxValue);
if (min == max) {
continue;
} else if (min > max) {
int x = min;
min = max;
max = x;
}
break;
}
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " min=" + min + " max=" + max);
}
boolean minInclusive = random().nextBoolean();
boolean maxInclusive = random().nextBoolean();
BytesRef minTerm = new BytesRef(String.format(Locale.ROOT, format, min));
BytesRef maxTerm = new BytesRef(String.format(Locale.ROOT, format, max));
CompiledAutomaton ca = new CompiledAutomaton(Automata.makeBinaryInterval(minTerm, minInclusive, maxTerm, maxInclusive),
true, false, Integer.MAX_VALUE, true);
TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field"));
NumericDocValues docValues = MultiDocValues.getNumericValues(r, "field");
PostingsEnum postingsEnum = null;
VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), minTerm, maxTerm);
while (te.next() != null) {
if (VERBOSE) {
System.out.println(" got term=" + te.term().utf8ToString());
}
verifier.sawTerm(te.term());
postingsEnum = te.postings(postingsEnum);
int docID;
while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
long v = docValues.get(docID);
assert v >= min && v <= max: "docID=" + docID + " v=" + v;
// The auto-prefix terms should never "overlap" one another, so we should only ever see a given docID one time:
if (VERBOSE) {
System.out.println(" got docID=" + docID + " v=" + v);
}
verifier.sawDoc(docID);
}
}
int startLoc = Collections.binarySearch(sortedTerms, String.format(Locale.ROOT, format, min));
if (startLoc < 0) {
startLoc = -startLoc-1;
} else if (minInclusive == false) {
startLoc++;
}
int endLoc = Collections.binarySearch(sortedTerms, String.format(Locale.ROOT, format, max));
if (endLoc < 0) {
endLoc = -endLoc-2;
} else if (maxInclusive == false) {
endLoc--;
}
verifier.finish(endLoc-startLoc+1, maxTermsAutoPrefix);
}
r.close();
w.close();
dir.close();
}
private static BytesRef intToBytes(int v) {
int sortableBits = v ^ 0x80000000;
BytesRef token = new BytesRef(4);
token.length = 4;
int index = 3;
while (index >= 0) {
token.bytes[index] = (byte) (sortableBits & 0xff);
index--;
sortableBits >>>= 8;
}
return token;
}
// Numbers are encoded in full binary (4 byte ints):
public void testBinaryNumericRanges() throws Exception {
if (VERBOSE) {
System.out.println("TEST: minItemsPerBlock=" + minItemsPerBlock);
System.out.println("TEST: maxItemsPerBlock=" + maxItemsPerBlock);
System.out.println("TEST: minTermsAutoPrefix=" + minTermsAutoPrefix);
System.out.println("TEST: maxTermsAutoPrefix=" + maxTermsAutoPrefix);
}
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
int numTerms = TestUtil.nextInt(random(), 3000, 50000);
Set<Integer> terms = new HashSet<>();
while (terms.size() < numTerms) {
terms.add(random().nextInt());
}
for(Integer term : terms) {
Document doc = new Document();
doc.add(newStringField("field", intToBytes(term), Field.Store.NO));
doc.add(new NumericDocValuesField("field", term));
w.addDocument(doc);
}
if (random().nextBoolean()) {
if (VERBOSE) System.out.println("TEST: now force merge");
w.forceMerge(1);
}
IndexReader r = DirectoryReader.open(w);
List<Integer> sortedTerms = new ArrayList<>(terms);
Collections.sort(sortedTerms);
if (VERBOSE) {
System.out.println("TEST: sorted terms:");
int idx = 0;
for(Integer term : sortedTerms) {
System.out.println(idx + ": " + term);
idx++;
}
}
int iters = atLeast(100);
for(int iter=0;iter<iters;iter++) {
int min, max;
while (true) {
min = random().nextInt();
max = random().nextInt();
if (min == max) {
continue;
} else if (min > max) {
int x = min;
min = max;
max = x;
}
break;
}
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " min=" + min + " (" + intToBytes(min) + ") max=" + max + " (" + intToBytes(max) + ")");
}
boolean minInclusive = random().nextBoolean();
BytesRef minTerm = intToBytes(min);
boolean maxInclusive = random().nextBoolean();
BytesRef maxTerm = intToBytes(max);
CompiledAutomaton ca = new CompiledAutomaton(Automata.makeBinaryInterval(minTerm, minInclusive, maxTerm, maxInclusive),
true, false, Integer.MAX_VALUE, true);
TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field"));
NumericDocValues docValues = MultiDocValues.getNumericValues(r, "field");
PostingsEnum postingsEnum = null;
VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), minTerm, maxTerm);
while (te.next() != null) {
if (VERBOSE) {
System.out.println(" got term=" + te.term() + " docFreq=" + te.docFreq());
}
verifier.sawTerm(te.term());
postingsEnum = te.postings(postingsEnum);
int docID;
while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
long v = docValues.get(docID);
assert v >= min && v <= max: "docID=" + docID + " v=" + v;
verifier.sawDoc(docID);
}
}
int startLoc = Collections.binarySearch(sortedTerms, min);
if (startLoc < 0) {
startLoc = -startLoc-1;
} else if (minInclusive == false) {
startLoc++;
}
int endLoc = Collections.binarySearch(sortedTerms, max);
if (endLoc < 0) {
endLoc = -endLoc-2;
} else if (maxInclusive == false) {
endLoc--;
}
int expectedHits = endLoc-startLoc+1;
try {
verifier.finish(expectedHits, maxTermsAutoPrefix);
} catch (AssertionError ae) {
for(int i=0;i<numTerms;i++) {
if (verifier.allHits.get(i) == false) {
int v = (int) docValues.get(i);
boolean accept = (v > min || (v == min && minInclusive)) &&
(v < max || (v == max && maxInclusive));
if (accept) {
System.out.println("MISSING: docID=" + i + " v=" + v + " term=" + intToBytes(v));
}
}
}
throw ae;
}
}
r.close();
w.close();
dir.close();
}
// Non-numeric, simple prefix query
public void testBasicPrefixTerms() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
iwc.setMergeScheduler(new SerialMergeScheduler());
IndexWriter w = new IndexWriter(dir, iwc);
int numTerms = TestUtil.nextInt(random(), 3000, 50000);
Set<String> terms = new HashSet<>();
while (terms.size() < numTerms) {
terms.add(TestUtil.randomSimpleString(random()));
}
for(String term : terms) {
Document doc = new Document();
doc.add(new StringField("field", term, Field.Store.NO));
doc.add(new BinaryDocValuesField("field", new BytesRef(term)));
w.addDocument(doc);
}
if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: now force merge");
}
w.forceMerge(1);
}
IndexReader r = DirectoryReader.open(w);
List<String> sortedTerms = new ArrayList<>(terms);
Collections.sort(sortedTerms);
if (VERBOSE) {
System.out.println("TEST: sorted terms:");
int idx = 0;
for(String term : sortedTerms) {
System.out.println(idx + ": " + term);
idx++;
}
}
if (VERBOSE) {
System.out.println("TEST: r=" + r);
}
int iters = atLeast(100);
for(int iter=0;iter<iters;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
String prefix;
if (random().nextInt(100) == 42) {
prefix = "";
} else {
prefix = TestUtil.randomSimpleString(random(), 1, 4);
}
BytesRef prefixBR = new BytesRef(prefix);
if (VERBOSE) {
System.out.println(" prefix=" + prefix);
}
CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(prefixBR), true, false, Integer.MAX_VALUE, true);
TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field"));
BinaryDocValues docValues = MultiDocValues.getBinaryValues(r, "field");
PostingsEnum postingsEnum = null;
VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), prefixBR);
while (te.next() != null) {
if (VERBOSE) {
System.out.println("TEST: got term=" + te.term().utf8ToString() + " docFreq=" + te.docFreq());
}
verifier.sawTerm(te.term());
postingsEnum = te.postings(postingsEnum);
int docID;
while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
assertTrue("prefixBR=" + prefixBR + " docBR=" + docValues.get(docID), StringHelper.startsWith(docValues.get(docID), prefixBR));
// The auto-prefix terms should never "overlap" one another, so we should only ever see a given docID one time:
verifier.sawDoc(docID);
}
}
int startLoc = Collections.binarySearch(sortedTerms, prefix);
if (startLoc < 0) {
startLoc = -startLoc-1;
}
int endLoc = Collections.binarySearch(sortedTerms, prefix + (char) ('z'+1));
if (endLoc < 0) {
endLoc = -endLoc-2;
}
int expectedHits = endLoc-startLoc+1;
try {
verifier.finish(expectedHits, maxTermsAutoPrefix);
} catch (AssertionError ae) {
for(int i=0;i<numTerms;i++) {
if (verifier.allHits.get(i) == false) {
String s = docValues.get(i).utf8ToString();
if (s.startsWith(prefix)) {
System.out.println("MISSING: docID=" + i + " term=" + s);
}
}
}
throw ae;
}
}
r.close();
w.close();
dir.close();
}
public void testDemoPrefixTerms() throws Exception {
if (VERBOSE) {
System.out.println("\nTEST: minTermsAutoPrefix=" + minTermsAutoPrefix + " maxTermsAutoPrefix=" + maxTermsAutoPrefix);
System.out.println("\nTEST: minItemsPerBlock=" + minItemsPerBlock + " maxItemsPerBlock=" + maxItemsPerBlock);
}
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
int numDocs = 30;
for(int i=0;i<numDocs;i++) {
Document doc = new Document();
doc.add(new StringField("field", "" + (char) (97+i), Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new StringField("field", "a" + (char) (97+i), Field.Store.NO));
w.addDocument(doc);
}
if (random().nextBoolean()) {
w.forceMerge(1);
}
IndexReader r = DirectoryReader.open(w);
Terms terms = MultiFields.getTerms(r, "field");
if (VERBOSE) {
System.out.println("\nTEST: now intersect");
}
CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(new BytesRef("a")), false, false, Integer.MAX_VALUE, true);
TermsEnum te = ca.getTermsEnum(terms);
PostingsEnum postingsEnum = null;
VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), new BytesRef("a"));
//TermsEnum te = terms.intersect(new CompiledAutomaton(a, true, false), null);
while (te.next() != null) {
verifier.sawTerm(te.term());
postingsEnum = te.postings(postingsEnum);
int docID;
while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
// The auto-prefix terms should never "overlap" one another, so we should only ever see a given docID one time:
verifier.sawDoc(docID);
}
}
// 1 document has exactly "a", and 30 documents had "a?"
verifier.finish(31, maxTermsAutoPrefix);
PrefixQuery q = new PrefixQuery(new Term("field", "a"));
assertEquals(31, newSearcher(r).search(q, 1).totalHits);
r.close();
w.close();
dir.close();
}
/** Helper class to ensure auto-prefix terms 1) never overlap one another, and 2) are used when they should be. */
private static class VerifyAutoPrefixTerms {
final FixedBitSet allHits;
private final Map<BytesRef,Integer> prefixCounts = new HashMap<>();
private int totPrefixCount;
private final BytesRef[] bounds;
private int totTermCount;
private BytesRef lastTerm;
public VerifyAutoPrefixTerms(int maxDoc, BytesRef... bounds) {
allHits = new FixedBitSet(maxDoc);
assert bounds.length > 0;
this.bounds = bounds;
}
public void sawTerm(BytesRef term) {
//System.out.println("saw term=" + term);
if (lastTerm != null) {
assertTrue(lastTerm.compareTo(term) < 0);
}
lastTerm = BytesRef.deepCopyOf(term);
totTermCount++;
totPrefixCount += term.length;
for(int i=1;i<=term.length;i++) {
BytesRef prefix = BytesRef.deepCopyOf(term);
prefix.length = i;
Integer count = prefixCounts.get(prefix);
if (count == null) {
count = 1;
} else {
count += 1;
}
prefixCounts.put(prefix, count);
}
}
public void sawDoc(int docID) {
// The auto-prefix terms should never "overlap" one another, so we should only ever see a given docID one time:
assertFalse(allHits.getAndSet(docID));
}
public void finish(int expectedNumHits, int maxPrefixCount) {
if (maxPrefixCount != -1) {
// Auto-prefix terms were used in this test
long allowedMaxTerms;
if (bounds.length == 1) {
// Simple prefix query: we should never see more than maxPrefixCount terms, except for the empty string:
if (bounds[0].length == 0) {
allowedMaxTerms = Integer.MAX_VALUE;
} else {
allowedMaxTerms = maxPrefixCount;
}
} else {
// Trickier: we need to allow for maxPrefixTerms for each different leading byte in the min and max:
assert bounds.length == 2;
BytesRef minTerm = bounds[0];
BytesRef maxTerm = bounds[1];
int commonPrefix = 0;
for(int i=0;i<minTerm.length && i<maxTerm.length;i++) {
if (minTerm.bytes[minTerm.offset+i] != maxTerm.bytes[maxTerm.offset+i]) {
commonPrefix = i;
break;
}
}
allowedMaxTerms = maxPrefixCount * (long) ((minTerm.length-commonPrefix) + (maxTerm.length-commonPrefix));
if (commonPrefix == 0) {
int min;
if (minTerm.length == 0) {
min = 0;
} else {
min = minTerm.bytes[minTerm.offset] & 0xff;
}
int max;
if (maxTerm.length == 0) {
max = 0;
} else {
max = maxTerm.bytes[maxTerm.offset] & 0xff;
}
if (max > min) {
// When maxPrefixCount is small (< 16), each byte of the term can require more than one "level" of auto-prefixing:
// NOTE: this is still only approximate ... it's tricky to get a closed form max bound that's "tight"
allowedMaxTerms += MathUtil.log(max-min, maxPrefixCount);
}
}
}
assertTrue("totTermCount=" + totTermCount + " is > allowedMaxTerms=" + allowedMaxTerms, totTermCount <= allowedMaxTerms);
}
assertEquals(expectedNumHits, allHits.cardinality());
int sum = 0;
for(Map.Entry<BytesRef,Integer> ent : prefixCounts.entrySet()) {
BytesRef prefix = ent.getKey();
if (VERBOSE) {
System.out.println(" verify prefix=" + TestUtil.bytesRefToString(prefix) + " count=" + ent.getValue());
}
if (maxPrefixCount != -1) {
// Auto-prefix terms were used in this test
int sumLeftoverSuffix = 0;
for(BytesRef bound : bounds) {
int minSharedLength = Math.min(bound.length, prefix.length);
int commonPrefix = minSharedLength;
for(int i=0;i<minSharedLength;i++) {
if (bound.bytes[bound.offset+i] != prefix.bytes[prefix.offset+i]) {
commonPrefix = i;
break;
}
}
sumLeftoverSuffix += bound.length - commonPrefix;
}
long limit = (1+sumLeftoverSuffix) * (long) maxPrefixCount;
assertTrue("maxPrefixCount=" + maxPrefixCount + " prefix=" + prefix + " sumLeftoverSuffix=" + sumLeftoverSuffix + " limit=" + limit + " vs actual=" +ent.getValue(),
ent.getValue() <= limit);
}
sum += ent.getValue();
}
// Make sure no test bug:
assertEquals(totPrefixCount, sum);
}
}
/** Make sure you get clear exc. if you try to use this within anything but IndexOptions.DOCS fields. */
public void testWithFreqs() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
Document doc = new Document();
doc.add(new Field("foo", "bar bar", ft));
w.addDocument(doc);
IllegalStateException expected = expectThrows(IllegalStateException.class, () -> {
w.commit();
});
assertEquals("ranges can only be indexed with IndexOptions.DOCS (field: foo)", expected.getMessage());
w.close();
dir.close();
}
/** Make sure you get clear exc. if you try to use this within anything but IndexOptions.DOCS fields. */
public void testWithPositions() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
Document doc = new Document();
doc.add(new Field("foo", "bar bar", ft));
w.addDocument(doc);
IllegalStateException expected = expectThrows(IllegalStateException.class, () -> {
w.commit();
});
assertEquals("ranges can only be indexed with IndexOptions.DOCS (field: foo)", expected.getMessage());
w.close();
dir.close();
}
/** Make sure you get clear exc. if you try to use this within anything but IndexOptions.DOCS fields. */
public void testWithOffsets() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Document doc = new Document();
doc.add(new Field("foo", "bar bar", ft));
w.addDocument(doc);
IllegalStateException expected = expectThrows(IllegalStateException.class, () -> {
w.commit();
});
assertEquals("ranges can only be indexed with IndexOptions.DOCS (field: foo)", expected.getMessage());
w.close();
dir.close();
}
/** Make sure auto prefix terms are used with TermRangeQuery */
public void testTermRange() throws Exception {
List<String> prefixes = new ArrayList<>();
for(int i=1;i<5;i++) {
char[] chars = new char[i];
Arrays.fill(chars, 'a');
prefixes.add(new String(chars));
}
Set<String> randomTerms = new HashSet<>();
int numTerms = atLeast(10000);
while (randomTerms.size() < numTerms) {
for(String prefix : prefixes) {
randomTerms.add(prefix + TestUtil.randomSimpleString(random()));
}
}
// We make term range aa<start> - aa<end>
char start;
char end;
int actualCount;
boolean startInclusive = random().nextBoolean();
boolean endInclusive = random().nextBoolean();
String startTerm;
String endTerm;
while (true) {
start = (char) TestUtil.nextInt(random(), 'a', 'm');
end = (char) TestUtil.nextInt(random(), start+1, 'z');
actualCount = 0;
startTerm = "aa" + start;
endTerm = "aa" + end;
for(String term : randomTerms) {
int cmpStart = startTerm.compareTo(term);
int cmpEnd = endTerm.compareTo(term);
if ((cmpStart < 0 || (startInclusive && cmpStart == 0)) &&
(cmpEnd > 0 || (endInclusive && cmpEnd == 0))) {
actualCount++;
}
}
if (actualCount > 2000) {
break;
}
}
if (VERBOSE) {
System.out.println("start " + startTerm + " inclusive? " + startInclusive);
System.out.println("end " + endTerm + " inclusive? " + endInclusive);
System.out.println("actual count " + actualCount);
}
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
int minTermsInBlock = TestUtil.nextInt(random(), 2, 100);
int maxTermsInBlock = Math.max(2, (minTermsInBlock-1)*2 + random().nextInt(100));
int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, 100);
int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2, (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE;
if (VERBOSE) {
System.out.println("minTermsAutoPrefix " + minTermsAutoPrefix);
System.out.println("maxTermsAutoPrefix " + maxTermsAutoPrefix);
}
iwc.setCodec(TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat(minTermsInBlock, maxTermsInBlock,
minTermsAutoPrefix, maxTermsAutoPrefix)));
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
if (VERBOSE) {
System.out.println("TEST: index terms");
}
for (String term : randomTerms) {
Document doc = new Document();
doc.add(new StringField("field", term, Field.Store.NO));
w.addDocument(doc);
if (VERBOSE) {
System.out.println(" " + term);
}
}
if (VERBOSE) {
System.out.println("TEST: now force merge");
}
w.forceMerge(1);
IndexReader r = w.getReader();
final Terms terms = MultiFields.getTerms(r, "field");
IndexSearcher s = new IndexSearcher(r);
final int finalActualCount = actualCount;
if (VERBOSE) {
System.out.println("start=" + startTerm + " end=" + endTerm + " startIncl=" + startInclusive + " endIncl=" + endInclusive);
}
TermRangeQuery q = new TermRangeQuery("field", new BytesRef(startTerm), new BytesRef(endTerm), startInclusive, endInclusive) {
public TermRangeQuery checkTerms() throws IOException {
TermsEnum termsEnum = getTermsEnum(terms, new AttributeSource());
int count = 0;
while (termsEnum.next() != null) {
if (VERBOSE) {
System.out.println("got term: " + termsEnum.term().utf8ToString());
}
count++;
}
if (VERBOSE) {
System.out.println("count " + count + " vs finalActualCount=" + finalActualCount);
}
// Auto-prefix term(s) should have kicked in, so we should have visited fewer than the total number of aa* terms:
assertTrue(count < finalActualCount);
return this;
}
}.checkTerms();
int maxClauseCount = BooleanQuery.getMaxClauseCount();
try {
// TODO test with boolean rewrite as well once we can create term
// queries on fake terms
/*if (random().nextBoolean()) {
q.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
BooleanQuery.setMaxClauseCount(actualCount);
} else if (random().nextBoolean()) {
q.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
BooleanQuery.setMaxClauseCount(actualCount);
}*/
if (VERBOSE) {
System.out.println("TEST: use rewrite method " + q.getRewriteMethod());
}
assertEquals(actualCount, s.search(q, 1).totalHits);
} finally {
BooleanQuery.setMaxClauseCount(maxClauseCount);
}
// Test when min == max:
List<String> randomTermsList = new ArrayList<>(randomTerms);
for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
String term = randomTermsList.get(random().nextInt(randomTermsList.size()));
q = new TermRangeQuery("field", new BytesRef(term), new BytesRef(term), true, true);
assertEquals(1, s.search(q, 1).totalHits);
}
r.close();
w.close();
dir.close();
}
/** Make sure auto prefix terms are used with PrefixQuery. */
public void testPrefixQuery() throws Exception {
List<String> prefixes = new ArrayList<>();
for(int i=1;i<5;i++) {
char[] chars = new char[i];
Arrays.fill(chars, 'a');
prefixes.add(new String(chars));
}
Set<String> randomTerms = new HashSet<>();
int numTerms = atLeast(10000);
while (randomTerms.size() < numTerms) {
for(String prefix : prefixes) {
randomTerms.add(prefix + TestUtil.randomRealisticUnicodeString(random()));
}
}
int actualCount = 0;
for(String term : randomTerms) {
if (term.startsWith("aa")) {
actualCount++;
}
}
//System.out.println("actual count " + actualCount);
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
int minTermsInBlock = TestUtil.nextInt(random(), 2, 100);
int maxTermsInBlock = Math.max(2, (minTermsInBlock-1)*2 + random().nextInt(100));
// As long as this is never > actualCount, aa should always see at least one auto-prefix term:
int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, actualCount);
int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2, (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE;
iwc.setCodec(TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat(minTermsInBlock, maxTermsInBlock,
minTermsAutoPrefix, maxTermsAutoPrefix)));
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
for (String term : randomTerms) {
Document doc = new Document();
doc.add(new StringField("field", term, Field.Store.NO));
w.addDocument(doc);
}
w.forceMerge(1);
IndexReader r = w.getReader();
final Terms terms = MultiFields.getTerms(r, "field");
IndexSearcher s = new IndexSearcher(r);
final int finalActualCount = actualCount;
PrefixQuery q = new PrefixQuery(new Term("field", "aa")) {
public PrefixQuery checkTerms() throws IOException {
TermsEnum termsEnum = getTermsEnum(terms, new AttributeSource());
int count = 0;
while (termsEnum.next() != null) {
//System.out.println("got term: " + termsEnum.term().utf8ToString());
count++;
}
// Auto-prefix term(s) should have kicked in, so we should have visited fewer than the total number of aa* terms:
assertTrue(count < finalActualCount);
return this;
}
}.checkTerms();
int x = BooleanQuery.getMaxClauseCount();
try {
// TODO test with boolean rewrite as well once we can create term
// queries on fake terms
/*BooleanQuery.setMaxClauseCount(randomTerms.size());
if (random().nextBoolean()) {
q.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
} else if (random().nextBoolean()) {
q.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
}*/
assertEquals(actualCount, s.search(q, 1).totalHits);
} finally {
BooleanQuery.setMaxClauseCount(x);
}
r.close();
w.close();
dir.close();
}
}

View File

@ -16,7 +16,6 @@
*/
package org.apache.lucene.codecs;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader; // javadocs
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.TermState;
@ -39,11 +38,6 @@ public class BlockTermState extends OrdTermState {
// TODO: update BTR to nuke this
public long blockFilePointer;
/** True if this term is "real" (e.g., not an auto-prefix term or
* some other "secret" term; currently only {@link BlockTreeTermsReader}
* sets this). */
public boolean isRealTerm = true;
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected BlockTermState() {
@ -58,16 +52,10 @@ public class BlockTermState extends OrdTermState {
totalTermFreq = other.totalTermFreq;
termBlockOrd = other.termBlockOrd;
blockFilePointer = other.blockFilePointer;
isRealTerm = other.isRealTerm;
}
@Override
public boolean isRealTerm() {
return isRealTerm;
}
@Override
public String toString() {
return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer + " isRealTerm=" + isRealTerm;
return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer;
}
}

View File

@ -1,438 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blocktree;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.StringHelper;
// TODO: instead of inlining auto-prefix terms with normal terms,
// we could write them into their own virtual/private field. This
// would make search time a bit more complex, since we'd need to
// merge sort between two TermEnums, but it would also make stats
// API (used by CheckIndex -verbose) easier to implement since we could
// just walk this virtual field and gather its stats)
/** Used in the first pass when writing a segment to locate
* "appropriate" auto-prefix terms to pre-compile into the index.
* This visits every term in the index to find prefixes that
* match {@code >= min} and {@code <= max} number of terms. */
class AutoPrefixTermsWriter {
//static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
//static boolean DEBUG = false;
//static boolean DEBUG2 = BlockTreeTermsWriter.DEBUG2;
//static boolean DEBUG2 = true;
/** Describes a range of term-space to match, either a simple prefix
* (foo*) or a floor-block range of a prefix (e.g. foo[a-m]*,
* foo[n-z]*) when there are too many terms starting with foo*. */
public static final class PrefixTerm implements Comparable<PrefixTerm> {
/** Common prefix */
public final byte[] prefix;
/** If this is -2, this is a normal prefix (foo *), else it's the minimum lead byte of the suffix (e.g. 'd' in foo[d-m]*). */
public final int floorLeadStart;
/** The lead byte (inclusive) of the suffix for the term range we match (e.g. 'm' in foo[d-m*]); this is ignored when
* floorLeadStart is -2. */
public final int floorLeadEnd;
public final BytesRef term;
/** Sole constructor. */
public PrefixTerm(byte[] prefix, int floorLeadStart, int floorLeadEnd) {
this.prefix = prefix;
this.floorLeadStart = floorLeadStart;
this.floorLeadEnd = floorLeadEnd;
this.term = toBytesRef(prefix, floorLeadStart);
assert floorLeadEnd >= floorLeadStart;
assert floorLeadEnd >= 0;
assert floorLeadStart == -2 || floorLeadStart >= 0;
// We should never create empty-string prefix term:
assert prefix.length > 0 || floorLeadStart != -2 || floorLeadEnd != 0xff;
}
@Override
public String toString() {
String s = brToString(new BytesRef(prefix));
if (floorLeadStart == -2) {
s += "[-" + Integer.toHexString(floorLeadEnd) + "]";
} else {
s += "[" + Integer.toHexString(floorLeadStart) + "-" + Integer.toHexString(floorLeadEnd) + "]";
}
return s;
}
@Override
public int compareTo(PrefixTerm other) {
int cmp = term.compareTo(other.term);
if (cmp == 0) {
if (prefix.length != other.prefix.length) {
return prefix.length - other.prefix.length;
}
// On tie, sort the bigger floorLeadEnd, earlier, since it
// spans more terms, so during intersect, we want to encounter this one
// first so we can use it if the automaton accepts the larger range:
cmp = other.floorLeadEnd - floorLeadEnd;
}
return cmp;
}
/** Returns the leading term for this prefix term, e.g. "foo" (for
* the foo* prefix) or "foom" (for the foo[m-z]* case). */
private static BytesRef toBytesRef(byte[] prefix, int floorLeadStart) {
BytesRef br;
if (floorLeadStart != -2) {
assert floorLeadStart >= 0;
br = new BytesRef(prefix.length+1);
} else {
br = new BytesRef(prefix.length);
}
System.arraycopy(prefix, 0, br.bytes, 0, prefix.length);
br.length = prefix.length;
if (floorLeadStart != -2) {
assert floorLeadStart >= 0;
br.bytes[br.length++] = (byte) floorLeadStart;
}
return br;
}
public int compareTo(BytesRef term) {
return this.term.compareTo(term);
}
public TermsEnum getTermsEnum(TermsEnum in) {
final BytesRef prefixRef = new BytesRef(prefix);
return new FilteredTermsEnum(in) {
{
setInitialSeekTerm(term);
}
@Override
protected AcceptStatus accept(BytesRef term) {
if (StringHelper.startsWith(term, prefixRef) &&
(floorLeadEnd == -1 || term.length == prefixRef.length || (term.bytes[term.offset + prefixRef.length] & 0xff) <= floorLeadEnd)) {
return AcceptStatus.YES;
} else {
return AcceptStatus.END;
}
}
};
}
}
// for debugging
static String brToString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
}
}
final List<PrefixTerm> prefixes = new ArrayList<>();
private final int minItemsInPrefix;
private final int maxItemsInPrefix;
// Records index into pending where the current prefix at that
// length "started"; for example, if current term starts with 't',
// startsByPrefix[0] is the index into pending for the first
// term/sub-block starting with 't'. We use this to figure out when
// to write a new block:
private final BytesRefBuilder lastTerm = new BytesRefBuilder();
private int[] prefixStarts = new int[8];
private List<Object> pending = new ArrayList<>();
//private final String segment;
public AutoPrefixTermsWriter(Terms terms, int minItemsInPrefix, int maxItemsInPrefix) throws IOException {
this.minItemsInPrefix = minItemsInPrefix;
this.maxItemsInPrefix = maxItemsInPrefix;
//this.segment = segment;
TermsEnum termsEnum = terms.iterator();
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
//if (DEBUG) System.out.println("pushTerm: " + brToString(term));
pushTerm(term);
}
if (pending.size() > 1) {
pushTerm(BlockTreeTermsWriter.EMPTY_BYTES_REF);
// Also maybe save floor prefixes in root block; this can be a biggish perf gain for large ranges:
/*
System.out.println("root block pending.size=" + pending.size());
for(Object o : pending) {
System.out.println(" " + o);
}
*/
while (pending.size() >= minItemsInPrefix) {
savePrefixes(0, pending.size());
}
}
// Even though we visited terms in already-sorted order, the prefixes
// can be slightly unsorted, e.g. aaaaa will be before aaa, so we
// must sort here so our caller can do merge sort into actual terms
// when writing. Probably we should use CollectionUtil.timSort here?
Collections.sort(prefixes);
}
/** Pushes the new term to the top of the stack, and writes new blocks. */
private void pushTerm(BytesRef text) throws IOException {
int limit = Math.min(lastTerm.length(), text.length);
//if (DEBUG) System.out.println("\nterm: " + text.utf8ToString());
// Find common prefix between last term and current term:
int pos = 0;
while (pos < limit && lastTerm.byteAt(pos) == text.bytes[text.offset+pos]) {
pos++;
}
//if (DEBUG) System.out.println(" shared=" + pos + " lastTerm.length=" + lastTerm.length());
// Close the "abandoned" suffix now:
for(int i=lastTerm.length()-1;i>=pos;i--) {
// How many items on top of the stack share the current suffix
// we are closing:
int prefixTopSize = pending.size() - prefixStarts[i];
while (prefixTopSize >= minItemsInPrefix) {
//if (DEBUG) System.out.println(" pop: i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
savePrefixes(i+1, prefixTopSize);
//prefixStarts[i] -= prefixTopSize;
//if (DEBUG) System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
// For large floor blocks, it's possible we should now re-run on the new prefix terms we just created:
prefixTopSize = pending.size() - prefixStarts[i];
}
}
if (prefixStarts.length < text.length) {
prefixStarts = ArrayUtil.grow(prefixStarts, text.length);
}
// Init new tail:
for(int i=pos;i<text.length;i++) {
prefixStarts[i] = pending.size();
}
lastTerm.copyBytes(text);
// Only append the first (optional) empty string, no the fake last one used to close all prefixes:
if (text.length > 0 || pending.isEmpty()) {
byte[] termBytes = new byte[text.length];
System.arraycopy(text.bytes, text.offset, termBytes, 0, text.length);
pending.add(termBytes);
}
}
void savePrefixes(int prefixLength, int count) throws IOException {
assert count > 0;
/*
if (DEBUG2) {
BytesRef br = new BytesRef(lastTerm.bytes());
br.length = prefixLength;
//System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
System.out.println(" savePrefixes: " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
}
*/
int lastSuffixLeadLabel = -2;
int start = pending.size()-count;
assert start >=0;
// Special case empty-string suffix case: we are being asked to build prefix terms for all aaa* terms, but
// the exact term aaa is here, and we must skip it (it is handled "higher", under the aa* terms):
Object o = pending.get(start);
boolean skippedEmptyStringSuffix = false;
if (o instanceof byte[]) {
if (((byte[]) o).length == prefixLength) {
start++;
count--;
//if (DEBUG) System.out.println(" skip empty-string term suffix");
skippedEmptyStringSuffix = true;
}
} else {
PrefixTerm prefix = (PrefixTerm) o;
if (prefix.term.bytes.length == prefixLength) {
start++;
count--;
//if (DEBUG) System.out.println(" skip empty-string PT suffix");
skippedEmptyStringSuffix = true;
}
}
int end = pending.size();
int nextBlockStart = start;
int nextFloorLeadLabel = -1;
int prefixCount = 0;
PrefixTerm lastPTEntry = null;
for (int i=start; i<end; i++) {
byte[] termBytes;
o = pending.get(i);
PrefixTerm ptEntry;
if (o instanceof byte[]) {
ptEntry = null;
termBytes = (byte[]) o;
} else {
ptEntry = (PrefixTerm) o;
termBytes = ptEntry.term.bytes;
if (ptEntry.prefix.length != prefixLength) {
assert ptEntry.prefix.length > prefixLength;
ptEntry = null;
}
}
//if (DEBUG) System.out.println(" check term=" + brToString(new BytesRef(termBytes)) + " o=" + o);
// We handled the empty-string suffix case up front:
assert termBytes.length > prefixLength;
int suffixLeadLabel = termBytes[prefixLength] & 0xff;
//if (DEBUG) System.out.println(" i=" + i + " o=" + o + " suffixLeadLabel=" + Integer.toHexString(suffixLeadLabel) + " pendingCount=" + (i - nextBlockStart) + " min=" + minItemsInPrefix);
if (suffixLeadLabel != lastSuffixLeadLabel) {
// This is a boundary, a chance to make an auto-prefix term if we want:
// When we are "recursing" (generating auto-prefix terms on a block of
// floor'd auto-prefix terms), this assert is non-trivial because it
// ensures the floorLeadEnd of the previous terms is in fact less
// than the lead start of the current entry:
assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel;
int itemsInBlock = i - nextBlockStart;
if (itemsInBlock >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix) {
// The count is too large for one block, so we must break it into "floor" blocks, where we record
// the leading label of the suffix of the first term in each floor block, so at search time we can
// jump to the right floor block. We just use a naive greedy segmenter here: make a new floor
// block as soon as we have at least minItemsInBlock. This is not always best: it often produces
// a too-small block as the final block:
// If the last entry was another prefix term of the same length, then it represents a range of terms, so we must use its ending
// prefix label as our ending label:
if (lastPTEntry != null) {
//if (DEBUG) System.out.println(" use last");
lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
}
savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
prefixCount++;
nextFloorLeadLabel = suffixLeadLabel;
nextBlockStart = i;
}
if (nextFloorLeadLabel == -1) {
nextFloorLeadLabel = suffixLeadLabel;
//if (DEBUG) System.out.println("set first lead label=" + nextFloorLeadLabel);
}
lastSuffixLeadLabel = suffixLeadLabel;
}
lastPTEntry = ptEntry;
}
// Write last block, if any:
if (nextBlockStart < end) {
//System.out.println(" lastPTEntry=" + lastPTEntry + " lastSuffixLeadLabel=" + lastSuffixLeadLabel);
if (lastPTEntry != null) {
lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
}
assert lastSuffixLeadLabel >= nextFloorLeadLabel: "lastSuffixLeadLabel=" + lastSuffixLeadLabel + " nextFloorLeadLabel=" + nextFloorLeadLabel;
if (prefixCount == 0) {
if (prefixLength > 0) {
savePrefix(prefixLength, -2, 0xff);
prefixCount++;
// If we skipped empty string suffix, e.g. term aaa for prefix aaa*, since we
// are now writing the full aaa* prefix term, we include it here:
if (skippedEmptyStringSuffix) {
count++;
}
} else {
// Don't add a prefix term for all terms in the index!
}
} else {
if (lastSuffixLeadLabel == -2) {
// Special case when closing the empty string root block:
lastSuffixLeadLabel = 0xff;
}
savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
prefixCount++;
}
}
// Remove slice from the top of the pending stack, that we just wrote:
pending.subList(pending.size()-count, pending.size()).clear();
// Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up":
for(int i=0;i<prefixCount;i++) {
PrefixTerm pt = prefixes.get(prefixes.size()-(prefixCount-i));
pending.add(pt);
}
}
private void savePrefix(int prefixLength, int floorLeadStart, int floorLeadEnd) {
byte[] prefix = new byte[prefixLength];
System.arraycopy(lastTerm.bytes(), 0, prefix, 0, prefixLength);
assert floorLeadStart != -1;
assert floorLeadEnd != -1;
PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd);
//if (DEBUG2) System.out.println(" savePrefix: seg=" + segment + " " + pt + " count=" + count);
//if (DEBUG) System.out.println(" savePrefix: " + pt);
prefixes.add(pt);
}
}

View File

@ -107,8 +107,11 @@ public final class BlockTreeTermsReader extends FieldsProducer {
* this field did write any auto-prefix terms. */
public static final int VERSION_AUTO_PREFIX_TERMS_COND = 2;
/** Auto-prefix terms have been superseded by points. */
public static final int VERSION_AUTO_PREFIX_TERMS_REMOVED = 3;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_AUTO_PREFIX_TERMS_COND;
public static final int VERSION_CURRENT = VERSION_AUTO_PREFIX_TERMS_REMOVED;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
@ -150,15 +153,15 @@ public final class BlockTreeTermsReader extends FieldsProducer {
termsIn = state.directory.openInput(termsName, state.context);
version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
if (version < VERSION_AUTO_PREFIX_TERMS) {
// Old (pre-5.2.0) index, no auto-prefix terms:
if (version < VERSION_AUTO_PREFIX_TERMS || version >= VERSION_AUTO_PREFIX_TERMS_REMOVED) {
// Old (pre-5.2.0) or recent (6.2.0+) index, no auto-prefix terms:
this.anyAutoPrefixTerms = false;
} else if (version == VERSION_AUTO_PREFIX_TERMS) {
// 5.2.x index, might have auto-prefix terms:
this.anyAutoPrefixTerms = true;
} else {
// 5.3.x index, we record up front if we may have written any auto-prefix terms:
assert version >= VERSION_AUTO_PREFIX_TERMS_COND;
assert version == VERSION_AUTO_PREFIX_TERMS_COND;
byte b = termsIn.readByte();
if (b == 0) {
this.anyAutoPrefixTerms = false;

View File

@ -25,13 +25,11 @@ import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.blocktree.AutoPrefixTermsWriter.PrefixTerm;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@ -90,15 +88,6 @@ import org.apache.lucene.util.packed.PackedInts;
* between two index terms.
* <p>
*
* If {@code minItemsInAutoPrefix} is not zero, then for
* {@link IndexOptions#DOCS} fields we detect prefixes that match
* "enough" terms and insert auto-prefix terms into the index, which are
* used by {@link Terms#intersect} at search time to speed up prefix
* and range queries. Besides {@link Terms#intersect}, these
* auto-prefix terms are invisible to all other APIs (don't change terms
* stats, don't show up in normal {@link TermsEnum}s, etc.).
* <p>
*
* Files:
* <ul>
* <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
@ -124,7 +113,7 @@ import org.apache.lucene.util.packed.PackedInts;
* and decoding the Postings Metadata and Term Metadata sections.</p>
*
* <ul>
* <li>TermsDict (.tim) --&gt; Header, HasAutoPrefixTerms, <i>PostingsHeader</i>, NodeBlock<sup>NumBlocks</sup>,
* <li>TermsDict (.tim) --&gt; Header, <i>PostingsHeader</i>, NodeBlock<sup>NumBlocks</sup>,
* FieldSummary, DirOffset, Footer</li>
* <li>NodeBlock --&gt; (OuterNode | InnerNode)</li>
* <li>OuterNode --&gt; EntryCount, SuffixLength, Byte<sup>SuffixLength</sup>, StatsLength, &lt; TermStats &gt;<sup>EntryCount</sup>, MetaLength, &lt;<i>TermMetadata</i>&gt;<sup>EntryCount</sup></li>
@ -145,7 +134,6 @@ import org.apache.lucene.util.packed.PackedInts;
* <ul>
* <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
* for the BlockTree implementation.</li>
* <li>HasAutoPrefixTerms is a single byte; 1 means there may be auto-prefix terms and 0 means there are none.
* <li>DirOffset is a pointer to the FieldSummary section.</li>
* <li>DocFreq is the count of documents which contain the term.</li>
* <li>TotalTermFreq is the total number of occurrences of the term. This is encoded
@ -223,8 +211,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
final int maxDoc;
final int minItemsInBlock;
final int maxItemsInBlock;
final int minItemsInAutoPrefix;
final int maxItemsInAutoPrefix;
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
@ -260,43 +246,14 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
private final List<FieldMetaData> fields = new ArrayList<>();
// private final String segment;
final FixedBitSet prefixDocs;
/** Reused in getAutoPrefixTermsEnum: */
final BitSetTermsEnum prefixFixedBitsTermsEnum;
/** Reused in getAutoPrefixTermsEnum: */
private TermsEnum prefixTermsEnum;
/** Reused in getAutoPrefixTermsEnum: */
private PostingsEnum prefixDocsEnum;
/** Create a new writer, using default values for auto-prefix terms. */
public BlockTreeTermsWriter(SegmentWriteState state,
PostingsWriterBase postingsWriter,
int minItemsInBlock,
int maxItemsInBlock) throws IOException {
this(state, postingsWriter, minItemsInBlock, maxItemsInBlock, 0, 0);
}
/** Create a new writer. The number of items (terms or
* sub-blocks) per block will aim to be between
* minItemsPerBlock and maxItemsPerBlock, though in some
* cases the blocks may be smaller than the min.
* For DOCS_ONLY fields, this terms dictionary will
* insert automatically generated prefix terms for common
* prefixes, as long as each prefix matches at least
* {@code minItemsInAutoPrefix} other terms or prefixes,
* and at most {@code maxItemsInAutoPrefix} other terms
* or prefixes. Set {@code minItemsInAutoPrefix} to 0
* to disable auto-prefix terms. */
* cases the blocks may be smaller than the min. */
public BlockTreeTermsWriter(SegmentWriteState state,
PostingsWriterBase postingsWriter,
int minItemsInBlock,
int maxItemsInBlock,
int minItemsInAutoPrefix,
int maxItemsInAutoPrefix)
int maxItemsInBlock)
throws IOException
{
validateSettings(minItemsInBlock,
@ -305,21 +262,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
validateAutoPrefixSettings(minItemsInAutoPrefix,
maxItemsInAutoPrefix);
if (minItemsInAutoPrefix != 0) {
// TODO: can we used compressed bitset instead? that auto-upgrades if it's dense enough...
prefixDocs = new FixedBitSet(state.segmentInfo.maxDoc());
prefixFixedBitsTermsEnum = new BitSetTermsEnum(prefixDocs);
} else {
prefixDocs = null;
prefixFixedBitsTermsEnum = null;
}
this.minItemsInAutoPrefix = minItemsInAutoPrefix;
this.maxItemsInAutoPrefix = maxItemsInAutoPrefix;
this.maxDoc = state.segmentInfo.maxDoc();
this.fieldInfos = state.fieldInfos;
this.postingsWriter = postingsWriter;
@ -332,13 +274,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
CodecUtil.writeIndexHeader(termsOut, BlockTreeTermsReader.TERMS_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
// So at read time we know, globally, that there will be no auto-prefix terms:
if (minItemsInAutoPrefix == 0) {
termsOut.writeByte((byte) 0);
} else {
termsOut.writeByte((byte) 1);
}
final String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_INDEX_EXTENSION);
indexOut = state.directory.createOutput(indexName, state.context);
CodecUtil.writeIndexHeader(indexOut, BlockTreeTermsReader.TERMS_INDEX_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
@ -380,25 +315,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
}
}
/** Throws {@code IllegalArgumentException} if any of these settings
* is invalid. */
public static void validateAutoPrefixSettings(int minItemsInAutoPrefix,
int maxItemsInAutoPrefix) {
if (minItemsInAutoPrefix != 0) {
if (minItemsInAutoPrefix < 2) {
throw new IllegalArgumentException("minItemsInAutoPrefix must be at least 2; got minItemsInAutoPrefix=" + minItemsInAutoPrefix);
}
if (minItemsInAutoPrefix > maxItemsInAutoPrefix) {
throw new IllegalArgumentException("maxItemsInAutoPrefix must be >= minItemsInAutoPrefix; got maxItemsInAutoPrefix=" + maxItemsInAutoPrefix + " minItemsInAutoPrefix=" + minItemsInAutoPrefix);
}
if (2*(minItemsInAutoPrefix-1) > maxItemsInAutoPrefix) {
throw new IllegalArgumentException("maxItemsInAutoPrefix must be at least 2*(minItemsInAutoPrefix-1); got maxItemsInAutoPrefix=" + maxItemsInAutoPrefix + " minItemsInAutoPrefix=" + minItemsInAutoPrefix);
}
} else if (maxItemsInAutoPrefix != 0) {
throw new IllegalArgumentException("maxItemsInAutoPrefix must be 0 (disabled) when minItemsInAutoPrefix is 0");
}
}
@Override
public void write(Fields fields) throws IOException {
//if (DEBUG) System.out.println("\nBTTW.write seg=" + segment);
@ -413,75 +329,26 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
if (terms == null) {
continue;
}
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
// First pass to find all prefix terms we should compile into the index:
List<PrefixTerm> prefixTerms;
if (minItemsInAutoPrefix != 0) {
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
throw new IllegalStateException("ranges can only be indexed with IndexOptions.DOCS (field: " + fieldInfo.name + ")");
}
prefixTerms = new AutoPrefixTermsWriter(terms, minItemsInAutoPrefix, maxItemsInAutoPrefix).prefixes;
//if (DEBUG) {
// for(PrefixTerm term : prefixTerms) {
// System.out.println("field=" + fieldInfo.name + " PREFIX TERM: " + term);
// }
//}
} else {
prefixTerms = null;
}
TermsEnum termsEnum = terms.iterator();
TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field));
int prefixTermUpto = 0;
while (true) {
BytesRef term = termsEnum.next();
//if (DEBUG) System.out.println("BTTW: next term " + term);
// Insert (merge sort) next prefix term(s):
if (prefixTerms != null) {
while (prefixTermUpto < prefixTerms.size() && (term == null || prefixTerms.get(prefixTermUpto).compareTo(term) <= 0)) {
PrefixTerm prefixTerm = prefixTerms.get(prefixTermUpto);
//if (DEBUG) System.out.println("seg=" + segment + " field=" + fieldInfo.name + " NOW INSERT prefix=" + prefixTerm);
termsWriter.write(prefixTerm.term, getAutoPrefixTermsEnum(terms, prefixTerm), prefixTerm);
prefixTermUpto++;
}
}
if (term == null) {
break;
}
//if (DEBUG) System.out.println("write field=" + fieldInfo.name + " term=" + brToString(term));
termsWriter.write(term, termsEnum, null);
termsWriter.write(term, termsEnum);
}
assert prefixTerms == null || prefixTermUpto == prefixTerms.size();
termsWriter.finish();
//if (DEBUG) System.out.println("\nBTTW.write done seg=" + segment + " field=" + field);
}
}
private TermsEnum getAutoPrefixTermsEnum(Terms terms, final PrefixTerm prefix) throws IOException {
assert prefixDocs != null;
prefixDocs.clear(0, prefixDocs.length());
prefixTermsEnum = prefix.getTermsEnum(terms.iterator());
//System.out.println("BTTW.getAutoPrefixTE: prefix=" + prefix);
while (prefixTermsEnum.next() != null) {
//System.out.println(" got term=" + prefixTermsEnum.term().utf8ToString());
//termCount++;
prefixDocsEnum = prefixTermsEnum.postings(prefixDocsEnum, 0);
//System.out.println(" " + prefixDocsEnum + " doc=" + prefixDocsEnum.docID());
prefixDocs.or(prefixDocsEnum);
}
//System.out.println(" done terms: " + prefixDocs.cardinality() + " doc seen; " + termCount + " terms seen");
return prefixFixedBitsTermsEnum;
}
static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
assert fp < (1L << 62);
@ -500,16 +367,12 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
public final byte[] termBytes;
// stats + metadata
public final BlockTermState state;
// Non-null if this is an auto-prefix-term:
public final PrefixTerm prefixTerm;
public PendingTerm other;
public PendingTerm(BytesRef term, BlockTermState state, PrefixTerm prefixTerm) {
public PendingTerm(BytesRef term, BlockTermState state) {
super(true);
this.termBytes = new byte[term.length];
System.arraycopy(term.bytes, term.offset, termBytes, 0, term.length);
this.state = state;
this.prefixTerm = prefixTerm;
}
@Override
@ -698,7 +561,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// only points to sub-blocks in the terms index so we can avoid seeking
// to it when we are looking for a term):
boolean hasTerms = false;
boolean hasPrefixTerms = false;
boolean hasSubBlocks = false;
int start = pending.size()-count;
@ -739,11 +601,10 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// block as soon as we have at least minItemsInBlock. This is not always best: it often produces
// a too-small block as the final block:
boolean isFloor = itemsInBlock < count;
newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasPrefixTerms, hasSubBlocks));
newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasSubBlocks));
hasTerms = false;
hasSubBlocks = false;
hasPrefixTerms = false;
nextFloorLeadLabel = suffixLeadLabel;
nextBlockStart = i;
}
@ -753,7 +614,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
if (ent.isTerm) {
hasTerms = true;
hasPrefixTerms |= ((PendingTerm) ent).prefixTerm != null;
} else {
hasSubBlocks = true;
}
@ -763,7 +623,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
if (nextBlockStart < end) {
int itemsInBlock = end - nextBlockStart;
boolean isFloor = itemsInBlock < count;
newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasPrefixTerms, hasSubBlocks));
newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasSubBlocks));
}
assert newBlocks.isEmpty() == false;
@ -789,7 +649,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
* same prefix, and so we broke it into multiple floor blocks where
* we record the starting label of the suffix of each floor block. */
private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLeadLabel, int start, int end,
boolean hasTerms, boolean hasPrefixTerms, boolean hasSubBlocks) throws IOException {
boolean hasTerms, boolean hasSubBlocks) throws IOException {
assert end > start;
@ -823,7 +683,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// We optimize the leaf block case (block has only terms), writing a more
// compact format in this case:
boolean isLeafBlock = hasSubBlocks == false && hasPrefixTerms == false;
boolean isLeafBlock = hasSubBlocks == false;
//System.out.println(" isLeaf=" + isLeafBlock);
@ -839,7 +699,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
assert ent.isTerm: "i=" + i;
PendingTerm term = (PendingTerm) ent;
assert term.prefixTerm == null;
assert StringHelper.startsWith(term.termBytes, prefix): "term.term=" + term.termBytes + " prefix=" + prefix;
BlockTermState state = term.state;
@ -876,7 +735,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
} else {
// Block has at least one prefix term or a sub block:
subIndices = new ArrayList<>();
boolean sawAutoPrefixTerm = false;
for (int i=start;i<end;i++) {
PendingEntry ent = pending.get(i);
if (ent.isTerm) {
@ -890,9 +748,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix);
// suffixBytes.length = suffix;
// System.out.println(" write term suffix=" + brToString(suffixBytes));
// if (term.prefixTerm != null) {
// System.out.println(" ** auto-prefix term: " + term.prefixTerm);
// }
//}
// For non-leaf block we borrow 1 bit to record
@ -900,33 +755,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// it's a prefix term. Terms cannot be larger than ~32 KB
// so we won't run out of bits:
if (minItemsInAutoPrefix == 0) {
suffixWriter.writeVInt(suffix << 1);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
} else {
code = suffix<<2;
int floorLeadEnd = -1;
if (term.prefixTerm != null) {
assert minItemsInAutoPrefix > 0;
sawAutoPrefixTerm = true;
PrefixTerm prefixTerm = term.prefixTerm;
floorLeadEnd = prefixTerm.floorLeadEnd;
assert floorLeadEnd != -1;
if (prefixTerm.floorLeadStart == -2) {
// Starts with empty string
code |= 2;
} else {
code |= 3;
}
}
suffixWriter.writeVInt(code);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
if (floorLeadEnd != -1) {
suffixWriter.writeByte((byte) floorLeadEnd);
}
assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
}
suffixWriter.writeVInt(suffix << 1);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
// Write term stats, to separate byte[] blob:
statsWriter.writeVInt(state.docFreq);
@ -961,13 +791,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
assert suffix > 0;
// For non-leaf block we borrow 1 bit to record
// if entry is term or sub-block, and 1 bit (unset here) to
// record if it's a prefix term:
if (minItemsInAutoPrefix == 0) {
suffixWriter.writeVInt((suffix<<1)|1);
} else {
suffixWriter.writeVInt((suffix<<2)|1);
}
// if entry is term or sub-block:f
suffixWriter.writeVInt((suffix<<1)|1);
suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
//if (DEBUG2) {
@ -985,7 +810,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
}
}
assert subIndices.size() != 0 || sawAutoPrefixTerm;
assert subIndices.size() != 0;
}
// TODO: we could block-write the term suffix pointers;
@ -1029,7 +854,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
}
/** Writes one term's worth of postings. */
public void write(BytesRef text, TermsEnum termsEnum, PrefixTerm prefixTerm) throws IOException {
public void write(BytesRef text, TermsEnum termsEnum) throws IOException {
/*
if (DEBUG) {
int[] tmp = new int[lastTerm.length];
@ -1045,20 +870,17 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
assert fieldInfo.getIndexOptions() == IndexOptions.DOCS || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
pushTerm(text);
PendingTerm term = new PendingTerm(text, state, prefixTerm);
PendingTerm term = new PendingTerm(text, state);
pending.add(term);
//if (DEBUG) System.out.println(" add pending term = " + text + " pending.size()=" + pending.size());
if (prefixTerm == null) {
// Only increment stats for real terms:
sumDocFreq += state.docFreq;
sumTotalTermFreq += state.totalTermFreq;
numTerms++;
if (firstPendingTerm == null) {
firstPendingTerm = term;
}
lastPendingTerm = term;
sumDocFreq += state.docFreq;
sumTotalTermFreq += state.totalTermFreq;
numTerms++;
if (firstPendingTerm == null) {
firstPendingTerm = term;
}
lastPendingTerm = term;
}
}

View File

@ -348,7 +348,6 @@ final class IntersectTermsEnum extends TermsEnum {
private boolean skipPastLastAutoPrefixTerm() throws IOException {
assert currentFrame.isAutoPrefixTerm;
useAutoPrefixTerm = false;
currentFrame.termState.isRealTerm = true;
// If we last returned an auto-prefix term, we must now skip all
// actual terms sharing that prefix. At most, that skipping
@ -694,7 +693,6 @@ final class IntersectTermsEnum extends TermsEnum {
if (useAutoPrefixTerm) {
// All suffixes of this auto-prefix term are accepted by the automaton, so we can use it:
copyTerm();
currentFrame.termState.isRealTerm = false;
return term;
} else {
// We move onto the next term

View File

@ -177,18 +177,6 @@ public final class TermContext {
return totalTermFreq;
}
/** Returns true if all terms stored here are real (e.g., not auto-prefix terms).
*
* @lucene.internal */
public boolean hasOnlyRealTerms() {
for (TermState termState : states) {
if (termState != null && termState.isRealTerm() == false) {
return false;
}
}
return true;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();

View File

@ -48,12 +48,6 @@ public abstract class TermState implements Cloneable {
// should not happen
throw new RuntimeException(cnse);
}
}
/** Returns true if this term is real (e.g., not an auto-prefix term).
* @lucene.internal */
public boolean isRealTerm() {
return true;
}
@Override

View File

@ -122,10 +122,6 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
return true;
}
TermState state = termsEnum.termState();
if (state.isRealTerm() == false) {
// TermQuery does not accept fake terms for now
return false;
}
terms.add(new TermAndState(BytesRef.deepCopyOf(term), state, termsEnum.docFreq(), termsEnum.totalTermFreq()));
}
return termsEnum.next() == null;

View File

@ -115,7 +115,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
for (int i = 0; i < size; i++) {
final int pos = sort[i];
final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
assert termStates[pos].hasOnlyRealTerms() == false || reader.docFreq(term) == termStates[pos].docFreq();
assert reader.docFreq(term) == termStates[pos].docFreq();
addClause(builder, term, termStates[pos].docFreq(), boost[pos], termStates[pos]);
}
}

View File

@ -53,8 +53,6 @@ public class TermQuery extends Query {
super(TermQuery.this);
this.needsScores = needsScores;
assert termStates != null : "TermContext must not be null";
// checked with a real exception in TermQuery constructor
assert termStates.hasOnlyRealTerms();
this.termStates = termStates;
this.similarity = searcher.getSimilarity(needsScores);
@ -166,12 +164,6 @@ public class TermQuery extends Query {
public TermQuery(Term t, TermContext states) {
assert states != null;
term = Objects.requireNonNull(t);
if (states.hasOnlyRealTerms() == false) {
// The reason for this is that fake terms might have the same bytes as
// real terms, and this confuses query caching because they don't match
// the same documents
throw new IllegalArgumentException("Term queries must be created on real terms");
}
perReaderTermState = Objects.requireNonNull(states);
}

View File

@ -345,10 +345,6 @@ public final class SolrRangeQuery extends ExtendedQueryBase implements DocSetPro
return -count - 1;
}
TermState state = termsEnum.termState();
if (state.isRealTerm() == false) {
// TermQuery does not accept fake terms for now
return count;
}
int df = termsEnum.docFreq();
count += df;
terms.add(new TermAndState(BytesRef.deepCopyOf(term), state, df, termsEnum.totalTermFreq()));