LUCENE-5879: add auto-prefix terms to block tree, and experimental AutoPrefixTermsPostingsFormat

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1670918 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2015-04-02 15:05:48 +00:00
parent 658a131002
commit 30807709e6
45 changed files with 4867 additions and 1772 deletions

View File

@ -19,6 +19,10 @@ New Features
for counting ranges that align with the underlying terms as defined by the
NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley)
* LUCENE-5879: Added experimental auto-prefix terms to BlockTree terms
dictionary, exposed as AutoPrefixPostingsFormat (Adrien Grand,
Uwe Schindler, Robert Muir, Mike McCandless)
API Changes
* LUCENE-3312: The API of oal.document was restructured to

View File

@ -0,0 +1,125 @@
package org.apache.lucene.codecs.autoprefix;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/**
* Just like {@link Lucene50PostingsFormat} except this format
* exposes the experimental auto-prefix terms.
*
* @lucene.experimental
*/
public final class AutoPrefixPostingsFormat extends PostingsFormat {
private final int minItemsInBlock;
private final int maxItemsInBlock;
private final int minItemsInAutoPrefix;
private final int maxItemsInAutoPrefix;
/** Creates {@code AutoPrefixPostingsFormat} with default settings. */
public AutoPrefixPostingsFormat() {
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE,
25, 48);
}
/** Creates {@code Lucene50PostingsFormat} with custom
* values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
* @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */
public AutoPrefixPostingsFormat(int minItemsInAutoPrefix, int maxItemsInAutoPrefix) {
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE,
minItemsInAutoPrefix,
maxItemsInAutoPrefix);
}
/** Creates {@code Lucene50PostingsFormat} with custom
* values for {@code minBlockSize}, {@code
* maxBlockSize}, {@code minItemsInAutoPrefix} and {@code maxItemsInAutoPrefix}, passed
* to block tree terms dictionary.
* @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int,int,int) */
public AutoPrefixPostingsFormat(int minItemsInBlock, int maxItemsInBlock, int minItemsInAutoPrefix, int maxItemsInAutoPrefix) {
super("AutoPrefix");
BlockTreeTermsWriter.validateSettings(minItemsInBlock,
maxItemsInBlock);
BlockTreeTermsWriter.validateAutoPrefixSettings(minItemsInAutoPrefix,
maxItemsInAutoPrefix);
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
this.minItemsInAutoPrefix = minItemsInAutoPrefix;
this.maxItemsInAutoPrefix = maxItemsInAutoPrefix;
}
@Override
public String toString() {
return getName();
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret = new BlockTreeTermsWriter(state,
postingsWriter,
minItemsInBlock,
maxItemsInBlock,
minItemsInAutoPrefix,
maxItemsInAutoPrefix);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene50PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(postingsReader, state);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
}

View File

@ -0,0 +1,22 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An experimental postings format that automatically indexes appropriate
* prefix terms for fast range and prefix queries.
*/
package org.apache.lucene.codecs.autoprefix;

View File

@ -20,3 +20,4 @@ org.apache.lucene.codecs.memory.FSTOrdPostingsFormat
org.apache.lucene.codecs.memory.FSTPostingsFormat
org.apache.lucene.codecs.memory.MemoryPostingsFormat
org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
org.apache.lucene.codecs.autoprefix.AutoPrefixPostingsFormat

View File

@ -0,0 +1,38 @@
package org.apache.lucene.codecs.autoprefix;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.RandomPostingsTester;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/**
* Tests AutoPrefix's postings
*/
// NOTE: we don't extend BasePostingsFormatTestCase becase we can only handle DOCS_ONLY fields:
public class TestAutoPrefixPostingsFormat extends LuceneTestCase {
public void test() throws Exception {
new RandomPostingsTester(random()).testFull(TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat()),
createTempDir("autoprefix"),
IndexOptions.DOCS,
false);
}
}

View File

@ -0,0 +1,738 @@
package org.apache.lucene.codecs.autoprefix;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CompiledAutomaton;
public class TestAutoPrefixTerms extends LuceneTestCase {
private int minItemsPerBlock = TestUtil.nextInt(random(), 2, 100);
private int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random().nextInt(100);
private int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, 100);
private int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2, (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE;
private final Codec codec = TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat(minItemsPerBlock, maxItemsPerBlock,
minTermsAutoPrefix, maxTermsAutoPrefix));
// Numbers in a restricted range, encoded in decimal, left-0-padded:
public void testBasicNumericRanges() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
int numTerms = TestUtil.nextInt(random(), 3000, 50000);
Set<String> terms = new HashSet<>();
int digits = TestUtil.nextInt(random(), 5, 10);
int maxValue = 1;
for(int i=0;i<digits;i++) {
maxValue *= 10;
}
String format = "%0" + digits + "d";
while (terms.size() < numTerms) {
terms.add(String.format(Locale.ROOT, format, random().nextInt(maxValue)));
}
for(String term : terms) {
Document doc = new Document();
doc.add(new StringField("field", term, Field.Store.NO));
doc.add(new NumericDocValuesField("field", Long.parseLong(term)));
w.addDocument(doc);
}
if (VERBOSE) System.out.println("\nTEST: now optimize");
if (random().nextBoolean()) {
w.forceMerge(1);
}
if (VERBOSE) System.out.println("\nTEST: now done");
IndexReader r = DirectoryReader.open(w, true);
List<String> sortedTerms = new ArrayList<>(terms);
Collections.sort(sortedTerms);
if (VERBOSE) {
System.out.println("TEST: sorted terms:");
int idx = 0;
for(String term : sortedTerms) {
System.out.println(idx + ": " + term);
idx++;
}
}
int iters = atLeast(100);
for(int iter=0;iter<iters;iter++) {
int min, max;
while (true) {
min = random().nextInt(maxValue);
max = random().nextInt(maxValue);
if (min == max) {
continue;
} else if (min > max) {
int x = min;
min = max;
max = x;
}
break;
}
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " min=" + min + " max=" + max);
}
boolean minInclusive = random().nextBoolean();
boolean maxInclusive = random().nextBoolean();
BytesRef minTerm = new BytesRef(String.format(Locale.ROOT, format, min));
BytesRef maxTerm = new BytesRef(String.format(Locale.ROOT, format, max));
CompiledAutomaton ca = new CompiledAutomaton(Automata.makeBinaryInterval(minTerm, minInclusive, maxTerm, maxInclusive),
true, false, Integer.MAX_VALUE, true);
TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field"));
NumericDocValues docValues = MultiDocValues.getNumericValues(r, "field");
PostingsEnum postingsEnum = null;
VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), minTerm, maxTerm);
while (te.next() != null) {
if (VERBOSE) {
System.out.println(" got term=" + te.term().utf8ToString());
}
verifier.sawTerm(te.term());
postingsEnum = te.postings(null, postingsEnum);
int docID;
while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
long v = docValues.get(docID);
assert v >= min && v <= max: "docID=" + docID + " v=" + v;
// The auto-prefix terms should never "overlap" one another, so we should only ever see a given docID one time:
if (VERBOSE) {
System.out.println(" got docID=" + docID + " v=" + v);
}
verifier.sawDoc(docID);
}
}
int startLoc = Collections.binarySearch(sortedTerms, String.format(Locale.ROOT, format, min));
if (startLoc < 0) {
startLoc = -startLoc-1;
} else if (minInclusive == false) {
startLoc++;
}
int endLoc = Collections.binarySearch(sortedTerms, String.format(Locale.ROOT, format, max));
if (endLoc < 0) {
endLoc = -endLoc-2;
} else if (maxInclusive == false) {
endLoc--;
}
verifier.finish(endLoc-startLoc+1, maxTermsAutoPrefix);
}
r.close();
w.close();
dir.close();
}
private static BytesRef intToBytes(int v) {
int sortableBits = v ^ 0x80000000;
BytesRef token = new BytesRef(4);
token.length = 4;
int index = 3;
while (index >= 0) {
token.bytes[index] = (byte) (sortableBits & 0xff);
index--;
sortableBits >>>= 8;
}
return token;
}
// Numbers are encoded in full binary (4 byte ints):
public void testBinaryNumericRanges() throws Exception {
if (VERBOSE) {
System.out.println("TEST: minItemsPerBlock=" + minItemsPerBlock);
System.out.println("TEST: maxItemsPerBlock=" + maxItemsPerBlock);
System.out.println("TEST: minTermsAutoPrefix=" + minTermsAutoPrefix);
System.out.println("TEST: maxTermsAutoPrefix=" + maxTermsAutoPrefix);
}
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
int numTerms = TestUtil.nextInt(random(), 3000, 50000);
Set<Integer> terms = new HashSet<>();
while (terms.size() < numTerms) {
terms.add(random().nextInt());
}
for(Integer term : terms) {
Document doc = new Document();
doc.add(new BinaryField("field", intToBytes(term)));
doc.add(new NumericDocValuesField("field", term));
w.addDocument(doc);
}
if (random().nextBoolean()) {
if (VERBOSE) System.out.println("TEST: now force merge");
w.forceMerge(1);
}
IndexReader r = DirectoryReader.open(w, true);
List<Integer> sortedTerms = new ArrayList<>(terms);
Collections.sort(sortedTerms);
if (VERBOSE) {
System.out.println("TEST: sorted terms:");
int idx = 0;
for(Integer term : sortedTerms) {
System.out.println(idx + ": " + term);
idx++;
}
}
int iters = atLeast(100);
for(int iter=0;iter<iters;iter++) {
int min, max;
while (true) {
min = random().nextInt();
max = random().nextInt();
if (min == max) {
continue;
} else if (min > max) {
int x = min;
min = max;
max = x;
}
break;
}
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " min=" + min + " (" + intToBytes(min) + ") max=" + max + " (" + intToBytes(max) + ")");
}
boolean minInclusive = random().nextBoolean();
BytesRef minTerm = intToBytes(min);
boolean maxInclusive = random().nextBoolean();
BytesRef maxTerm = intToBytes(max);
CompiledAutomaton ca = new CompiledAutomaton(Automata.makeBinaryInterval(minTerm, minInclusive, maxTerm, maxInclusive),
true, false, Integer.MAX_VALUE, true);
TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field"));
NumericDocValues docValues = MultiDocValues.getNumericValues(r, "field");
PostingsEnum postingsEnum = null;
VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), minTerm, maxTerm);
while (te.next() != null) {
if (VERBOSE) {
System.out.println(" got term=" + te.term() + " docFreq=" + te.docFreq());
}
verifier.sawTerm(te.term());
postingsEnum = te.postings(null, postingsEnum);
int docID;
while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
long v = docValues.get(docID);
assert v >= min && v <= max: "docID=" + docID + " v=" + v;
verifier.sawDoc(docID);
}
}
int startLoc = Collections.binarySearch(sortedTerms, min);
if (startLoc < 0) {
startLoc = -startLoc-1;
} else if (minInclusive == false) {
startLoc++;
}
int endLoc = Collections.binarySearch(sortedTerms, max);
if (endLoc < 0) {
endLoc = -endLoc-2;
} else if (maxInclusive == false) {
endLoc--;
}
int expectedHits = endLoc-startLoc+1;
try {
verifier.finish(expectedHits, maxTermsAutoPrefix);
} catch (AssertionError ae) {
for(int i=0;i<numTerms;i++) {
if (verifier.allHits.get(i) == false) {
int v = (int) docValues.get(i);
boolean accept = (v > min || (v == min && minInclusive)) &&
(v < max || (v == max && maxInclusive));
if (accept) {
System.out.println("MISSING: docID=" + i + " v=" + v + " term=" + intToBytes(v));
}
}
}
throw ae;
}
}
r.close();
w.close();
dir.close();
}
// Non-numeric, simple prefix query
public void testBasicPrefixTerms() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
iwc.setMergeScheduler(new SerialMergeScheduler());
IndexWriter w = new IndexWriter(dir, iwc);
int numTerms = TestUtil.nextInt(random(), 3000, 50000);
Set<String> terms = new HashSet<>();
while (terms.size() < numTerms) {
terms.add(TestUtil.randomSimpleString(random()));
}
for(String term : terms) {
Document doc = new Document();
doc.add(new StringField("field", term, Field.Store.NO));
doc.add(new BinaryDocValuesField("field", new BytesRef(term)));
w.addDocument(doc);
}
if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: now force merge");
}
w.forceMerge(1);
}
IndexReader r = DirectoryReader.open(w, true);
List<String> sortedTerms = new ArrayList<>(terms);
Collections.sort(sortedTerms);
if (VERBOSE) {
System.out.println("TEST: sorted terms:");
int idx = 0;
for(String term : sortedTerms) {
System.out.println(idx + ": " + term);
idx++;
}
}
if (VERBOSE) {
System.out.println("TEST: r=" + r);
}
int iters = atLeast(100);
for(int iter=0;iter<iters;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
String prefix;
if (random().nextInt(100) == 42) {
prefix = "";
} else {
prefix = TestUtil.randomSimpleString(random(), 1, 4);
}
BytesRef prefixBR = new BytesRef(prefix);
if (VERBOSE) {
System.out.println(" prefix=" + prefix);
}
CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(prefixBR), true, false, Integer.MAX_VALUE, true);
TermsEnum te = ca.getTermsEnum(MultiFields.getTerms(r, "field"));
BinaryDocValues docValues = MultiDocValues.getBinaryValues(r, "field");
PostingsEnum postingsEnum = null;
VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), prefixBR);
while (te.next() != null) {
if (VERBOSE) {
System.out.println("TEST: got term=" + te.term().utf8ToString() + " docFreq=" + te.docFreq());
}
verifier.sawTerm(te.term());
postingsEnum = te.postings(null, postingsEnum);
int docID;
while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
assertTrue("prefixBR=" + prefixBR + " docBR=" + docValues.get(docID), StringHelper.startsWith(docValues.get(docID), prefixBR));
// The auto-prefix terms should never "overlap" one another, so we should only ever see a given docID one time:
verifier.sawDoc(docID);
}
}
int startLoc = Collections.binarySearch(sortedTerms, prefix);
if (startLoc < 0) {
startLoc = -startLoc-1;
}
int endLoc = Collections.binarySearch(sortedTerms, prefix + (char) ('z'+1));
if (endLoc < 0) {
endLoc = -endLoc-2;
}
int expectedHits = endLoc-startLoc+1;
try {
verifier.finish(expectedHits, maxTermsAutoPrefix);
} catch (AssertionError ae) {
for(int i=0;i<numTerms;i++) {
if (verifier.allHits.get(i) == false) {
String s = docValues.get(i).utf8ToString();
if (s.startsWith(prefix)) {
System.out.println("MISSING: docID=" + i + " term=" + s);
}
}
}
throw ae;
}
}
r.close();
w.close();
dir.close();
}
public void testDemoPrefixTerms() throws Exception {
if (VERBOSE) {
System.out.println("\nTEST: minTermsAutoPrefix=" + minTermsAutoPrefix + " maxTermsAutoPrefix=" + maxTermsAutoPrefix);
System.out.println("\nTEST: minItemsPerBlock=" + minItemsPerBlock + " maxItemsPerBlock=" + maxItemsPerBlock);
}
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
int numDocs = 30;
for(int i=0;i<numDocs;i++) {
Document doc = new Document();
doc.add(new StringField("field", "" + (char) (97+i), Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new StringField("field", "a" + (char) (97+i), Field.Store.NO));
w.addDocument(doc);
}
if (random().nextBoolean()) {
w.forceMerge(1);
}
IndexReader r = DirectoryReader.open(w, true);
Terms terms = MultiFields.getTerms(r, "field");
if (VERBOSE) {
System.out.println("\nTEST: now intersect");
}
CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(new BytesRef("a")), false, false, Integer.MAX_VALUE, true);
TermsEnum te = ca.getTermsEnum(terms);
PostingsEnum postingsEnum = null;
VerifyAutoPrefixTerms verifier = new VerifyAutoPrefixTerms(r.maxDoc(), new BytesRef("a"));
//TermsEnum te = terms.intersect(new CompiledAutomaton(a, true, false), null);
while (te.next() != null) {
verifier.sawTerm(te.term());
postingsEnum = te.postings(null, postingsEnum);
int docID;
while ((docID = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
// The auto-prefix terms should never "overlap" one another, so we should only ever see a given docID one time:
verifier.sawDoc(docID);
}
}
// 1 document has exactly "a", and 30 documents had "a?"
verifier.finish(31, maxTermsAutoPrefix);
PrefixQuery q = new PrefixQuery(new Term("field", "a"));
q.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
assertEquals(31, newSearcher(r).search(q, 1).totalHits);
r.close();
w.close();
dir.close();
}
static final class BinaryTokenStream extends TokenStream {
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
private boolean available = true;
public BinaryTokenStream(BytesRef bytes) {
bytesAtt.setBytesRef(bytes);
}
@Override
public boolean incrementToken() {
if (available) {
clearAttributes();
available = false;
return true;
}
return false;
}
@Override
public void reset() {
available = true;
}
public interface ByteTermAttribute extends TermToBytesRefAttribute {
void setBytesRef(BytesRef bytes);
}
public static class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute,TermToBytesRefAttribute {
private BytesRef bytes;
@Override
public void fillBytesRef() {
// no-op: the bytes was already filled by our owner's incrementToken
}
@Override
public BytesRef getBytesRef() {
return bytes;
}
@Override
public void setBytesRef(BytesRef bytes) {
this.bytes = bytes;
}
@Override
public void clear() {}
@Override
public void copyTo(AttributeImpl target) {
ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
other.bytes = bytes;
}
}
}
/** Basically a StringField that accepts binary term. */
private static class BinaryField extends Field {
final static FieldType TYPE;
static {
TYPE = new FieldType(StringField.TYPE_NOT_STORED);
// Necessary so our custom tokenStream is used by Field.tokenStream:
TYPE.setTokenized(true);
TYPE.freeze();
}
public BinaryField(String name, BytesRef value) {
super(name, new BinaryTokenStream(value), TYPE);
}
}
/** Helper class to ensure auto-prefix terms 1) never overlap one another, and 2) are used when they should be. */
private static class VerifyAutoPrefixTerms {
final FixedBitSet allHits;
private final Map<BytesRef,Integer> prefixCounts = new HashMap<>();
private int totPrefixCount;
private final BytesRef[] bounds;
private int totTermCount;
private BytesRef lastTerm;
public VerifyAutoPrefixTerms(int maxDoc, BytesRef... bounds) {
allHits = new FixedBitSet(maxDoc);
assert bounds.length > 0;
this.bounds = bounds;
}
public void sawTerm(BytesRef term) {
//System.out.println("saw term=" + term);
if (lastTerm != null) {
assertTrue(lastTerm.compareTo(term) < 0);
}
lastTerm = BytesRef.deepCopyOf(term);
totTermCount++;
totPrefixCount += term.length;
for(int i=1;i<=term.length;i++) {
BytesRef prefix = BytesRef.deepCopyOf(term);
prefix.length = i;
Integer count = prefixCounts.get(prefix);
if (count == null) {
count = 1;
} else {
count += 1;
}
prefixCounts.put(prefix, count);
}
}
public void sawDoc(int docID) {
// The auto-prefix terms should never "overlap" one another, so we should only ever see a given docID one time:
assertFalse(allHits.getAndSet(docID));
}
public void finish(int expectedNumHits, int maxPrefixCount) {
if (maxPrefixCount != -1) {
// Auto-terms were used in this test
long allowedMaxTerms;
if (bounds.length == 1) {
// Simple prefix query: we should never see more than maxPrefixCount terms:
allowedMaxTerms = maxPrefixCount;
} else {
// Trickier: we need to allow for maxPrefixTerms for each different leading byte in the min and max:
assert bounds.length == 2;
BytesRef minTerm = bounds[0];
BytesRef maxTerm = bounds[1];
int commonPrefix = 0;
for(int i=0;i<minTerm.length && i<maxTerm.length;i++) {
if (minTerm.bytes[minTerm.offset+i] != maxTerm.bytes[maxTerm.offset+i]) {
commonPrefix = i;
break;
}
}
allowedMaxTerms = maxPrefixCount * (long) ((minTerm.length-commonPrefix) + (maxTerm.length-commonPrefix));
}
assertTrue("totTermCount=" + totTermCount + " is > allowedMaxTerms=" + allowedMaxTerms, totTermCount <= allowedMaxTerms);
}
assertEquals(expectedNumHits, allHits.cardinality());
int sum = 0;
for(Map.Entry<BytesRef,Integer> ent : prefixCounts.entrySet()) {
BytesRef prefix = ent.getKey();
if (VERBOSE) {
System.out.println(" verify prefix=" + TestUtil.bytesRefToString(prefix) + " count=" + ent.getValue());
}
if (maxPrefixCount != -1) {
// Auto-terms were used in this test
int sumLeftoverSuffix = 0;
for(BytesRef bound : bounds) {
int minSharedLength = Math.min(bound.length, prefix.length);
int commonPrefix = minSharedLength;
for(int i=0;i<minSharedLength;i++) {
if (bound.bytes[bound.offset+i] != prefix.bytes[prefix.offset+i]) {
commonPrefix = i;
break;
}
}
sumLeftoverSuffix += bound.length - commonPrefix;
}
long limit = (1+sumLeftoverSuffix) * (long) maxPrefixCount;
assertTrue("maxPrefixCount=" + maxPrefixCount + " prefix=" + prefix + " sumLeftoverSuffix=" + sumLeftoverSuffix + " limit=" + limit + " vs actual=" +ent.getValue(),
ent.getValue() <= limit);
}
sum += ent.getValue();
}
// Make sure no test bug:
assertEquals(totPrefixCount, sum);
}
}
/** Make sure you get clear exc. if you try to use this within anything but IndexOptions.DOCS fields. */
public void testWithFreqs() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
Document doc = new Document();
doc.add(new Field("foo", "bar bar", ft));
w.addDocument(doc);
try {
w.commit();
} catch (IllegalStateException ise) {
assertEquals("ranges can only be indexed with IndexOptions.DOCS (field: foo)", ise.getMessage());
}
w.close();
dir.close();
}
/** Make sure you get clear exc. if you try to use this within anything but IndexOptions.DOCS fields. */
public void testWithPositions() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
Document doc = new Document();
doc.add(new Field("foo", "bar bar", ft));
w.addDocument(doc);
try {
w.commit();
} catch (IllegalStateException ise) {
assertEquals("ranges can only be indexed with IndexOptions.DOCS (field: foo)", ise.getMessage());
}
w.close();
dir.close();
}
/** Make sure you get clear exc. if you try to use this within anything but IndexOptions.DOCS fields. */
public void testWithOffsets() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
IndexWriter w = new IndexWriter(dir, iwc);
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Document doc = new Document();
doc.add(new Field("foo", "bar bar", ft));
w.addDocument(doc);
try {
w.commit();
} catch (IllegalStateException ise) {
assertEquals("ranges can only be indexed with IndexOptions.DOCS (field: foo)", ise.getMessage());
}
w.close();
dir.close();
}
}

View File

@ -16,6 +16,7 @@ package org.apache.lucene.codecs;
* limitations under the License.
*/
import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader; // javadocs
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.TermState;
@ -23,6 +24,8 @@ import org.apache.lucene.index.TermState;
* Holds all state required for {@link PostingsReaderBase}
* to produce a {@link org.apache.lucene.index.PostingsEnum} without re-seeking the
* terms dict.
*
* @lucene.internal
*/
public class BlockTermState extends OrdTermState {
/** how many docs have this term */
@ -36,6 +39,11 @@ public class BlockTermState extends OrdTermState {
// TODO: update BTR to nuke this
public long blockFilePointer;
/** True if this term is "real" (e.g., not an auto-prefix term or
* some other "secret" term; currently only {@link BlockTreeTermsReader}
* sets this). */
public boolean isRealTerm;
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected BlockTermState() {
@ -50,10 +58,11 @@ public class BlockTermState extends OrdTermState {
totalTermFreq = other.totalTermFreq;
termBlockOrd = other.termBlockOrd;
blockFilePointer = other.blockFilePointer;
isRealTerm = other.isRealTerm;
}
@Override
public String toString() {
return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer;
return "docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termBlockOrd=" + termBlockOrd + " blockFP=" + blockFilePointer + " isRealTerm=" + isRealTerm;
}
}

View File

@ -62,6 +62,7 @@ public abstract class PostingsFormat implements NamedSPILoader.NamedSPI {
* @param name must be all ascii alphanumeric, and less than 128 characters in length.
*/
protected PostingsFormat(String name) {
// TODO: can we somehow detect name conflicts here? Two different classes trying to claim the same name? Otherwise you see confusing errors...
NamedSPILoader.checkServiceName(name);
this.name = name;
}

View File

@ -0,0 +1,415 @@
package org.apache.lucene.codecs.blocktree;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.StringHelper;
// TODO: instead of inlining auto-prefix terms with normal terms,
// we could write them into their own virtual/private field. This
// would make search time a bit more complex, since we'd need to
// merge sort between two TermEnums, but it would also make stats
// API (used by CheckIndex -verbose) easier to implement since we could
// just walk this virtual field and gather its stats)
/** Used in the first pass when writing a segment to locate
* "appropriate" auto-prefix terms to pre-compile into the index.
* This visits every term in the index to find prefixes that
* match >= min and <= max number of terms. */
class AutoPrefixTermsWriter {
//static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
//static boolean DEBUG = false;
//static boolean DEBUG2 = BlockTreeTermsWriter.DEBUG2;
//static boolean DEBUG2 = true;
/** Describes a range of term-space to match, either a simple prefix
* (foo*) or a floor-block range of a prefix (e.g. foo[a-m]*,
* foo[n-z]*) when there are too many terms starting with foo*. */
public static final class PrefixTerm implements Comparable<PrefixTerm> {
/** Common prefix */
public final byte[] prefix;
/** If this is -2, this is a normal prefix (foo *), else it's the minimum lead byte of the suffix (e.g. 'd' in foo[d-m]*). */
public final int floorLeadStart;
/** The lead byte (inclusive) of the suffix for the term range we match (e.g. 'm' in foo[d-m*]); this is ignored when
* floorLeadStart is -2. */
public final int floorLeadEnd;
public final BytesRef term;
/** Sole constructor. */
public PrefixTerm(byte[] prefix, int floorLeadStart, int floorLeadEnd) {
this.prefix = prefix;
this.floorLeadStart = floorLeadStart;
this.floorLeadEnd = floorLeadEnd;
this.term = toBytesRef(prefix, floorLeadStart);
assert floorLeadEnd >= floorLeadStart;
assert floorLeadEnd >= 0;
assert floorLeadStart == -2 || floorLeadStart >= 0;
// We should never create empty-string prefix term:
assert prefix.length > 0 || floorLeadStart != -2 || floorLeadEnd != 0xff;
}
@Override
public String toString() {
String s = brToString(new BytesRef(prefix));
if (floorLeadStart == -2) {
s += "[-" + Integer.toHexString(floorLeadEnd) + "]";
} else {
s += "[" + Integer.toHexString(floorLeadStart) + "-" + Integer.toHexString(floorLeadEnd) + "]";
}
return s;
}
@Override
public int compareTo(PrefixTerm other) {
int cmp = term.compareTo(other.term);
if (cmp == 0) {
if (prefix.length != other.prefix.length) {
return prefix.length - other.prefix.length;
}
// On tie, sort the bigger floorLeadEnd, earlier, since it
// spans more terms, so during intersect, we want to encounter this one
// first so we can use it if the automaton accepts the larger range:
cmp = other.floorLeadEnd - floorLeadEnd;
}
return cmp;
}
/** Returns the leading term for this prefix term, e.g. "foo" (for
* the foo* prefix) or "foom" (for the foo[m-z]* case). */
private static BytesRef toBytesRef(byte[] prefix, int floorLeadStart) {
BytesRef br;
if (floorLeadStart != -2) {
assert floorLeadStart >= 0;
br = new BytesRef(prefix.length+1);
} else {
br = new BytesRef(prefix.length);
}
System.arraycopy(prefix, 0, br.bytes, 0, prefix.length);
br.length = prefix.length;
if (floorLeadStart != -2) {
assert floorLeadStart >= 0;
br.bytes[br.length++] = (byte) floorLeadStart;
}
return br;
}
public int compareTo(BytesRef term) {
return this.term.compareTo(term);
}
public TermsEnum getTermsEnum(TermsEnum in) {
final BytesRef prefixRef = new BytesRef(prefix);
return new FilteredTermsEnum(in) {
{
setInitialSeekTerm(term);
}
@Override
protected AcceptStatus accept(BytesRef term) {
if (StringHelper.startsWith(term, prefixRef) &&
(floorLeadEnd == -1 || term.length == prefixRef.length || (term.bytes[term.offset + prefixRef.length] & 0xff) <= floorLeadEnd)) {
return AcceptStatus.YES;
} else {
return AcceptStatus.END;
}
}
};
}
}
// for debugging
static String brToString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
}
}
final List<PrefixTerm> prefixes = new ArrayList<>();
private final int minItemsInPrefix;
private final int maxItemsInPrefix;
// Records index into pending where the current prefix at that
// length "started"; for example, if current term starts with 't',
// startsByPrefix[0] is the index into pending for the first
// term/sub-block starting with 't'. We use this to figure out when
// to write a new block:
private final BytesRefBuilder lastTerm = new BytesRefBuilder();
private int[] prefixStarts = new int[8];
private List<Object> pending = new ArrayList<>();
//private final String segment;
public AutoPrefixTermsWriter(Terms terms, int minItemsInPrefix, int maxItemsInPrefix) throws IOException {
this.minItemsInPrefix = minItemsInPrefix;
this.maxItemsInPrefix = maxItemsInPrefix;
//this.segment = segment;
TermsEnum termsEnum = terms.iterator(null);
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
//if (DEBUG) System.out.println("pushTerm: " + brToString(term));
pushTerm(term);
}
if (pending.size() > 1) {
pushTerm(BlockTreeTermsWriter.EMPTY_BYTES_REF);
// Also maybe save floor prefixes in root block; this can be a biggish perf gain for large ranges:
/*
System.out.println("root block pending.size=" + pending.size());
for(Object o : pending) {
System.out.println(" " + o);
}
*/
while (pending.size() >= minItemsInPrefix) {
savePrefixes(0, pending.size());
}
}
Collections.sort(prefixes);
}
/** Pushes the new term to the top of the stack, and writes new blocks. */
private void pushTerm(BytesRef text) throws IOException {
int limit = Math.min(lastTerm.length(), text.length);
// Find common prefix between last term and current term:
int pos = 0;
while (pos < limit && lastTerm.byteAt(pos) == text.bytes[text.offset+pos]) {
pos++;
}
//if (DEBUG) System.out.println(" shared=" + pos + " lastTerm.length=" + lastTerm.length());
// Close the "abandoned" suffix now:
for(int i=lastTerm.length()-1;i>=pos;i--) {
// How many items on top of the stack share the current suffix
// we are closing:
int prefixTopSize = pending.size() - prefixStarts[i];
while (prefixTopSize >= minItemsInPrefix) {
//if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
savePrefixes(i+1, prefixTopSize);
//prefixStarts[i] -= prefixTopSize;
//System.out.println(" after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
// For large floor blocks, it's possible we should now re-run on the new prefix terms we just created:
prefixTopSize = pending.size() - prefixStarts[i];
}
}
if (prefixStarts.length < text.length) {
prefixStarts = ArrayUtil.grow(prefixStarts, text.length);
}
// Init new tail:
for(int i=pos;i<text.length;i++) {
prefixStarts[i] = pending.size();
}
lastTerm.copyBytes(text);
// Only append the first (optional) empty string, no the fake last one used to close all prefixes:
if (text.length > 0 || pending.isEmpty()) {
byte[] termBytes = new byte[text.length];
System.arraycopy(text.bytes, text.offset, termBytes, 0, text.length);
pending.add(termBytes);
}
}
void savePrefixes(int prefixLength, int count) throws IOException {
assert count > 0;
//if (DEBUG2) {
// BytesRef br = new BytesRef(lastTerm.bytes());
// br.length = prefixLength;
// System.out.println(" savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
//}
int lastSuffixLeadLabel = -2;
int start = pending.size()-count;
assert start >=0;
int end = pending.size();
int nextBlockStart = start;
int nextFloorLeadLabel = -1;
int prefixCount = 0;
int pendingCount = 0;
PrefixTerm lastPTEntry = null;
for (int i=start; i<end; i++) {
byte[] termBytes;
Object o = pending.get(i);
PrefixTerm ptEntry;
if (o instanceof byte[]) {
ptEntry = null;
termBytes = (byte[]) o;
} else {
ptEntry = (PrefixTerm) o;
termBytes = ptEntry.term.bytes;
if (ptEntry.prefix.length != prefixLength) {
assert ptEntry.prefix.length > prefixLength;
ptEntry = null;
}
}
pendingCount++;
//if (DEBUG) System.out.println(" check term=" + brToString(new BytesRef(termBytes)));
int suffixLeadLabel;
if (termBytes.length == prefixLength) {
// Suffix is 0, i.e. prefix 'foo' and term is
// 'foo' so the term has empty string suffix
// in this block
assert lastSuffixLeadLabel == -2;
suffixLeadLabel = -2;
} else {
suffixLeadLabel = termBytes[prefixLength] & 0xff;
}
// if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + " suffixLeadLabel=" + suffixLeadLabel);
if (suffixLeadLabel != lastSuffixLeadLabel) {
// This is a boundary, a chance to make an auto-prefix term if we want:
// When we are "recursing" (generating auto-prefix terms on a block of
// floor'd auto-prefix terms), this assert is non-trivial because it
// ensures the floorLeadEnd of the previous terms is in fact less
// than the lead start of the current entry:
assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel;
// NOTE: must check nextFloorLeadLabel in case minItemsInPrefix is 2 and prefix is 'a' and we've seen 'a' and then 'aa'
if (pendingCount >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix && nextFloorLeadLabel != -1) {
// The count is too large for one block, so we must break it into "floor" blocks, where we record
// the leading label of the suffix of the first term in each floor block, so at search time we can
// jump to the right floor block. We just use a naive greedy segmenter here: make a new floor
// block as soon as we have at least minItemsInBlock. This is not always best: it often produces
// a too-small block as the final block:
// If the last entry was another prefix term of the same length, then it represents a range of terms, so we must use its ending
// prefix label as our ending label:
if (lastPTEntry != null) {
lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
}
savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
pendingCount = 0;
prefixCount++;
nextFloorLeadLabel = suffixLeadLabel;
nextBlockStart = i;
}
if (nextFloorLeadLabel == -1) {
nextFloorLeadLabel = suffixLeadLabel;
//if (DEBUG) System.out.println("set first lead label=" + nextFloorLeadLabel);
}
lastSuffixLeadLabel = suffixLeadLabel;
}
lastPTEntry = ptEntry;
}
// Write last block, if any:
if (nextBlockStart < end) {
//System.out.println(" lastPTEntry=" + lastPTEntry + " lastSuffixLeadLabel=" + lastSuffixLeadLabel);
if (lastPTEntry != null) {
lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
}
assert lastSuffixLeadLabel >= nextFloorLeadLabel: "lastSuffixLeadLabel=" + lastSuffixLeadLabel + " nextFloorLeadLabel=" + nextFloorLeadLabel;
if (prefixCount == 0) {
if (prefixLength > 0) {
savePrefix(prefixLength, -2, 0xff);
prefixCount++;
} else {
// Don't add a prefix term for all terms in the index!
}
} else {
if (lastSuffixLeadLabel == -2) {
// Special case when closing the empty string root block:
lastSuffixLeadLabel = 0xff;
}
savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
prefixCount++;
}
}
// Remove slice from the top of the pending stack, that we just wrote:
int sizeToClear = count;
if (prefixCount > 1) {
Object o = pending.get(pending.size()-count);
if (o instanceof byte[] && ((byte[]) o).length == prefixLength) {
// If we were just asked to write all f* terms, but there were too many and so we made floor blocks, the exact term 'f' will remain
// as its own item, followed by floor block terms like f[a-m]*, f[n-z]*, so in this case we leave 3 (not 2) items on the pending stack:
sizeToClear--;
}
}
pending.subList(pending.size()-sizeToClear, pending.size()).clear();
// Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up":
for(int i=0;i<prefixCount;i++) {
PrefixTerm pt = prefixes.get(prefixes.size()-(prefixCount-i));
pending.add(pt);
}
}
private void savePrefix(int prefixLength, int floorLeadStart, int floorLeadEnd) {
byte[] prefix = new byte[prefixLength];
System.arraycopy(lastTerm.bytes(), 0, prefix, 0, prefixLength);
assert floorLeadStart != -1;
assert floorLeadEnd != -1;
PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd);
//if (DEBUG2) System.out.println(" savePrefix: seg=" + segment + " " + pt + " count=" + count);
prefixes.add(pt);
}
}

View File

@ -0,0 +1,95 @@
package org.apache.lucene.codecs.blocktree;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet; // javadocs
/** Takes a {@link FixedBitSet} and creates a DOCS {@link PostingsEnum} from it. */
class BitSetPostingsEnum extends PostingsEnum {
private final BitSet bits;
private DocIdSetIterator in;
BitSetPostingsEnum(BitSet bits) {
this.bits = bits;
reset();
}
@Override
public int freq() throws IOException {
return 1;
}
@Override
public int docID() {
if (in == null) {
return -1;
} else {
return in.docID();
}
}
@Override
public int nextDoc() throws IOException {
if (in == null) {
in = new BitSetIterator(bits, 0);
}
return in.nextDoc();
}
@Override
public int advance(int target) throws IOException {
return in.advance(target);
}
@Override
public long cost() {
return in.cost();
}
void reset() {
in = null;
}
@Override
public BytesRef getPayload() {
return null;
}
@Override
public int nextPosition() {
throw new UnsupportedOperationException();
}
@Override
public int startOffset() {
throw new UnsupportedOperationException();
}
@Override
public int endOffset() {
throw new UnsupportedOperationException();
}
}

View File

@ -0,0 +1,87 @@
package org.apache.lucene.codecs.blocktree;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/** Silly stub class, used only when writing an auto-prefix
* term in order to expose DocsEnum over a FixedBitSet. We
* pass this to {@link PostingsWriterBase#writeTerm} so
* that it can pull .docs() multiple times for the
* current term. */
class BitSetTermsEnum extends TermsEnum {
private final BitSetPostingsEnum postingsEnum;
public BitSetTermsEnum(BitSet docs) {
postingsEnum = new BitSetPostingsEnum(docs);
}
@Override
public SeekStatus seekCeil(BytesRef text) {
throw new UnsupportedOperationException();
}
@Override
public void seekExact(long ord) {
throw new UnsupportedOperationException();
}
@Override
public BytesRef term() {
throw new UnsupportedOperationException();
}
@Override
public BytesRef next() {
throw new UnsupportedOperationException();
}
@Override
public long ord() {
throw new UnsupportedOperationException();
}
@Override
public int docFreq() {
throw new UnsupportedOperationException();
}
@Override
public long totalTermFreq() {
throw new UnsupportedOperationException();
}
@Override
public PostingsEnum postings(Bits liveDocs, PostingsEnum reuse, int flags) {
if (flags != PostingsEnum.NONE) {
// We only work with DOCS_ONLY fields
return null;
}
if (liveDocs != null) {
throw new IllegalArgumentException("cannot handle live docs");
}
postingsEnum.reset();
return postingsEnum;
}
}

View File

@ -34,6 +34,8 @@ import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.PrefixQuery; // javadocs
import org.apache.lucene.search.TermRangeQuery; // javadocs
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
@ -57,6 +59,14 @@ import org.apache.lucene.util.fst.Outputs;
* min/maxItemsPerBlock during indexing to control how
* much memory the terms index uses.</p>
*
* <p>If auto-prefix terms were indexed (see
* {@link BlockTreeTermsWriter}), then the {@link Terms#intersect}
* implementation here will make use of these terms only if the
* automaton has a binary sink state, i.e. an accept state
* which has a transition to itself accepting all byte values.
* For example, both {@link PrefixQuery} and {@link TermRangeQuery}
* pass such automata to {@link Terms#intersect}.</p>
*
* <p>The data structure used by this implementation is very
* similar to a burst trie
* (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499),
@ -90,8 +100,11 @@ public final class BlockTreeTermsReader extends FieldsProducer {
/** Initial terms format. */
public static final int VERSION_START = 0;
/** Auto-prefix terms. */
public static final int VERSION_AUTO_PREFIX_TERMS = 1;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_START;
public static final int VERSION_CURRENT = VERSION_AUTO_PREFIX_TERMS;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
@ -116,7 +129,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
final String segment;
private final int version;
final int version;
/** Sole constructor. */
public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) throws IOException {

View File

@ -25,11 +25,13 @@ import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.blocktree.AutoPrefixTermsWriter.PrefixTerm;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@ -87,6 +89,16 @@ import org.apache.lucene.util.packed.PackedInts;
* stride) each term's metadata for each set of terms
* between two index terms.
* <p>
*
* If {@code minItemsInAutoPrefix} is not zero, then for
* {@link IndexOptions#DOCS} fields we detect prefixes that match
* "enough" terms and insert auto-prefix terms into the index, which are
* used by {@link Terms#intersect} at search time to speed up prefix
* and range queries. Besides {@link Terms#intersect}, these
* auto-prefix terms are invisible to all other APIs (don't change terms
* stats, don't show up in normal {@link TermsEnum}s, etc.).
* <p>
*
* Files:
* <ul>
* <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
@ -200,7 +212,9 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
* #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
public final static int DEFAULT_MAX_BLOCK_SIZE = 48;
// public final static boolean DEBUG = false;
//public static boolean DEBUG = false;
//public static boolean DEBUG2 = false;
//private final static boolean SAVE_DOT_FILES = false;
private final IndexOutput termsOut;
@ -208,6 +222,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
final int maxDoc;
final int minItemsInBlock;
final int maxItemsInBlock;
final int minItemsInAutoPrefix;
final int maxItemsInAutoPrefix;
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
@ -244,23 +260,67 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
private final List<FieldMetaData> fields = new ArrayList<>();
// private final String segment;
final FixedBitSet prefixDocs;
/** Reused in getAutoPrefixTermsEnum: */
final BitSetTermsEnum prefixFixedBitsTermsEnum;
/** Reused in getAutoPrefixTermsEnum: */
private TermsEnum prefixTermsEnum;
/** Reused in getAutoPrefixTermsEnum: */
private PostingsEnum prefixDocsEnum;
/** Create a new writer, using default values for auto-prefix terms. */
public BlockTreeTermsWriter(SegmentWriteState state,
PostingsWriterBase postingsWriter,
int minItemsInBlock,
int maxItemsInBlock) throws IOException {
this(state, postingsWriter, minItemsInBlock, maxItemsInBlock, 0, 0);
}
/** Create a new writer. The number of items (terms or
* sub-blocks) per block will aim to be between
* minItemsPerBlock and maxItemsPerBlock, though in some
* cases the blocks may be smaller than the min. */
* cases the blocks may be smaller than the min.
* For DOCS_ONLY fields, this terms dictionary will
* insert automatically generated prefix terms for common
* prefixes, as long as each prefix matches at least
* {@code minItemsInAutoPrefix} other terms or prefixes,
* and at most {@code maxItemsInAutoPrefix} other terms
* or prefixes. Set {@code minItemsInAutoPrefix} to 0
* to disable auto-prefix terms. */
public BlockTreeTermsWriter(SegmentWriteState state,
PostingsWriterBase postingsWriter,
int minItemsInBlock,
int maxItemsInBlock)
int maxItemsInBlock,
int minItemsInAutoPrefix,
int maxItemsInAutoPrefix)
throws IOException
{
validateSettings(minItemsInBlock, maxItemsInBlock);
validateSettings(minItemsInBlock,
maxItemsInBlock);
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
validateAutoPrefixSettings(minItemsInAutoPrefix,
maxItemsInAutoPrefix);
if (minItemsInAutoPrefix != 0) {
// TODO: can we used compressed bitset instead? that auto-upgrades if it's dense enough...
prefixDocs = new FixedBitSet(state.segmentInfo.maxDoc());
prefixFixedBitsTermsEnum = new BitSetTermsEnum(prefixDocs);
} else {
prefixDocs = null;
prefixFixedBitsTermsEnum = null;
}
this.minItemsInAutoPrefix = minItemsInAutoPrefix;
this.maxItemsInAutoPrefix = maxItemsInAutoPrefix;
this.maxDoc = state.segmentInfo.maxDoc();
this.fieldInfos = state.fieldInfos;
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
this.postingsWriter = postingsWriter;
final String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_EXTENSION);
@ -269,12 +329,13 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
IndexOutput indexOut = null;
try {
CodecUtil.writeIndexHeader(termsOut, BlockTreeTermsReader.TERMS_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
state.segmentInfo.getId(), state.segmentSuffix);
final String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_INDEX_EXTENSION);
indexOut = state.directory.createOutput(indexName, state.context);
CodecUtil.writeIndexHeader(indexOut, BlockTreeTermsReader.TERMS_INDEX_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
state.segmentInfo.getId(), state.segmentSuffix);
//segment = state.segmentInfo.name;
postingsWriter.init(termsOut, state); // have consumer write its format/header
@ -311,34 +372,109 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
}
}
/** Throws {@code IllegalArgumentException} if any of these settings
* is invalid. */
public static void validateAutoPrefixSettings(int minItemsInAutoPrefix,
int maxItemsInAutoPrefix) {
if (minItemsInAutoPrefix != 0) {
if (minItemsInAutoPrefix < 2) {
throw new IllegalArgumentException("minItemsInAutoPrefix must be at least 2; got minItemsInAutoPrefix=" + minItemsInAutoPrefix);
}
if (minItemsInAutoPrefix > maxItemsInAutoPrefix) {
throw new IllegalArgumentException("maxItemsInAutoPrefix must be >= minItemsInAutoPrefix; got maxItemsInAutoPrefix=" + maxItemsInAutoPrefix + " minItemsInAutoPrefix=" + minItemsInAutoPrefix);
}
if (2*(minItemsInAutoPrefix-1) > maxItemsInAutoPrefix) {
throw new IllegalArgumentException("maxItemsInAutoPrefix must be at least 2*(minItemsInAutoPrefix-1); got maxItemsInAutoPrefix=" + maxItemsInAutoPrefix + " minItemsInAutoPrefix=" + minItemsInAutoPrefix);
}
} else if (maxItemsInAutoPrefix != 0) {
throw new IllegalArgumentException("maxItemsInAutoPrefix must be 0 (disabled) when minItemsInAutoPrefix is 0");
}
}
@Override
public void write(Fields fields) throws IOException {
//if (DEBUG) System.out.println("\nBTTW.write seg=" + segment);
String lastField = null;
for(String field : fields) {
assert lastField == null || lastField.compareTo(field) < 0;
lastField = field;
//if (DEBUG) System.out.println("\nBTTW.write seg=" + segment + " field=" + field);
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
// First pass to find all prefix terms we should compile into the index:
List<PrefixTerm> prefixTerms;
if (minItemsInAutoPrefix != 0) {
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
throw new IllegalStateException("ranges can only be indexed with IndexOptions.DOCS (field: " + fieldInfo.name + ")");
}
prefixTerms = new AutoPrefixTermsWriter(terms, minItemsInAutoPrefix, maxItemsInAutoPrefix).prefixes;
//if (DEBUG) {
// for(PrefixTerm term : prefixTerms) {
// System.out.println("field=" + fieldInfo.name + " PREFIX TERM: " + term);
// }
//}
} else {
prefixTerms = null;
}
TermsEnum termsEnum = terms.iterator(null);
TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field));
int prefixTermUpto = 0;
while (true) {
BytesRef term = termsEnum.next();
//if (DEBUG) System.out.println("BTTW: next term " + term);
// Insert (merge sort) next prefix term(s):
if (prefixTerms != null) {
while (prefixTermUpto < prefixTerms.size() && (term == null || prefixTerms.get(prefixTermUpto).compareTo(term) <= 0)) {
PrefixTerm prefixTerm = prefixTerms.get(prefixTermUpto);
//if (DEBUG) System.out.println("seg=" + segment + " field=" + fieldInfo.name + " NOW INSERT prefix=" + prefixTerm);
termsWriter.write(prefixTerm.term, getAutoPrefixTermsEnum(terms, prefixTerm), prefixTerm);
prefixTermUpto++;
}
}
if (term == null) {
break;
}
termsWriter.write(term, termsEnum);
//if (DEBUG) System.out.println("write field=" + fieldInfo.name + " term=" + brToString(term));
termsWriter.write(term, termsEnum, null);
}
assert prefixTerms == null || prefixTermUpto == prefixTerms.size();
termsWriter.finish();
//if (DEBUG) System.out.println("\nBTTW.write done seg=" + segment + " field=" + field);
}
}
private TermsEnum getAutoPrefixTermsEnum(Terms terms, final PrefixTerm prefix) throws IOException {
assert prefixDocs != null;
prefixDocs.clear(0, prefixDocs.length());
prefixTermsEnum = prefix.getTermsEnum(terms.iterator(prefixTermsEnum));
//System.out.println("BTTW.getAutoPrefixTE: prefix=" + prefix);
while (prefixTermsEnum.next() != null) {
//System.out.println(" got term=" + prefixTermsEnum.term().utf8ToString());
//termCount++;
prefixDocsEnum = prefixTermsEnum.postings(null, prefixDocsEnum, 0);
//System.out.println(" " + prefixDocsEnum + " doc=" + prefixDocsEnum.docID());
prefixDocs.or(prefixDocsEnum);
}
//System.out.println(" done terms: " + prefixDocs.cardinality() + " doc seen; " + termCount + " terms seen");
return prefixFixedBitsTermsEnum;
}
static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
assert fp < (1L << 62);
return (fp << 2) | (hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0);
@ -356,30 +492,38 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
public final byte[] termBytes;
// stats + metadata
public final BlockTermState state;
// Non-null if this is an auto-prefix-term:
public final PrefixTerm prefixTerm;
public PendingTerm other;
public PendingTerm(BytesRef term, BlockTermState state) {
public PendingTerm(BytesRef term, BlockTermState state, PrefixTerm prefixTerm) {
super(true);
this.termBytes = new byte[term.length];
System.arraycopy(term.bytes, term.offset, termBytes, 0, term.length);
this.state = state;
this.prefixTerm = prefixTerm;
}
@Override
public String toString() {
return brToString(termBytes);
return "TERM: " + brToString(termBytes);
}
}
// for debugging
@SuppressWarnings("unused")
static String brToString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
if (b == null) {
return "(null)";
} else {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
}
}
}
@ -410,7 +554,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
@Override
public String toString() {
return "BLOCK: " + brToString(prefix);
return "BLOCK: prefix=" + brToString(prefix);
}
public void compileIndex(List<PendingBlock> blocks, RAMOutputStream scratchBytes, IntsRefBuilder scratchIntsRef) throws IOException {
@ -493,6 +637,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
private final RAMOutputStream scratchBytes = new RAMOutputStream();
private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
static final BytesRef EMPTY_BYTES_REF = new BytesRef();
class TermsWriter {
private final FieldInfo fieldInfo;
private final int longsSize;
@ -529,14 +675,11 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
assert count > 0;
/*
if (DEBUG) {
BytesRef br = new BytesRef(lastTerm.bytes);
br.offset = lastTerm.offset;
br.length = prefixLength;
System.out.println("writeBlocks: " + br.utf8ToString() + " count=" + count);
}
*/
//if (DEBUG2) {
// BytesRef br = new BytesRef(lastTerm.bytes());
// br.length = prefixLength;
// System.out.println("writeBlocks: seg=" + segment + " prefix=" + brToString(br) + " count=" + count);
//}
// Root block better write all remaining pending entries:
assert prefixLength > 0 || count == pending.size();
@ -547,6 +690,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// only points to sub-blocks in the terms index so we can avoid seeking
// to it when we are looking for a term):
boolean hasTerms = false;
boolean hasPrefixTerms = false;
boolean hasSubBlocks = false;
int start = pending.size()-count;
@ -566,7 +710,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// Suffix is 0, i.e. prefix 'foo' and term is
// 'foo' so the term has empty string suffix
// in this block
assert lastSuffixLeadLabel == -1;
assert lastSuffixLeadLabel == -1: "i=" + i + " lastSuffixLeadLabel=" + lastSuffixLeadLabel;
suffixLeadLabel = -1;
} else {
suffixLeadLabel = term.termBytes[prefixLength] & 0xff;
@ -587,10 +731,11 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// block as soon as we have at least minItemsInBlock. This is not always best: it often produces
// a too-small block as the final block:
boolean isFloor = itemsInBlock < count;
newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasSubBlocks));
newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasPrefixTerms, hasSubBlocks));
hasTerms = false;
hasSubBlocks = false;
hasPrefixTerms = false;
nextFloorLeadLabel = suffixLeadLabel;
nextBlockStart = i;
}
@ -600,6 +745,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
if (ent.isTerm) {
hasTerms = true;
hasPrefixTerms |= ((PendingTerm) ent).prefixTerm != null;
} else {
hasSubBlocks = true;
}
@ -609,7 +755,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
if (nextBlockStart < end) {
int itemsInBlock = end - nextBlockStart;
boolean isFloor = itemsInBlock < count;
newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasSubBlocks));
newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasPrefixTerms, hasSubBlocks));
}
assert newBlocks.isEmpty() == false;
@ -634,7 +780,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
* were too many (more than maxItemsInBlock) entries sharing the
* same prefix, and so we broke it into multiple floor blocks where
* we record the starting label of the suffix of each floor block. */
private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLeadLabel, int start, int end, boolean hasTerms, boolean hasSubBlocks) throws IOException {
private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLeadLabel, int start, int end,
boolean hasTerms, boolean hasPrefixTerms, boolean hasSubBlocks) throws IOException {
assert end > start;
@ -646,6 +793,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
System.arraycopy(lastTerm.get().bytes, 0, prefix.bytes, 0, prefixLength);
prefix.length = prefixLength;
//if (DEBUG2) System.out.println(" writeBlock field=" + fieldInfo.name + " prefix=" + brToString(prefix) + " fp=" + startFP + " isFloor=" + isFloor + " isLastInFloor=" + (end == pending.size()) + " floorLeadLabel=" + floorLeadLabel + " start=" + start + " end=" + end + " hasTerms=" + hasTerms + " hasSubBlocks=" + hasSubBlocks);
// Write block header:
int numEntries = end - start;
int code = numEntries << 1;
@ -666,31 +815,34 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// We optimize the leaf block case (block has only terms), writing a more
// compact format in this case:
boolean isLeafBlock = hasSubBlocks == false;
boolean isLeafBlock = hasSubBlocks == false && hasPrefixTerms == false;
//System.out.println(" isLeaf=" + isLeafBlock);
final List<FST<BytesRef>> subIndices;
boolean absolute = true;
if (isLeafBlock) {
// Only terms:
// Block contains only ordinary terms:
subIndices = null;
for (int i=start;i<end;i++) {
PendingEntry ent = pending.get(i);
assert ent.isTerm: "i=" + i;
PendingTerm term = (PendingTerm) ent;
assert term.prefixTerm == null;
assert StringHelper.startsWith(term.termBytes, prefix): "term.term=" + term.termBytes + " prefix=" + prefix;
BlockTermState state = term.state;
final int suffix = term.termBytes.length - prefixLength;
/*
if (DEBUG) {
BytesRef suffixBytes = new BytesRef(suffix);
System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix);
suffixBytes.length = suffix;
System.out.println(" write term suffix=" + brToString(suffixBytes));
}
*/
//if (DEBUG2) {
// BytesRef suffixBytes = new BytesRef(suffix);
// System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix);
// suffixBytes.length = suffix;
// System.out.println(" write term suffix=" + brToString(suffixBytes));
//}
// For leaf block we write suffix straight
suffixWriter.writeVInt(suffix);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
@ -714,27 +866,51 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
absolute = false;
}
} else {
// Mixed terms and sub-blocks:
// Block has at least one prefix term or a sub block:
subIndices = new ArrayList<>();
boolean sawAutoPrefixTerm = false;
for (int i=start;i<end;i++) {
PendingEntry ent = pending.get(i);
if (ent.isTerm) {
PendingTerm term = (PendingTerm) ent;
assert StringHelper.startsWith(term.termBytes, prefix): "term.term=" + term.termBytes + " prefix=" + prefix;
BlockTermState state = term.state;
final int suffix = term.termBytes.length - prefixLength;
/*
if (DEBUG) {
BytesRef suffixBytes = new BytesRef(suffix);
System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix);
suffixBytes.length = suffix;
System.out.println(" write term suffix=" + brToString(suffixBytes));
}
*/
//if (DEBUG2) {
// BytesRef suffixBytes = new BytesRef(suffix);
// System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix);
// suffixBytes.length = suffix;
// System.out.println(" write term suffix=" + brToString(suffixBytes));
// if (term.prefixTerm != null) {
// System.out.println(" ** auto-prefix term: " + term.prefixTerm);
// }
//}
// For non-leaf block we borrow 1 bit to record
// if entry is term or sub-block
suffixWriter.writeVInt(suffix<<1);
// if entry is term or sub-block, and 1 bit to record if
// it's a prefix term. Terms cannot be larger than ~32 KB
// so we won't run out of bits:
code = suffix<<2;
int floorLeadEnd = -1;
if (term.prefixTerm != null) {
sawAutoPrefixTerm = true;
PrefixTerm prefixTerm = term.prefixTerm;
floorLeadEnd = prefixTerm.floorLeadEnd;
assert floorLeadEnd != -1;
if (prefixTerm.floorLeadStart == -2) {
// Starts with empty string
code |= 2;
} else {
code |= 3;
}
}
suffixWriter.writeVInt(code);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
if (floorLeadEnd != -1) {
suffixWriter.writeByte((byte) floorLeadEnd);
}
assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
// Write term stats, to separate byte[] blob:
@ -765,33 +941,32 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
PendingBlock block = (PendingBlock) ent;
assert StringHelper.startsWith(block.prefix, prefix);
final int suffix = block.prefix.length - prefixLength;
assert StringHelper.startsWith(block.prefix, prefix);
assert suffix > 0;
// For non-leaf block we borrow 1 bit to record
// if entry is term or sub-block
suffixWriter.writeVInt((suffix<<1)|1);
// if entry is term or sub-block, and 1 bit (unset here) to
// record if it's a prefix term:
suffixWriter.writeVInt((suffix<<2)|1);
suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
assert floorLeadLabel == -1 || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel;
//if (DEBUG2) {
// BytesRef suffixBytes = new BytesRef(suffix);
// System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
// suffixBytes.length = suffix;
// System.out.println(" write sub-block suffix=" + brToString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
//}
assert floorLeadLabel == -1 || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel: "floorLeadLabel=" + floorLeadLabel + " suffixLead=" + (block.prefix.bytes[prefixLength] & 0xff);
assert block.fp < startFP;
/*
if (DEBUG) {
BytesRef suffixBytes = new BytesRef(suffix);
System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
suffixBytes.length = suffix;
System.out.println(" write sub-block suffix=" + brToString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
}
*/
suffixWriter.writeVLong(startFP - block.fp);
subIndices.add(block.index);
}
}
assert subIndices.size() != 0;
assert subIndices.size() != 0 || sawAutoPrefixTerm;
}
// TODO: we could block-write the term suffix pointers;
@ -835,7 +1010,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
}
/** Writes one term's worth of postings. */
public void write(BytesRef text, TermsEnum termsEnum) throws IOException {
public void write(BytesRef text, TermsEnum termsEnum, PrefixTerm prefixTerm) throws IOException {
/*
if (DEBUG) {
int[] tmp = new int[lastTerm.length];
@ -846,19 +1021,25 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen);
if (state != null) {
assert state.docFreq != 0;
assert fieldInfo.getIndexOptions() == IndexOptions.DOCS || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
sumDocFreq += state.docFreq;
sumTotalTermFreq += state.totalTermFreq;
pushTerm(text);
PendingTerm term = new PendingTerm(text, state);
PendingTerm term = new PendingTerm(text, state, prefixTerm);
pending.add(term);
numTerms++;
if (firstPendingTerm == null) {
firstPendingTerm = term;
//if (DEBUG) System.out.println(" add pending term = " + text + " pending.size()=" + pending.size());
if (prefixTerm == null) {
// Only increment stats for real terms:
sumDocFreq += state.docFreq;
sumTotalTermFreq += state.totalTermFreq;
numTerms++;
if (firstPendingTerm == null) {
firstPendingTerm = term;
}
lastPendingTerm = term;
}
lastPendingTerm = term;
}
}
@ -910,6 +1091,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
// TODO: if pending.size() is already 1 with a non-zero prefix length
// we can save writing a "degenerate" root block, but we have to
// fix all the places that assume the root block's prefix is the empty string:
pushTerm(new BytesRef());
writeBlocks(0, pending.size());
// We better have one final "root" block:

View File

@ -41,6 +41,8 @@ import org.apache.lucene.util.fst.FST;
*/
public final class FieldReader extends Terms implements Accountable {
// private final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(FieldReader.class)
+ 3 * RamUsageEstimator.shallowSizeOfInstance(BytesRef.class);
@ -125,6 +127,7 @@ public final class FieldReader extends Terms implements Accountable {
/** For debugging -- used by CheckIndex too*/
@Override
public Stats getStats() throws IOException {
// TODO: add auto-prefix terms into stats
return new SegmentTermsEnum(this).computeBlockStats();
}
@ -175,10 +178,11 @@ public final class FieldReader extends Terms implements Accountable {
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new IntersectTermsEnum(this, compiled, startTerm);
// if (DEBUG) System.out.println(" FieldReader.intersect startTerm=" + BlockTreeTermsWriter.brToString(startTerm));
//System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton);
// TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum?
// can we optimize knowing that...?
return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm, compiled.sinkState);
}
@Override

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
@ -28,23 +29,38 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
// NOTE: cannot seek!
/** This is used to implement efficient {@link Terms#intersect} for
* block-tree. Note that it cannot seek, except for the initial term on
* init. It just "nexts" through the intersection of the automaton and
* the terms. It does not use the terms index at all: on init, it
* loads the root block, and scans its way to the initial term.
* Likewise, in next it scans until it finds a term that matches the
* current automaton transition. If the index has auto-prefix terms
* (only for DOCS_ONLY fields currently) it will visit these terms
* when possible and then skip the real terms that auto-prefix term
* matched. */
final class IntersectTermsEnum extends TermsEnum {
//static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
final IndexInput in;
final static Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
private IntersectTermsEnumFrame[] stack;
IntersectTermsEnumFrame[] stack;
@SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<BytesRef>[] arcs = new FST.Arc[5];
final RunAutomaton runAutomaton;
final CompiledAutomaton compiledAutomaton;
final Automaton automaton;
final BytesRef commonSuffix;
private IntersectTermsEnumFrame currentFrame;
@ -52,19 +68,34 @@ final class IntersectTermsEnum extends TermsEnum {
private final FST.BytesReader fstReader;
private final boolean allowAutoPrefixTerms;
final FieldReader fr;
/** Which state in the automaton accepts all possible suffixes. */
private final int sinkState;
private BytesRef savedStartTerm;
/** True if we did return the current auto-prefix term */
private boolean useAutoPrefixTerm;
// TODO: in some cases we can filter by length? eg
// regexp foo*bar must be at least length 6 bytes
public IntersectTermsEnum(FieldReader fr, CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
// if (DEBUG) {
// System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef));
// }
public IntersectTermsEnum(FieldReader fr, Automaton automaton, RunAutomaton runAutomaton, BytesRef commonSuffix, BytesRef startTerm, int sinkState) throws IOException {
//if (DEBUG) System.out.println("\nintEnum.init seg=" + fr.parent.segment + " commonSuffix=" + commonSuffix);
this.fr = fr;
runAutomaton = compiled.runAutomaton;
compiledAutomaton = compiled;
this.sinkState = sinkState;
assert automaton != null;
assert runAutomaton != null;
//if (DEBUG) System.out.println("sinkState=" + sinkState + " AUTOMATON:\n" + automaton.toDot());
this.runAutomaton = runAutomaton;
this.allowAutoPrefixTerms = sinkState != -1;
this.automaton = automaton;
this.commonSuffix = commonSuffix;
in = fr.parent.termsIn.clone();
stack = new IntersectTermsEnumFrame[5];
for(int idx=0;idx<stack.length;idx++) {
@ -152,7 +183,7 @@ final class IntersectTermsEnum extends TermsEnum {
f.fp = f.fpOrig = currentFrame.lastSubFP;
f.prefix = currentFrame.prefix + currentFrame.suffix;
// if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix);
//if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix);
f.setState(state);
// Walk the arc through the index -- we only
@ -220,7 +251,7 @@ final class IntersectTermsEnum extends TermsEnum {
// arbitrary seekExact/Ceil. Note that this is a
// seekFloor!
private void seekToStartTerm(BytesRef target) throws IOException {
//if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString());
//if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString() + " length=" + target.length);
assert currentFrame.ord == 0;
if (term.length < target.length) {
term.bytes = ArrayUtil.grow(term.bytes, target.length);
@ -229,23 +260,29 @@ final class IntersectTermsEnum extends TermsEnum {
assert arc == currentFrame.arc;
for(int idx=0;idx<=target.length;idx++) {
//if (DEBUG) System.out.println("cycle idx=" + idx);
while (true) {
final int savNextEnt = currentFrame.nextEnt;
final int savePos = currentFrame.suffixesReader.getPosition();
final int saveStartBytePos = currentFrame.startBytePos;
final int saveSuffix = currentFrame.suffix;
final long saveLastSubFP = currentFrame.lastSubFP;
final int saveTermBlockOrd = currentFrame.termState.termBlockOrd;
final boolean saveIsAutoPrefixTerm = currentFrame.isAutoPrefixTerm;
//if (DEBUG) System.out.println(" cycle isAutoPrefix=" + saveIsAutoPrefixTerm + " ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff));
final boolean isSubBlock = currentFrame.next();
//if (DEBUG) System.out.println(" cycle ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " isBlock=" + isSubBlock + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff));
term.length = currentFrame.prefix + currentFrame.suffix;
if (term.bytes.length < term.length) {
term.bytes = ArrayUtil.grow(term.bytes, term.length);
}
System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
//if (DEBUG) System.out.println(" isSubBlock=" + isSubBlock + " term/prefix=" + brToString(term) + " saveIsAutoPrefixTerm=" + saveIsAutoPrefixTerm + " allowAutoPrefixTerms=" + allowAutoPrefixTerms);
if (isSubBlock && StringHelper.startsWith(target, term)) {
// Recurse
//if (DEBUG) System.out.println(" recurse!");
@ -253,9 +290,11 @@ final class IntersectTermsEnum extends TermsEnum {
break;
} else {
final int cmp = term.compareTo(target);
//if (DEBUG) System.out.println(" cmp=" + cmp);
if (cmp < 0) {
if (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
// Advance to next floor block
//if (DEBUG) System.out.println(" load floorBlock");
currentFrame.loadNextFloorBlock();
continue;
@ -266,19 +305,24 @@ final class IntersectTermsEnum extends TermsEnum {
}
continue;
} else if (cmp == 0) {
if (allowAutoPrefixTerms == false && currentFrame.isAutoPrefixTerm) {
continue;
}
//if (DEBUG) System.out.println(" return term=" + brToString(term));
return;
} else {
} else if (allowAutoPrefixTerms || currentFrame.isAutoPrefixTerm == false) {
// Fallback to prior entry: the semantics of
// this method is that the first call to
// next() will return the term after the
// requested term
currentFrame.nextEnt--;
//if (DEBUG) System.out.println(" fallback prior entry");
currentFrame.nextEnt = savNextEnt;
currentFrame.lastSubFP = saveLastSubFP;
currentFrame.startBytePos = saveStartBytePos;
currentFrame.suffix = saveSuffix;
currentFrame.suffixesReader.setPosition(savePos);
currentFrame.termState.termBlockOrd = saveTermBlockOrd;
currentFrame.isAutoPrefixTerm = saveIsAutoPrefixTerm;
System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
term.length = currentFrame.prefix + currentFrame.suffix;
// If the last entry was a block we don't
@ -297,77 +341,249 @@ final class IntersectTermsEnum extends TermsEnum {
@Override
public BytesRef next() throws IOException {
// if (DEBUG) {
// System.out.println("\nintEnum.next seg=" + segment);
// System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
// }
//if (DEBUG) {
// System.out.println("\nintEnum.next seg=" + fr.parent.segment);
// System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " outputPrefix=" + currentFrame.outputPrefix + " trans: " + currentFrame.transition + " useAutoPrefix=" + useAutoPrefixTerm);
//}
nextTerm:
while(true) {
// Pop finished frames
while (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
//if (DEBUG) System.out.println(" next-floor-block");
currentFrame.loadNextFloorBlock();
//if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
} else {
//if (DEBUG) System.out.println(" pop frame");
if (currentFrame.ord == 0) {
return null;
while (true) {
boolean isSubBlock;
if (useAutoPrefixTerm) {
assert currentFrame.isAutoPrefixTerm;
useAutoPrefixTerm = false;
currentFrame.termState.isRealTerm = true;
//if (DEBUG) {
// System.out.println(" now scan beyond auto-prefix term=" + brToString(term) + " floorSuffixLeadEnd=" + Integer.toHexString(currentFrame.floorSuffixLeadEnd));
//}
// If we last returned an auto-prefix term, we must now skip all
// actual terms sharing that prefix. At most, that skipping
// requires popping one frame, but it can also require simply
// scanning ahead within the current frame. This scanning will
// skip sub-blocks that contain many terms, which is why the
// optimization "works":
int floorSuffixLeadEnd = currentFrame.floorSuffixLeadEnd;
if (floorSuffixLeadEnd == -1) {
// An ordinary prefix, e.g. foo*
int prefix = currentFrame.prefix;
int suffix = currentFrame.suffix;
//if (DEBUG) System.out.println(" prefix=" + prefix + " suffix=" + suffix);
if (suffix == 0) {
//if (DEBUG) System.out.println(" pop frame & nextTerm");
// Easy case: the prefix term's suffix is the empty string,
// meaning the prefix corresponds to all terms in the
// current block, so we just pop this entire block:
if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
return null;
}
currentFrame = stack[currentFrame.ord-1];
continue nextTerm;
} else {
// Just next() until we hit an entry that doesn't share this
// prefix. The first next should be a sub-block sharing the
// same prefix, because if there are enough terms matching a
// given prefix to warrant an auto-prefix term, then there
// must also be enough to make a sub-block (assuming
// minItemsInPrefix > minItemsInBlock):
scanPrefix:
while (true) {
//if (DEBUG) System.out.println(" scan next");
if (currentFrame.nextEnt == currentFrame.entCount) {
if (currentFrame.isLastInFloor == false) {
currentFrame.loadNextFloorBlock();
} else if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null0");
return null;
} else {
// Pop frame, which also means we've moved beyond this
// auto-prefix term:
//if (DEBUG) System.out.println(" pop; nextTerm");
currentFrame = stack[currentFrame.ord-1];
continue nextTerm;
}
}
isSubBlock = currentFrame.next();
//if (DEBUG) {
// BytesRef suffixBytes = new BytesRef(currentFrame.suffix);
// System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, suffixBytes.bytes, 0, currentFrame.suffix);
// suffixBytes.length = currentFrame.suffix;
// System.out.println(" currentFrame.suffix=" + brToString(suffixBytes));
//}
for(int i=0;i<suffix;i++) {
if (term.bytes[prefix+i] != currentFrame.suffixBytes[currentFrame.startBytePos+i]) {
//if (DEBUG) System.out.println(" done; now stop scan");
break scanPrefix;
}
}
}
}
} else {
// Floor'd auto-prefix term; in this case we must skip all
// terms e.g. matching foo[a-m]*. We are currently "on" fooa,
// which the automaton accepted (fooa* through foom*), and
// floorSuffixLeadEnd is m, so we must now scan to foon:
int prefix = currentFrame.prefix;
int suffix = currentFrame.suffix;
if (currentFrame.floorSuffixLeadStart == -1) {
suffix++;
}
//if (DEBUG) System.out.println(" prefix=" + prefix + " suffix=" + suffix);
if (suffix == 0) {
//if (DEBUG) System.out.println(" pop frame");
// This means current frame is fooa*, so we have to first
// pop the current frame, then scan in parent frame:
if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
return null;
}
currentFrame = stack[currentFrame.ord-1];
// Current (parent) frame is now foo*, so now we just scan
// until the lead suffix byte is > floorSuffixLeadEnd
//assert currentFrame.prefix == prefix-1;
//prefix = currentFrame.prefix;
// In case when we pop, and the parent block is not just prefix-1, e.g. in block 417* on
// its first term = floor prefix term 41[7-9], popping to block 4*:
prefix = currentFrame.prefix;
suffix = term.length - currentFrame.prefix;
} else {
// No need to pop; just scan in currentFrame:
}
//if (DEBUG) System.out.println(" start scan: prefix=" + prefix + " suffix=" + suffix);
// Now we scan until the lead suffix byte is > floorSuffixLeadEnd
scanFloor:
while (true) {
//if (DEBUG) System.out.println(" scan next");
if (currentFrame.nextEnt == currentFrame.entCount) {
if (currentFrame.isLastInFloor == false) {
//if (DEBUG) System.out.println(" next floor block");
currentFrame.loadNextFloorBlock();
} else if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
return null;
} else {
// Pop frame, which also means we've moved beyond this
// auto-prefix term:
currentFrame = stack[currentFrame.ord-1];
//if (DEBUG) System.out.println(" pop, now curFrame.prefix=" + currentFrame.prefix);
continue nextTerm;
}
}
isSubBlock = currentFrame.next();
//if (DEBUG) {
// BytesRef suffixBytes = new BytesRef(currentFrame.suffix);
// System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, suffixBytes.bytes, 0, currentFrame.suffix);
// suffixBytes.length = currentFrame.suffix;
// System.out.println(" currentFrame.suffix=" + brToString(suffixBytes));
//}
for(int i=0;i<suffix-1;i++) {
if (term.bytes[prefix+i] != currentFrame.suffixBytes[currentFrame.startBytePos+i]) {
//if (DEBUG) System.out.println(" done; now stop scan");
break scanFloor;
}
}
//if (DEBUG) {
// if (currentFrame.suffix >= suffix) {
// System.out.println(" cmp label=" + Integer.toHexString(currentFrame.suffixBytes[currentFrame.startBytePos+suffix-1]) + " vs " + floorSuffixLeadEnd);
// }
//}
if (currentFrame.suffix >= suffix && (currentFrame.suffixBytes[currentFrame.startBytePos+suffix-1]&0xff) > floorSuffixLeadEnd) {
// Done scanning: we are now on the first term after all
// terms matched by this auto-prefix term
//if (DEBUG) System.out.println(" done; now stop scan");
break;
}
}
final long lastFP = currentFrame.fpOrig;
currentFrame = stack[currentFrame.ord-1];
assert currentFrame.lastSubFP == lastFP;
//if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
}
} else {
// Pop finished frames
while (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
//if (DEBUG) System.out.println(" next-floor-block: trans: " + currentFrame.transition);
// Advance to next floor block
currentFrame.loadNextFloorBlock();
//if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " outputPrefix=" + currentFrame.outputPrefix);
break;
} else {
//if (DEBUG) System.out.println(" pop frame");
if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
return null;
}
final long lastFP = currentFrame.fpOrig;
currentFrame = stack[currentFrame.ord-1];
assert currentFrame.lastSubFP == lastFP;
//if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " outputPrefix=" + currentFrame.outputPrefix);
}
}
isSubBlock = currentFrame.next();
}
final boolean isSubBlock = currentFrame.next();
// if (DEBUG) {
// final BytesRef suffixRef = new BytesRef();
// suffixRef.bytes = currentFrame.suffixBytes;
// suffixRef.offset = currentFrame.startBytePos;
// suffixRef.length = currentFrame.suffix;
// System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef));
// }
//if (DEBUG) {
// final BytesRef suffixRef = new BytesRef();
// suffixRef.bytes = currentFrame.suffixBytes;
// suffixRef.offset = currentFrame.startBytePos;
// suffixRef.length = currentFrame.suffix;
// System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef));
//}
if (currentFrame.suffix != 0) {
// Advance where we are in the automaton to match what terms
// dict next'd to:
final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff;
//if (DEBUG) {
// System.out.println(" move automaton to label=" + label + " vs curMax=" + currentFrame.curTransitionMax);
// }
while (label > currentFrame.curTransitionMax) {
if (currentFrame.transitionIndex >= currentFrame.transitionCount-1) {
// Stop processing this frame -- no further
// matches are possible because we've moved
// beyond what the max transition will allow
//if (DEBUG) System.out.println(" break: trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]));
// sneaky! forces a pop above
currentFrame.isLastInFloor = true;
currentFrame.nextEnt = currentFrame.entCount;
// Pop this frame: no further matches are possible because
// we've moved beyond what the max transition will allow
//if (DEBUG) System.out.println(" break: trans");
if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
return null;
}
currentFrame = stack[currentFrame.ord-1];
continue nextTerm;
}
currentFrame.transitionIndex++;
compiledAutomaton.automaton.getNextTransition(currentFrame.transition);
automaton.getNextTransition(currentFrame.transition);
currentFrame.curTransitionMax = currentFrame.transition.max;
//if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]);
//if (DEBUG) System.out.println(" next trans");
}
}
// First test the common suffix, if set:
if (compiledAutomaton.commonSuffixRef != null && !isSubBlock) {
if (commonSuffix != null && !isSubBlock) {
final int termLen = currentFrame.prefix + currentFrame.suffix;
if (termLen < compiledAutomaton.commonSuffixRef.length) {
if (termLen < commonSuffix.length) {
// No match
// if (DEBUG) {
// System.out.println(" skip: common suffix length");
// }
//if (DEBUG) System.out.println(" skip: common suffix length");
continue nextTerm;
}
final byte[] suffixBytes = currentFrame.suffixBytes;
final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes;
final byte[] commonSuffixBytes = commonSuffix.bytes;
final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix;
assert compiledAutomaton.commonSuffixRef.offset == 0;
final int lenInPrefix = commonSuffix.length - currentFrame.suffix;
assert commonSuffix.offset == 0;
int suffixBytesPos;
int commonSuffixBytesPos = 0;
@ -381,24 +597,20 @@ final class IntersectTermsEnum extends TermsEnum {
final int termBytesPosEnd = currentFrame.prefix;
while (termBytesPos < termBytesPosEnd) {
if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
// if (DEBUG) {
// System.out.println(" skip: common suffix mismatch (in prefix)");
// }
//if (DEBUG) System.out.println(" skip: common suffix mismatch (in prefix)");
continue nextTerm;
}
}
suffixBytesPos = currentFrame.startBytePos;
} else {
suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length;
suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - commonSuffix.length;
}
// Test overlapping suffix part:
final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length;
final int commonSuffixBytesPosEnd = commonSuffix.length;
while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
// if (DEBUG) {
// System.out.println(" skip: common suffix mismatch");
// }
//if (DEBUG) System.out.println(" skip: common suffix mismatch");
continue nextTerm;
}
}
@ -410,10 +622,19 @@ final class IntersectTermsEnum extends TermsEnum {
// "temporarily" accepted, we just blindly .next()
// until the limit
// See if the term prefix matches the automaton:
// TODO: for first iter of this loop can't we just use the current trans? we already advanced it and confirmed it matches lead
// byte of the suffix
// See if the term suffix matches the automaton:
int state = currentFrame.state;
int lastState = currentFrame.lastState;
//if (DEBUG) {
// System.out.println(" a state=" + state + " curFrame.suffix.len=" + currentFrame.suffix + " curFrame.prefix=" + currentFrame.prefix);
// }
for (int idx=0;idx<currentFrame.suffix;idx++) {
state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
lastState = state;
//if (DEBUG) System.out.println(" step label=" + (char) (currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff));
state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
if (state == -1) {
// No match
//System.out.println(" no s=" + state);
@ -423,16 +644,59 @@ final class IntersectTermsEnum extends TermsEnum {
}
}
//if (DEBUG) System.out.println(" after suffix: state=" + state + " lastState=" + lastState);
if (isSubBlock) {
// Match! Recurse:
//if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" + currentFrame.lastSubFP);
copyTerm();
currentFrame = pushFrame(state);
//if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
currentFrame.lastState = lastState;
//xif (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
} else if (currentFrame.isAutoPrefixTerm) {
// We are on an auto-prefix term, meaning this term was compiled
// at indexing time, matching all terms sharing this prefix (or,
// a floor'd subset of them if that count was too high). A
// prefix term represents a range of terms, so we now need to
// test whether, from the current state in the automaton, it
// accepts all terms in that range. As long as it does, we can
// use this term and then later skip ahead past all terms in
// this range:
if (allowAutoPrefixTerms) {
if (currentFrame.floorSuffixLeadEnd == -1) {
// Simple prefix case
useAutoPrefixTerm = state == sinkState;
} else {
if (currentFrame.floorSuffixLeadStart == -1) {
// Must also accept the empty string in this case
if (automaton.isAccept(state)) {
//if (DEBUG) System.out.println(" state is accept");
useAutoPrefixTerm = acceptsSuffixRange(state, 0, currentFrame.floorSuffixLeadEnd);
}
} else {
useAutoPrefixTerm = acceptsSuffixRange(lastState, currentFrame.floorSuffixLeadStart, currentFrame.floorSuffixLeadEnd);
}
}
//if (DEBUG) System.out.println(" useAutoPrefixTerm=" + useAutoPrefixTerm);
if (useAutoPrefixTerm) {
copyTerm();
currentFrame.termState.isRealTerm = false;
//if (DEBUG) System.out.println(" return auto prefix term: " + brToString(term));
return term;
} else {
// We move onto the next term
}
} else {
// We are not allowed to use auto-prefix terms, so we just skip it
}
} else if (runAutomaton.isAccept(state)) {
copyTerm();
//if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" + brToString(term));
//if (DEBUG) System.out.println(" term match to state=" + state);
assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString();
//if (DEBUG) System.out.println(" return term=" + brToString(term));
return term;
} else {
//System.out.println(" no s=" + state);
@ -440,6 +704,41 @@ final class IntersectTermsEnum extends TermsEnum {
}
}
private final Transition transition = new Transition();
/** Returns true if, from this state, the automaton accepts any suffix
* starting with a label between start and end, inclusive. We just
* look for a transition, matching this range, to the sink state. */
private boolean acceptsSuffixRange(int state, int start, int end) {
//xif (DEBUG) System.out.println(" acceptsSuffixRange state=" + state + " start=" + start + " end=" + end);
int count = automaton.initTransition(state, transition);
//xif (DEBUG) System.out.println(" transCount=" + count);
//xif (DEBUG) System.out.println(" trans=" + transition);
for(int i=0;i<count;i++) {
automaton.getNextTransition(transition);
if (start >= transition.min && end <= transition.max && transition.dest == sinkState) {
return true;
}
}
return false;
}
// for debugging
@SuppressWarnings("unused")
static String brToString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
}
}
private void copyTerm() {
//System.out.println(" copyTerm cur.prefix=" + currentFrame.prefix + " cur.suffix=" + currentFrame.suffix + " first=" + (char) currentFrame.suffixBytes[currentFrame.startBytePos]);
final int len = currentFrame.prefix + currentFrame.suffix;

View File

@ -35,9 +35,14 @@ final class IntersectTermsEnumFrame {
long fpEnd;
long lastSubFP;
// private static boolean DEBUG = IntersectTermsEnum.DEBUG;
// State in automaton
int state;
// State just before the last label
int lastState;
int metaDataUpto;
byte[] suffixBytes = new byte[128];
@ -73,6 +78,8 @@ final class IntersectTermsEnumFrame {
int transitionIndex;
int transitionCount;
final boolean versionAutoPrefix;
FST.Arc<BytesRef> arc;
final BlockTermState termState;
@ -89,6 +96,17 @@ final class IntersectTermsEnumFrame {
int startBytePos;
int suffix;
// When we are on an auto-prefix term this is the starting lead byte
// of the suffix (e.g. 'a' for the foo[a-m]* case):
int floorSuffixLeadStart;
// When we are on an auto-prefix term this is the ending lead byte
// of the suffix (e.g. 'm' for the foo[a-m]* case):
int floorSuffixLeadEnd;
// True if the term we are currently on is an auto-prefix term:
boolean isAutoPrefixTerm;
private final IntersectTermsEnum ite;
public IntersectTermsEnumFrame(IntersectTermsEnum ite, int ord) throws IOException {
@ -97,35 +115,39 @@ final class IntersectTermsEnumFrame {
this.termState = ite.fr.parent.postingsReader.newTermState();
this.termState.totalTermFreq = -1;
this.longs = new long[ite.fr.longsSize];
this.versionAutoPrefix = ite.fr.parent.version >= BlockTreeTermsReader.VERSION_AUTO_PREFIX_TERMS;
}
void loadNextFloorBlock() throws IOException {
assert numFollowFloorBlocks > 0;
//if (DEBUG) System.out.println(" loadNextFoorBlock trans=" + transitions[transitionIndex]);
//if (DEBUG) System.out.println(" loadNextFloorBlock transition.min=" + transition.min);
do {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
// if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[transitionIndex].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
//if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
if (numFollowFloorBlocks != 0) {
nextFloorLabel = floorDataReader.readByte() & 0xff;
} else {
nextFloorLabel = 256;
}
// if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel);
//if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel);
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
//if (DEBUG) System.out.println(" done loadNextFloorBlock");
load(null);
}
public void setState(int state) {
this.state = state;
transitionIndex = 0;
transitionCount = ite.compiledAutomaton.automaton.getNumTransitions(state);
transitionCount = ite.automaton.getNumTransitions(state);
if (transitionCount != 0) {
ite.compiledAutomaton.automaton.initTransition(state, transition);
ite.compiledAutomaton.automaton.getNextTransition(transition);
ite.automaton.initTransition(state, transition);
ite.automaton.getNextTransition(transition);
curTransitionMax = transition.max;
//if (DEBUG) System.out.println(" after setState state=" + state + " trans: " + transition + " transCount=" + transitionCount);
} else {
curTransitionMax = -1;
}
@ -133,7 +155,7 @@ final class IntersectTermsEnumFrame {
void load(BytesRef frameIndexData) throws IOException {
// if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
//xif (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
if (frameIndexData != null && transitionCount != 0) {
// Floor frame
@ -148,7 +170,7 @@ final class IntersectTermsEnumFrame {
if ((code & BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
// if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);
//if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);
// If current state is accept, we must process
// first block in case it has empty suffix:
@ -158,7 +180,7 @@ final class IntersectTermsEnumFrame {
while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
// if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
//xif (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
if (numFollowFloorBlocks != 0) {
nextFloorLabel = floorDataReader.readByte() & 0xff;
} else {
@ -179,7 +201,7 @@ final class IntersectTermsEnumFrame {
code = ite.in.readVInt();
isLeafBlock = (code & 1) != 0;
int numBytes = code >>> 1;
// if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes);
//if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes);
if (suffixBytes.length < numBytes) {
suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
@ -214,41 +236,106 @@ final class IntersectTermsEnumFrame {
// written one after another -- tail recurse:
fpEnd = ite.in.getFilePointer();
}
// Necessary in case this ord previously was an auto-prefix
// term but now we recurse to a new leaf block
isAutoPrefixTerm = false;
}
// TODO: maybe add scanToLabel; should give perf boost
// Decodes next entry; returns true if it's a sub-block
public boolean next() {
return isLeafBlock ? nextLeaf() : nextNonLeaf();
if (isLeafBlock) {
nextLeaf();
return false;
} else {
return nextNonLeaf();
}
}
// Decodes next entry; returns true if it's a sub-block
public boolean nextLeaf() {
//if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
public void nextLeaf() {
//if (DEBUG) {
// System.out.println(" frame.nextLeaf ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
//}
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
suffix = suffixesReader.readVInt();
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
return false;
}
public boolean nextNonLeaf() {
//if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
//if (DEBUG) {
// System.out.println(" frame.nextNonLeaf ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount + " versionAutoPrefix=" + versionAutoPrefix + " fp=" + suffixesReader.getPosition());
// }
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
final int code = suffixesReader.readVInt();
suffix = code >>> 1;
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
if ((code & 1) == 0) {
// A normal term
termState.termBlockOrd++;
return false;
if (versionAutoPrefix == false) {
suffix = code >>> 1;
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
if ((code & 1) == 0) {
// A normal term
termState.termBlockOrd++;
return false;
} else {
// A sub-block; make sub-FP absolute:
lastSubFP = fp - suffixesReader.readVLong();
return true;
}
} else {
// A sub-block; make sub-FP absolute:
lastSubFP = fp - suffixesReader.readVLong();
return true;
suffix = code >>> 2;
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
switch (code & 3) {
case 0:
// A normal term
//if (DEBUG) System.out.println(" ret: term");
isAutoPrefixTerm = false;
termState.termBlockOrd++;
return false;
case 1:
// A sub-block; make sub-FP absolute:
isAutoPrefixTerm = false;
lastSubFP = fp - suffixesReader.readVLong();
//if (DEBUG) System.out.println(" ret: sub-block");
return true;
case 2:
// A normal prefix term, suffix leads with empty string
floorSuffixLeadStart = -1;
termState.termBlockOrd++;
floorSuffixLeadEnd = suffixesReader.readByte() & 0xff;
if (floorSuffixLeadEnd == 0xff) {
floorSuffixLeadEnd = -1;
//System.out.println(" fill in -1");
}
//if (DEBUG) System.out.println(" ret: floor prefix term: start=-1 end=" + floorSuffixLeadEnd);
isAutoPrefixTerm = true;
return false;
case 3:
// A floor'd prefix term, suffix leads with real byte
if (suffix == 0) {
// TODO: this is messy, but necessary because we are an auto-prefix term, but our suffix is the empty string here, so we have to
// look at the parent block to get the lead suffix byte:
assert ord > 0;
IntersectTermsEnumFrame parent = ite.stack[ord-1];
floorSuffixLeadStart = parent.suffixBytes[parent.startBytePos+parent.suffix-1] & 0xff;
//if (DEBUG) System.out.println(" peek-parent: suffix=" + floorSuffixLeadStart);
} else {
floorSuffixLeadStart = suffixBytes[startBytePos+suffix-1] & 0xff;
}
termState.termBlockOrd++;
isAutoPrefixTerm = true;
floorSuffixLeadEnd = suffixesReader.readByte() & 0xff;
//if (DEBUG) System.out.println(" ret: floor prefix term start=" + floorSuffixLeadStart + " end=" + floorSuffixLeadEnd);
return false;
default:
// Silly javac:
assert false;
return false;
}
}
}

View File

@ -34,7 +34,9 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
/** Iterates through terms in this field */
/** Iterates through terms in this field. This implementation skips
* any auto-prefix terms it encounters. */
final class SegmentTermsEnum extends TermsEnum {
// Lazy init:
@ -48,7 +50,7 @@ final class SegmentTermsEnum extends TermsEnum {
private int targetBeforeCurrentLength;
// static boolean DEBUG = false;
//static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
@ -119,6 +121,8 @@ final class SegmentTermsEnum extends TermsEnum {
* computing aggregate statistics. */
public Stats computeBlockStats() throws IOException {
// TODO: add total auto-prefix term count
Stats stats = new Stats(fr.parent.segment, fr.fieldInfo.name);
if (fr.index != null) {
stats.indexNodeCount = fr.index.getNodeCount();
@ -152,8 +156,10 @@ final class SegmentTermsEnum extends TermsEnum {
while (currentFrame.nextEnt == currentFrame.entCount) {
stats.endBlock(currentFrame);
if (!currentFrame.isLastInFloor) {
// Advance to next floor block
currentFrame.loadNextFloorBlock();
stats.startBlock(currentFrame, true);
break;
} else {
if (currentFrame.ord == 0) {
break allTerms;
@ -175,8 +181,6 @@ final class SegmentTermsEnum extends TermsEnum {
// This is a "next" frame -- even if it's
// floor'd we must pretend it isn't so we don't
// try to scan to the right floor frame:
currentFrame.isFloor = false;
//currentFrame.hasTerms = true;
currentFrame.loadBlock();
stats.startBlock(currentFrame, !currentFrame.isLastInFloor);
} else {
@ -294,6 +298,7 @@ final class SegmentTermsEnum extends TermsEnum {
return true;
}
/*
// for debugging
@SuppressWarnings("unused")
static String brToString(BytesRef b) {
@ -307,8 +312,15 @@ final class SegmentTermsEnum extends TermsEnum {
}
}
// for debugging
@SuppressWarnings("unused")
static String brToString(BytesRefBuilder b) {
return brToString(b.get());
}
*/
@Override
public boolean seekExact(final BytesRef target) throws IOException {
public boolean seekExact(BytesRef target) throws IOException {
if (fr.index == null) {
throw new IllegalStateException("terms index was not loaded");
@ -565,7 +577,8 @@ final class SegmentTermsEnum extends TermsEnum {
}
@Override
public SeekStatus seekCeil(final BytesRef target) throws IOException {
public SeekStatus seekCeil(BytesRef target) throws IOException {
if (fr.index == null) {
throw new IllegalStateException("terms index was not loaded");
}
@ -575,7 +588,7 @@ final class SegmentTermsEnum extends TermsEnum {
assert clearEOF();
// if (DEBUG) {
// System.out.println("\nBTTR.seekCeil seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix);
// System.out.println("\nBTTR.seekCeil seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + brToString(target) + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix);
// printSeekState(System.out);
// }
@ -617,7 +630,7 @@ final class SegmentTermsEnum extends TermsEnum {
while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
//if (DEBUG) {
//System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
//System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.byteAt(targetUpto)) + ")" + " arc.output=" + arc.output + " output=" + output);
//}
if (cmp != 0) {
break;
@ -647,7 +660,7 @@ final class SegmentTermsEnum extends TermsEnum {
while (targetUpto < targetLimit2) {
cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
//if (DEBUG) {
//System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
//System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.byteAt(targetUpto)) + ")");
//}
if (cmp != 0) {
break;
@ -733,7 +746,7 @@ final class SegmentTermsEnum extends TermsEnum {
// Index is exhausted
// if (DEBUG) {
// System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
// System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + targetLabel);
// }
validIndexPrefix = currentFrame.prefix;
@ -743,6 +756,7 @@ final class SegmentTermsEnum extends TermsEnum {
currentFrame.loadBlock();
//if (DEBUG) System.out.println(" now scanToTerm");
final SeekStatus result = currentFrame.scanToTerm(target, false);
if (result == SeekStatus.END) {
term.copyBytes(target);
@ -750,7 +764,7 @@ final class SegmentTermsEnum extends TermsEnum {
if (next() != null) {
//if (DEBUG) {
//System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term);
//System.out.println(" return NOT_FOUND term=" + brToString(term));
//}
return SeekStatus.NOT_FOUND;
} else {
@ -761,7 +775,7 @@ final class SegmentTermsEnum extends TermsEnum {
}
} else {
//if (DEBUG) {
//System.out.println(" return " + result + " term=" + brToString(term) + " " + term);
//System.out.println(" return " + result + " term=" + brToString(term));
//}
return result;
}
@ -776,7 +790,7 @@ final class SegmentTermsEnum extends TermsEnum {
}
//if (DEBUG) {
//System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
//System.out.println(" index: follow label=" + (target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
//}
targetUpto++;
@ -802,7 +816,7 @@ final class SegmentTermsEnum extends TermsEnum {
termExists = false;
if (next() != null) {
//if (DEBUG) {
//System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term);
//System.out.println(" return NOT_FOUND term=" + term.get().utf8ToString() + " " + term);
//}
return SeekStatus.NOT_FOUND;
} else {
@ -906,7 +920,9 @@ final class SegmentTermsEnum extends TermsEnum {
// Pop finished blocks
while (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
// Advance to next floor block
currentFrame.loadNextFloorBlock();
break;
} else {
//if (DEBUG) System.out.println(" pop frame");
if (currentFrame.ord == 0) {
@ -946,11 +962,9 @@ final class SegmentTermsEnum extends TermsEnum {
// This is a "next" frame -- even if it's
// floor'd we must pretend it isn't so we don't
// try to scan to the right floor frame:
currentFrame.isFloor = false;
//currentFrame.hasTerms = true;
currentFrame.loadBlock();
} else {
//if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord);
//if (DEBUG) System.out.println(" return term=" + brToString(term) + " currentFrame.ord=" + currentFrame.ord);
return term.get();
}
}

View File

@ -37,6 +37,10 @@ final class SegmentTermsEnumFrame {
FST.Arc<BytesRef> arc;
final boolean versionAutoPrefix;
//static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
// File pointer where this block was loaded from
long fp;
long fpOrig;
@ -96,6 +100,7 @@ final class SegmentTermsEnumFrame {
this.state = ste.fr.parent.postingsReader.newTermState();
this.state.totalTermFreq = -1;
this.longs = new long[ste.fr.longsSize];
this.versionAutoPrefix = ste.fr.parent.version >= BlockTreeTermsReader.VERSION_AUTO_PREFIX_TERMS;
}
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
@ -262,12 +267,17 @@ final class SegmentTermsEnumFrame {
*/
}
public boolean next() {
return isLeafBlock ? nextLeaf() : nextNonLeaf();
// Decodes next entry; returns true if it's a sub-block
public boolean next() throws IOException {
if (isLeafBlock) {
nextLeaf();
return false;
} else {
return nextNonLeaf();
}
}
// Decodes next entry; returns true if it's a sub-block
public boolean nextLeaf() {
public void nextLeaf() {
//if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
@ -276,36 +286,78 @@ final class SegmentTermsEnumFrame {
ste.term.setLength(prefix + suffix);
ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
// A normal term
ste.termExists = true;
return false;
}
public boolean nextNonLeaf() {
//if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
final int code = suffixesReader.readVInt();
suffix = code >>> 1;
startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix);
ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
if ((code & 1) == 0) {
// A normal term
ste.termExists = true;
subCode = 0;
state.termBlockOrd++;
return false;
} else {
// A sub-block; make sub-FP absolute:
ste.termExists = false;
subCode = suffixesReader.readVLong();
lastSubFP = fp - subCode;
//if (DEBUG) {
//System.out.println(" lastSubFP=" + lastSubFP);
//}
return true;
public boolean nextNonLeaf() throws IOException {
//if (DEBUG) System.out.println(" stef.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + suffixesReader.getPosition());
while (true) {
if (nextEnt == entCount) {
assert arc == null || (isFloor && isLastInFloor == false): "isFloor=" + isFloor + " isLastInFloor=" + isLastInFloor;
loadNextFloorBlock();
if (isLeafBlock) {
nextLeaf();
return false;
} else {
continue;
}
}
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
final int code = suffixesReader.readVInt();
if (versionAutoPrefix == false) {
suffix = code >>> 1;
} else {
suffix = code >>> 2;
}
startBytePos = suffixesReader.getPosition();
ste.term.setLength(prefix + suffix);
ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
if (versionAutoPrefix == false) {
if ((code & 1) == 0) {
// A normal term
ste.termExists = true;
subCode = 0;
state.termBlockOrd++;
return false;
} else {
// A sub-block; make sub-FP absolute:
ste.termExists = false;
subCode = suffixesReader.readVLong();
lastSubFP = fp - subCode;
//if (DEBUG) {
//System.out.println(" lastSubFP=" + lastSubFP);
//}
return true;
}
} else {
switch(code & 3) {
case 0:
// A normal term
ste.termExists = true;
subCode = 0;
state.termBlockOrd++;
return false;
case 1:
// A sub-block; make sub-FP absolute:
ste.termExists = false;
subCode = suffixesReader.readVLong();
lastSubFP = fp - subCode;
//if (DEBUG) {
//System.out.println(" lastSubFP=" + lastSubFP);
//}
return true;
case 2:
case 3:
// A prefix term: skip it
state.termBlockOrd++;
suffixesReader.readByte();
continue;
}
}
}
}
@ -448,18 +500,38 @@ final class SegmentTermsEnumFrame {
assert nextEnt < entCount;
nextEnt++;
final int code = suffixesReader.readVInt();
suffixesReader.skipBytes(isLeafBlock ? code : code >>> 1);
//if (DEBUG) System.out.println(" " + nextEnt + " (of " + entCount + ") ent isSubBlock=" + ((code&1)==1));
if ((code & 1) != 0) {
final long subCode = suffixesReader.readVLong();
//if (DEBUG) System.out.println(" subCode=" + subCode);
if (targetSubCode == subCode) {
//if (DEBUG) System.out.println(" match!");
lastSubFP = subFP;
return;
if (versionAutoPrefix == false) {
suffixesReader.skipBytes(code >>> 1);
if ((code & 1) != 0) {
final long subCode = suffixesReader.readVLong();
if (targetSubCode == subCode) {
//if (DEBUG) System.out.println(" match!");
lastSubFP = subFP;
return;
}
} else {
state.termBlockOrd++;
}
} else {
state.termBlockOrd++;
int flag = code & 3;
suffixesReader.skipBytes(code >>> 2);
//if (DEBUG) System.out.println(" " + nextEnt + " (of " + entCount + ") ent isSubBlock=" + ((code&1)==1));
if (flag == 1) {
// Sub-block
final long subCode = suffixesReader.readVLong();
//if (DEBUG) System.out.println(" subCode=" + subCode);
if (targetSubCode == subCode) {
//if (DEBUG) System.out.println(" match!");
lastSubFP = subFP;
return;
}
} else {
state.termBlockOrd++;
if (flag == 2 || flag == 3) {
// Floor'd prefix term
suffixesReader.readByte();
}
}
}
}
}
@ -473,6 +545,21 @@ final class SegmentTermsEnumFrame {
private int suffix;
private long subCode;
// for debugging
/*
@SuppressWarnings("unused")
static String brToString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
}
}
*/
// Target's prefix matches this block's prefix; we
// scan the entries check if the suffix matches.
public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOException {
@ -535,9 +622,6 @@ final class SegmentTermsEnumFrame {
// keep scanning
if (nextEnt == entCount) {
if (exactOnly) {
fillTerm();
}
// We are done scanning this block
break nextTerm;
} else {
@ -590,7 +674,7 @@ final class SegmentTermsEnumFrame {
// scan the entries check if the suffix matches.
public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException {
//if (DEBUG) System.out.println(" scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + brToString(term));
//if (DEBUG) System.out.println(" scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + brToString(target));
assert nextEnt != -1;
@ -605,30 +689,60 @@ final class SegmentTermsEnumFrame {
assert prefixMatches(target);
// Loop over each entry (term or sub-block) in this block:
//nextTerm: while(nextEnt < entCount) {
nextTerm: while (true) {
nextTerm: while(nextEnt < entCount) {
nextEnt++;
final int code = suffixesReader.readVInt();
suffix = code >>> 1;
// if (DEBUG) {
// BytesRef suffixBytesRef = new BytesRef();
// suffixBytesRef.bytes = suffixBytes;
// suffixBytesRef.offset = suffixesReader.getPosition();
// suffixBytesRef.length = suffix;
// System.out.println(" cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef));
// }
if (versionAutoPrefix == false) {
suffix = code >>> 1;
} else {
suffix = code >>> 2;
}
//if (DEBUG) {
// BytesRef suffixBytesRef = new BytesRef();
// suffixBytesRef.bytes = suffixBytes;
// suffixBytesRef.offset = suffixesReader.getPosition();
// suffixBytesRef.length = suffix;
// System.out.println(" cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef));
//}
ste.termExists = (code & 1) == 0;
final int termLen = prefix + suffix;
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
if (ste.termExists) {
state.termBlockOrd++;
subCode = 0;
if (versionAutoPrefix == false) {
ste.termExists = (code & 1) == 0;
if (ste.termExists) {
state.termBlockOrd++;
subCode = 0;
} else {
subCode = suffixesReader.readVLong();
lastSubFP = fp - subCode;
}
} else {
subCode = suffixesReader.readVLong();
lastSubFP = fp - subCode;
switch (code & 3) {
case 0:
// Normal term
ste.termExists = true;
state.termBlockOrd++;
subCode = 0;
break;
case 1:
// Sub-block
ste.termExists = false;
subCode = suffixesReader.readVLong();
lastSubFP = fp - subCode;
break;
case 2:
case 3:
// Floor prefix term: skip it
//if (DEBUG) System.out.println(" skip floor prefix term");
suffixesReader.readByte();
ste.termExists = false;
state.termBlockOrd++;
continue;
}
}
final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen);
@ -637,7 +751,7 @@ final class SegmentTermsEnumFrame {
// Loop over bytes in the suffix, comparing to
// the target
int bytePos = startBytePos;
while(true) {
while (true) {
final int cmp;
final boolean stop;
if (targetPos < targetLimit) {
@ -652,24 +766,18 @@ final class SegmentTermsEnumFrame {
if (cmp < 0) {
// Current entry is still before the target;
// keep scanning
if (nextEnt == entCount) {
if (exactOnly) {
fillTerm();
//termExists = true;
}
// We are done scanning this block
break nextTerm;
} else {
continue nextTerm;
}
continue nextTerm;
} else if (cmp > 0) {
// Done! Current entry is after target --
// return NOT_FOUND:
fillTerm();
//if (DEBUG) System.out.println(" maybe done exactOnly=" + exactOnly + " ste.termExists=" + ste.termExists);
if (!exactOnly && !ste.termExists) {
//System.out.println(" now pushFrame");
// TODO this
// We are on a sub-block, and caller wants
// us to position to the next term after
// the target, so we must recurse into the

View File

@ -48,6 +48,8 @@ public class Stats {
/** Total number of bytes (sum of term lengths) across all terms in the field. */
public long totalTermBytes;
// TODO: add total auto-prefix term count
/** The number of normal (non-floor) blocks in the terms file. */
public int nonFloorBlockCount;

View File

@ -43,9 +43,9 @@ import org.apache.lucene.util.automaton.Transition;
* completely accepted. This is not possible when the language accepted by the
* FSM is not finite (i.e. * operator).
* </p>
* @lucene.experimental
* @lucene.internal
*/
class AutomatonTermsEnum extends FilteredTermsEnum {
public class AutomatonTermsEnum extends FilteredTermsEnum {
// a tableized array-based form of the DFA
private final ByteRunAutomaton runAutomaton;
// common suffix of the automaton
@ -70,9 +70,8 @@ class AutomatonTermsEnum extends FilteredTermsEnum {
/**
* Construct an enumerator based upon an automaton, enumerating the specified
* field, working on a supplied TermsEnum
* <p>
*
* @lucene.experimental
* <p>
* @param compiled CompiledAutomaton
*/
public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) {

View File

@ -25,7 +25,9 @@ import java.nio.file.Paths;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@ -56,6 +58,8 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CompiledAutomaton;
/**
* Basic tool and API to check the health of an index and
@ -902,6 +906,180 @@ public class CheckIndex implements Closeable {
return status;
}
/** Visits all terms in the range minTerm (inclusive) to maxTerm (exclusive), marking all doc IDs encountered into allDocsSeen, and
* returning the total number of terms visited. */
private static long getDocsFromTermRange(String field, int maxDoc, TermsEnum termsEnum, FixedBitSet docsSeen, BytesRef minTerm, BytesRef maxTerm, boolean isIntersect) throws IOException {
docsSeen.clear(0, docsSeen.length());
long termCount = 0;
PostingsEnum postingsEnum = null;
BytesRefBuilder lastTerm = null;
while (true) {
BytesRef term;
// Kinda messy: for intersect, we must first next(), but for "normal", we are already on our first term:
if (isIntersect || termCount != 0) {
term = termsEnum.next();
} else {
term = termsEnum.term();
}
if (term == null) {
if (isIntersect == false) {
throw new RuntimeException("didn't see max term field=" + field + " term=" + maxTerm);
}
return termCount;
}
assert term.isValid();
if (lastTerm == null) {
lastTerm = new BytesRefBuilder();
lastTerm.copyBytes(term);
} else {
if (lastTerm.get().compareTo(term) >= 0) {
throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term);
}
lastTerm.copyBytes(term);
}
//System.out.println(" term=" + term);
// Caller already ensured terms enum positioned >= minTerm:
if (term.compareTo(minTerm) < 0) {
throw new RuntimeException("saw term before min term field=" + field + " term=" + minTerm);
}
if (isIntersect == false) {
int cmp = term.compareTo(maxTerm);
if (cmp == 0) {
// Done!
return termCount;
} else if (cmp > 0) {
throw new RuntimeException("didn't see end term field=" + field + " term=" + maxTerm);
}
}
postingsEnum = termsEnum.postings(null, postingsEnum, 0);
int lastDoc = -1;
while (true) {
int doc = postingsEnum.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
if (doc <= lastDoc) {
throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
}
if (doc >= maxDoc) {
throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
}
//System.out.println(" doc=" + doc);
docsSeen.set(doc);
lastDoc = doc;
}
termCount++;
}
}
/** Test Terms.intersect on this range, and validates that it returns the same doc ids as using non-intersect TermsEnum. Returns true if
* any fake terms were seen. */
private static boolean checkSingleTermRange(String field, int maxDoc, Terms terms, BytesRef minTerm, BytesRef maxTerm, FixedBitSet normalDocs, FixedBitSet intersectDocs) throws IOException {
// System.out.println(" check minTerm=" + minTerm + " maxTerm=" + maxTerm);
TermsEnum termsEnum = terms.iterator(null);
TermsEnum.SeekStatus status = termsEnum.seekCeil(minTerm);
if (status != TermsEnum.SeekStatus.FOUND) {
throw new RuntimeException("failed to seek to existing term field=" + field + " term=" + minTerm);
}
// Do "dumb" iteration to visit all terms in the range:
long normalTermCount = getDocsFromTermRange(field, maxDoc, termsEnum, normalDocs, minTerm, maxTerm, false);
// Now do the same operation using intersect:
long intersectTermCount = getDocsFromTermRange(field, maxDoc, terms.intersect(new CompiledAutomaton(Automata.makeBinaryInterval(minTerm, true, maxTerm, false), true, false, Integer.MAX_VALUE, true), null), intersectDocs, minTerm, maxTerm, true);
if (intersectTermCount > normalTermCount) {
throw new RuntimeException("intersect returned too many terms: field=" + field + " intersectTermCount=" + intersectTermCount + " normalTermCount=" + normalTermCount);
}
if (normalDocs.equals(intersectDocs) == false) {
throw new RuntimeException("intersect visited different docs than straight terms enum: " + normalDocs.cardinality() + " for straight enum, vs " + intersectDocs.cardinality() + " for intersect, minTerm=" + minTerm + " maxTerm=" + maxTerm);
}
//System.out.println(" " + intersectTermCount + " vs " + normalTermCount);
return intersectTermCount != normalTermCount;
}
/** Make an effort to visit "fake" (e.g. auto-prefix) terms. We do this by running term range intersections across an initially wide
* interval of terms, at different boundaries, and then gradually decrease the interval. This is not guaranteed to hit all non-real
* terms (doing that in general is non-trivial), but it should hit many of them, and validate their postings against the postings for the
* real terms. */
private static void checkTermRanges(String field, int maxDoc, Terms terms, long numTerms) throws IOException {
// We'll target this many terms in our interval for the current level:
double currentInterval = numTerms;
FixedBitSet normalDocs = new FixedBitSet(maxDoc);
FixedBitSet intersectDocs = new FixedBitSet(maxDoc);
TermsEnum termsEnum = null;
//System.out.println("CI.checkTermRanges field=" + field + " numTerms=" + numTerms);
while (currentInterval >= 10.0) {
//System.out.println(" cycle interval=" + currentInterval);
// We iterate this terms enum to locate min/max term for each sliding/overlapping interval we test at the current level:
termsEnum = terms.iterator(termsEnum);
long termCount = 0;
Deque<BytesRef> termBounds = new LinkedList<>();
long lastTermAdded = Long.MIN_VALUE;
BytesRefBuilder lastTerm = null;
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
//System.out.println(" top: term=" + term.utf8ToString());
if (termCount >= lastTermAdded + currentInterval/4) {
termBounds.add(BytesRef.deepCopyOf(term));
lastTermAdded = termCount;
if (termBounds.size() == 5) {
BytesRef minTerm = termBounds.removeFirst();
BytesRef maxTerm = termBounds.getLast();
checkSingleTermRange(field, maxDoc, terms, minTerm, maxTerm, normalDocs, intersectDocs);
}
}
termCount++;
if (lastTerm == null) {
lastTerm = new BytesRefBuilder();
lastTerm.copyBytes(term);
} else {
if (lastTerm.get().compareTo(term) >= 0) {
throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term);
}
lastTerm.copyBytes(term);
}
}
if (lastTerm != null && termBounds.isEmpty() == false) {
BytesRef minTerm = termBounds.removeFirst();
BytesRef maxTerm = lastTerm.get();
checkSingleTermRange(field, maxDoc, terms, minTerm, maxTerm, normalDocs, intersectDocs);
}
currentInterval *= .75;
}
}
/**
* checks Fields api is consistent with itself.
* searcher is optional, to verify with queries. Can be null.
@ -923,6 +1101,7 @@ public class CheckIndex implements Closeable {
String lastField = null;
for (String field : fields) {
// MultiFieldsEnum relies upon this order...
if (lastField != null && field.compareTo(lastField) <= 0) {
throw new RuntimeException("fields out of order: lastField=" + lastField + " field=" + field);
@ -1032,6 +1211,7 @@ public class CheckIndex implements Closeable {
if (term == null) {
break;
}
// System.out.println("CI: field=" + field + " check term=" + term + " docFreq=" + termsEnum.docFreq());
assert term.isValid();
@ -1332,6 +1512,14 @@ public class CheckIndex implements Closeable {
// docs got deleted and then merged away):
} else {
long fieldTermCount = (status.delTermCount+status.termCount)-termCountStart;
if (hasFreqs == false) {
// For DOCS_ONLY fields we recursively test term ranges:
checkTermRanges(field, maxDoc, fieldTerms, fieldTermCount);
}
final Object stats = fieldTerms.getStats();
assert stats != null;
if (status.blockTreeStats == null) {
@ -1353,11 +1541,9 @@ public class CheckIndex implements Closeable {
}
}
if (fieldTerms != null) {
final int v = fieldTerms.getDocCount();
if (v != -1 && visitedDocs.cardinality() != v) {
throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality());
}
final int v = fieldTerms.getDocCount();
if (v != -1 && visitedDocs.cardinality() != v) {
throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality());
}
// Test seek to last term:
@ -1365,6 +1551,9 @@ public class CheckIndex implements Closeable {
if (termsEnum.seekCeil(lastTerm.get()) != TermsEnum.SeekStatus.FOUND) {
throw new RuntimeException("seek to last term " + lastTerm + " failed");
}
if (termsEnum.term().equals(lastTerm.get()) == false) {
throw new RuntimeException("seek to last term " + lastTerm.get() + " returned FOUND but seeked to the wrong term " + termsEnum.term());
}
int expectedDocFreq = termsEnum.docFreq();
PostingsEnum d = termsEnum.postings(null, null, PostingsEnum.NONE);
@ -1373,18 +1562,18 @@ public class CheckIndex implements Closeable {
docFreq++;
}
if (docFreq != expectedDocFreq) {
throw new RuntimeException("docFreq for last term " + lastTerm + "=" + expectedDocFreq + " != recomputed docFreq=" + docFreq);
throw new RuntimeException("docFreq for last term " + lastTerm.toBytesRef() + "=" + expectedDocFreq + " != recomputed docFreq=" + docFreq);
}
}
// check unique term count
long termCount = -1;
if ((status.delTermCount+status.termCount)-termCountStart > 0) {
if (fieldTermCount > 0) {
termCount = fields.terms(field).size();
if (termCount != -1 && termCount != status.delTermCount + status.termCount - termCountStart) {
throw new RuntimeException("termCount mismatch " + (status.delTermCount + termCount) + " vs " + (status.termCount - termCountStart));
if (termCount != -1 && termCount != fieldTermCount) {
throw new RuntimeException("termCount mismatch " + termCount + " vs " + fieldTermCount);
}
}
@ -1407,6 +1596,9 @@ public class CheckIndex implements Closeable {
if (termsEnum.seekCeil(seekTerms[i]) != TermsEnum.SeekStatus.FOUND) {
throw new RuntimeException("seek to existing term " + seekTerms[i] + " failed");
}
if (termsEnum.term().equals(seekTerms[i]) == false) {
throw new RuntimeException("seek to existing term " + seekTerms[i] + " returned FOUND but seeked to the wrong term " + termsEnum.term());
}
docs = termsEnum.postings(liveDocs, docs, PostingsEnum.NONE);
if (docs == null) {

View File

@ -151,7 +151,6 @@ class FreqProxFields extends Fields {
}
public SeekStatus seekCeil(BytesRef text) {
// TODO: we could instead keep the BytesRefHash
// intact so this is a hash lookup
@ -170,17 +169,19 @@ class FreqProxFields extends Fields {
} else {
// found:
ord = mid;
assert term().compareTo(text) == 0;
return SeekStatus.FOUND;
}
}
// not found:
ord = lo + 1;
ord = lo;
if (ord >= numTerms) {
return SeekStatus.END;
} else {
int textStart = postingsArray.textStarts[sortedTermIDs[ord]];
terms.bytePool.setBytesRef(scratch, textStart);
assert term().compareTo(text) > 0;
return SeekStatus.NOT_FOUND;
}
}
@ -309,7 +310,7 @@ class FreqProxFields extends Fields {
final FreqProxPostingsArray postingsArray;
final ByteSliceReader reader = new ByteSliceReader();
final boolean readTermFreq;
int docID;
int docID = -1;
int freq;
boolean ended;
int termID;
@ -324,7 +325,7 @@ class FreqProxFields extends Fields {
this.termID = termID;
terms.initReader(reader, termID, 0);
ended = false;
docID = 0;
docID = -1;
}
@Override
@ -365,6 +366,9 @@ class FreqProxFields extends Fields {
@Override
public int nextDoc() throws IOException {
if (docID == -1) {
docID = 0;
}
if (reader.eof()) {
if (ended) {
return NO_MORE_DOCS;
@ -412,7 +416,7 @@ class FreqProxFields extends Fields {
final ByteSliceReader reader = new ByteSliceReader();
final ByteSliceReader posReader = new ByteSliceReader();
final boolean readOffsets;
int docID;
int docID = -1;
int freq;
int pos;
int startOffset;
@ -436,7 +440,7 @@ class FreqProxFields extends Fields {
terms.initReader(reader, termID, 0);
terms.initReader(posReader, termID, 1);
ended = false;
docID = 0;
docID = -1;
posLeft = 0;
}
@ -452,6 +456,9 @@ class FreqProxFields extends Fields {
@Override
public int nextDoc() throws IOException {
if (docID == -1) {
docID = 0;
}
while (posLeft != 0) {
nextPosition();
}

View File

@ -49,6 +49,7 @@ final class MappingMultiPostingsEnum extends PostingsEnum {
this.numSubs = postingsEnum.getNumSubs();
this.subs = postingsEnum.getSubs();
upto = -1;
doc = -1;
current = null;
this.multiDocsAndPositionsEnum = postingsEnum;
return this;

View File

@ -17,6 +17,7 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
@ -165,4 +166,30 @@ public final class TermContext {
public void setDocFreq(int docFreq) {
this.docFreq = docFreq;
}
/** Returns true if all terms stored here are real (e.g., not auto-prefix terms).
*
* @lucene.internal */
public boolean hasOnlyRealTerms() {
for(TermState termState : states) {
if (termState instanceof BlockTermState && ((BlockTermState) termState).isRealTerm == false) {
return false;
}
}
return true;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("TermContext\n");
for(TermState termState : states) {
sb.append(" state=");
sb.append(termState.toString());
sb.append('\n');
}
return sb.toString();
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.automaton.CompiledAutomaton;
@ -42,17 +43,23 @@ public abstract class Terms {
* implementation can do so. */
public abstract TermsEnum iterator(TermsEnum reuse) throws IOException;
/** Returns a TermsEnum that iterates over all terms that
* are accepted by the provided {@link
/** Returns a TermsEnum that iterates over all terms and
* documents that are accepted by the provided {@link
* CompiledAutomaton}. If the <code>startTerm</code> is
* provided then the returned enum will only accept terms
* provided then the returned enum will only return terms
* {@code > startTerm}, but you still must call
* next() first to get to the first term. Note that the
* provided <code>startTerm</code> must be accepted by
* the automaton.
*
* <p><b>NOTE</b>: the returned TermsEnum cannot
* seek</p>. */
* seek</p>.
*
* <p><b>NOTE</b>: the terms dictionary is free to
* return arbitrary terms as long as the resulted visited
* docs is the same. E.g., {@link BlockTreeTermsWriter}
* creates auto-prefix terms during indexing to reduce the
* number of terms visited. */
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException {
// TODO: could we factor out a common interface b/w
@ -64,13 +71,17 @@ public abstract class Terms {
// TODO: eventually we could support seekCeil/Exact on
// the returned enum, instead of only being able to seek
// at the start
TermsEnum termsEnum = iterator(null);
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
if (startTerm == null) {
return new AutomatonTermsEnum(iterator(null), compiled);
return new AutomatonTermsEnum(termsEnum, compiled);
} else {
return new AutomatonTermsEnum(iterator(null), compiled) {
return new AutomatonTermsEnum(termsEnum, compiled) {
@Override
protected BytesRef nextSeekTerm(BytesRef term) throws IOException {
if (term == null) {

View File

@ -99,6 +99,7 @@ public class AutomatonQuery extends MultiTermQuery {
super(term.field());
this.term = term;
this.automaton = automaton;
// TODO: we could take isFinite too, to save a bit of CPU in CompiledAutomaton ctor?:
this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates, isBinary);
}

View File

@ -17,12 +17,7 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.automaton.Automaton;
@ -33,6 +28,7 @@ import org.apache.lucene.util.automaton.Automaton;
* <p>This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_REWRITE}
* rewrite method. */
public class PrefixQuery extends AutomatonQuery {
/** Constructs a query for terms starting with <code>prefix</code>. */

View File

@ -18,19 +18,19 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
/**
* Base rewrite method that translates each term into a query, and keeps
@ -112,7 +112,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
for (int i = 0; i < size; i++) {
final int pos = sort[i];
final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
assert reader.docFreq(term) == termStates[pos].docFreq();
assert termStates[pos].hasOnlyRealTerms() == false || reader.docFreq(term) == termStates[pos].docFreq();
addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos], termStates[pos]);
}
}
@ -137,7 +137,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
final int e = terms.add(bytes);
final TermState state = termsEnum.termState();
assert state != null;
if (e < 0 ) {
if (e < 0) {
// duplicate term: update docFreq
final int pos = (-e)-1;
array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());

View File

@ -17,22 +17,17 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
/**
* A Query that matches documents within an range of terms.
*
* <p>This query matches the documents looking for terms that fall into the
* supplied range according to {@link
* Byte#compareTo(Byte)}. It is not intended
* for numerical ranges; use {@link NumericRangeQuery} instead.
* supplied range according to {@link BytesRef#compareTo(BytesRef)}.
*
* <p>This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_REWRITE}
@ -40,12 +35,11 @@ import org.apache.lucene.util.ToStringUtils;
* @since 2.9
*/
public class TermRangeQuery extends MultiTermQuery {
private BytesRef lowerTerm;
private BytesRef upperTerm;
private boolean includeLower;
private boolean includeUpper;
public class TermRangeQuery extends AutomatonQuery {
private final BytesRef lowerTerm;
private final BytesRef upperTerm;
private final boolean includeLower;
private final boolean includeUpper;
/**
* Constructs a query selecting all terms greater/equal than <code>lowerTerm</code>
@ -70,13 +64,28 @@ public class TermRangeQuery extends MultiTermQuery {
* included in the range.
*/
public TermRangeQuery(String field, BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
super(field);
super(new Term(field, lowerTerm), toAutomaton(lowerTerm, upperTerm, includeLower, includeUpper), Integer.MAX_VALUE, true);
this.lowerTerm = lowerTerm;
this.upperTerm = upperTerm;
this.includeLower = includeLower;
this.includeUpper = includeUpper;
}
public static Automaton toAutomaton(BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
if (lowerTerm == null) {
// makeBinaryInterval is more picky than we are:
includeLower = true;
}
if (upperTerm == null) {
// makeBinaryInterval is more picky than we are:
includeUpper = true;
}
return Automata.makeBinaryInterval(lowerTerm, includeLower, upperTerm, includeUpper);
}
/**
* Factory that creates a new TermRangeQuery using Strings for term text.
*/
@ -98,37 +107,22 @@ public class TermRangeQuery extends MultiTermQuery {
/** Returns <code>true</code> if the upper endpoint is inclusive */
public boolean includesUpper() { return includeUpper; }
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
if (lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) {
return TermsEnum.EMPTY;
}
TermsEnum tenum = terms.iterator(null);
if ((lowerTerm == null || (includeLower && lowerTerm.length == 0)) && upperTerm == null) {
return tenum;
}
return new TermRangeTermsEnum(tenum,
lowerTerm, upperTerm, includeLower, includeUpper);
}
/** Prints a user-readable version of this query. */
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
if (!getField().equals(field)) {
buffer.append(getField());
buffer.append(":");
}
buffer.append(includeLower ? '[' : '{');
// TODO: all these toStrings for queries should just output the bytes, it might not be UTF-8!
buffer.append(lowerTerm != null ? ("*".equals(Term.toString(lowerTerm)) ? "\\*" : Term.toString(lowerTerm)) : "*");
buffer.append(" TO ");
buffer.append(upperTerm != null ? ("*".equals(Term.toString(upperTerm)) ? "\\*" : Term.toString(upperTerm)) : "*");
buffer.append(includeUpper ? ']' : '}');
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
StringBuilder buffer = new StringBuilder();
if (!getField().equals(field)) {
buffer.append(getField());
buffer.append(":");
}
buffer.append(includeLower ? '[' : '{');
// TODO: all these toStrings for queries should just output the bytes, it might not be UTF-8!
buffer.append(lowerTerm != null ? ("*".equals(Term.toString(lowerTerm)) ? "\\*" : Term.toString(lowerTerm)) : "*");
buffer.append(" TO ");
buffer.append(upperTerm != null ? ("*".equals(Term.toString(upperTerm)) ? "\\*" : Term.toString(upperTerm)) : "*");
buffer.append(includeUpper ? ']' : '}');
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
@Override
@ -167,5 +161,4 @@ public class TermRangeQuery extends MultiTermQuery {
return false;
return true;
}
}

View File

@ -1,101 +0,0 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
/**
* Subclass of FilteredTermEnum for enumerating all terms that match the
* specified range parameters. Each term in the enumeration is
* greater than all that precede it.
*/
public class TermRangeTermsEnum extends FilteredTermsEnum {
final private boolean includeLower;
final private boolean includeUpper;
final private BytesRef lowerBytesRef;
final private BytesRef upperBytesRef;
/**
* Enumerates all terms greater/equal than <code>lowerTerm</code>
* but less/equal than <code>upperTerm</code>.
*
* If an endpoint is null, it is said to be "open". Either or both
* endpoints may be open. Open endpoints may not be exclusive
* (you can't select all but the first or last term without
* explicitly specifying the term to exclude.)
*
* @param tenum
* TermsEnum to filter
* @param lowerTerm
* The term text at the lower end of the range
* @param upperTerm
* The term text at the upper end of the range
* @param includeLower
* If true, the <code>lowerTerm</code> is included in the range.
* @param includeUpper
* If true, the <code>upperTerm</code> is included in the range.
*/
public TermRangeTermsEnum(TermsEnum tenum, BytesRef lowerTerm, BytesRef upperTerm,
boolean includeLower, boolean includeUpper) {
super(tenum);
// do a little bit of normalization...
// open ended range queries should always be inclusive.
if (lowerTerm == null) {
this.lowerBytesRef = new BytesRef();
this.includeLower = true;
} else {
this.lowerBytesRef = lowerTerm;
this.includeLower = includeLower;
}
if (upperTerm == null) {
this.includeUpper = true;
upperBytesRef = null;
} else {
this.includeUpper = includeUpper;
upperBytesRef = upperTerm;
}
setInitialSeekTerm(lowerBytesRef);
}
@Override
protected AcceptStatus accept(BytesRef term) {
if (!this.includeLower && term.equals(lowerBytesRef))
return AcceptStatus.NO;
// Use this field's default sort ordering
if (upperBytesRef != null) {
final int cmp = upperBytesRef.compareTo(term);
/*
* if beyond the upper term, or is exclusive and this is equal to
* the upper term, break out
*/
if ((cmp < 0) ||
(!includeUpper && cmp==0)) {
return AcceptStatus.END;
}
}
return AcceptStatus.YES;
}
}

View File

@ -73,6 +73,18 @@ final public class Automata {
return a;
}
/**
* Returns a new (deterministic) automaton that accepts all binary terms.
*/
public static Automaton makeAnyBinary() {
Automaton a = new Automaton();
int s = a.createState();
a.setAccept(s, true);
a.addTransition(s, s, 0, 255);
a.finishState();
return a;
}
/**
* Returns a new (deterministic) automaton that accepts any single codepoint.
*/
@ -204,8 +216,172 @@ final public class Automata {
return s;
}
/** Creates a new deterministic, minimal automaton accepting
* all binary terms in the specified interval. Note that unlike
* {@link #makeDecimalInterval}, the returned automaton is infinite,
* because terms behave like floating point numbers leading with
* a decimal point. However, in the special case where min == max,
* and both are inclusive, the automata will be finite and accept
* exactly one term. */
public static Automaton makeBinaryInterval(BytesRef min, boolean minInclusive, BytesRef max, boolean maxInclusive) {
if (min == null && minInclusive == false) {
throw new IllegalArgumentException("minInclusive must be true when min is null (open ended)");
}
if (max == null && maxInclusive == false) {
throw new IllegalArgumentException("maxInclusive must be true when max is null (open ended)");
}
if (min != null && min.length == 0 && minInclusive == true) {
// Silly empty string corner case:
min = null;
}
if (min == null) {
if (max == null) {
// Accepts all terms:
return makeAnyBinary();
}
min = new BytesRef();
minInclusive = true;
}
int cmp;
if (max != null) {
cmp = min.compareTo(max);
} else {
cmp = -1;
}
if (cmp == 0) {
if (minInclusive == false || maxInclusive == false) {
return makeEmpty();
} else {
return makeBinary(min);
}
} else if (cmp > 0) {
// max > min
return makeEmpty();
}
Automaton a = new Automaton();
int startState = a.createState();
int sinkState = a.createState();
a.setAccept(sinkState, true);
// This state accepts all suffixes:
a.addTransition(sinkState, sinkState, 0, 255);
boolean equalPrefix = true;
int lastState = startState;
int firstMaxState = -1;
int sharedPrefixLength = 0;
for(int i=0;i<min.length;i++) {
int minLabel = min.bytes[min.offset+i] & 0xff;
int maxLabel;
if (max != null && equalPrefix && i < max.length) {
maxLabel = max.bytes[max.offset+i] & 0xff;
} else {
maxLabel = -1;
}
int nextState;
if (minInclusive && i == min.length-1 && (equalPrefix == false || minLabel != maxLabel)) {
nextState = sinkState;
} else {
nextState = a.createState();
}
if (equalPrefix) {
if (minLabel == maxLabel) {
// Still in shared prefix
a.addTransition(lastState, nextState, minLabel);
} else if (max == null) {
equalPrefix = false;
sharedPrefixLength = 0;
a.addTransition(lastState, sinkState, minLabel+1, 0xff);
a.addTransition(lastState, nextState, minLabel);
} else {
// This is the first point where min & max diverge:
assert maxLabel > minLabel;
a.addTransition(lastState, nextState, minLabel);
if (maxLabel > minLabel + 1) {
a.addTransition(lastState, sinkState, minLabel+1, maxLabel-1);
}
// Now fork off path for max:
if (maxInclusive || i < max.length-1) {
firstMaxState = a.createState();
if (i < max.length-1) {
a.setAccept(firstMaxState, true);
}
a.addTransition(lastState, firstMaxState, maxLabel);
}
equalPrefix = false;
sharedPrefixLength = i;
}
} else {
// OK, already diverged:
a.addTransition(lastState, nextState, minLabel);
if (minLabel < 255) {
a.addTransition(lastState, sinkState, minLabel+1, 255);
}
}
lastState = nextState;
}
// Accept any suffix appended to the min term:
if (equalPrefix == false && lastState != sinkState && lastState != startState) {
a.addTransition(lastState, sinkState, 0, 255);
}
if (minInclusive) {
// Accept exactly the min term:
a.setAccept(lastState, true);
}
if (max != null) {
// Now do max:
if (firstMaxState == -1) {
// Min was a full prefix of max
sharedPrefixLength = min.length;
} else {
lastState = firstMaxState;
sharedPrefixLength++;
}
for(int i=sharedPrefixLength;i<max.length;i++) {
int maxLabel = max.bytes[max.offset+i]&0xff;
if (maxLabel > 0) {
a.addTransition(lastState, sinkState, 0, maxLabel-1);
}
if (maxInclusive || i < max.length-1) {
int nextState = a.createState();
if (i < max.length-1) {
a.setAccept(nextState, true);
}
a.addTransition(lastState, nextState, maxLabel);
lastState = nextState;
}
}
if (maxInclusive) {
a.setAccept(lastState, true);
}
}
a.finishState();
assert a.isDeterministic(): a.toDot();
return a;
}
/**
* Returns a new automaton that accepts strings representing decimal
* Returns a new automaton that accepts strings representing decimal (base 10)
* non-negative integers in the given interval.
*
* @param min minimal value of interval
@ -218,7 +394,7 @@ final public class Automata {
* interval cannot be expressed with the given fixed number of
* digits
*/
public static Automaton makeInterval(int min, int max, int digits)
public static Automaton makeDecimalInterval(int min, int max, int digits)
throws IllegalArgumentException {
String x = Integer.toString(min);
String y = Integer.toString(max);
@ -275,7 +451,30 @@ final public class Automata {
for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
int state = a.createState();
cp = s.codePointAt(i);
a.addTransition(lastState, state, cp, cp);
a.addTransition(lastState, state, cp);
lastState = state;
}
a.setAccept(lastState, true);
a.finishState();
assert a.isDeterministic();
assert Operations.hasDeadStates(a) == false;
return a;
}
/**
* Returns a new (deterministic) automaton that accepts the single given
* binary term.
*/
public static Automaton makeBinary(BytesRef term) {
Automaton a = new Automaton();
int lastState = a.createState();
for (int i=0;i<term.length;i++) {
int state = a.createState();
int label = term.bytes[term.offset+i] & 0xff;
a.addTransition(lastState, state, label);
lastState = state;
}

View File

@ -491,11 +491,50 @@ public class Automaton implements Accountable {
public void getNextTransition(Transition t) {
// Make sure there is still a transition left:
assert (t.transitionUpto+3 - states[2*t.source]) <= 3*states[2*t.source+1];
// Make sure transitions are in fact sorted:
assert transitionSorted(t);
t.dest = transitions[t.transitionUpto++];
t.min = transitions[t.transitionUpto++];
t.max = transitions[t.transitionUpto++];
}
private boolean transitionSorted(Transition t) {
int upto = t.transitionUpto;
if (upto == states[2*t.source]) {
// Transition isn't initialzed yet (this is the first transition); don't check:
return true;
}
int nextDest = transitions[upto];
int nextMin = transitions[upto+1];
int nextMax = transitions[upto+2];
if (nextMin > t.min) {
return true;
} else if (nextMin < t.min) {
return false;
}
// Min is equal, now test max:
if (nextMax > t.max) {
return true;
} else if (nextMax < t.max) {
return false;
}
// Max is also equal, now test dest:
if (nextDest > t.dest) {
return true;
} else if (nextDest < t.dest) {
return false;
}
// We should never see fully equal transitions here:
return false;
}
/** Fill the provided {@link Transition} with the index'th
* transition leaving the specified state. */
public void getTransition(int state, int index, Transition t) {
@ -565,7 +604,7 @@ public class Automaton implements Accountable {
//System.out.println("toDot: state " + state + " has " + numTransitions + " transitions; t.nextTrans=" + t.transitionUpto);
for(int i=0;i<numTransitions;i++) {
getNextTransition(t);
//System.out.println(" t.nextTrans=" + t.transitionUpto);
//System.out.println(" t.nextTrans=" + t.transitionUpto + " t=" + t);
assert t.max >= t.min;
b.append(" ");
b.append(state);

View File

@ -28,8 +28,8 @@ public class ByteRunAutomaton extends RunAutomaton {
}
/** expert: if utf8 is true, the input is already byte-based */
public ByteRunAutomaton(Automaton a, boolean utf8, int maxDeterminizedStates) {
super(utf8 ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates);
public ByteRunAutomaton(Automaton a, boolean isBinary, int maxDeterminizedStates) {
super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates);
}
/**

View File

@ -90,12 +90,41 @@ public class CompiledAutomaton {
*/
public final Boolean finite;
/** Which state, if any, accepts all suffixes, else -1. */
public final int sinkState;
/** Create this, passing simplify=true and finite=null, so that we try
* to simplify the automaton and determine if it is finite. */
public CompiledAutomaton(Automaton automaton) {
this(automaton, null, true);
}
/** Returns sink state, if present, else -1. */
private static int findSinkState(Automaton automaton) {
int numStates = automaton.getNumStates();
Transition t = new Transition();
int foundState = -1;
for (int s=0;s<numStates;s++) {
if (automaton.isAccept(s)) {
int count = automaton.initTransition(s, t);
boolean isSinkState = false;
for(int i=0;i<count;i++) {
automaton.getNextTransition(t);
if (t.dest == s && t.min == 0 && t.max == 0xff) {
isSinkState = true;
break;
}
}
if (isSinkState) {
foundState = s;
break;
}
}
}
return foundState;
}
/** Create this. If finite is null, we use {@link Operations#isFinite}
* to determine whether it is finite. If simplify is true, we run
* possibly expensive operations to determine if the automaton is one
@ -134,6 +163,7 @@ public class CompiledAutomaton {
runAutomaton = null;
this.automaton = null;
this.finite = null;
sinkState = -1;
return;
}
@ -154,6 +184,7 @@ public class CompiledAutomaton {
runAutomaton = null;
this.automaton = null;
this.finite = null;
sinkState = -1;
return;
}
@ -174,7 +205,7 @@ public class CompiledAutomaton {
} else {
term = new BytesRef(UnicodeUtil.newString(singleton.ints, singleton.offset, singleton.length));
}
sinkState = -1;
return;
}
}
@ -202,7 +233,8 @@ public class CompiledAutomaton {
if (this.finite) {
commonSuffixRef = null;
} else {
// NOTE: this is a very costly operation! We should test if it's really warranted in practice...
// NOTE: this is a very costly operation! We should test if it's really warranted in practice... we could do a fast match
// by looking for a sink state (which means it has no common suffix). Or maybe we shouldn't do it when simplify is false?:
BytesRef suffix = Operations.getCommonSuffixBytesRef(binary, maxDeterminizedStates);
if (suffix.length == 0) {
commonSuffixRef = null;
@ -215,6 +247,10 @@ public class CompiledAutomaton {
runAutomaton = new ByteRunAutomaton(binary, true, maxDeterminizedStates);
this.automaton = runAutomaton.automaton;
// TODO: this is a bit fragile because if the automaton is not minimized there could be more than 1 sink state but auto-prefix will fail
// to run for those:
sinkState = findSinkState(this.automaton);
}
private Transition transition = new Transition();

View File

@ -599,7 +599,7 @@ public class RegExp {
a = aa;
break;
case REGEXP_INTERVAL:
a = Automata.makeInterval(min, max, digits);
a = Automata.makeDecimalInterval(min, max, digits);
break;
}
return a;

View File

@ -117,8 +117,8 @@ public class TestAutomatonQuery extends LuceneTestCase {
assertAutomatonHits(2, Automata.makeString("doc"));
assertAutomatonHits(1, Automata.makeChar('a'));
assertAutomatonHits(2, Automata.makeCharRange('a', 'b'));
assertAutomatonHits(2, Automata.makeInterval(1233, 2346, 0));
assertAutomatonHits(1, Automata.makeInterval(0, 2000, 0));
assertAutomatonHits(2, Automata.makeDecimalInterval(1233, 2346, 0));
assertAutomatonHits(1, Automata.makeDecimalInterval(0, 2000, 0));
assertAutomatonHits(2, Operations.union(Automata.makeChar('a'),
Automata.makeChar('b')));
assertAutomatonHits(0, Operations.intersection(Automata
@ -194,7 +194,6 @@ public class TestAutomatonQuery extends LuceneTestCase {
Automaton pfx = Automata.makeString("do");
Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN);
assertEquals(3, automatonQueryNrHits(aq));
}

View File

@ -17,16 +17,19 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
@ -34,8 +37,6 @@ import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import java.io.IOException;
public class TestMultiTermQueryRewrites extends LuceneTestCase {
static Directory dir, sdir1, sdir2;
@ -152,14 +153,27 @@ public class TestMultiTermQueryRewrites extends LuceneTestCase {
final MultiTermQuery mtq = new MultiTermQuery("data") {
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return new TermRangeTermsEnum(terms.iterator(null), new BytesRef("2"), new BytesRef("7"), true, true) {
return new FilteredTermsEnum(terms.iterator(null)) {
final BoostAttribute boostAtt =
attributes().addAttribute(BoostAttribute.class);
@Override
protected AcceptStatus accept(BytesRef term) {
boostAtt.setBoost(Float.parseFloat(term.utf8ToString()));
return super.accept(term);
if (term.length == 0) {
return AcceptStatus.NO;
}
char c = (char) (term.bytes[term.offset] & 0xff);
if (c >= '2') {
if (c <= '7') {
return AcceptStatus.YES;
} else {
return AcceptStatus.END;
}
} else {
return AcceptStatus.NO;
}
}
};
}

View File

@ -17,25 +17,32 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.codecs.autoprefix.AutoPrefixPostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.StringHelper;
@ -70,7 +77,6 @@ public class TestPrefixQuery extends LuceneTestCase {
assertEquals("One in /Computers/Mac", 1, hits.length);
query = new PrefixQuery(new Term("category", ""));
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "category");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("everything", 3, hits.length);
writer.close();
@ -78,6 +84,92 @@ public class TestPrefixQuery extends LuceneTestCase {
directory.close();
}
/** Make sure auto prefix terms are used with PrefixQuery. */
public void testAutoPrefixTermsKickIn() throws Exception {
List<String> prefixes = new ArrayList<>();
for(int i=1;i<5;i++) {
char[] chars = new char[i];
Arrays.fill(chars, 'a');
prefixes.add(new String(chars));
}
Set<String> randomTerms = new HashSet<>();
int numTerms = atLeast(10000);
while (randomTerms.size() < numTerms) {
for(String prefix : prefixes) {
randomTerms.add(prefix + TestUtil.randomRealisticUnicodeString(random()));
}
}
int actualCount = 0;
for(String term : randomTerms) {
if (term.startsWith("aa")) {
actualCount++;
}
}
//System.out.println("actual count " + actualCount);
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
int minTermsInBlock = TestUtil.nextInt(random(), 2, 100);
int maxTermsInBlock = Math.max(2, (minTermsInBlock-1)*2 + random().nextInt(100));
// As long as this is never > actualCount, aa should always see at least one auto-prefix term:
int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, actualCount);
int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2, (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE;
iwc.setCodec(TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat(minTermsInBlock, maxTermsInBlock,
minTermsAutoPrefix, maxTermsAutoPrefix)));
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
for (String term : randomTerms) {
Document doc = new Document();
doc.add(new StringField("field", term, Field.Store.NO));
w.addDocument(doc);
}
w.forceMerge(1);
IndexReader r = w.getReader();
final Terms terms = MultiFields.getTerms(r, "field");
IndexSearcher s = new IndexSearcher(r);
final int finalActualCount = actualCount;
PrefixQuery q = new PrefixQuery(new Term("field", "aa")) {
public PrefixQuery checkTerms() throws IOException {
TermsEnum termsEnum = getTermsEnum(terms, new AttributeSource());
int count = 0;
while (termsEnum.next() != null) {
//System.out.println("got term: " + termsEnum.term().utf8ToString());
count++;
}
// Auto-prefix term(s) should have kicked in, so we should have visited fewer than the total number of aa* terms:
assertTrue(count < finalActualCount);
return this;
}
}.checkTerms();
int x = BooleanQuery.getMaxClauseCount();
try {
BooleanQuery.setMaxClauseCount(randomTerms.size());
if (random().nextBoolean()) {
q.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
} else if (random().nextBoolean()) {
q.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
}
assertEquals(actualCount, s.search(q, 1).totalHits);
} finally {
BooleanQuery.setMaxClauseCount(x);
}
r.close();
w.close();
dir.close();
}
public void testMatchAll() throws Exception {
Directory directory = newDirectory();
@ -92,8 +184,6 @@ public class TestPrefixQuery extends LuceneTestCase {
IndexSearcher searcher = newSearcher(reader);
assertEquals(1, searcher.search(query, 1000).totalHits);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
writer.close();
reader.close();
directory.close();

View File

@ -18,20 +18,32 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.codecs.autoprefix.AutoPrefixPostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestTermRangeQuery extends LuceneTestCase {
@ -104,19 +116,24 @@ public class TestTermRangeQuery extends LuceneTestCase {
initializeIndex(new String[]{"A", "B", "C", "D"});
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = newSearcher(reader);
TermRangeQuery query = new TermRangeQuery("content", null, null, true, true);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "content");
// Should return the unfiltered TermsEnum:
assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum);
assertEquals(4, searcher.search(query, 1000).scoreDocs.length);
query = new TermRangeQuery("content", null, null, false, false);
query = TermRangeQuery.newStringRange("content", "", null, true, true);
// Should return the unfiltered TermsEnum:
assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum);
assertEquals(4, searcher.search(query, 1000).scoreDocs.length);
query = TermRangeQuery.newStringRange("content", "", null, true, false);
assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum);
assertEquals(4, searcher.search(query, 1000).scoreDocs.length);
// and now anothe one
query = TermRangeQuery.newStringRange("content", "B", null, true, false);
assertTrue(query.getTermsEnum(terms) instanceof TermRangeTermsEnum);
// and now another one
query = TermRangeQuery.newStringRange("content", "B", null, true, true);
assertEquals(3, searcher.search(query, 1000).scoreDocs.length);
reader.close();
}
@ -336,4 +353,127 @@ public class TestTermRangeQuery extends LuceneTestCase {
//assertEquals("C added => A,B,<empty string>,C in range", 3, hits.length());
reader.close();
}
/** Make sure auto prefix terms are used with TermRangeQuery */
public void testAutoPrefixTermsKickIn() throws Exception {
List<String> prefixes = new ArrayList<>();
for(int i=1;i<5;i++) {
char[] chars = new char[i];
Arrays.fill(chars, 'a');
prefixes.add(new String(chars));
}
Set<String> randomTerms = new HashSet<>();
int numTerms = atLeast(10000);
while (randomTerms.size() < numTerms) {
for(String prefix : prefixes) {
randomTerms.add(prefix + TestUtil.randomSimpleString(random()));
}
}
// We make term range aa<start> - aa<end>
char start;
char end;
int actualCount;
boolean startInclusive = random().nextBoolean();
boolean endInclusive = random().nextBoolean();
String startTerm;
String endTerm;
while (true) {
start = (char) TestUtil.nextInt(random(), 'a', 'm');
end = (char) TestUtil.nextInt(random(), start+1, 'z');
actualCount = 0;
startTerm = "aa" + start;
endTerm = "aa" + end;
for(String term : randomTerms) {
int cmpStart = startTerm.compareTo(term);
int cmpEnd = endTerm.compareTo(term);
if ((cmpStart < 0 || (startInclusive && cmpStart == 0)) &&
(cmpEnd > 0 || (endInclusive && cmpEnd == 0))) {
actualCount++;
}
}
if (actualCount > 2000) {
break;
}
}
//System.out.println("start " + startTerm + " inclusive? " + startInclusive);
//System.out.println("end " + endTerm + " inclusive? " + endInclusive);
//System.out.println("actual count " + actualCount);
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
int minTermsInBlock = TestUtil.nextInt(random(), 2, 100);
int maxTermsInBlock = Math.max(2, (minTermsInBlock-1)*2 + random().nextInt(100));
int minTermsAutoPrefix = TestUtil.nextInt(random(), 2, 100);
int maxTermsAutoPrefix = random().nextBoolean() ? Math.max(2, (minTermsAutoPrefix-1)*2 + random().nextInt(100)) : Integer.MAX_VALUE;
//System.out.println("minTermsAutoPrefix " + minTermsAutoPrefix);
//System.out.println("maxTermsAutoPrefix " + maxTermsAutoPrefix);
iwc.setCodec(TestUtil.alwaysPostingsFormat(new AutoPrefixPostingsFormat(minTermsInBlock, maxTermsInBlock,
minTermsAutoPrefix, maxTermsAutoPrefix)));
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
//System.out.println("TEST: index terms");
for (String term : randomTerms) {
Document doc = new Document();
doc.add(new StringField("field", term, Field.Store.NO));
w.addDocument(doc);
//System.out.println(" " + term);
}
//System.out.println("TEST: now force merge");
w.forceMerge(1);
IndexReader r = w.getReader();
final Terms terms = MultiFields.getTerms(r, "field");
IndexSearcher s = new IndexSearcher(r);
final int finalActualCount = actualCount;
//System.out.println("start=" + startTerm + " end=" + endTerm + " startIncl=" + startInclusive + " endIncl=" + endInclusive);
TermRangeQuery q = new TermRangeQuery("field", new BytesRef(startTerm), new BytesRef(endTerm), startInclusive, endInclusive) {
public TermRangeQuery checkTerms() throws IOException {
TermsEnum termsEnum = getTermsEnum(terms, new AttributeSource());
int count = 0;
while (termsEnum.next() != null) {
//System.out.println("got term: " + termsEnum.term().utf8ToString());
count++;
}
//System.out.println("count " + count + " vs finalActualCount=" + finalActualCount);
// Auto-prefix term(s) should have kicked in, so we should have visited fewer than the total number of aa* terms:
assertTrue(count < finalActualCount);
return this;
}
}.checkTerms();
if (random().nextBoolean()) {
q.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
} else if (random().nextBoolean()) {
q.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
}
assertEquals(actualCount, s.search(q, 1).totalHits);
// Test when min == max:
List<String> randomTermsList = new ArrayList<>(randomTerms);
for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
String term = randomTermsList.get(random().nextInt(randomTermsList.size()));
q = new TermRangeQuery("field", new BytesRef(term), new BytesRef(term), true, true);
assertEquals(1, s.search(q, 1).totalHits);
}
r.close();
w.close();
dir.close();
}
}

View File

@ -34,13 +34,7 @@ import java.io.IOException;
/**
* TestWildcard tests the '*' and '?' wildcard characters.
*/
public class TestWildcard
extends LuceneTestCase {
@Override
public void setUp() throws Exception {
super.setUp();
}
public class TestWildcard extends LuceneTestCase {
public void testEquals() {
WildcardQuery wq1 = new WildcardQuery(new Term("field", "b*a"));
@ -126,10 +120,10 @@ public class TestWildcard
MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*"));
assertMatches(searcher, wq, 2);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
wq = new WildcardQuery(new Term("field", "*"));
assertMatches(searcher, wq, 2);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
assertFalse(wq.getTermsEnum(terms).getClass().getSimpleName().contains("AutomatonTermsEnum"));
reader.close();
indexStore.close();

View File

@ -232,7 +232,7 @@ public class TestAutomaton extends LuceneTestCase {
}
public void testInterval() throws Exception {
Automaton a = Operations.determinize(Automata.makeInterval(17, 100, 3),
Automaton a = Operations.determinize(Automata.makeDecimalInterval(17, 100, 3),
DEFAULT_MAX_DETERMINIZED_STATES);
assertFalse(Operations.run(a, ""));
assertTrue(Operations.run(a, "017"));
@ -431,7 +431,7 @@ public class TestAutomaton extends LuceneTestCase {
}
public void testOneInterval() throws Exception {
Automaton a = Automata.makeInterval(999, 1032, 0);
Automaton a = Automata.makeDecimalInterval(999, 1032, 0);
a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
assertTrue(Operations.run(a, "0999"));
assertTrue(Operations.run(a, "00999"));
@ -439,7 +439,7 @@ public class TestAutomaton extends LuceneTestCase {
}
public void testAnotherInterval() throws Exception {
Automaton a = Automata.makeInterval(1, 2, 0);
Automaton a = Automata.makeDecimalInterval(1, 2, 0);
a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
assertTrue(Operations.run(a, "01"));
}
@ -462,7 +462,7 @@ public class TestAutomaton extends LuceneTestCase {
}
String prefix = b.toString();
Automaton a = Operations.determinize(Automata.makeInterval(min, max, digits),
Automaton a = Operations.determinize(Automata.makeDecimalInterval(min, max, digits),
DEFAULT_MAX_DETERMINIZED_STATES);
if (random().nextBoolean()) {
a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
@ -942,7 +942,7 @@ public class TestAutomaton extends LuceneTestCase {
if (VERBOSE) {
System.out.println(" op=union interval min=" + min + " max=" + max + " digits=" + digits);
}
a = Operations.union(a, Automata.makeInterval(min, max, digits));
a = Operations.union(a, Automata.makeDecimalInterval(min, max, digits));
StringBuilder b = new StringBuilder();
for(int i=0;i<digits;i++) {
b.append('0');
@ -1105,6 +1105,138 @@ public class TestAutomaton extends LuceneTestCase {
}
}
public void testMakeBinaryIntervalRandom() throws Exception {
int iters = atLeast(100);
for(int iter=0;iter<iters;iter++) {
BytesRef minTerm = TestUtil.randomBinaryTerm(random());
boolean minInclusive = random().nextBoolean();
BytesRef maxTerm = TestUtil.randomBinaryTerm(random());
boolean maxInclusive = random().nextBoolean();
if (VERBOSE) {
System.out.println("TEST: iter=" + iter + " minTerm=" + minTerm + " minInclusive=" + minInclusive + " maxTerm=" + maxTerm + " maxInclusive=" + maxInclusive);
}
Automaton a = Automata.makeBinaryInterval(minTerm, minInclusive, maxTerm, maxInclusive);
Automaton minA = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
if (minA.getNumStates() != a.getNumStates()) {
assertTrue(minA.getNumStates() < a.getNumStates());
System.out.println("Original was not minimal:");
System.out.println("Original:\n" + a.toDot());
System.out.println("Minimized:\n" + minA.toDot());
fail("auotmaton was not minimal");
}
if (VERBOSE) {
System.out.println(a.toDot());
}
for(int iter2=0;iter2<500;iter2++) {
BytesRef term = TestUtil.randomBinaryTerm(random());
int minCmp = minTerm.compareTo(term);
int maxCmp = maxTerm.compareTo(term);
boolean expected;
if (minCmp > 0 || maxCmp < 0) {
expected = false;
} else if (minCmp == 0 && maxCmp == 0) {
expected = minInclusive && maxInclusive;
} else if (minCmp == 0) {
expected = minInclusive;
} else if (maxCmp == 0) {
expected = maxInclusive;
} else {
expected = true;
}
if (VERBOSE) {
System.out.println(" check term=" + term + " expected=" + expected);
}
IntsRefBuilder intsBuilder = new IntsRefBuilder();
Util.toIntsRef(term, intsBuilder);
assertEquals(expected, Operations.run(a, intsBuilder.toIntsRef()));
}
}
}
private static IntsRef intsRef(String s) {
IntsRefBuilder intsBuilder = new IntsRefBuilder();
Util.toIntsRef(new BytesRef(s), intsBuilder);
return intsBuilder.toIntsRef();
}
public void testMakeBinaryIntervalBasic() throws Exception {
Automaton a = Automata.makeBinaryInterval(new BytesRef("bar"), true, new BytesRef("foo"), true);
assertTrue(Operations.run(a, intsRef("bar")));
assertTrue(Operations.run(a, intsRef("foo")));
assertTrue(Operations.run(a, intsRef("beep")));
assertFalse(Operations.run(a, intsRef("baq")));
assertTrue(Operations.run(a, intsRef("bara")));
}
public void testMakeBinaryIntervalEqual() throws Exception {
Automaton a = Automata.makeBinaryInterval(new BytesRef("bar"), true, new BytesRef("bar"), true);
assertTrue(Operations.run(a, intsRef("bar")));
assertTrue(Operations.isFinite(a));
assertEquals(1, Operations.getFiniteStrings(a, 10).size());
}
public void testMakeBinaryIntervalCommonPrefix() throws Exception {
Automaton a = Automata.makeBinaryInterval(new BytesRef("bar"), true, new BytesRef("barfoo"), true);
assertFalse(Operations.run(a, intsRef("bam")));
assertTrue(Operations.run(a, intsRef("bar")));
assertTrue(Operations.run(a, intsRef("bara")));
assertTrue(Operations.run(a, intsRef("barf")));
assertTrue(Operations.run(a, intsRef("barfo")));
assertTrue(Operations.run(a, intsRef("barfoo")));
assertTrue(Operations.run(a, intsRef("barfonz")));
assertFalse(Operations.run(a, intsRef("barfop")));
assertFalse(Operations.run(a, intsRef("barfoop")));
}
public void testMakeBinaryIntervalOpenMax() throws Exception {
Automaton a = Automata.makeBinaryInterval(new BytesRef("bar"), true, null, true);
assertFalse(Operations.run(a, intsRef("bam")));
assertTrue(Operations.run(a, intsRef("bar")));
assertTrue(Operations.run(a, intsRef("bara")));
assertTrue(Operations.run(a, intsRef("barf")));
assertTrue(Operations.run(a, intsRef("barfo")));
assertTrue(Operations.run(a, intsRef("barfoo")));
assertTrue(Operations.run(a, intsRef("barfonz")));
assertTrue(Operations.run(a, intsRef("barfop")));
assertTrue(Operations.run(a, intsRef("barfoop")));
assertTrue(Operations.run(a, intsRef("zzz")));
}
public void testMakeBinaryIntervalOpenMin() throws Exception {
Automaton a = Automata.makeBinaryInterval(null, true, new BytesRef("foo"), true);
assertFalse(Operations.run(a, intsRef("foz")));
assertFalse(Operations.run(a, intsRef("zzz")));
assertTrue(Operations.run(a, intsRef("foo")));
assertTrue(Operations.run(a, intsRef("")));
assertTrue(Operations.run(a, intsRef("a")));
assertTrue(Operations.run(a, intsRef("aaa")));
assertTrue(Operations.run(a, intsRef("bz")));
}
public void testMakeBinaryIntervalOpenBoth() throws Exception {
Automaton a = Automata.makeBinaryInterval(null, true, null, true);
assertTrue(Operations.run(a, intsRef("foz")));
assertTrue(Operations.run(a, intsRef("zzz")));
assertTrue(Operations.run(a, intsRef("foo")));
assertTrue(Operations.run(a, intsRef("")));
assertTrue(Operations.run(a, intsRef("a")));
assertTrue(Operations.run(a, intsRef("aaa")));
assertTrue(Operations.run(a, intsRef("bz")));
}
public void testAcceptAllEmptyStringMin() throws Exception {
Automaton a = Automata.makeBinaryInterval(new BytesRef(), true, null, true);
System.out.println("HERE: " + a.toDot());
assertTrue(Operations.sameLanguage(Automata.makeAnyBinary(), a));
}
private static IntsRef toIntsRef(String s) {
IntsRefBuilder b = new IntsRefBuilder();
for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {

View File

@ -258,14 +258,14 @@ public class AssertingLeafReader extends FilterLeafReader {
public TermState termState() throws IOException {
assertThread("Terms enums", creationThread);
assert state == State.POSITIONED : "termState() called on unpositioned TermsEnum";
return super.termState();
return in.termState();
}
@Override
public void seekExact(BytesRef term, TermState state) throws IOException {
assertThread("Terms enums", creationThread);
assert term.isValid();
super.seekExact(term, state);
in.seekExact(term, state);
this.state = State.POSITIONED;
}

File diff suppressed because it is too large Load Diff

View File

@ -831,6 +831,14 @@ public final class TestUtil {
return new String(buffer, 0, i);
}
/** Returns a random binary term. */
public static BytesRef randomBinaryTerm(Random r) {
int length = r.nextInt(15);
BytesRef b = new BytesRef(length);
r.nextBytes(b.bytes);
b.length = length;
return b;
}
/** Return a Codec that can read any of the
* default codecs and formats, but always writes in the specified
@ -858,7 +866,7 @@ public final class TestUtil {
// (and maybe their params, too) to infostream on flush and merge.
// otherwise in a real debugging situation we won't know whats going on!
if (LuceneTestCase.VERBOSE) {
System.out.println("forcing docvalues format to:" + format);
System.out.println("TestUtil: forcing docvalues format to:" + format);
}
return new AssertingCodec() {
@Override
@ -1285,6 +1293,24 @@ public final class TestUtil {
}
}
/** For debugging: tries to include br.utf8ToString(), but if that
* fails (because it's not valid utf8, which is fine!), just
* use ordinary toString. */
public static String bytesRefToString(BytesRef br) {
if (br == null) {
return "(null)";
} else {
try {
return br.utf8ToString() + " " + br.toString();
} catch (IllegalArgumentException t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return br.toString();
}
}
}
/** Returns a copy of directory, entirely in RAM */
public static RAMDirectory ramCopyOf(Directory dir) throws IOException {
RAMDirectory ram = new RAMDirectory();