LUCENE-6367: PrefixQuery now subclasses AutomatonQuery

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1669522 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2015-03-27 08:22:54 +00:00
parent ef0209189e
commit 6a6c729920
15 changed files with 409 additions and 137 deletions

View File

@ -244,6 +244,11 @@ Changes in Runtime Behavior
* LUCENE-6298: SimpleQueryParser returns an empty query rather than
null, if e.g. the terms were all stopwords. (Lee Hinman via Robert Muir)
* LUCENE-6367: PrefixQuery now subclasses AutomatonQuery, removing the
specialized PrefixTermsEnum. PrefixQuery now operates in binary
term space, meaning any binary term (not just valid UTF-8 terms)
are accepted. (Robert Muir, Mike McCandless)
======================= Lucene 5.0.0 =======================
New Features

View File

@ -78,10 +78,28 @@ public class AutomatonQuery extends MultiTermQuery {
* space but can process more complex automata.
*/
public AutomatonQuery(final Term term, Automaton automaton, int maxDeterminizedStates) {
this(term, automaton, maxDeterminizedStates, false);
}
/**
* Create a new AutomatonQuery from an {@link Automaton}.
*
* @param term Term containing field and possibly some pattern structure. The
* term text is ignored.
* @param automaton Automaton to run, terms that are accepted are considered a
* match.
* @param maxDeterminizedStates maximum number of states in the resulting
* automata. If the automata would need more than this many states
* TooComplextToDeterminizeException is thrown. Higher number require more
* space but can process more complex automata.
* @param isBinary if true, this automaton is already binary and
* will not go through the UTF32ToUTF8 conversion
*/
public AutomatonQuery(final Term term, Automaton automaton, int maxDeterminizedStates, boolean isBinary) {
super(term.field());
this.term = term;
this.automaton = automaton;
this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates);
this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates, isBinary);
}
@Override

View File

@ -19,11 +19,13 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.automaton.Automaton;
/** A Query that matches documents containing terms with a specified prefix. A PrefixQuery
* is built by QueryParser for input like <code>app*</code>.
@ -31,29 +33,38 @@ import org.apache.lucene.util.ToStringUtils;
* <p>This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_REWRITE}
* rewrite method. */
public class PrefixQuery extends MultiTermQuery {
private Term prefix;
public class PrefixQuery extends AutomatonQuery {
/** Constructs a query for terms starting with <code>prefix</code>. */
public PrefixQuery(Term prefix) {
super(prefix.field());
this.prefix = prefix;
// It's OK to pass unlimited maxDeterminizedStates: the automaton is born small and determinized:
super(prefix, toAutomaton(prefix.bytes()), Integer.MAX_VALUE, true);
if (prefix == null) {
throw new NullPointerException("prefix cannot be null");
}
}
/** Build an automaton accepting all terms with the specified prefix. */
public static Automaton toAutomaton(BytesRef prefix) {
Automaton automaton = new Automaton();
int lastState = automaton.createState();
for(int i=0;i<prefix.length;i++) {
int state = automaton.createState();
automaton.addTransition(lastState, state, prefix.bytes[prefix.offset+i]&0xff);
lastState = state;
}
automaton.setAccept(lastState, true);
automaton.addTransition(lastState, lastState, 0, 255);
automaton.finishState();
assert automaton.isDeterministic();
return automaton;
}
/** Returns the prefix of this query. */
public Term getPrefix() { return prefix; }
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
TermsEnum tenum = terms.iterator(null);
if (prefix.bytes().length == 0) {
// no prefix -- match all terms for this field:
return tenum;
}
return new PrefixTermsEnum(tenum, prefix.bytes());
public Term getPrefix() {
return term;
}
/** Prints a user-readable version of this query. */
@Override
public String toString(String field) {
@ -62,7 +73,7 @@ public class PrefixQuery extends MultiTermQuery {
buffer.append(getField());
buffer.append(":");
}
buffer.append(prefix.text());
buffer.append(term.text());
buffer.append('*');
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
@ -72,25 +83,23 @@ public class PrefixQuery extends MultiTermQuery {
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + ((prefix == null) ? 0 : prefix.hashCode());
result = prime * result + term.hashCode();
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
if (this == obj) {
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
}
if (!super.equals(obj)) {
return false;
}
// super.equals() ensures we are the same class
PrefixQuery other = (PrefixQuery) obj;
if (prefix == null) {
if (other.prefix != null)
return false;
} else if (!prefix.equals(other.prefix))
if (!term.equals(other.term)) {
return false;
}
return true;
}
}

View File

@ -1,49 +0,0 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
/**
* Subclass of FilteredTermEnum for enumerating all terms that match the
* specified prefix filter term.
* <p>Term enumerations are always ordered by
* {@link BytesRef#compareTo}. Each term in the enumeration is
* greater than all that precede it.</p>
*/
public class PrefixTermsEnum extends FilteredTermsEnum {
private final BytesRef prefixRef;
public PrefixTermsEnum(TermsEnum tenum, BytesRef prefixText) {
super(tenum);
setInitialSeekTerm(this.prefixRef = prefixText);
}
@Override
protected AcceptStatus accept(BytesRef term) {
if (StringHelper.startsWith(term, prefixRef)) {
return AcceptStatus.YES;
} else {
return AcceptStatus.END;
}
}
}

View File

@ -393,4 +393,20 @@ public abstract class StringHelper {
return sb.toString();
}
}
/** Just converts each int in the incoming {@link IntsRef} to each byte
* in the returned {@link BytesRef}, throwing {@code IllegalArgumentException}
* if any int value is out of bounds for a byte. */
public static BytesRef intsRefToBytesRef(IntsRef ints) {
byte[] bytes = new byte[ints.length];
for(int i=0;i<ints.length;i++) {
int x = ints.ints[ints.offset+i];
if (x < 0 || x > 255) {
throw new IllegalArgumentException("int at pos=" + i + " with value=" + x + " is out-of-bounds for byte");
}
bytes[i] = (byte) x;
}
return new BytesRef(bytes);
}
}

View File

@ -347,6 +347,7 @@ public class Automaton implements Accountable {
/** How many transitions this state has. */
public int getNumTransitions(int state) {
assert state >= 0;
int count = states[2*state+1];
if (count == -1) {
return 0;

View File

@ -24,9 +24,11 @@ import java.util.List;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.PrefixTermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
/**
* Immutable class holding compiled details for a given
@ -47,8 +49,6 @@ public class CompiledAutomaton {
ALL,
/** Automaton that accepts only a single fixed string. */
SINGLE,
/** Automaton that matches all Strings with a constant prefix. */
PREFIX,
/** Catch-all for any other automata. */
NORMAL
};
@ -57,8 +57,7 @@ public class CompiledAutomaton {
public final AUTOMATON_TYPE type;
/**
* For {@link AUTOMATON_TYPE#PREFIX}, this is the prefix term;
* for {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
* For {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
*/
public final BytesRef term;
@ -101,7 +100,7 @@ public class CompiledAutomaton {
* possibly expensive operations to determine if the automaton is one
* the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. */
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES, false);
}
@ -114,7 +113,7 @@ public class CompiledAutomaton {
* TooComplexToDeterminizeException.
*/
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify,
int maxDeterminizedStates) {
int maxDeterminizedStates, boolean isBinary) {
if (automaton.getNumStates() == 0) {
automaton = new Automaton();
automaton.createState();
@ -135,8 +134,18 @@ public class CompiledAutomaton {
this.automaton = null;
this.finite = null;
return;
}
boolean isTotal;
// NOTE: only approximate, because automaton may not be minimal:
} else if (Operations.isTotal(automaton)) {
if (isBinary) {
isTotal = Operations.isTotal(automaton, 0, 0xff);
} else {
isTotal = Operations.isTotal(automaton);
}
if (isTotal) {
// matches all possible strings
type = AUTOMATON_TYPE.ALL;
term = null;
@ -145,43 +154,27 @@ public class CompiledAutomaton {
this.automaton = null;
this.finite = null;
return;
} else {
}
automaton = Operations.determinize(automaton, maxDeterminizedStates);
automaton = Operations.determinize(automaton, maxDeterminizedStates);
final String commonPrefix = Operations.getCommonPrefix(automaton);
final String singleton;
IntsRef singleton = Operations.getSingleton(automaton);
if (commonPrefix.length() > 0 && Operations.sameLanguage(automaton, Automata.makeString(commonPrefix))) {
singleton = commonPrefix;
if (singleton != null) {
// matches a fixed string
type = AUTOMATON_TYPE.SINGLE;
commonSuffixRef = null;
runAutomaton = null;
this.automaton = null;
this.finite = null;
if (isBinary) {
term = StringHelper.intsRefToBytesRef(singleton);
} else {
singleton = null;
term = new BytesRef(UnicodeUtil.newString(singleton.ints, singleton.offset, singleton.length));
}
if (singleton != null) {
// matches a fixed string
type = AUTOMATON_TYPE.SINGLE;
term = new BytesRef(singleton);
commonSuffixRef = null;
runAutomaton = null;
this.automaton = null;
this.finite = null;
return;
} else if (commonPrefix.length() > 0) {
Automaton other = Operations.concatenate(Automata.makeString(commonPrefix), Automata.makeAnyString());
other = Operations.determinize(other, maxDeterminizedStates);
assert Operations.hasDeadStates(other) == false;
if (Operations.sameLanguage(automaton, other)) {
// matches a constant prefix
type = AUTOMATON_TYPE.PREFIX;
term = new BytesRef(commonPrefix);
commonSuffixRef = null;
runAutomaton = null;
this.automaton = null;
this.finite = null;
return;
}
}
return;
}
}
@ -194,14 +187,26 @@ public class CompiledAutomaton {
this.finite = finite;
}
Automaton utf8 = new UTF32ToUTF8().convert(automaton);
Automaton binary;
if (isBinary) {
// Caller already built binary automaton themselves, e.g. PrefixQuery
// does this since it can be provided with a binary (not necessarily
// UTF8!) term:
binary = automaton;
} else {
// Incoming automaton is unicode, and we must convert to UTF8 to match what's in the index:
binary = new UTF32ToUTF8().convert(automaton);
}
if (this.finite) {
commonSuffixRef = null;
} else {
// NOTE: this is a very costly operation! We should test if it's really warranted in practice...
commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8, maxDeterminizedStates);
commonSuffixRef = Operations.getCommonSuffixBytesRef(binary, maxDeterminizedStates);
}
runAutomaton = new ByteRunAutomaton(utf8, true, maxDeterminizedStates);
// This will determinize the binary automaton for us:
runAutomaton = new ByteRunAutomaton(binary, true, maxDeterminizedStates);
this.automaton = runAutomaton.automaton;
}
@ -285,10 +290,6 @@ public class CompiledAutomaton {
return terms.iterator(null);
case SINGLE:
return new SingleTermsEnum(terms.iterator(null), term);
case PREFIX:
// TODO: this is very likely faster than .intersect,
// but we should test and maybe cutover
return new PrefixTermsEnum(terms.iterator(null), term);
case NORMAL:
return terms.intersect(this, null);
default:
@ -410,7 +411,7 @@ public class CompiledAutomaton {
if (getClass() != obj.getClass()) return false;
CompiledAutomaton other = (CompiledAutomaton) obj;
if (type != other.type) return false;
if (type == AUTOMATON_TYPE.SINGLE || type == AUTOMATON_TYPE.PREFIX) {
if (type == AUTOMATON_TYPE.SINGLE) {
if (!term.equals(other.term)) return false;
} else if (type == AUTOMATON_TYPE.NORMAL) {
if (!runAutomaton.equals(other.runAutomaton)) return false;

View File

@ -834,11 +834,20 @@ final public class Operations {
* Returns true if the given automaton accepts all strings. The automaton must be minimized.
*/
public static boolean isTotal(Automaton a) {
return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
}
/**
* Returns true if the given automaton accepts all strings for the specified min/max
* range of the alphabet. The automaton must be minimized.
*/
public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) {
if (a.isAccept(0) && a.getNumTransitions(0) == 1) {
Transition t = new Transition();
a.getTransition(0, 0, t);
return t.dest == 0 && t.min == Character.MIN_CODE_POINT
&& t.max == Character.MAX_CODE_POINT;
return t.dest == 0
&& t.min == minAlphabet
&& t.max == maxAlphabet;
}
return false;
}
@ -1112,6 +1121,37 @@ final public class Operations {
return builder.get();
}
/** If this automaton accepts a single input, return it. Else, return null.
* The automaton must be deterministic. */
public static IntsRef getSingleton(Automaton a) {
if (a.isDeterministic() == false) {
throw new IllegalArgumentException("input automaton must be deterministic");
}
IntsRefBuilder builder = new IntsRefBuilder();
HashSet<Integer> visited = new HashSet<>();
int s = 0;
boolean done;
Transition t = new Transition();
while (true) {
visited.add(s);
if (a.isAccept(s) == false) {
if (a.getNumTransitions(s) == 1) {
a.getTransition(s, 0, t);
if (t.min == t.max && !visited.contains(t.dest)) {
builder.append(t.min);
s = t.dest;
continue;
}
}
} else if (a.getNumTransitions(s) == 0) {
return builder.get();
}
// Automaton accepts more than one string:
return null;
}
}
/**
* Returns the longest BytesRef that is a suffix of all accepted strings.
* Worst case complexity: exponential in number of states (this calls

View File

@ -260,7 +260,7 @@ public class TestTermsEnum extends LuceneTestCase {
a = Automata.makeStringUnion(sortedAcceptTerms);
}
final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000);
final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000, false);
final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
final Set<BytesRef> acceptTermsSet = new HashSet<>();

View File

@ -195,7 +195,6 @@ public class TestAutomatonQuery extends LuceneTestCase {
Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN);
assertTrue(aq.getTermsEnum(terms) instanceof PrefixTermsEnum);
assertEquals(3, automatonQueryNrHits(aq));
}

View File

@ -17,15 +17,29 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.document.Document;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
/**
* Tests {@link PrefixQuery} class.
@ -57,11 +71,145 @@ public class TestPrefixQuery extends LuceneTestCase {
query = new PrefixQuery(new Term("category", ""));
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "category");
assertFalse(query.getTermsEnum(terms) instanceof PrefixTermsEnum);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("everything", 3, hits.length);
writer.close();
reader.close();
directory.close();
}
public void testMatchAll() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
Document doc = new Document();
doc.add(newStringField("field", "field", Field.Store.YES));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
PrefixQuery query = new PrefixQuery(new Term("field", ""));
IndexSearcher searcher = newSearcher(reader);
assertEquals(1, searcher.search(query, 1000).totalHits);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
writer.close();
reader.close();
directory.close();
}
static final class BinaryTokenStream extends TokenStream {
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
private boolean available = true;
public BinaryTokenStream(BytesRef bytes) {
bytesAtt.setBytesRef(bytes);
}
@Override
public boolean incrementToken() {
if (available) {
clearAttributes();
available = false;
return true;
}
return false;
}
@Override
public void reset() {
available = true;
}
public interface ByteTermAttribute extends TermToBytesRefAttribute {
public void setBytesRef(BytesRef bytes);
}
public static class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute,TermToBytesRefAttribute {
private BytesRef bytes;
@Override
public void fillBytesRef() {
// no-op: the bytes was already filled by our owner's incrementToken
}
@Override
public BytesRef getBytesRef() {
return bytes;
}
@Override
public void setBytesRef(BytesRef bytes) {
this.bytes = bytes;
}
@Override
public void clear() {}
@Override
public void copyTo(AttributeImpl target) {
ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
other.bytes = bytes;
}
}
}
/** Basically a StringField that accepts binary term. */
private static class BinaryField extends Field {
final static FieldType TYPE;
static {
TYPE = new FieldType(StringField.TYPE_NOT_STORED);
// Necessary so our custom tokenStream is used by Field.tokenStream:
TYPE.setTokenized(true);
TYPE.freeze();
}
public BinaryField(String name, BytesRef value) {
super(name, new BinaryTokenStream(value), TYPE);
}
}
public void testRandomBinaryPrefix() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int numTerms = atLeast(10000);
Set<BytesRef> terms = new HashSet<>();
while (terms.size() < numTerms) {
byte[] bytes = new byte[TestUtil.nextInt(random(), 1, 10)];
random().nextBytes(bytes);
terms.add(new BytesRef(bytes));
}
List<BytesRef> termsList = new ArrayList<>(terms);
Collections.shuffle(termsList, random());
for(BytesRef term : termsList) {
Document doc = new Document();
doc.add(new BinaryField("field", term));
w.addDocument(doc);
}
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r);
int iters = atLeast(100);
for(int iter=0;iter<iters;iter++) {
byte[] bytes = new byte[random().nextInt(3)];
random().nextBytes(bytes);
BytesRef prefix = new BytesRef(bytes);
PrefixQuery q = new PrefixQuery(new Term("field", prefix));
int count = 0;
for(BytesRef term : termsList) {
if (StringHelper.startsWith(term, prefix)) {
count++;
}
}
assertEquals(count, s.search(q, 1).totalHits);
}
r.close();
w.close();
dir.close();
}
}

View File

@ -127,11 +127,9 @@ public class TestWildcard
MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*"));
assertMatches(searcher, wq, 2);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
assertTrue(wq.getTermsEnum(terms) instanceof PrefixTermsEnum);
wq = new WildcardQuery(new Term("field", "*"));
assertMatches(searcher, wq, 2);
assertFalse(wq.getTermsEnum(terms) instanceof PrefixTermsEnum);
assertFalse(wq.getTermsEnum(terms).getClass().getSimpleName().contains("AutomatonTermsEnum"));
reader.close();
indexStore.close();

View File

@ -1104,4 +1104,51 @@ public class TestAutomaton extends LuceneTestCase {
throw ae;
}
}
private static IntsRef toIntsRef(String s) {
IntsRefBuilder b = new IntsRefBuilder();
for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
cp = s.codePointAt(i);
b.append(cp);
}
return b.get();
}
public void testGetSingleton() {
int iters = atLeast(10000);
for(int iter=0;iter<iters;iter++) {
String s = TestUtil.randomRealisticUnicodeString(random());
Automaton a = Automata.makeString(s);
assertEquals(toIntsRef(s), Operations.getSingleton(a));
}
}
public void testGetSingletonEmptyString() {
Automaton a = new Automaton();
int s = a.createState();
a.setAccept(s, true);
a.finishState();
assertEquals(new IntsRef(), Operations.getSingleton(a));
}
public void testGetSingletonNothing() {
Automaton a = new Automaton();
a.createState();
a.finishState();
assertNull(Operations.getSingleton(a));
}
public void testGetSingletonTwo() {
Automaton a = new Automaton();
int s = a.createState();
int x = a.createState();
a.setAccept(x, true);
a.addTransition(s, x, 55);
int y = a.createState();
a.setAccept(y, true);
a.addTransition(s, y, 58);
a.finishState();
assertNull(Operations.getSingleton(a));
}
}

View File

@ -38,7 +38,7 @@ public class TestCompiledAutomaton extends LuceneTestCase {
}
Collections.sort(terms);
final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
return new CompiledAutomaton(a, true, false, maxDeterminizedStates);
return new CompiledAutomaton(a, true, false, maxDeterminizedStates, false);
}
private void testFloor(CompiledAutomaton c, String input, String expected) {
@ -121,4 +121,43 @@ public class TestCompiledAutomaton extends LuceneTestCase {
testFloor(c, "aa", null);
testFloor(c, "zzz", "goo");
}
// LUCENE-6367
public void testBinaryAll() throws Exception {
Automaton a = new Automaton();
int state = a.createState();
a.setAccept(state, true);
a.addTransition(state, state, 0, 0xff);
a.finishState();
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, true);
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.ALL, ca.type);
}
// LUCENE-6367
public void testUnicodeAll() throws Exception {
Automaton a = new Automaton();
int state = a.createState();
a.setAccept(state, true);
a.addTransition(state, state, 0, Character.MAX_CODE_POINT);
a.finishState();
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.ALL, ca.type);
}
// LUCENE-6367
public void testBinarySingleton() throws Exception {
// This is just ascii so we can pretend it's binary:
Automaton a = Automata.makeString("foobar");
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, true);
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.SINGLE, ca.type);
}
// LUCENE-6367
public void testUnicodeSingleton() throws Exception {
Automaton a = Automata.makeString(TestUtil.randomRealisticUnicodeString(random()));
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.SINGLE, ca.type);
}
}

View File

@ -1238,7 +1238,7 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest
for(String field : fields.keySet()) {
while (true) {
Automaton a = AutomatonTestUtil.randomAutomaton(random());
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE);
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
// Keep retrying until we get an A that will really "use" the PF's intersect code:
continue;