mirror of https://github.com/apache/lucene.git
LUCENE-6367: PrefixQuery now subclasses AutomatonQuery
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1669522 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ef0209189e
commit
6a6c729920
|
@ -244,6 +244,11 @@ Changes in Runtime Behavior
|
|||
* LUCENE-6298: SimpleQueryParser returns an empty query rather than
|
||||
null, if e.g. the terms were all stopwords. (Lee Hinman via Robert Muir)
|
||||
|
||||
* LUCENE-6367: PrefixQuery now subclasses AutomatonQuery, removing the
|
||||
specialized PrefixTermsEnum. PrefixQuery now operates in binary
|
||||
term space, meaning any binary term (not just valid UTF-8 terms)
|
||||
are accepted. (Robert Muir, Mike McCandless)
|
||||
|
||||
======================= Lucene 5.0.0 =======================
|
||||
|
||||
New Features
|
||||
|
|
|
@ -78,10 +78,28 @@ public class AutomatonQuery extends MultiTermQuery {
|
|||
* space but can process more complex automata.
|
||||
*/
|
||||
public AutomatonQuery(final Term term, Automaton automaton, int maxDeterminizedStates) {
|
||||
this(term, automaton, maxDeterminizedStates, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new AutomatonQuery from an {@link Automaton}.
|
||||
*
|
||||
* @param term Term containing field and possibly some pattern structure. The
|
||||
* term text is ignored.
|
||||
* @param automaton Automaton to run, terms that are accepted are considered a
|
||||
* match.
|
||||
* @param maxDeterminizedStates maximum number of states in the resulting
|
||||
* automata. If the automata would need more than this many states
|
||||
* TooComplextToDeterminizeException is thrown. Higher number require more
|
||||
* space but can process more complex automata.
|
||||
* @param isBinary if true, this automaton is already binary and
|
||||
* will not go through the UTF32ToUTF8 conversion
|
||||
*/
|
||||
public AutomatonQuery(final Term term, Automaton automaton, int maxDeterminizedStates, boolean isBinary) {
|
||||
super(term.field());
|
||||
this.term = term;
|
||||
this.automaton = automaton;
|
||||
this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates);
|
||||
this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates, isBinary);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,11 +19,13 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
|
||||
/** A Query that matches documents containing terms with a specified prefix. A PrefixQuery
|
||||
* is built by QueryParser for input like <code>app*</code>.
|
||||
|
@ -31,29 +33,38 @@ import org.apache.lucene.util.ToStringUtils;
|
|||
* <p>This query uses the {@link
|
||||
* MultiTermQuery#CONSTANT_SCORE_REWRITE}
|
||||
* rewrite method. */
|
||||
public class PrefixQuery extends MultiTermQuery {
|
||||
private Term prefix;
|
||||
public class PrefixQuery extends AutomatonQuery {
|
||||
|
||||
/** Constructs a query for terms starting with <code>prefix</code>. */
|
||||
public PrefixQuery(Term prefix) {
|
||||
super(prefix.field());
|
||||
this.prefix = prefix;
|
||||
// It's OK to pass unlimited maxDeterminizedStates: the automaton is born small and determinized:
|
||||
super(prefix, toAutomaton(prefix.bytes()), Integer.MAX_VALUE, true);
|
||||
if (prefix == null) {
|
||||
throw new NullPointerException("prefix cannot be null");
|
||||
}
|
||||
}
|
||||
|
||||
/** Build an automaton accepting all terms with the specified prefix. */
|
||||
public static Automaton toAutomaton(BytesRef prefix) {
|
||||
Automaton automaton = new Automaton();
|
||||
int lastState = automaton.createState();
|
||||
for(int i=0;i<prefix.length;i++) {
|
||||
int state = automaton.createState();
|
||||
automaton.addTransition(lastState, state, prefix.bytes[prefix.offset+i]&0xff);
|
||||
lastState = state;
|
||||
}
|
||||
automaton.setAccept(lastState, true);
|
||||
automaton.addTransition(lastState, lastState, 0, 255);
|
||||
automaton.finishState();
|
||||
assert automaton.isDeterministic();
|
||||
return automaton;
|
||||
}
|
||||
|
||||
/** Returns the prefix of this query. */
|
||||
public Term getPrefix() { return prefix; }
|
||||
|
||||
@Override
|
||||
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||
TermsEnum tenum = terms.iterator(null);
|
||||
|
||||
if (prefix.bytes().length == 0) {
|
||||
// no prefix -- match all terms for this field:
|
||||
return tenum;
|
||||
}
|
||||
return new PrefixTermsEnum(tenum, prefix.bytes());
|
||||
public Term getPrefix() {
|
||||
return term;
|
||||
}
|
||||
|
||||
|
||||
/** Prints a user-readable version of this query. */
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
|
@ -62,7 +73,7 @@ public class PrefixQuery extends MultiTermQuery {
|
|||
buffer.append(getField());
|
||||
buffer.append(":");
|
||||
}
|
||||
buffer.append(prefix.text());
|
||||
buffer.append(term.text());
|
||||
buffer.append('*');
|
||||
buffer.append(ToStringUtils.boost(getBoost()));
|
||||
return buffer.toString();
|
||||
|
@ -72,25 +83,23 @@ public class PrefixQuery extends MultiTermQuery {
|
|||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = super.hashCode();
|
||||
result = prime * result + ((prefix == null) ? 0 : prefix.hashCode());
|
||||
result = prime * result + term.hashCode();
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
if (this == obj) {
|
||||
return true;
|
||||
if (!super.equals(obj))
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
}
|
||||
if (!super.equals(obj)) {
|
||||
return false;
|
||||
}
|
||||
// super.equals() ensures we are the same class
|
||||
PrefixQuery other = (PrefixQuery) obj;
|
||||
if (prefix == null) {
|
||||
if (other.prefix != null)
|
||||
return false;
|
||||
} else if (!prefix.equals(other.prefix))
|
||||
if (!term.equals(other.term)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.FilteredTermsEnum;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
|
||||
/**
|
||||
* Subclass of FilteredTermEnum for enumerating all terms that match the
|
||||
* specified prefix filter term.
|
||||
* <p>Term enumerations are always ordered by
|
||||
* {@link BytesRef#compareTo}. Each term in the enumeration is
|
||||
* greater than all that precede it.</p>
|
||||
*/
|
||||
public class PrefixTermsEnum extends FilteredTermsEnum {
|
||||
|
||||
private final BytesRef prefixRef;
|
||||
|
||||
public PrefixTermsEnum(TermsEnum tenum, BytesRef prefixText) {
|
||||
super(tenum);
|
||||
setInitialSeekTerm(this.prefixRef = prefixText);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AcceptStatus accept(BytesRef term) {
|
||||
if (StringHelper.startsWith(term, prefixRef)) {
|
||||
return AcceptStatus.YES;
|
||||
} else {
|
||||
return AcceptStatus.END;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -393,4 +393,20 @@ public abstract class StringHelper {
|
|||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/** Just converts each int in the incoming {@link IntsRef} to each byte
|
||||
* in the returned {@link BytesRef}, throwing {@code IllegalArgumentException}
|
||||
* if any int value is out of bounds for a byte. */
|
||||
public static BytesRef intsRefToBytesRef(IntsRef ints) {
|
||||
byte[] bytes = new byte[ints.length];
|
||||
for(int i=0;i<ints.length;i++) {
|
||||
int x = ints.ints[ints.offset+i];
|
||||
if (x < 0 || x > 255) {
|
||||
throw new IllegalArgumentException("int at pos=" + i + " with value=" + x + " is out-of-bounds for byte");
|
||||
}
|
||||
bytes[i] = (byte) x;
|
||||
}
|
||||
|
||||
return new BytesRef(bytes);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -347,6 +347,7 @@ public class Automaton implements Accountable {
|
|||
|
||||
/** How many transitions this state has. */
|
||||
public int getNumTransitions(int state) {
|
||||
assert state >= 0;
|
||||
int count = states[2*state+1];
|
||||
if (count == -1) {
|
||||
return 0;
|
||||
|
|
|
@ -24,9 +24,11 @@ import java.util.List;
|
|||
import org.apache.lucene.index.SingleTermsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.PrefixTermsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/**
|
||||
* Immutable class holding compiled details for a given
|
||||
|
@ -47,8 +49,6 @@ public class CompiledAutomaton {
|
|||
ALL,
|
||||
/** Automaton that accepts only a single fixed string. */
|
||||
SINGLE,
|
||||
/** Automaton that matches all Strings with a constant prefix. */
|
||||
PREFIX,
|
||||
/** Catch-all for any other automata. */
|
||||
NORMAL
|
||||
};
|
||||
|
@ -57,8 +57,7 @@ public class CompiledAutomaton {
|
|||
public final AUTOMATON_TYPE type;
|
||||
|
||||
/**
|
||||
* For {@link AUTOMATON_TYPE#PREFIX}, this is the prefix term;
|
||||
* for {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
|
||||
* For {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
|
||||
*/
|
||||
public final BytesRef term;
|
||||
|
||||
|
@ -101,7 +100,7 @@ public class CompiledAutomaton {
|
|||
* possibly expensive operations to determine if the automaton is one
|
||||
* the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. */
|
||||
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
|
||||
this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES, false);
|
||||
}
|
||||
|
||||
|
||||
|
@ -114,7 +113,7 @@ public class CompiledAutomaton {
|
|||
* TooComplexToDeterminizeException.
|
||||
*/
|
||||
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify,
|
||||
int maxDeterminizedStates) {
|
||||
int maxDeterminizedStates, boolean isBinary) {
|
||||
if (automaton.getNumStates() == 0) {
|
||||
automaton = new Automaton();
|
||||
automaton.createState();
|
||||
|
@ -135,8 +134,18 @@ public class CompiledAutomaton {
|
|||
this.automaton = null;
|
||||
this.finite = null;
|
||||
return;
|
||||
}
|
||||
|
||||
boolean isTotal;
|
||||
|
||||
// NOTE: only approximate, because automaton may not be minimal:
|
||||
} else if (Operations.isTotal(automaton)) {
|
||||
if (isBinary) {
|
||||
isTotal = Operations.isTotal(automaton, 0, 0xff);
|
||||
} else {
|
||||
isTotal = Operations.isTotal(automaton);
|
||||
}
|
||||
|
||||
if (isTotal) {
|
||||
// matches all possible strings
|
||||
type = AUTOMATON_TYPE.ALL;
|
||||
term = null;
|
||||
|
@ -145,43 +154,27 @@ public class CompiledAutomaton {
|
|||
this.automaton = null;
|
||||
this.finite = null;
|
||||
return;
|
||||
} else {
|
||||
}
|
||||
|
||||
automaton = Operations.determinize(automaton, maxDeterminizedStates);
|
||||
automaton = Operations.determinize(automaton, maxDeterminizedStates);
|
||||
|
||||
final String commonPrefix = Operations.getCommonPrefix(automaton);
|
||||
final String singleton;
|
||||
IntsRef singleton = Operations.getSingleton(automaton);
|
||||
|
||||
if (commonPrefix.length() > 0 && Operations.sameLanguage(automaton, Automata.makeString(commonPrefix))) {
|
||||
singleton = commonPrefix;
|
||||
if (singleton != null) {
|
||||
// matches a fixed string
|
||||
type = AUTOMATON_TYPE.SINGLE;
|
||||
commonSuffixRef = null;
|
||||
runAutomaton = null;
|
||||
this.automaton = null;
|
||||
this.finite = null;
|
||||
|
||||
if (isBinary) {
|
||||
term = StringHelper.intsRefToBytesRef(singleton);
|
||||
} else {
|
||||
singleton = null;
|
||||
term = new BytesRef(UnicodeUtil.newString(singleton.ints, singleton.offset, singleton.length));
|
||||
}
|
||||
|
||||
if (singleton != null) {
|
||||
// matches a fixed string
|
||||
type = AUTOMATON_TYPE.SINGLE;
|
||||
term = new BytesRef(singleton);
|
||||
commonSuffixRef = null;
|
||||
runAutomaton = null;
|
||||
this.automaton = null;
|
||||
this.finite = null;
|
||||
return;
|
||||
} else if (commonPrefix.length() > 0) {
|
||||
Automaton other = Operations.concatenate(Automata.makeString(commonPrefix), Automata.makeAnyString());
|
||||
other = Operations.determinize(other, maxDeterminizedStates);
|
||||
assert Operations.hasDeadStates(other) == false;
|
||||
if (Operations.sameLanguage(automaton, other)) {
|
||||
// matches a constant prefix
|
||||
type = AUTOMATON_TYPE.PREFIX;
|
||||
term = new BytesRef(commonPrefix);
|
||||
commonSuffixRef = null;
|
||||
runAutomaton = null;
|
||||
this.automaton = null;
|
||||
this.finite = null;
|
||||
return;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -194,14 +187,26 @@ public class CompiledAutomaton {
|
|||
this.finite = finite;
|
||||
}
|
||||
|
||||
Automaton utf8 = new UTF32ToUTF8().convert(automaton);
|
||||
Automaton binary;
|
||||
if (isBinary) {
|
||||
// Caller already built binary automaton themselves, e.g. PrefixQuery
|
||||
// does this since it can be provided with a binary (not necessarily
|
||||
// UTF8!) term:
|
||||
binary = automaton;
|
||||
} else {
|
||||
// Incoming automaton is unicode, and we must convert to UTF8 to match what's in the index:
|
||||
binary = new UTF32ToUTF8().convert(automaton);
|
||||
}
|
||||
|
||||
if (this.finite) {
|
||||
commonSuffixRef = null;
|
||||
} else {
|
||||
// NOTE: this is a very costly operation! We should test if it's really warranted in practice...
|
||||
commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8, maxDeterminizedStates);
|
||||
commonSuffixRef = Operations.getCommonSuffixBytesRef(binary, maxDeterminizedStates);
|
||||
}
|
||||
runAutomaton = new ByteRunAutomaton(utf8, true, maxDeterminizedStates);
|
||||
|
||||
// This will determinize the binary automaton for us:
|
||||
runAutomaton = new ByteRunAutomaton(binary, true, maxDeterminizedStates);
|
||||
|
||||
this.automaton = runAutomaton.automaton;
|
||||
}
|
||||
|
@ -285,10 +290,6 @@ public class CompiledAutomaton {
|
|||
return terms.iterator(null);
|
||||
case SINGLE:
|
||||
return new SingleTermsEnum(terms.iterator(null), term);
|
||||
case PREFIX:
|
||||
// TODO: this is very likely faster than .intersect,
|
||||
// but we should test and maybe cutover
|
||||
return new PrefixTermsEnum(terms.iterator(null), term);
|
||||
case NORMAL:
|
||||
return terms.intersect(this, null);
|
||||
default:
|
||||
|
@ -410,7 +411,7 @@ public class CompiledAutomaton {
|
|||
if (getClass() != obj.getClass()) return false;
|
||||
CompiledAutomaton other = (CompiledAutomaton) obj;
|
||||
if (type != other.type) return false;
|
||||
if (type == AUTOMATON_TYPE.SINGLE || type == AUTOMATON_TYPE.PREFIX) {
|
||||
if (type == AUTOMATON_TYPE.SINGLE) {
|
||||
if (!term.equals(other.term)) return false;
|
||||
} else if (type == AUTOMATON_TYPE.NORMAL) {
|
||||
if (!runAutomaton.equals(other.runAutomaton)) return false;
|
||||
|
|
|
@ -834,11 +834,20 @@ final public class Operations {
|
|||
* Returns true if the given automaton accepts all strings. The automaton must be minimized.
|
||||
*/
|
||||
public static boolean isTotal(Automaton a) {
|
||||
return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the given automaton accepts all strings for the specified min/max
|
||||
* range of the alphabet. The automaton must be minimized.
|
||||
*/
|
||||
public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) {
|
||||
if (a.isAccept(0) && a.getNumTransitions(0) == 1) {
|
||||
Transition t = new Transition();
|
||||
a.getTransition(0, 0, t);
|
||||
return t.dest == 0 && t.min == Character.MIN_CODE_POINT
|
||||
&& t.max == Character.MAX_CODE_POINT;
|
||||
return t.dest == 0
|
||||
&& t.min == minAlphabet
|
||||
&& t.max == maxAlphabet;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -1112,6 +1121,37 @@ final public class Operations {
|
|||
return builder.get();
|
||||
}
|
||||
|
||||
/** If this automaton accepts a single input, return it. Else, return null.
|
||||
* The automaton must be deterministic. */
|
||||
public static IntsRef getSingleton(Automaton a) {
|
||||
if (a.isDeterministic() == false) {
|
||||
throw new IllegalArgumentException("input automaton must be deterministic");
|
||||
}
|
||||
IntsRefBuilder builder = new IntsRefBuilder();
|
||||
HashSet<Integer> visited = new HashSet<>();
|
||||
int s = 0;
|
||||
boolean done;
|
||||
Transition t = new Transition();
|
||||
while (true) {
|
||||
visited.add(s);
|
||||
if (a.isAccept(s) == false) {
|
||||
if (a.getNumTransitions(s) == 1) {
|
||||
a.getTransition(s, 0, t);
|
||||
if (t.min == t.max && !visited.contains(t.dest)) {
|
||||
builder.append(t.min);
|
||||
s = t.dest;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else if (a.getNumTransitions(s) == 0) {
|
||||
return builder.get();
|
||||
}
|
||||
|
||||
// Automaton accepts more than one string:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the longest BytesRef that is a suffix of all accepted strings.
|
||||
* Worst case complexity: exponential in number of states (this calls
|
||||
|
|
|
@ -260,7 +260,7 @@ public class TestTermsEnum extends LuceneTestCase {
|
|||
a = Automata.makeStringUnion(sortedAcceptTerms);
|
||||
}
|
||||
|
||||
final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000);
|
||||
final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000, false);
|
||||
|
||||
final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
|
||||
final Set<BytesRef> acceptTermsSet = new HashSet<>();
|
||||
|
|
|
@ -195,7 +195,6 @@ public class TestAutomatonQuery extends LuceneTestCase {
|
|||
Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
|
||||
AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
|
||||
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN);
|
||||
assertTrue(aq.getTermsEnum(terms) instanceof PrefixTermsEnum);
|
||||
assertEquals(3, automatonQueryNrHits(aq));
|
||||
}
|
||||
|
||||
|
|
|
@ -17,15 +17,29 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Tests {@link PrefixQuery} class.
|
||||
|
@ -57,11 +71,145 @@ public class TestPrefixQuery extends LuceneTestCase {
|
|||
|
||||
query = new PrefixQuery(new Term("category", ""));
|
||||
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "category");
|
||||
assertFalse(query.getTermsEnum(terms) instanceof PrefixTermsEnum);
|
||||
hits = searcher.search(query, 1000).scoreDocs;
|
||||
assertEquals("everything", 3, hits.length);
|
||||
writer.close();
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
public void testMatchAll() throws Exception {
|
||||
Directory directory = newDirectory();
|
||||
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField("field", "field", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
|
||||
PrefixQuery query = new PrefixQuery(new Term("field", ""));
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
|
||||
assertEquals(1, searcher.search(query, 1000).totalHits);
|
||||
|
||||
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
|
||||
writer.close();
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
static final class BinaryTokenStream extends TokenStream {
|
||||
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
|
||||
private boolean available = true;
|
||||
|
||||
public BinaryTokenStream(BytesRef bytes) {
|
||||
bytesAtt.setBytesRef(bytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
if (available) {
|
||||
clearAttributes();
|
||||
available = false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
available = true;
|
||||
}
|
||||
|
||||
public interface ByteTermAttribute extends TermToBytesRefAttribute {
|
||||
public void setBytesRef(BytesRef bytes);
|
||||
}
|
||||
|
||||
public static class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute,TermToBytesRefAttribute {
|
||||
private BytesRef bytes;
|
||||
|
||||
@Override
|
||||
public void fillBytesRef() {
|
||||
// no-op: the bytes was already filled by our owner's incrementToken
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getBytesRef() {
|
||||
return bytes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setBytesRef(BytesRef bytes) {
|
||||
this.bytes = bytes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
|
||||
other.bytes = bytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Basically a StringField that accepts binary term. */
|
||||
private static class BinaryField extends Field {
|
||||
|
||||
final static FieldType TYPE;
|
||||
static {
|
||||
TYPE = new FieldType(StringField.TYPE_NOT_STORED);
|
||||
// Necessary so our custom tokenStream is used by Field.tokenStream:
|
||||
TYPE.setTokenized(true);
|
||||
TYPE.freeze();
|
||||
}
|
||||
|
||||
public BinaryField(String name, BytesRef value) {
|
||||
super(name, new BinaryTokenStream(value), TYPE);
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandomBinaryPrefix() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
|
||||
int numTerms = atLeast(10000);
|
||||
Set<BytesRef> terms = new HashSet<>();
|
||||
while (terms.size() < numTerms) {
|
||||
byte[] bytes = new byte[TestUtil.nextInt(random(), 1, 10)];
|
||||
random().nextBytes(bytes);
|
||||
terms.add(new BytesRef(bytes));
|
||||
}
|
||||
|
||||
List<BytesRef> termsList = new ArrayList<>(terms);
|
||||
Collections.shuffle(termsList, random());
|
||||
for(BytesRef term : termsList) {
|
||||
Document doc = new Document();
|
||||
doc.add(new BinaryField("field", term));
|
||||
w.addDocument(doc);
|
||||
}
|
||||
|
||||
IndexReader r = w.getReader();
|
||||
IndexSearcher s = newSearcher(r);
|
||||
|
||||
int iters = atLeast(100);
|
||||
for(int iter=0;iter<iters;iter++) {
|
||||
byte[] bytes = new byte[random().nextInt(3)];
|
||||
random().nextBytes(bytes);
|
||||
BytesRef prefix = new BytesRef(bytes);
|
||||
PrefixQuery q = new PrefixQuery(new Term("field", prefix));
|
||||
int count = 0;
|
||||
for(BytesRef term : termsList) {
|
||||
if (StringHelper.startsWith(term, prefix)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
assertEquals(count, s.search(q, 1).totalHits);
|
||||
}
|
||||
r.close();
|
||||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -127,11 +127,9 @@ public class TestWildcard
|
|||
MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*"));
|
||||
assertMatches(searcher, wq, 2);
|
||||
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
|
||||
assertTrue(wq.getTermsEnum(terms) instanceof PrefixTermsEnum);
|
||||
|
||||
wq = new WildcardQuery(new Term("field", "*"));
|
||||
assertMatches(searcher, wq, 2);
|
||||
assertFalse(wq.getTermsEnum(terms) instanceof PrefixTermsEnum);
|
||||
assertFalse(wq.getTermsEnum(terms).getClass().getSimpleName().contains("AutomatonTermsEnum"));
|
||||
reader.close();
|
||||
indexStore.close();
|
||||
|
|
|
@ -1104,4 +1104,51 @@ public class TestAutomaton extends LuceneTestCase {
|
|||
throw ae;
|
||||
}
|
||||
}
|
||||
|
||||
private static IntsRef toIntsRef(String s) {
|
||||
IntsRefBuilder b = new IntsRefBuilder();
|
||||
for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
|
||||
cp = s.codePointAt(i);
|
||||
b.append(cp);
|
||||
}
|
||||
|
||||
return b.get();
|
||||
}
|
||||
|
||||
public void testGetSingleton() {
|
||||
int iters = atLeast(10000);
|
||||
for(int iter=0;iter<iters;iter++) {
|
||||
String s = TestUtil.randomRealisticUnicodeString(random());
|
||||
Automaton a = Automata.makeString(s);
|
||||
assertEquals(toIntsRef(s), Operations.getSingleton(a));
|
||||
}
|
||||
}
|
||||
|
||||
public void testGetSingletonEmptyString() {
|
||||
Automaton a = new Automaton();
|
||||
int s = a.createState();
|
||||
a.setAccept(s, true);
|
||||
a.finishState();
|
||||
assertEquals(new IntsRef(), Operations.getSingleton(a));
|
||||
}
|
||||
|
||||
public void testGetSingletonNothing() {
|
||||
Automaton a = new Automaton();
|
||||
a.createState();
|
||||
a.finishState();
|
||||
assertNull(Operations.getSingleton(a));
|
||||
}
|
||||
|
||||
public void testGetSingletonTwo() {
|
||||
Automaton a = new Automaton();
|
||||
int s = a.createState();
|
||||
int x = a.createState();
|
||||
a.setAccept(x, true);
|
||||
a.addTransition(s, x, 55);
|
||||
int y = a.createState();
|
||||
a.setAccept(y, true);
|
||||
a.addTransition(s, y, 58);
|
||||
a.finishState();
|
||||
assertNull(Operations.getSingleton(a));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ public class TestCompiledAutomaton extends LuceneTestCase {
|
|||
}
|
||||
Collections.sort(terms);
|
||||
final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
|
||||
return new CompiledAutomaton(a, true, false, maxDeterminizedStates);
|
||||
return new CompiledAutomaton(a, true, false, maxDeterminizedStates, false);
|
||||
}
|
||||
|
||||
private void testFloor(CompiledAutomaton c, String input, String expected) {
|
||||
|
@ -121,4 +121,43 @@ public class TestCompiledAutomaton extends LuceneTestCase {
|
|||
testFloor(c, "aa", null);
|
||||
testFloor(c, "zzz", "goo");
|
||||
}
|
||||
|
||||
// LUCENE-6367
|
||||
public void testBinaryAll() throws Exception {
|
||||
Automaton a = new Automaton();
|
||||
int state = a.createState();
|
||||
a.setAccept(state, true);
|
||||
a.addTransition(state, state, 0, 0xff);
|
||||
a.finishState();
|
||||
|
||||
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, true);
|
||||
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.ALL, ca.type);
|
||||
}
|
||||
|
||||
// LUCENE-6367
|
||||
public void testUnicodeAll() throws Exception {
|
||||
Automaton a = new Automaton();
|
||||
int state = a.createState();
|
||||
a.setAccept(state, true);
|
||||
a.addTransition(state, state, 0, Character.MAX_CODE_POINT);
|
||||
a.finishState();
|
||||
|
||||
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
|
||||
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.ALL, ca.type);
|
||||
}
|
||||
|
||||
// LUCENE-6367
|
||||
public void testBinarySingleton() throws Exception {
|
||||
// This is just ascii so we can pretend it's binary:
|
||||
Automaton a = Automata.makeString("foobar");
|
||||
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, true);
|
||||
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.SINGLE, ca.type);
|
||||
}
|
||||
|
||||
// LUCENE-6367
|
||||
public void testUnicodeSingleton() throws Exception {
|
||||
Automaton a = Automata.makeString(TestUtil.randomRealisticUnicodeString(random()));
|
||||
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
|
||||
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.SINGLE, ca.type);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1238,7 +1238,7 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest
|
|||
for(String field : fields.keySet()) {
|
||||
while (true) {
|
||||
Automaton a = AutomatonTestUtil.randomAutomaton(random());
|
||||
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE);
|
||||
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
|
||||
if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
|
||||
// Keep retrying until we get an A that will really "use" the PF's intersect code:
|
||||
continue;
|
||||
|
|
Loading…
Reference in New Issue