mirror of https://github.com/apache/lucene.git
LUCENE-6801: Clarify javadocs of PhraseQuery, MultiPhraseQuery RE terms at same position. Add test.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1715908 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5b0b4194a1
commit
60bb927256
|
@ -375,6 +375,10 @@ Other
|
|||
* LUCENE-6902: Don't retry to fsync files / directories; fail
|
||||
immediately. (Daniel Mitterdorfer, Uwe Schindler)
|
||||
|
||||
* LUCENE-6801: Clarify JavaDocs of PhraseQuery that it in fact supports terms
|
||||
at the same position (as does MultiPhraseQuery), treated like a conjunction.
|
||||
Added test. (David Smiley, Adrien Grand)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-6732: Improve checker for invalid source patterns to also
|
||||
|
|
|
@ -37,18 +37,19 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* MultiPhraseQuery is a generalized version of PhraseQuery, with an added
|
||||
* method {@link #add(Term[])}.
|
||||
* To use this class, to search for the phrase "Microsoft app*" first use
|
||||
* add(Term) on the term "Microsoft", then find all terms that have "app" as
|
||||
* prefix using IndexReader.terms(Term), and use MultiPhraseQuery.add(Term[]
|
||||
* terms) to add them to the query.
|
||||
*
|
||||
* A generalized version of {@link PhraseQuery}, with an added
|
||||
* method {@link #add(Term[])} for adding more than one term at the same position
|
||||
* that are treated as a disjunction (OR).
|
||||
* To use this class to search for the phrase "Microsoft app*" first use
|
||||
* {@link #add(Term)} on the term "microsoft" (assuming lowercase analysis), then
|
||||
* find all terms that have "app" as prefix using {@link LeafReader#terms(String)},
|
||||
* seeking to "app" then iterating and collecting terms until there is no longer
|
||||
* that prefix, and finally use {@link #add(Term[])} to add them to the query.
|
||||
*/
|
||||
public class MultiPhraseQuery extends Query {
|
||||
private String field;
|
||||
private ArrayList<Term[]> termArrays = new ArrayList<>();
|
||||
private ArrayList<Integer> positions = new ArrayList<>();
|
||||
private String field;// becomes non-null on first add() then is unmodified
|
||||
private final ArrayList<Term[]> termArrays = new ArrayList<>();
|
||||
private final ArrayList<Integer> positions = new ArrayList<>();
|
||||
|
||||
private int slop = 0;
|
||||
|
||||
|
@ -72,38 +73,41 @@ public class MultiPhraseQuery extends Query {
|
|||
public void add(Term term) { add(new Term[]{term}); }
|
||||
|
||||
/** Add multiple terms at the next position in the phrase. Any of the terms
|
||||
* may match.
|
||||
* may match (a disjunction).
|
||||
* The array is not copied or mutated, the caller should consider it
|
||||
* immutable subsequent to calling this method.
|
||||
*/
|
||||
public void add(Term[] terms) {
|
||||
int position = 0;
|
||||
if (positions.size() > 0)
|
||||
position = positions.get(positions.size()-1).intValue() + 1;
|
||||
position = positions.get(positions.size() - 1) + 1;
|
||||
|
||||
add(terms, position);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows to specify the relative position of terms within the phrase.
|
||||
* The array is not copied or mutated, the caller should consider it
|
||||
* immutable subsequent to calling this method.
|
||||
*/
|
||||
public void add(Term[] terms, int position) {
|
||||
Objects.requireNonNull(terms, "Term array must not be null");
|
||||
if (termArrays.size() == 0)
|
||||
field = terms[0].field();
|
||||
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
if (!terms[i].field().equals(field)) {
|
||||
for (Term term : terms) {
|
||||
if (!term.field().equals(field)) {
|
||||
throw new IllegalArgumentException(
|
||||
"All phrase terms must be in the same field (" + field + "): "
|
||||
+ terms[i]);
|
||||
"All phrase terms must be in the same field (" + field + "): " + term);
|
||||
}
|
||||
}
|
||||
|
||||
termArrays.add(terms);
|
||||
positions.add(Integer.valueOf(position));
|
||||
positions.add(position);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a List of the terms in the multiphrase.
|
||||
* Returns a List of the terms in the multi-phrase.
|
||||
* Do not modify the List or its contents.
|
||||
*/
|
||||
public List<Term[]> getTermArrays() {
|
||||
|
@ -116,7 +120,7 @@ public class MultiPhraseQuery extends Query {
|
|||
public int[] getPositions() {
|
||||
int[] result = new int[positions.size()];
|
||||
for (int i = 0; i < positions.size(); i++)
|
||||
result[i] = positions.get(i).intValue();
|
||||
result[i] = positions.get(i);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -154,9 +158,7 @@ public class MultiPhraseQuery extends Query {
|
|||
@Override
|
||||
public void extractTerms(Set<Term> terms) {
|
||||
for (final Term[] arr : termArrays) {
|
||||
for (final Term term: arr) {
|
||||
terms.add(term);
|
||||
}
|
||||
Collections.addAll(terms, arr);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -184,7 +186,8 @@ public class MultiPhraseQuery extends Query {
|
|||
|
||||
// TODO: move this check to createWeight to happen earlier to the user?
|
||||
if (fieldTerms.hasPositions() == false) {
|
||||
throw new IllegalStateException("field \"" + field + "\" was indexed without position data; cannot run MultiPhraseQuery (phrase=" + getQuery() + ")");
|
||||
throw new IllegalStateException("field \"" + field + "\" was indexed without position data;" +
|
||||
" cannot run MultiPhraseQuery (phrase=" + getQuery() + ")");
|
||||
}
|
||||
|
||||
// Reuse single TermsEnum below:
|
||||
|
@ -263,8 +266,8 @@ public class MultiPhraseQuery extends Query {
|
|||
Term[] terms = termArrays.get(0);
|
||||
BooleanQuery.Builder builder = new BooleanQuery.Builder();
|
||||
builder.setDisableCoord(true);
|
||||
for (int i=0; i<terms.length; i++) {
|
||||
builder.add(new TermQuery(terms[i]), BooleanClause.Occur.SHOULD);
|
||||
for (Term term : terms) {
|
||||
builder.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
return builder.build();
|
||||
} else {
|
||||
|
|
|
@ -46,7 +46,11 @@ import org.apache.lucene.util.BytesRef;
|
|||
*
|
||||
* <p>This query may be combined with other terms or queries with a {@link BooleanQuery}.
|
||||
*
|
||||
* <b>NOTE</b>: Leading holes don't have any particular meaning for this query
|
||||
* <p><b>NOTE</b>:
|
||||
* All terms in the phrase must match, even those at the same position. If you
|
||||
* have terms at the same position, perhaps synonyms, you probably want {@link MultiPhraseQuery}
|
||||
* instead which only requires one term at a position to match.
|
||||
* <br >Also, Leading holes don't have any particular meaning for this query
|
||||
* and will be ignored. For instance this query:
|
||||
* <pre class="prettyprint">
|
||||
* PhraseQuery.Builder builder = new PhraseQuery.Builder();
|
||||
|
@ -97,10 +101,12 @@ public class PhraseQuery extends Query {
|
|||
|
||||
/**
|
||||
* Adds a term to the end of the query phrase.
|
||||
* The relative position of the term within the phrase is specified explicitly.
|
||||
* This allows e.g. phrases with more than one term at the same position
|
||||
* or phrases with gaps (e.g. in connection with stopwords).
|
||||
*
|
||||
* The relative position of the term within the phrase is specified explicitly, but must be greater than
|
||||
* or equal to that of the previously added term.
|
||||
* A greater position allows phrases with gaps (e.g. in connection with stopwords).
|
||||
* If the position is equal, you most likely should be using
|
||||
* {@link MultiPhraseQuery} instead which only requires one term at each position to match; this class requires
|
||||
* all of them.
|
||||
*/
|
||||
public Builder add(Term term, int position) {
|
||||
if (position < 0) {
|
||||
|
@ -265,7 +271,7 @@ public class PhraseQuery extends Query {
|
|||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
if (terms.length == 0) {
|
||||
return new MatchNoDocsQuery();
|
||||
return new MatchNoDocsQuery();
|
||||
} else if (terms.length == 1) {
|
||||
return new TermQuery(terms[0]);
|
||||
} else if (positions[0] != 0) {
|
||||
|
|
|
@ -23,19 +23,23 @@ import java.util.List;
|
|||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.junit.AfterClass;
|
||||
|
@ -549,6 +553,52 @@ public class TestPhraseQuery extends LuceneTestCase {
|
|||
assertTrue(rewritten instanceof TermQuery);
|
||||
}
|
||||
|
||||
/** Tests PhraseQuery with terms at the same position in the query. */
|
||||
public void testZeroPosIncr() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
final Token[] tokens = new Token[3];
|
||||
tokens[0] = new Token();
|
||||
tokens[0].append("a");
|
||||
tokens[0].setPositionIncrement(1);
|
||||
tokens[1] = new Token();
|
||||
tokens[1].append("aa");
|
||||
tokens[1].setPositionIncrement(0);
|
||||
tokens[2] = new Token();
|
||||
tokens[2].append("b");
|
||||
tokens[2].setPositionIncrement(1);
|
||||
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(new TextField("field", new CannedTokenStream(tokens)));
|
||||
writer.addDocument(doc);
|
||||
IndexReader r = writer.getReader();
|
||||
writer.close();
|
||||
IndexSearcher searcher = newSearcher(r);
|
||||
|
||||
// Sanity check; simple "a b" phrase:
|
||||
PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
|
||||
pqBuilder.add(new Term("field", "a"), 0);
|
||||
pqBuilder.add(new Term("field", "b"), 1);
|
||||
assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
|
||||
|
||||
// Now with "a|aa b"
|
||||
pqBuilder = new PhraseQuery.Builder();
|
||||
pqBuilder.add(new Term("field", "a"), 0);
|
||||
pqBuilder.add(new Term("field", "aa"), 0);
|
||||
pqBuilder.add(new Term("field", "b"), 1);
|
||||
assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
|
||||
|
||||
// Now with "a|z b" which should not match; this isn't a MultiPhraseQuery
|
||||
pqBuilder = new PhraseQuery.Builder();
|
||||
pqBuilder.add(new Term("field", "a"), 0);
|
||||
pqBuilder.add(new Term("field", "z"), 0);
|
||||
pqBuilder.add(new Term("field", "b"), 1);
|
||||
assertEquals(0, searcher.search(pqBuilder.build(), 1).totalHits);
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testRandomPhrases() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
|
|
Loading…
Reference in New Issue