From 60bb927256534e446b4fb75f13c0b8972df4b5be Mon Sep 17 00:00:00 2001 From: David Wayne Smiley Date: Mon, 23 Nov 2015 18:17:08 +0000 Subject: [PATCH] LUCENE-6801: Clarify javadocs of PhraseQuery, MultiPhraseQuery RE terms at same position. Add test. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1715908 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 ++ .../lucene/search/MultiPhraseQuery.java | 53 ++++++++++--------- .../org/apache/lucene/search/PhraseQuery.java | 18 ++++--- .../apache/lucene/search/TestPhraseQuery.java | 50 +++++++++++++++++ 4 files changed, 94 insertions(+), 31 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9c2e886937d..00019fda2be 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -375,6 +375,10 @@ Other * LUCENE-6902: Don't retry to fsync files / directories; fail immediately. (Daniel Mitterdorfer, Uwe Schindler) +* LUCENE-6801: Clarify JavaDocs of PhraseQuery that it in fact supports terms + at the same position (as does MultiPhraseQuery), treated like a conjunction. + Added test. (David Smiley, Adrien Grand) + Build * LUCENE-6732: Improve checker for invalid source patterns to also diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java index f36c1763de3..53ea464d003 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -37,18 +37,19 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; /** - * MultiPhraseQuery is a generalized version of PhraseQuery, with an added - * method {@link #add(Term[])}. - * To use this class, to search for the phrase "Microsoft app*" first use - * add(Term) on the term "Microsoft", then find all terms that have "app" as - * prefix using IndexReader.terms(Term), and use MultiPhraseQuery.add(Term[] - * terms) to add them to the query. - * + * A generalized version of {@link PhraseQuery}, with an added + * method {@link #add(Term[])} for adding more than one term at the same position + * that are treated as a disjunction (OR). + * To use this class to search for the phrase "Microsoft app*" first use + * {@link #add(Term)} on the term "microsoft" (assuming lowercase analysis), then + * find all terms that have "app" as prefix using {@link LeafReader#terms(String)}, + * seeking to "app" then iterating and collecting terms until there is no longer + * that prefix, and finally use {@link #add(Term[])} to add them to the query. */ public class MultiPhraseQuery extends Query { - private String field; - private ArrayList termArrays = new ArrayList<>(); - private ArrayList positions = new ArrayList<>(); + private String field;// becomes non-null on first add() then is unmodified + private final ArrayList termArrays = new ArrayList<>(); + private final ArrayList positions = new ArrayList<>(); private int slop = 0; @@ -72,38 +73,41 @@ public class MultiPhraseQuery extends Query { public void add(Term term) { add(new Term[]{term}); } /** Add multiple terms at the next position in the phrase. Any of the terms - * may match. + * may match (a disjunction). + * The array is not copied or mutated, the caller should consider it + * immutable subsequent to calling this method. */ public void add(Term[] terms) { int position = 0; if (positions.size() > 0) - position = positions.get(positions.size()-1).intValue() + 1; + position = positions.get(positions.size() - 1) + 1; add(terms, position); } /** * Allows to specify the relative position of terms within the phrase. + * The array is not copied or mutated, the caller should consider it + * immutable subsequent to calling this method. */ public void add(Term[] terms, int position) { Objects.requireNonNull(terms, "Term array must not be null"); if (termArrays.size() == 0) field = terms[0].field(); - for (int i = 0; i < terms.length; i++) { - if (!terms[i].field().equals(field)) { + for (Term term : terms) { + if (!term.field().equals(field)) { throw new IllegalArgumentException( - "All phrase terms must be in the same field (" + field + "): " - + terms[i]); + "All phrase terms must be in the same field (" + field + "): " + term); } } termArrays.add(terms); - positions.add(Integer.valueOf(position)); + positions.add(position); } /** - * Returns a List of the terms in the multiphrase. + * Returns a List of the terms in the multi-phrase. * Do not modify the List or its contents. */ public List getTermArrays() { @@ -116,7 +120,7 @@ public class MultiPhraseQuery extends Query { public int[] getPositions() { int[] result = new int[positions.size()]; for (int i = 0; i < positions.size(); i++) - result[i] = positions.get(i).intValue(); + result[i] = positions.get(i); return result; } @@ -154,9 +158,7 @@ public class MultiPhraseQuery extends Query { @Override public void extractTerms(Set terms) { for (final Term[] arr : termArrays) { - for (final Term term: arr) { - terms.add(term); - } + Collections.addAll(terms, arr); } } @@ -184,7 +186,8 @@ public class MultiPhraseQuery extends Query { // TODO: move this check to createWeight to happen earlier to the user? if (fieldTerms.hasPositions() == false) { - throw new IllegalStateException("field \"" + field + "\" was indexed without position data; cannot run MultiPhraseQuery (phrase=" + getQuery() + ")"); + throw new IllegalStateException("field \"" + field + "\" was indexed without position data;" + + " cannot run MultiPhraseQuery (phrase=" + getQuery() + ")"); } // Reuse single TermsEnum below: @@ -263,8 +266,8 @@ public class MultiPhraseQuery extends Query { Term[] terms = termArrays.get(0); BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.setDisableCoord(true); - for (int i=0; iThis query may be combined with other terms or queries with a {@link BooleanQuery}. * - * NOTE: Leading holes don't have any particular meaning for this query + *

NOTE: + * All terms in the phrase must match, even those at the same position. If you + * have terms at the same position, perhaps synonyms, you probably want {@link MultiPhraseQuery} + * instead which only requires one term at a position to match. + *
Also, Leading holes don't have any particular meaning for this query * and will be ignored. For instance this query: *

  * PhraseQuery.Builder builder = new PhraseQuery.Builder();
@@ -97,10 +101,12 @@ public class PhraseQuery extends Query {
 
     /**
      * Adds a term to the end of the query phrase.
-     * The relative position of the term within the phrase is specified explicitly.
-     * This allows e.g. phrases with more than one term at the same position
-     * or phrases with gaps (e.g. in connection with stopwords).
-     * 
+     * The relative position of the term within the phrase is specified explicitly, but must be greater than
+     * or equal to that of the previously added term.
+     * A greater position allows phrases with gaps (e.g. in connection with stopwords).
+     * If the position is equal, you most likely should be using
+     * {@link MultiPhraseQuery} instead which only requires one term at each position to match; this class requires
+     * all of them.
      */
     public Builder add(Term term, int position) {
       if (position < 0) {
@@ -265,7 +271,7 @@ public class PhraseQuery extends Query {
   @Override
   public Query rewrite(IndexReader reader) throws IOException {
     if (terms.length == 0) {
-      return new MatchNoDocsQuery();
+      return   new MatchNoDocsQuery();
     } else if (terms.length == 1) {
       return new TermQuery(terms[0]);
     } else if (positions[0] != 0) {
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java
index a5848fac1e6..28054fa70c5 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java
@@ -23,19 +23,23 @@ import java.util.List;
 import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.similarities.ClassicSimilarity;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
 import org.junit.AfterClass;
@@ -549,6 +553,52 @@ public class TestPhraseQuery extends LuceneTestCase {
     assertTrue(rewritten instanceof TermQuery);
   }
 
+  /** Tests PhraseQuery with terms at the same position in the query. */
+  public void testZeroPosIncr() throws IOException {
+    Directory dir = newDirectory();
+    final Token[] tokens = new Token[3];
+    tokens[0] = new Token();
+    tokens[0].append("a");
+    tokens[0].setPositionIncrement(1);
+    tokens[1] = new Token();
+    tokens[1].append("aa");
+    tokens[1].setPositionIncrement(0);
+    tokens[2] = new Token();
+    tokens[2].append("b");
+    tokens[2].setPositionIncrement(1);
+
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    doc.add(new TextField("field", new CannedTokenStream(tokens)));
+    writer.addDocument(doc);
+    IndexReader r = writer.getReader();
+    writer.close();
+    IndexSearcher searcher = newSearcher(r);
+
+    // Sanity check; simple "a b" phrase:
+    PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
+    pqBuilder.add(new Term("field", "a"), 0);
+    pqBuilder.add(new Term("field", "b"), 1);
+    assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
+
+    // Now with "a|aa b"
+    pqBuilder = new PhraseQuery.Builder();
+    pqBuilder.add(new Term("field", "a"), 0);
+    pqBuilder.add(new Term("field", "aa"), 0);
+    pqBuilder.add(new Term("field", "b"), 1);
+    assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
+
+    // Now with "a|z b" which should not match; this isn't a MultiPhraseQuery
+    pqBuilder = new PhraseQuery.Builder();
+    pqBuilder.add(new Term("field", "a"), 0);
+    pqBuilder.add(new Term("field", "z"), 0);
+    pqBuilder.add(new Term("field", "b"), 1);
+    assertEquals(0, searcher.search(pqBuilder.build(), 1).totalHits);
+
+    r.close();
+    dir.close();
+  }
+
   public void testRandomPhrases() throws Exception {
     Directory dir = newDirectory();
     Analyzer analyzer = new MockAnalyzer(random());