LUCENE-8344: TokenStreamToAutomaton doesn't ignore trailing posInc when preservePositionIncrements=false

2018-06-13 23:35:44 -04:00 · 2018-06-13 23:35:44 -04:00 · 228a84fd6d
parent eea4197a3a
commit 228a84fd6d
4 changed files with 139 additions and 59 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -120,6 +120,13 @@ Bug Fixes:

 ======================= Lucene 7.4.0 =======================

+Upgrading
+
+* LUCENE-8344: If you are using the AnalyzingSuggester or FuzzySuggester subclass, and if you
+  explicitly use the preservePositionIncrements=false setting (not the default), then you ought
+  to rebuild your suggester index. If you don't, queries or indexed data with trailing position
+  gaps (e.g. stop words) may not work correctly. (David Smiley, Jim Ferenczi)
+
 API Changes

 * LUCENE-8242: IndexSearcher.createNormalizedWeight() has been deprecated.
@ -280,6 +287,10 @@ Bug Fixes
 * LUCENE-8355: Prevent IW from opening an already dropped segment while DV updates
  are written. (Nhat Nguyen via Simon Willnauer)

+* LUCENE-8344: TokenStreamToAutomaton (used by some suggesters) was not ignoring a trailing
+  position increment when the preservePositionIncrement setting is false.
+  (David Smiley, Jim Ferenczi)
+
 Other

 * LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)
--- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
@ -208,14 +208,14 @@ public class TokenStreamToAutomaton {

    in.end();

-    int endState = -1;
-
    int endPosInc = posIncAtt.getPositionIncrement();
-
    if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
      endPosInc = 1;
+    } else if (endPosInc > 0 && preservePositionIncrements==false) {
+      endPosInc = 0;
    }
-    
+
+    int endState;
    if (endPosInc > 0) {
      // there were hole(s) after the last token
      endState = builder.createState();
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
@ -220,34 +220,49 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
   * basic "standardanalyzer" test with stopword removal
   */
  public void testStandard() throws Exception {
+    final String input = "the ghost of christmas past the"; // trailing stopword there just to perturb possible bugs
    Input keys[] = new Input[] {
-        new Input("the ghost of christmas past", 50),
+        new Input(input, 50),
    };
-    
+
    Directory tempDir = getDirectory();
    Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", standard, standard, 
        AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false);

    suggester.build(new InputArrayIterator(keys));
-    
-    List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
+    List<LookupResult> results;
+
+    // round-trip
+    results = suggester.lookup(TestUtil.stringToCharSequence(input, random()), false, 1);
    assertEquals(1, results.size());
-    assertEquals("the ghost of christmas past", results.get(0).key.toString());
+    assertEquals(input, results.get(0).key.toString());
+    assertEquals(50, results.get(0).value, 0.01F);
+
+    // prefix of input stopping part way through christmas
+    results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
+    assertEquals(1, results.size());
+    assertEquals(input, results.get(0).key.toString());
    assertEquals(50, results.get(0).value, 0.01F);

    // omit the 'the' since it's a stopword, it's suggested anyway
    results = suggester.lookup(TestUtil.stringToCharSequence("ghost of chris", random()), false, 1);
    assertEquals(1, results.size());
-    assertEquals("the ghost of christmas past", results.get(0).key.toString());
+    assertEquals(input, results.get(0).key.toString());
    assertEquals(50, results.get(0).value, 0.01F);

    // omit the 'the' and 'of' since they are stopwords, it's suggested anyway
    results = suggester.lookup(TestUtil.stringToCharSequence("ghost chris", random()), false, 1);
    assertEquals(1, results.size());
-    assertEquals("the ghost of christmas past", results.get(0).key.toString());
+    assertEquals(input, results.get(0).key.toString());
    assertEquals(50, results.get(0).value, 0.01F);
-    
+
+    // trailing stopword "the"
+    results = suggester.lookup(TestUtil.stringToCharSequence("ghost christmas past the", random()), false, 1);
+    assertEquals(1, results.size());
+    assertEquals(input, results.get(0).key.toString());
+    assertEquals(50, results.get(0).value, 0.01F);
+
    IOUtils.close(standard, tempDir);
  }

--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java
@ -253,71 +253,125 @@ public class TestPrefixCompletionQuery extends LuceneTestCase {
    iw.close();
  }

-  public void testAnalyzerWithoutPreservePosAndSep() throws Exception {
+  public void testAnalyzerDefaults() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
-    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, false);
-    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, "suggest_field_no_p_sep_or_pos_inc"));
+    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
+    final String field = getTestName();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, field));
    Document document = new Document();
-    document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "foobar", 7));
-    document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "foo bar", 8));
-    document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "the fo", 9));
-    document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "the foo bar", 10));
+    document.add(new SuggestField(field, "foobar", 7));
+    document.add(new SuggestField(field, "foo bar", 8));
+    document.add(new SuggestField(field, "the fo", 9));
+    document.add(new SuggestField(field, "the foo bar", 10));
+    document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
+    document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
+
    iw.addDocument(document);

    DirectoryReader reader = iw.getReader();
    SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
-    CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep_or_pos_inc", "fo"));
-    TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); // all 4
-    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
-    query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep_or_pos_inc", "foob"));
-    suggest = indexSearcher.suggest(query, 4, false); // not the fo
-    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
+    CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "fo"));
+    TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with "fo*"
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // with leading stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); // becomes "_ fo*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9));
+    // with middle stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar")); // becomes "foo _ bar*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foo the bar", 11));
+    // no space
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foobar", 7));
+    // surrounding stopwords
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the")); // becomes "_ baz _"
+    suggest = indexSearcher.suggest(query, 4, false);
+    assertSuggestions(suggest);
    reader.close();
    iw.close();
  }

-  public void testAnalyzerWithSepAndNoPreservePos() throws Exception {
+  public void testAnalyzerWithoutSeparator() throws Exception {
+    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
+    //note: when we don't preserve separators, the choice of preservePosInc is irrelevant
+    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, random().nextBoolean());
+    final String field = getTestName();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, field));
+    Document document = new Document();
+    document.add(new SuggestField(field, "foobar", 7));
+    document.add(new SuggestField(field, "foo bar", 8));
+    document.add(new SuggestField(field, "the fo", 9));
+    document.add(new SuggestField(field, "the foo bar", 10));
+    document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
+    document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
+
+    iw.addDocument(document);
+
+    // note we use the completionAnalyzer with the queries (instead of input analyzer) because of non-default settings
+    DirectoryReader reader = iw.getReader();
+    SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
+    CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "fo"));
+    TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with fo
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // with leading stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); // becomes "fo*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // with middle stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar")); // becomes "foobar*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // no space
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
+    suggest = indexSearcher.suggest(query, 9, false); // no separators, thus match several
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // surrounding stopwords
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the")); // becomes "baz*"
+    suggest = indexSearcher.suggest(query, 4, false);// stopwords in query get removed so we match
+    assertSuggestions(suggest, new Entry("baz the", 12));
+    reader.close();
+    iw.close();
+  }
+
+  public void testAnalyzerNoPreservePosInc() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, true, false);
-    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, "suggest_field_no_p_pos_inc"));
+    final String field = getTestName();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, field));
    Document document = new Document();
-    document.add(new SuggestField("suggest_field_no_p_pos_inc", "foobar", 7));
-    document.add(new SuggestField("suggest_field_no_p_pos_inc", "foo bar", 8));
-    document.add(new SuggestField("suggest_field_no_p_pos_inc", "the fo", 9));
-    document.add(new SuggestField("suggest_field_no_p_pos_inc", "the foo bar", 10));
+    document.add(new SuggestField(field, "foobar", 7));
+    document.add(new SuggestField(field, "foo bar", 8));
+    document.add(new SuggestField(field, "the fo", 9));
+    document.add(new SuggestField(field, "the foo bar", 10));
+    document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
+    document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
+
    iw.addDocument(document);

+    // note we use the completionAnalyzer with the queries (instead of input analyzer) because of non-default settings
    DirectoryReader reader = iw.getReader();
    SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
-    CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_pos_inc", "fo"));
-    TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); //matches all 4
-    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
-    query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_pos_inc", "foob"));
-    suggest = indexSearcher.suggest(query, 4, false); // only foobar
+    CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "fo"));
+    TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with fo
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // with leading stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); // becomes "fo*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // with middle stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar")); // becomes "foo bar*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("foo bar", 8)); // no foobar
+    // no space
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
+    suggest = indexSearcher.suggest(query, 4, false); // separators, thus only match "foobar"
    assertSuggestions(suggest, new Entry("foobar", 7));
-    reader.close();
-    iw.close();
-  }
-
-  public void testAnalyzerWithPreservePosAndNoSep() throws Exception {
-    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
-    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, true);
-    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, "suggest_field_no_p_sep"));
-    Document document = new Document();
-    document.add(new SuggestField("suggest_field_no_p_sep", "foobar", 7));
-    document.add(new SuggestField("suggest_field_no_p_sep", "foo bar", 8));
-    document.add(new SuggestField("suggest_field_no_p_sep", "the fo", 9));
-    document.add(new SuggestField("suggest_field_no_p_sep", "the foo bar", 10));
-    iw.addDocument(document);
-
-    DirectoryReader reader = iw.getReader();
-    SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
-    CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep", "fo"));
-    TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); // matches all 4
-    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
-    query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep", "foob"));
-    suggest = indexSearcher.suggest(query, 4, false); // except the fo
-    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // surrounding stopwords
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the")); // becomes "baz*"
+    suggest = indexSearcher.suggest(query, 4, false);// stopwords in query get removed so we match
+    assertSuggestions(suggest, new Entry("baz the", 12));
    reader.close();
    iw.close();
  }