LUCENE-8344: TokenStreamToAutomaton doesn't ignore trailing posInc when preservePositionIncrements=false

This commit is contained in:
David Smiley 2018-06-13 23:35:44 -04:00
parent eea4197a3a
commit 228a84fd6d
4 changed files with 139 additions and 59 deletions

View File

@ -120,6 +120,13 @@ Bug Fixes:
======================= Lucene 7.4.0 =======================
Upgrading
* LUCENE-8344: If you are using the AnalyzingSuggester or FuzzySuggester subclass, and if you
explicitly use the preservePositionIncrements=false setting (not the default), then you ought
to rebuild your suggester index. If you don't, queries or indexed data with trailing position
gaps (e.g. stop words) may not work correctly. (David Smiley, Jim Ferenczi)
API Changes
* LUCENE-8242: IndexSearcher.createNormalizedWeight() has been deprecated.
@ -280,6 +287,10 @@ Bug Fixes
* LUCENE-8355: Prevent IW from opening an already dropped segment while DV updates
are written. (Nhat Nguyen via Simon Willnauer)
* LUCENE-8344: TokenStreamToAutomaton (used by some suggesters) was not ignoring a trailing
position increment when the preservePositionIncrement setting is false.
(David Smiley, Jim Ferenczi)
Other
* LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)

View File

@ -208,14 +208,14 @@ public class TokenStreamToAutomaton {
in.end();
int endState = -1;
int endPosInc = posIncAtt.getPositionIncrement();
if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
endPosInc = 1;
} else if (endPosInc > 0 && preservePositionIncrements==false) {
endPosInc = 0;
}
int endState;
if (endPosInc > 0) {
// there were hole(s) after the last token
endState = builder.createState();

View File

@ -220,34 +220,49 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
* basic "standardanalyzer" test with stopword removal
*/
public void testStandard() throws Exception {
final String input = "the ghost of christmas past the"; // trailing stopword there just to perturb possible bugs
Input keys[] = new Input[] {
new Input("the ghost of christmas past", 50),
new Input(input, 50),
};
Directory tempDir = getDirectory();
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", standard, standard,
AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false);
suggester.build(new InputArrayIterator(keys));
List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
List<LookupResult> results;
// round-trip
results = suggester.lookup(TestUtil.stringToCharSequence(input, random()), false, 1);
assertEquals(1, results.size());
assertEquals("the ghost of christmas past", results.get(0).key.toString());
assertEquals(input, results.get(0).key.toString());
assertEquals(50, results.get(0).value, 0.01F);
// prefix of input stopping part way through christmas
results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
assertEquals(1, results.size());
assertEquals(input, results.get(0).key.toString());
assertEquals(50, results.get(0).value, 0.01F);
// omit the 'the' since it's a stopword, it's suggested anyway
results = suggester.lookup(TestUtil.stringToCharSequence("ghost of chris", random()), false, 1);
assertEquals(1, results.size());
assertEquals("the ghost of christmas past", results.get(0).key.toString());
assertEquals(input, results.get(0).key.toString());
assertEquals(50, results.get(0).value, 0.01F);
// omit the 'the' and 'of' since they are stopwords, it's suggested anyway
results = suggester.lookup(TestUtil.stringToCharSequence("ghost chris", random()), false, 1);
assertEquals(1, results.size());
assertEquals("the ghost of christmas past", results.get(0).key.toString());
assertEquals(input, results.get(0).key.toString());
assertEquals(50, results.get(0).value, 0.01F);
// trailing stopword "the"
results = suggester.lookup(TestUtil.stringToCharSequence("ghost christmas past the", random()), false, 1);
assertEquals(1, results.size());
assertEquals(input, results.get(0).key.toString());
assertEquals(50, results.get(0).value, 0.01F);
IOUtils.close(standard, tempDir);
}

View File

@ -253,71 +253,125 @@ public class TestPrefixCompletionQuery extends LuceneTestCase {
iw.close();
}
public void testAnalyzerWithoutPreservePosAndSep() throws Exception {
public void testAnalyzerDefaults() throws Exception {
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, false);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, "suggest_field_no_p_sep_or_pos_inc"));
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
final String field = getTestName();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, field));
Document document = new Document();
document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "foobar", 7));
document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "foo bar", 8));
document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "the fo", 9));
document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "the foo bar", 10));
document.add(new SuggestField(field, "foobar", 7));
document.add(new SuggestField(field, "foo bar", 8));
document.add(new SuggestField(field, "the fo", 9));
document.add(new SuggestField(field, "the foo bar", 10));
document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
iw.addDocument(document);
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep_or_pos_inc", "fo"));
TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); // all 4
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep_or_pos_inc", "foob"));
suggest = indexSearcher.suggest(query, 4, false); // not the fo
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "fo"));
TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with "fo*"
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("foo bar", 8), new Entry("foobar", 7));
// with leading stopword
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); // becomes "_ fo*"
suggest = indexSearcher.suggest(query, 9, false);
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9));
// with middle stopword
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar")); // becomes "foo _ bar*"
suggest = indexSearcher.suggest(query, 9, false);
assertSuggestions(suggest, new Entry("foo the bar", 11));
// no space
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
suggest = indexSearcher.suggest(query, 9, false);
assertSuggestions(suggest, new Entry("foobar", 7));
// surrounding stopwords
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the")); // becomes "_ baz _"
suggest = indexSearcher.suggest(query, 4, false);
assertSuggestions(suggest);
reader.close();
iw.close();
}
public void testAnalyzerWithSepAndNoPreservePos() throws Exception {
public void testAnalyzerWithoutSeparator() throws Exception {
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
//note: when we don't preserve separators, the choice of preservePosInc is irrelevant
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, random().nextBoolean());
final String field = getTestName();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, field));
Document document = new Document();
document.add(new SuggestField(field, "foobar", 7));
document.add(new SuggestField(field, "foo bar", 8));
document.add(new SuggestField(field, "the fo", 9));
document.add(new SuggestField(field, "the foo bar", 10));
document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
iw.addDocument(document);
// note we use the completionAnalyzer with the queries (instead of input analyzer) because of non-default settings
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "fo"));
TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with fo
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
// with leading stopword
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); // becomes "fo*"
suggest = indexSearcher.suggest(query, 9, false);
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
// with middle stopword
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar")); // becomes "foobar*"
suggest = indexSearcher.suggest(query, 9, false);
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
// no space
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
suggest = indexSearcher.suggest(query, 9, false); // no separators, thus match several
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
// surrounding stopwords
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the")); // becomes "baz*"
suggest = indexSearcher.suggest(query, 4, false);// stopwords in query get removed so we match
assertSuggestions(suggest, new Entry("baz the", 12));
reader.close();
iw.close();
}
public void testAnalyzerNoPreservePosInc() throws Exception {
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, true, false);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, "suggest_field_no_p_pos_inc"));
final String field = getTestName();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, field));
Document document = new Document();
document.add(new SuggestField("suggest_field_no_p_pos_inc", "foobar", 7));
document.add(new SuggestField("suggest_field_no_p_pos_inc", "foo bar", 8));
document.add(new SuggestField("suggest_field_no_p_pos_inc", "the fo", 9));
document.add(new SuggestField("suggest_field_no_p_pos_inc", "the foo bar", 10));
document.add(new SuggestField(field, "foobar", 7));
document.add(new SuggestField(field, "foo bar", 8));
document.add(new SuggestField(field, "the fo", 9));
document.add(new SuggestField(field, "the foo bar", 10));
document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
iw.addDocument(document);
// note we use the completionAnalyzer with the queries (instead of input analyzer) because of non-default settings
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_pos_inc", "fo"));
TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); //matches all 4
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_pos_inc", "foob"));
suggest = indexSearcher.suggest(query, 4, false); // only foobar
CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "fo"));
TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with fo
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
// with leading stopword
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); // becomes "fo*"
suggest = indexSearcher.suggest(query, 9, false);
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
// with middle stopword
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar")); // becomes "foo bar*"
suggest = indexSearcher.suggest(query, 9, false);
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("foo bar", 8)); // no foobar
// no space
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
suggest = indexSearcher.suggest(query, 4, false); // separators, thus only match "foobar"
assertSuggestions(suggest, new Entry("foobar", 7));
reader.close();
iw.close();
}
public void testAnalyzerWithPreservePosAndNoSep() throws Exception {
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, true);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, "suggest_field_no_p_sep"));
Document document = new Document();
document.add(new SuggestField("suggest_field_no_p_sep", "foobar", 7));
document.add(new SuggestField("suggest_field_no_p_sep", "foo bar", 8));
document.add(new SuggestField("suggest_field_no_p_sep", "the fo", 9));
document.add(new SuggestField("suggest_field_no_p_sep", "the foo bar", 10));
iw.addDocument(document);
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep", "fo"));
TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); // matches all 4
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep", "foob"));
suggest = indexSearcher.suggest(query, 4, false); // except the fo
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
// surrounding stopwords
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the")); // becomes "baz*"
suggest = indexSearcher.suggest(query, 4, false);// stopwords in query get removed so we match
assertSuggestions(suggest, new Entry("baz the", 12));
reader.close();
iw.close();
}