mirror of https://github.com/apache/lucene.git
LUCENE-8344: TokenStreamToAutomaton doesn't ignore trailing posInc when preservePositionIncrements=false
This commit is contained in:
parent
eea4197a3a
commit
228a84fd6d
|
@ -120,6 +120,13 @@ Bug Fixes:
|
|||
|
||||
======================= Lucene 7.4.0 =======================
|
||||
|
||||
Upgrading
|
||||
|
||||
* LUCENE-8344: If you are using the AnalyzingSuggester or FuzzySuggester subclass, and if you
|
||||
explicitly use the preservePositionIncrements=false setting (not the default), then you ought
|
||||
to rebuild your suggester index. If you don't, queries or indexed data with trailing position
|
||||
gaps (e.g. stop words) may not work correctly. (David Smiley, Jim Ferenczi)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-8242: IndexSearcher.createNormalizedWeight() has been deprecated.
|
||||
|
@ -280,6 +287,10 @@ Bug Fixes
|
|||
* LUCENE-8355: Prevent IW from opening an already dropped segment while DV updates
|
||||
are written. (Nhat Nguyen via Simon Willnauer)
|
||||
|
||||
* LUCENE-8344: TokenStreamToAutomaton (used by some suggesters) was not ignoring a trailing
|
||||
position increment when the preservePositionIncrement setting is false.
|
||||
(David Smiley, Jim Ferenczi)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)
|
||||
|
|
|
@ -208,14 +208,14 @@ public class TokenStreamToAutomaton {
|
|||
|
||||
in.end();
|
||||
|
||||
int endState = -1;
|
||||
|
||||
int endPosInc = posIncAtt.getPositionIncrement();
|
||||
|
||||
if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
|
||||
endPosInc = 1;
|
||||
} else if (endPosInc > 0 && preservePositionIncrements==false) {
|
||||
endPosInc = 0;
|
||||
}
|
||||
|
||||
|
||||
int endState;
|
||||
if (endPosInc > 0) {
|
||||
// there were hole(s) after the last token
|
||||
endState = builder.createState();
|
||||
|
|
|
@ -220,34 +220,49 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
* basic "standardanalyzer" test with stopword removal
|
||||
*/
|
||||
public void testStandard() throws Exception {
|
||||
final String input = "the ghost of christmas past the"; // trailing stopword there just to perturb possible bugs
|
||||
Input keys[] = new Input[] {
|
||||
new Input("the ghost of christmas past", 50),
|
||||
new Input(input, 50),
|
||||
};
|
||||
|
||||
|
||||
Directory tempDir = getDirectory();
|
||||
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", standard, standard,
|
||||
AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false);
|
||||
|
||||
suggester.build(new InputArrayIterator(keys));
|
||||
|
||||
List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
|
||||
List<LookupResult> results;
|
||||
|
||||
// round-trip
|
||||
results = suggester.lookup(TestUtil.stringToCharSequence(input, random()), false, 1);
|
||||
assertEquals(1, results.size());
|
||||
assertEquals("the ghost of christmas past", results.get(0).key.toString());
|
||||
assertEquals(input, results.get(0).key.toString());
|
||||
assertEquals(50, results.get(0).value, 0.01F);
|
||||
|
||||
// prefix of input stopping part way through christmas
|
||||
results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
|
||||
assertEquals(1, results.size());
|
||||
assertEquals(input, results.get(0).key.toString());
|
||||
assertEquals(50, results.get(0).value, 0.01F);
|
||||
|
||||
// omit the 'the' since it's a stopword, it's suggested anyway
|
||||
results = suggester.lookup(TestUtil.stringToCharSequence("ghost of chris", random()), false, 1);
|
||||
assertEquals(1, results.size());
|
||||
assertEquals("the ghost of christmas past", results.get(0).key.toString());
|
||||
assertEquals(input, results.get(0).key.toString());
|
||||
assertEquals(50, results.get(0).value, 0.01F);
|
||||
|
||||
// omit the 'the' and 'of' since they are stopwords, it's suggested anyway
|
||||
results = suggester.lookup(TestUtil.stringToCharSequence("ghost chris", random()), false, 1);
|
||||
assertEquals(1, results.size());
|
||||
assertEquals("the ghost of christmas past", results.get(0).key.toString());
|
||||
assertEquals(input, results.get(0).key.toString());
|
||||
assertEquals(50, results.get(0).value, 0.01F);
|
||||
|
||||
|
||||
// trailing stopword "the"
|
||||
results = suggester.lookup(TestUtil.stringToCharSequence("ghost christmas past the", random()), false, 1);
|
||||
assertEquals(1, results.size());
|
||||
assertEquals(input, results.get(0).key.toString());
|
||||
assertEquals(50, results.get(0).value, 0.01F);
|
||||
|
||||
IOUtils.close(standard, tempDir);
|
||||
}
|
||||
|
||||
|
|
|
@ -253,71 +253,125 @@ public class TestPrefixCompletionQuery extends LuceneTestCase {
|
|||
iw.close();
|
||||
}
|
||||
|
||||
public void testAnalyzerWithoutPreservePosAndSep() throws Exception {
|
||||
public void testAnalyzerDefaults() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, false);
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, "suggest_field_no_p_sep_or_pos_inc"));
|
||||
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
|
||||
final String field = getTestName();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, field));
|
||||
Document document = new Document();
|
||||
document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "foobar", 7));
|
||||
document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "foo bar", 8));
|
||||
document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "the fo", 9));
|
||||
document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "the foo bar", 10));
|
||||
document.add(new SuggestField(field, "foobar", 7));
|
||||
document.add(new SuggestField(field, "foo bar", 8));
|
||||
document.add(new SuggestField(field, "the fo", 9));
|
||||
document.add(new SuggestField(field, "the foo bar", 10));
|
||||
document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
|
||||
document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
|
||||
|
||||
iw.addDocument(document);
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
|
||||
CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep_or_pos_inc", "fo"));
|
||||
TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); // all 4
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep_or_pos_inc", "foob"));
|
||||
suggest = indexSearcher.suggest(query, 4, false); // not the fo
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "fo"));
|
||||
TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with "fo*"
|
||||
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
// with leading stopword
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); // becomes "_ fo*"
|
||||
suggest = indexSearcher.suggest(query, 9, false);
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9));
|
||||
// with middle stopword
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar")); // becomes "foo _ bar*"
|
||||
suggest = indexSearcher.suggest(query, 9, false);
|
||||
assertSuggestions(suggest, new Entry("foo the bar", 11));
|
||||
// no space
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
|
||||
suggest = indexSearcher.suggest(query, 9, false);
|
||||
assertSuggestions(suggest, new Entry("foobar", 7));
|
||||
// surrounding stopwords
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the")); // becomes "_ baz _"
|
||||
suggest = indexSearcher.suggest(query, 4, false);
|
||||
assertSuggestions(suggest);
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
public void testAnalyzerWithSepAndNoPreservePos() throws Exception {
|
||||
public void testAnalyzerWithoutSeparator() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
//note: when we don't preserve separators, the choice of preservePosInc is irrelevant
|
||||
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, random().nextBoolean());
|
||||
final String field = getTestName();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, field));
|
||||
Document document = new Document();
|
||||
document.add(new SuggestField(field, "foobar", 7));
|
||||
document.add(new SuggestField(field, "foo bar", 8));
|
||||
document.add(new SuggestField(field, "the fo", 9));
|
||||
document.add(new SuggestField(field, "the foo bar", 10));
|
||||
document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
|
||||
document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
|
||||
|
||||
iw.addDocument(document);
|
||||
|
||||
// note we use the completionAnalyzer with the queries (instead of input analyzer) because of non-default settings
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
|
||||
CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "fo"));
|
||||
TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with fo
|
||||
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
// with leading stopword
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); // becomes "fo*"
|
||||
suggest = indexSearcher.suggest(query, 9, false);
|
||||
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
// with middle stopword
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar")); // becomes "foobar*"
|
||||
suggest = indexSearcher.suggest(query, 9, false);
|
||||
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
// no space
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
|
||||
suggest = indexSearcher.suggest(query, 9, false); // no separators, thus match several
|
||||
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
// surrounding stopwords
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the")); // becomes "baz*"
|
||||
suggest = indexSearcher.suggest(query, 4, false);// stopwords in query get removed so we match
|
||||
assertSuggestions(suggest, new Entry("baz the", 12));
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
public void testAnalyzerNoPreservePosInc() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, true, false);
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, "suggest_field_no_p_pos_inc"));
|
||||
final String field = getTestName();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, field));
|
||||
Document document = new Document();
|
||||
document.add(new SuggestField("suggest_field_no_p_pos_inc", "foobar", 7));
|
||||
document.add(new SuggestField("suggest_field_no_p_pos_inc", "foo bar", 8));
|
||||
document.add(new SuggestField("suggest_field_no_p_pos_inc", "the fo", 9));
|
||||
document.add(new SuggestField("suggest_field_no_p_pos_inc", "the foo bar", 10));
|
||||
document.add(new SuggestField(field, "foobar", 7));
|
||||
document.add(new SuggestField(field, "foo bar", 8));
|
||||
document.add(new SuggestField(field, "the fo", 9));
|
||||
document.add(new SuggestField(field, "the foo bar", 10));
|
||||
document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
|
||||
document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
|
||||
|
||||
iw.addDocument(document);
|
||||
|
||||
// note we use the completionAnalyzer with the queries (instead of input analyzer) because of non-default settings
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
|
||||
CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_pos_inc", "fo"));
|
||||
TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); //matches all 4
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_pos_inc", "foob"));
|
||||
suggest = indexSearcher.suggest(query, 4, false); // only foobar
|
||||
CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "fo"));
|
||||
TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with fo
|
||||
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
// with leading stopword
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); // becomes "fo*"
|
||||
suggest = indexSearcher.suggest(query, 9, false);
|
||||
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
// with middle stopword
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar")); // becomes "foo bar*"
|
||||
suggest = indexSearcher.suggest(query, 9, false);
|
||||
assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10), new Entry("foo bar", 8)); // no foobar
|
||||
// no space
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
|
||||
suggest = indexSearcher.suggest(query, 4, false); // separators, thus only match "foobar"
|
||||
assertSuggestions(suggest, new Entry("foobar", 7));
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
public void testAnalyzerWithPreservePosAndNoSep() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, true);
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer, "suggest_field_no_p_sep"));
|
||||
Document document = new Document();
|
||||
document.add(new SuggestField("suggest_field_no_p_sep", "foobar", 7));
|
||||
document.add(new SuggestField("suggest_field_no_p_sep", "foo bar", 8));
|
||||
document.add(new SuggestField("suggest_field_no_p_sep", "the fo", 9));
|
||||
document.add(new SuggestField("suggest_field_no_p_sep", "the foo bar", 10));
|
||||
iw.addDocument(document);
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
|
||||
CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep", "fo"));
|
||||
TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); // matches all 4
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep", "foob"));
|
||||
suggest = indexSearcher.suggest(query, 4, false); // except the fo
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
// surrounding stopwords
|
||||
query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the")); // becomes "baz*"
|
||||
suggest = indexSearcher.suggest(query, 4, false);// stopwords in query get removed so we match
|
||||
assertSuggestions(suggest, new Entry("baz the", 12));
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue