LUCENE-2229: Fix SimpleSpanFragmenter bug with adjacent stop-words

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1722241 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2015-12-29 22:00:53 +00:00
parent a23a1f98be
commit ec64ba0029
3 changed files with 35 additions and 6 deletions

View File

@ -179,6 +179,10 @@ Bug Fixes
hide the true docvalues update generation or other properties. hide the true docvalues update generation or other properties.
(Ishan Chattopadhyaya via Robert Muir) (Ishan Chattopadhyaya via Robert Muir)
* LUCENE-2229: Fix Highlighter's SimpleSpanFragmenter when multiple adjacent
stop words following a span can unduly make the fragment way too long.
(Elmer Garduno, Lukhnos Liu via David Smiley)
Other Other
* LUCENE-6924: Upgrade randomizedtesting to 2.3.2. (Dawid Weiss) * LUCENE-6924: Upgrade randomizedtesting to 2.3.2. (Dawid Weiss)

View File

@ -51,7 +51,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
/** /**
* @param queryScorer QueryScorer that was used to score hits * @param queryScorer QueryScorer that was used to score hits
* @param fragmentSize size in bytes of each fragment * @param fragmentSize size in chars of each fragment
*/ */
public SimpleSpanFragmenter(QueryScorer queryScorer, int fragmentSize) { public SimpleSpanFragmenter(QueryScorer queryScorer, int fragmentSize) {
this.fragmentSize = fragmentSize; this.fragmentSize = fragmentSize;
@ -65,7 +65,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
public boolean isNewFragment() { public boolean isNewFragment() {
position += posIncAtt.getPositionIncrement(); position += posIncAtt.getPositionIncrement();
if (waitForPos == position) { if (waitForPos <= position) {
waitForPos = -1; waitForPos = -1;
} else if (waitForPos != -1) { } else if (waitForPos != -1) {
return false; return false;
@ -76,9 +76,9 @@ public class SimpleSpanFragmenter implements Fragmenter {
if (wSpanTerm != null) { if (wSpanTerm != null) {
List<PositionSpan> positionSpans = wSpanTerm.getPositionSpans(); List<PositionSpan> positionSpans = wSpanTerm.getPositionSpans();
for (int i = 0; i < positionSpans.size(); i++) { for (PositionSpan positionSpan : positionSpans) {
if (positionSpans.get(i).start == position) { if (positionSpan.start == position) {
waitForPos = positionSpans.get(i).end + 1; waitForPos = positionSpan.end + 1;
break; break;
} }
} }

View File

@ -132,7 +132,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
"This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
"JFK has been shot", "John Kennedy has been shot", "JFK has been shot", "John Kennedy has been shot",
"This text has a typo in referring to Keneddy", "This text has a typo in referring to Keneddy",
"wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" }; "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets",
"Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the Attributes and then calls incrementToken() again until it retuns false, which indicates that the end of the stream was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in the Attribute instances. "
};
// Convenience method for succinct tests; doesn't represent "best practice" // Convenience method for succinct tests; doesn't represent "best practice"
private TokenStream getAnyTokenStream(String fieldName, int docId) private TokenStream getAnyTokenStream(String fieldName, int docId)
@ -347,6 +349,29 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
// throw any exceptions // throw any exceptions
} }
// LUCENE-2229
public void testSimpleSpanHighlighterWithStopWordsStraddlingFragmentBoundaries() throws Exception {
doSearching(new PhraseQuery(FIELD_NAME, "all", "tokens"));
int maxNumFragmentsRequired = 1;
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(scorer);
assertEquals("Must have one hit", 1, hits.totalHits);
for (int i = 0; i < hits.totalHits; i++) {
String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 36));
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
if (VERBOSE) System.out.println("\t" + result);
assertTrue("Fragment must be less than 60 characters long", result.length() < 60);
}
}
// LUCENE-1752 // LUCENE-1752
public void testRepeatingTermsInMultBooleans() throws Exception { public void testRepeatingTermsInMultBooleans() throws Exception {
String content = "x y z a b c d e f g b c g"; String content = "x y z a b c d e f g b c g";