LUCENE-2266: Fixed offset calculations in NGramTokenFilter and EdgeNGramTokenFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@910078 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2010-02-14 21:33:12 +00:00
parent 23aacd101f
commit 5abaff61fa
5 changed files with 11 additions and 4 deletions

View File

@ -153,6 +153,9 @@ Bug Fixes
CJKTokenizer, ChineseTokenizer, SmartChinese SentenceTokenizer,
and WikipediaTokenizer. (Koji Sekiguchi, Robert Muir)
* LUCENE-2266: Fixed offset calculations in NGramTokenFilter and
EdgeNGramTokenFilter. (Joe Calderon, Robert Muir via Uwe Schindler)
API Changes
* LUCENE-2108: Add SpellChecker.close, to close the underlying

View File

@ -70,6 +70,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
private char[] curTermBuffer;
private int curTermLength;
private int curGramSize;
private int tokStart;
private final TermAttribute termAtt;
private final OffsetAttribute offsetAtt;
@ -126,6 +127,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
curTermBuffer = (char[]) termAtt.termBuffer().clone();
curTermLength = termAtt.termLength();
curGramSize = minGram;
tokStart = offsetAtt.startOffset();
}
}
if (curGramSize <= maxGram) {
@ -135,7 +137,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
int end = start + curGramSize;
clearAttributes();
offsetAtt.setOffset(start, end);
offsetAtt.setOffset(tokStart + start, tokStart + end);
termAtt.setTermBuffer(curTermBuffer, start, curGramSize);
curGramSize++;
return true;

View File

@ -37,6 +37,7 @@ public final class NGramTokenFilter extends TokenFilter {
private int curTermLength;
private int curGramSize;
private int curPos;
private int tokStart;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
@ -82,13 +83,14 @@ public final class NGramTokenFilter extends TokenFilter {
curTermLength = termAtt.termLength();
curGramSize = minGram;
curPos = 0;
tokStart = offsetAtt.startOffset();
}
}
while (curGramSize <= maxGram) {
while (curPos+curGramSize <= curTermLength) { // while there is input
clearAttributes();
termAtt.setTermBuffer(curTermBuffer, curPos, curGramSize);
offsetAtt.setOffset(curPos, curPos+curGramSize);
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
curPos++;
return true;
}

View File

@ -94,7 +94,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testSmallTokenInStream() throws Exception {
input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3});
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
}
public void testReset() throws Exception {

View File

@ -83,7 +83,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testSmallTokenInStream() throws Exception {
input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3});
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
}
public void testReset() throws Exception {