mirror of https://github.com/apache/lucene.git
LUCENE-2266: Fixed offset calculations in NGramTokenFilter and EdgeNGramTokenFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@910078 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
23aacd101f
commit
5abaff61fa
|
@ -153,6 +153,9 @@ Bug Fixes
|
|||
CJKTokenizer, ChineseTokenizer, SmartChinese SentenceTokenizer,
|
||||
and WikipediaTokenizer. (Koji Sekiguchi, Robert Muir)
|
||||
|
||||
* LUCENE-2266: Fixed offset calculations in NGramTokenFilter and
|
||||
EdgeNGramTokenFilter. (Joe Calderon, Robert Muir via Uwe Schindler)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-2108: Add SpellChecker.close, to close the underlying
|
||||
|
|
|
@ -70,6 +70,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
private char[] curTermBuffer;
|
||||
private int curTermLength;
|
||||
private int curGramSize;
|
||||
private int tokStart;
|
||||
|
||||
private final TermAttribute termAtt;
|
||||
private final OffsetAttribute offsetAtt;
|
||||
|
@ -126,6 +127,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
curTermBuffer = (char[]) termAtt.termBuffer().clone();
|
||||
curTermLength = termAtt.termLength();
|
||||
curGramSize = minGram;
|
||||
tokStart = offsetAtt.startOffset();
|
||||
}
|
||||
}
|
||||
if (curGramSize <= maxGram) {
|
||||
|
@ -135,7 +137,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
|
||||
int end = start + curGramSize;
|
||||
clearAttributes();
|
||||
offsetAtt.setOffset(start, end);
|
||||
offsetAtt.setOffset(tokStart + start, tokStart + end);
|
||||
termAtt.setTermBuffer(curTermBuffer, start, curGramSize);
|
||||
curGramSize++;
|
||||
return true;
|
||||
|
|
|
@ -37,6 +37,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
private int curTermLength;
|
||||
private int curGramSize;
|
||||
private int curPos;
|
||||
private int tokStart;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
|
@ -82,13 +83,14 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
curTermLength = termAtt.termLength();
|
||||
curGramSize = minGram;
|
||||
curPos = 0;
|
||||
tokStart = offsetAtt.startOffset();
|
||||
}
|
||||
}
|
||||
while (curGramSize <= maxGram) {
|
||||
while (curPos+curGramSize <= curTermLength) { // while there is input
|
||||
clearAttributes();
|
||||
termAtt.setTermBuffer(curTermBuffer, curPos, curGramSize);
|
||||
offsetAtt.setOffset(curPos, curPos+curGramSize);
|
||||
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
|
||||
curPos++;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -94,7 +94,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testSmallTokenInStream() throws Exception {
|
||||
input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3});
|
||||
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
|
|
|
@ -83,7 +83,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testSmallTokenInStream() throws Exception {
|
||||
input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
||||
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3});
|
||||
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue