From 5abaff61fa4eac85448e2840c0a3d494105af70d Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Sun, 14 Feb 2010 21:33:12 +0000 Subject: [PATCH] LUCENE-2266: Fixed offset calculations in NGramTokenFilter and EdgeNGramTokenFilter git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@910078 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/CHANGES.txt | 3 +++ .../apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java | 4 +++- .../org/apache/lucene/analysis/ngram/NGramTokenFilter.java | 4 +++- .../lucene/analysis/ngram/EdgeNGramTokenFilterTest.java | 2 +- .../apache/lucene/analysis/ngram/NGramTokenFilterTest.java | 2 +- 5 files changed, 11 insertions(+), 4 deletions(-) diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 134288139d9..cd41a289240 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -153,6 +153,9 @@ Bug Fixes CJKTokenizer, ChineseTokenizer, SmartChinese SentenceTokenizer, and WikipediaTokenizer. (Koji Sekiguchi, Robert Muir) + * LUCENE-2266: Fixed offset calculations in NGramTokenFilter and + EdgeNGramTokenFilter. (Joe Calderon, Robert Muir via Uwe Schindler) + API Changes * LUCENE-2108: Add SpellChecker.close, to close the underlying diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index ff6c901402b..fd4c65d3d97 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -70,6 +70,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter { private char[] curTermBuffer; private int curTermLength; private int curGramSize; + private int tokStart; private final TermAttribute termAtt; private final OffsetAttribute offsetAtt; @@ -126,6 +127,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter { curTermBuffer = (char[]) termAtt.termBuffer().clone(); curTermLength = termAtt.termLength(); curGramSize = minGram; + tokStart = offsetAtt.startOffset(); } } if (curGramSize <= maxGram) { @@ -135,7 +137,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter { int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; int end = start + curGramSize; clearAttributes(); - offsetAtt.setOffset(start, end); + offsetAtt.setOffset(tokStart + start, tokStart + end); termAtt.setTermBuffer(curTermBuffer, start, curGramSize); curGramSize++; return true; diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index a3946e86c0b..6fd4b7c09d5 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -37,6 +37,7 @@ public final class NGramTokenFilter extends TokenFilter { private int curTermLength; private int curGramSize; private int curPos; + private int tokStart; private TermAttribute termAtt; private OffsetAttribute offsetAtt; @@ -82,13 +83,14 @@ public final class NGramTokenFilter extends TokenFilter { curTermLength = termAtt.termLength(); curGramSize = minGram; curPos = 0; + tokStart = offsetAtt.startOffset(); } } while (curGramSize <= maxGram) { while (curPos+curGramSize <= curTermLength) { // while there is input clearAttributes(); termAtt.setTermBuffer(curTermBuffer, curPos, curGramSize); - offsetAtt.setOffset(curPos, curPos+curGramSize); + offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize); curPos++; return true; } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java index 518f7ee84b7..481fe7a6208 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java @@ -94,7 +94,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testSmallTokenInStream() throws Exception { input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh")); EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3); - assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3}); + assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}); } public void testReset() throws Exception { diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java index ff8022a4283..0a6fa47b5ae 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java @@ -83,7 +83,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { public void testSmallTokenInStream() throws Exception { input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh")); NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3); - assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3}); + assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}); } public void testReset() throws Exception {