diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1503975086a..5c0bd2b629a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -814,10 +814,10 @@ Bug fixes * LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners to clones/reopened readers. (Uwe Schindler) -* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters, - compound token filters, and smart chinese where they would create invalid - offsets in some situations, leading to problems in highlighting. - (Max Beutel, Edwin Steiner via Robert Muir) +* LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram filters, + compound token filters, thai word filter, icutokenizer, and smart chinese + where they would create invalid offsets in some situations, leading to problems + in highlighting. (Max Beutel, Edwin Steiner via Robert Muir) * LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to Float.MIN_VALUE when it should be Float.NaN, when there were 0 diff --git a/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index 91ce7848d5e..b415b61aab0 100644 --- a/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis; * limitations under the License. */ +import java.io.Reader; import java.io.StringReader; import java.io.IOException; import java.util.ArrayList; @@ -289,8 +290,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { } } }; - + public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException { + checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean()); + } + + public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException { for (int i = 0; i < iterations; i++) { String text; switch(_TestUtil.nextInt(random, 0, 4)) { @@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text); } - TokenStream ts = a.tokenStream("dummy", new StringReader(text)); + int remainder = random.nextInt(10); + Reader reader = new StringReader(text); + TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader); assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null; @@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { if (VERBOSE) { System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis"); } + reader = new StringReader(text); + ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader); if (typeAtt != null && posIncAtt != null && offsetAtt != null) { // offset + pos + type - assertAnalyzesToReuse(a, text, + assertTokenStreamContents(ts, tokens.toArray(new String[tokens.size()]), toIntArray(startOffsets), toIntArray(endOffsets), types.toArray(new String[types.size()]), - toIntArray(positions)); + toIntArray(positions), + text.length()); } else if (posIncAtt != null && offsetAtt != null) { // offset + pos - assertAnalyzesToReuse(a, text, + assertTokenStreamContents(ts, tokens.toArray(new String[tokens.size()]), toIntArray(startOffsets), toIntArray(endOffsets), - toIntArray(positions)); + null, + toIntArray(positions), + text.length()); } else if (offsetAtt != null) { // offset - assertAnalyzesToReuse(a, text, + assertTokenStreamContents(ts, tokens.toArray(new String[tokens.size()]), toIntArray(startOffsets), - toIntArray(endOffsets)); + toIntArray(endOffsets), + null, + null, + text.length()); } else { // terms only - assertAnalyzesToReuse(a, text, + assertTokenStreamContents(ts, tokens.toArray(new String[tokens.size()])); } } diff --git a/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java b/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java new file mode 100644 index 00000000000..e1e6e88c78c --- /dev/null +++ b/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java @@ -0,0 +1,100 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.SortedMap; +import java.util.TreeMap; + +// the purpose of this charfilter is to send offsets out of bounds +// if the analyzer doesn't use correctOffset or does incorrect offset math. +class MockCharFilter extends CharStream { + final Reader in; + final int remainder; + + // for testing only + public MockCharFilter(Reader in, int remainder) { + this.in = in; + this.remainder = remainder; + assert remainder >= 0 && remainder < 10 : "invalid parameter"; + } + + @Override + public void close() throws IOException { + in.close(); + } + + int currentOffset = -1; + int delta = 0; + int bufferedCh = -1; + + @Override + public int read() throws IOException { + // we have a buffered character, add an offset correction and return it + if (bufferedCh >= 0) { + int ch = bufferedCh; + bufferedCh = -1; + currentOffset++; + + addOffCorrectMap(currentOffset+delta, delta-1); + delta--; + return ch; + } + + // otherwise actually read one + int ch = in.read(); + if (ch < 0) + return ch; + + currentOffset++; + if ((ch % 10) != remainder || Character.isHighSurrogate((char)ch) || Character.isLowSurrogate((char)ch)) { + return ch; + } + + // we will double this character, so buffer it. + bufferedCh = ch; + return ch; + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + int numRead = 0; + for (int i = off; i < off + len; i++) { + int c = read(); + if (c == -1) break; + cbuf[i] = (char) c; + numRead++; + } + return numRead == 0 ? -1 : numRead; + } + + @Override + public int correctOffset(int currentOff) { + SortedMap subMap = corrections.subMap(0, currentOff+1); + int ret = subMap.isEmpty() ? currentOff : currentOff + subMap.get(subMap.lastKey()); + assert ret >= 0 : "currentOff=" + currentOff + ",diff=" + (ret-currentOff); + return ret; + } + + protected void addOffCorrectMap(int off, int cumulativeDiff) { + corrections.put(off, cumulativeDiff); + } + + TreeMap corrections = new TreeMap(); +} diff --git a/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java b/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java new file mode 100644 index 00000000000..70662ba2381 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.io.Reader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestMockCharFilter extends BaseTokenStreamTestCase { + + public void test() throws IOException { + Analyzer analyzer = new Analyzer() { + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, tokenizer); + } + + @Override + protected Reader initReader(Reader reader) { + return new MockCharFilter(CharReader.get(reader), 7); + } + }; + + assertAnalyzesTo(analyzer, "ab", + new String[] { "aab" }, + new int[] { 0 }, + new int[] { 2 } + ); + + assertAnalyzesTo(analyzer, "aba", + new String[] { "aabaa" }, + new int[] { 0 }, + new int[] { 3 } + ); + + assertAnalyzesTo(analyzer, "abcdefga", + new String[] { "aabcdefgaa" }, + new int[] { 0 }, + new int[] { 8 } + ); + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java index 67397512e1e..b2bc64ffdda 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java @@ -68,6 +68,7 @@ public final class ThaiWordFilter extends TokenFilter { private CharTermAttribute clonedTermAtt = null; private OffsetAttribute clonedOffsetAtt = null; private boolean hasMoreTokensInClone = false; + private boolean hasIllegalOffsets = false; // only if the length changed before this filter /** Creates a new ThaiWordFilter with the specified match version. */ public ThaiWordFilter(Version matchVersion, TokenStream input) { @@ -86,7 +87,11 @@ public final class ThaiWordFilter extends TokenFilter { if (end != BreakIterator.DONE) { clonedToken.copyTo(this); termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start); - offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end); + if (hasIllegalOffsets) { + offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset()); + } else { + offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end); + } if (handlePosIncr) posAtt.setPositionIncrement(1); return true; } @@ -102,6 +107,10 @@ public final class ThaiWordFilter extends TokenFilter { } hasMoreTokensInClone = true; + + // if length by start + end offsets doesn't match the term text then assume + // this is a synonym and don't adjust the offsets. + hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length(); // we lazy init the cloned token, as in ctor not all attributes may be added if (clonedToken == null) { @@ -118,7 +127,11 @@ public final class ThaiWordFilter extends TokenFilter { int end = breaker.next(); if (end != BreakIterator.DONE) { termAtt.setLength(end); - offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end); + if (hasIllegalOffsets) { + offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset()); + } else { + offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end); + } // position increment keeps as it is for first token return true; } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java index 0be05d52ea6..9411ab38644 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java @@ -503,7 +503,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase { @Override protected Reader initReader(Reader reader) { - return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader))); + return new HTMLStripCharFilter(CharReader.get(reader)); } }; diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java index d5a1c68bac9..f3a28a3f620 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java @@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase { hasSentence = false; clearAttributes(); termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart); - offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd); + offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd)); return true; } else { return false; @@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase { clearAttributes(); termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart); - offsetAtt.setOffset(offset+wordStart, offset+wordEnd); + offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd)); posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost); posBoost = 0; return true; diff --git a/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java index b2022bdad6a..c823bcda992 100644 --- a/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java +++ b/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java @@ -111,7 +111,7 @@ public final class ICUTokenizer extends Tokenizer { @Override public void end() throws IOException { final int finalOffset = (length < 0) ? offset : offset + length; - offsetAtt.setOffset(finalOffset, finalOffset); + offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset)); } /*