From 2a5421ca685b7341bbead24975cd218cbcaf34f7 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 20 Aug 2013 17:13:06 +0000 Subject: [PATCH] LUCENE-3849: end() now sets position increment, so any trailing holes are counted git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1515887 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 5 ++ .../analysis/core/KeywordTokenizer.java | 3 +- .../lucene/analysis/ngram/NGramTokenizer.java | 4 +- .../analysis/path/PathHierarchyTokenizer.java | 3 +- .../path/ReversePathHierarchyTokenizer.java | 3 +- .../analysis/pattern/PatternTokenizer.java | 3 +- .../analysis/standard/ClassicTokenizer.java | 14 +++-- .../analysis/standard/StandardTokenizer.java | 14 +++-- .../standard/UAX29URLEmailTokenizer.java | 14 +++-- .../lucene/analysis/util/CharTokenizer.java | 3 +- .../analysis/util/FilteringTokenFilter.java | 9 ++- .../wikipedia/WikipediaTokenizer.java | 3 +- .../lucene/analysis/core/TestStopFilter.java | 16 +++++ .../icu/segmentation/ICUTokenizer.java | 3 +- .../lucene/analysis/ja/JapaneseTokenizer.java | 3 +- .../analysis/cn/smart/SentenceTokenizer.java | 3 +- .../analysis/uima/BaseUIMATokenizer.java | 5 -- .../uima/UIMAAnnotationsTokenizer.java | 2 +- .../UIMATypeAwareAnnotationsTokenizer.java | 2 +- .../apache/lucene/analysis/TokenStream.java | 10 ++- .../lucene/index/DocInverterPerField.java | 4 +- .../lucene/index/BinaryTokenStream.java | 5 +- .../apache/lucene/index/TestIndexWriter.java | 63 +++++++++++++++++++ .../directory/DirectoryTaxonomyWriter.java | 16 ++++- .../vectorhighlight/AbstractTestCase.java | 3 +- .../lucene/index/sorter/SorterTestBase.java | 3 +- .../suggest/analyzing/SuggestStopFilter.java | 15 ++--- .../analysis/BaseTokenStreamTestCase.java | 30 ++++++++- .../lucene/analysis/MockTokenFilter.java | 17 ++++- .../apache/lucene/analysis/MockTokenizer.java | 1 + 30 files changed, 226 insertions(+), 53 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 05be1fe587b..019a147ed53 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -126,6 +126,11 @@ Bug Fixes the default one) have their own limits (David Smiley, Robert Muir, Mike McCandless) +* LUCENE-3849: TokenStreams now set the position increment in end(), + so we can handle trailing holes. If you have a custom TokenStream + implementing end() then be sure it calls super.end(). (Robert Muir, + Mike McCandless) + API Changes * LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java index 14d103e9817..29239fe327b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java @@ -80,7 +80,8 @@ public final class KeywordTokenizer extends Tokenizer { } @Override - public final void end() { + public final void end() throws IOException { + super.end(); // set final offset offsetAtt.setOffset(finalOffset, finalOffset); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java index 646b5e6f839..771e7039ccb 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java @@ -217,13 +217,15 @@ public class NGramTokenizer extends Tokenizer { } @Override - public final void end() { + public final void end() throws IOException { + super.end(); assert bufferStart <= bufferEnd; int endOffset = offset; for (int i = bufferStart; i < bufferEnd; ++i) { endOffset += Character.charCount(buffer[i]); } endOffset = correctOffset(endOffset); + // set final offset offsetAtt.setOffset(endOffset, endOffset); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java index 2b4239e78fc..da11df10d9d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java @@ -191,7 +191,8 @@ public class PathHierarchyTokenizer extends Tokenizer { } @Override - public final void end() { + public final void end() throws IOException { + super.end(); // set final offset int finalOffset = correctOffset(charsRead); offsetAtt.setOffset(finalOffset, finalOffset); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java index 236105b9806..5b38a74d2e3 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java @@ -176,7 +176,8 @@ public class ReversePathHierarchyTokenizer extends Tokenizer { } @Override - public final void end() { + public final void end() throws IOException { + super.end(); // set final offset offsetAtt.setOffset(finalOffset, finalOffset); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java index 14b9e4b1e7b..08f463003d2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java @@ -130,7 +130,8 @@ public final class PatternTokenizer extends Tokenizer { } @Override - public void end() { + public void end() throws IOException { + super.end(); final int ofs = correctOffset(str.length()); offsetAtt.setOffset(ofs, ofs); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java index 76187dd3849..fe1baa30203 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java @@ -76,6 +76,8 @@ public final class ClassicTokenizer extends Tokenizer { "", "" }; + + private int skippedPositions; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; @@ -130,7 +132,7 @@ public final class ClassicTokenizer extends Tokenizer { @Override public final boolean incrementToken() throws IOException { clearAttributes(); - int posIncr = 1; + skippedPositions = 0; while(true) { int tokenType = scanner.getNextToken(); @@ -140,7 +142,7 @@ public final class ClassicTokenizer extends Tokenizer { } if (scanner.yylength() <= maxTokenLength) { - posIncrAtt.setPositionIncrement(posIncr); + posIncrAtt.setPositionIncrement(skippedPositions+1); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); @@ -155,19 +157,23 @@ public final class ClassicTokenizer extends Tokenizer { } else // When we skip a too-long term, we still increment the // position increment - posIncr++; + skippedPositions++; } } @Override - public final void end() { + public final void end() throws IOException { + super.end(); // set final offset int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); offsetAtt.setOffset(finalOffset, finalOffset); + // adjust any skipped tokens + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions); } @Override public void reset() throws IOException { scanner.yyreset(input); + skippedPositions = 0; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java index f1e194939e3..e30fa2450fc 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java @@ -90,6 +90,8 @@ public final class StandardTokenizer extends Tokenizer { "", "" }; + + private int skippedPositions; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; @@ -144,7 +146,7 @@ public final class StandardTokenizer extends Tokenizer { @Override public final boolean incrementToken() throws IOException { clearAttributes(); - int posIncr = 1; + skippedPositions = 0; while(true) { int tokenType = scanner.getNextToken(); @@ -154,7 +156,7 @@ public final class StandardTokenizer extends Tokenizer { } if (scanner.yylength() <= maxTokenLength) { - posIncrAtt.setPositionIncrement(posIncr); + posIncrAtt.setPositionIncrement(skippedPositions+1); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); @@ -163,19 +165,23 @@ public final class StandardTokenizer extends Tokenizer { } else // When we skip a too-long term, we still increment the // position increment - posIncr++; + skippedPositions++; } } @Override - public final void end() { + public final void end() throws IOException { + super.end(); // set final offset int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); offsetAtt.setOffset(finalOffset, finalOffset); + // adjust any skipped tokens + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions); } @Override public void reset() throws IOException { scanner.yyreset(input); + skippedPositions = 0; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java index 9992a5309da..9e1b23cb569 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java @@ -76,6 +76,8 @@ public final class UAX29URLEmailTokenizer extends Tokenizer { "", "", }; + + private int skippedPositions; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; @@ -123,7 +125,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer { @Override public final boolean incrementToken() throws IOException { clearAttributes(); - int posIncr = 1; + skippedPositions = 0; while(true) { int tokenType = scanner.getNextToken(); @@ -133,7 +135,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer { } if (scanner.yylength() <= maxTokenLength) { - posIncrAtt.setPositionIncrement(posIncr); + posIncrAtt.setPositionIncrement(skippedPositions+1); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); @@ -142,19 +144,23 @@ public final class UAX29URLEmailTokenizer extends Tokenizer { } else // When we skip a too-long term, we still increment the // position increment - posIncr++; + skippedPositions++; } } @Override - public final void end() { + public final void end() throws IOException { + super.end(); // set final offset int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); offsetAtt.setOffset(finalOffset, finalOffset); + // adjust any skipped tokens + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions); } @Override public void reset() throws IOException { scanner.yyreset(input); + skippedPositions = 0; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java index 38b6e626e86..d19760f2a1c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java @@ -142,7 +142,8 @@ public abstract class CharTokenizer extends Tokenizer { } @Override - public final void end() { + public final void end() throws IOException { + super.end(); // set final offset offsetAtt.setOffset(finalOffset, finalOffset); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java index 731647c168c..1f5071a5382 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java @@ -34,6 +34,7 @@ public abstract class FilteringTokenFilter extends TokenFilter { protected final Version version; private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private int skippedPositions; /** * Create a new {@link FilteringTokenFilter}. @@ -50,7 +51,7 @@ public abstract class FilteringTokenFilter extends TokenFilter { @Override public final boolean incrementToken() throws IOException { - int skippedPositions = 0; + skippedPositions = 0; while (input.incrementToken()) { if (accept()) { if (skippedPositions != 0) { @@ -68,6 +69,12 @@ public abstract class FilteringTokenFilter extends TokenFilter { @Override public void reset() throws IOException { super.reset(); + skippedPositions = 0; } + @Override + public void end() throws IOException { + super.end(); + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); + } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java index 984c4de3bde..8d61852156c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java @@ -309,7 +309,8 @@ public final class WikipediaTokenizer extends Tokenizer { } @Override - public void end() { + public void end() throws IOException { + super.end(); // set final offset final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); this.offsetAtt.setOffset(finalOffset, finalOffset); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java index 79d63051338..383fd7066d4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java @@ -90,6 +90,22 @@ public class TestStopFilter extends BaseTokenStreamTestCase { StopFilter stpf01 = new StopFilter(TEST_VERSION_CURRENT, stpf0, stopSet1); // two stop filters concatenated! doTestStopPositons(stpf01); } + + // LUCENE-3849: make sure after .end() we see the "ending" posInc + public void testEndStopword() throws Exception { + CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of"); + StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(new StringReader("test of"), MockTokenizer.WHITESPACE, false), stopSet); + assertTokenStreamContents(stpf, new String[] { "test" }, + new int[] {0}, + new int[] {4}, + null, + new int[] {1}, + null, + 7, + 1, + null, + true); + } private void doTestStopPositons(StopFilter stpf) throws IOException { CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class); diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java index 00b90cb0859..176ee9ba8b2 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java @@ -120,7 +120,8 @@ public final class ICUTokenizer extends Tokenizer { } @Override - public void end() { + public void end() throws IOException { + super.end(); final int finalOffset = (length < 0) ? offset : offset + length; offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset)); } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java index f25c67cfb48..3568d56859d 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java @@ -280,7 +280,8 @@ public final class JapaneseTokenizer extends Tokenizer { } @Override - public void end() { + public void end() throws IOException { + super.end(); // Set final offset int finalOffset = correctOffset(pos); offsetAtt.setOffset(finalOffset, finalOffset); diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java index e25ed6cc5d5..1ff8b88f096 100644 --- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java +++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java @@ -112,7 +112,8 @@ public final class SentenceTokenizer extends Tokenizer { } @Override - public void end() { + public void end() throws IOException { + super.end(); // set final offset final int finalOffset = correctOffset(tokenEnd); offsetAtt.setOffset(finalOffset, finalOffset); diff --git a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java index 8453492375f..ead3bf9b576 100644 --- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java +++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java @@ -91,9 +91,4 @@ public abstract class BaseUIMATokenizer extends Tokenizer { public void reset() throws IOException { iterator = null; } - - @Override - public void end() throws IOException { - iterator = null; - } } diff --git a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java index 90c31e969a5..ba35230083a 100644 --- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java +++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java @@ -86,7 +86,7 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer { @Override public void end() throws IOException { - offsetAttr.setOffset(finalOffset, finalOffset); super.end(); + offsetAttr.setOffset(finalOffset, finalOffset); } } diff --git a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java index a6b1d159148..3b337aaaae4 100644 --- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java +++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java @@ -107,8 +107,8 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer { @Override public void end() throws IOException { - offsetAttr.setOffset(finalOffset, finalOffset); super.end(); + offsetAttr.setOffset(finalOffset, finalOffset); } diff --git a/lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java b/lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java index fc8832594d2..6dde1816a41 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.Closeable; import java.lang.reflect.Modifier; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; @@ -159,11 +160,18 @@ public abstract class TokenStream extends AttributeSource implements Closeable { * setting the final offset of a stream. The final offset of a stream might * differ from the offset of the last token eg in case one or more whitespaces * followed after the last token, but a WhitespaceTokenizer was used. + *

+ * Additionally any skipped positions (such as those removed by a stopfilter) + * can be applied to the position increment, or any adjustment of other + * attributes where the end-of-stream value may be important. * * @throws IOException If an I/O error occurs */ public void end() throws IOException { - // do nothing by default + clearAttributes(); // LUCENE-3849: don't consume dirty atts + if (hasAttribute(PositionIncrementAttribute.class)) { + getAttribute(PositionIncrementAttribute.class).setPositionIncrement(0); + } } /** diff --git a/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java b/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java index e682e4f21c4..0e966698deb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java @@ -175,7 +175,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField { } // trigger streams to perform end-of-stream operations stream.end(); - + // TODO: maybe add some safety? then again, its already checked + // when we come back around to the field... + fieldState.position += posIncrAttribute.getPositionIncrement(); fieldState.offset += offsetAttribute.endOffset(); success2 = true; } finally { diff --git a/lucene/core/src/test/org/apache/lucene/index/BinaryTokenStream.java b/lucene/core/src/test/org/apache/lucene/index/BinaryTokenStream.java index 09f33ad2902..1b52b7a573c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/BinaryTokenStream.java +++ b/lucene/core/src/test/org/apache/lucene/index/BinaryTokenStream.java @@ -31,16 +31,19 @@ import org.apache.lucene.analysis.CannedBinaryTokenStream; // javadocs */ public final class BinaryTokenStream extends TokenStream { private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class); + private final BytesRef bytes; private boolean available = true; public BinaryTokenStream(BytesRef bytes) { - bytesAtt.setBytesRef(bytes); + this.bytes = bytes; } @Override public boolean incrementToken() { if (available) { + clearAttributes(); available = false; + bytesAtt.setBytesRef(bytes); return true; } return false; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 5fe04427966..86d639aca00 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -50,6 +50,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.AlreadyClosedException; @@ -72,6 +73,9 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.SetOnce; import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.packed.PackedInts; import org.junit.Test; @@ -1899,6 +1903,65 @@ public class TestIndexWriter extends LuceneTestCase { } } + // LUCENE-3849 + public void testStopwordsPosIncHole() throws Exception { + Directory dir = newDirectory(); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader); + TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET); + return new TokenStreamComponents(tokenizer, stream); + } + }; + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a); + Document doc = new Document(); + doc.add(new TextField("body", "just a", Field.Store.NO)); + doc.add(new TextField("body", "test of gaps", Field.Store.NO)); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term("body", "just"), 0); + pq.add(new Term("body", "test"), 2); + // body:"just ? test" + assertEquals(1, is.search(pq, 5).totalHits); + ir.close(); + dir.close(); + } + + // LUCENE-3849 + public void testStopwordsPosIncHole2() throws Exception { + // use two stopfilters for testing here + Directory dir = newDirectory(); + final Automaton secondSet = BasicAutomata.makeString("foobar"); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader); + TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET); + stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet)); + return new TokenStreamComponents(tokenizer, stream); + } + }; + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a); + Document doc = new Document(); + doc.add(new TextField("body", "just a foobar", Field.Store.NO)); + doc.add(new TextField("body", "test of gaps", Field.Store.NO)); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term("body", "just"), 0); + pq.add(new Term("body", "test"), 3); + // body:"just ? ? test" + assertEquals(1, is.search(pq, 5).totalHits); + ir.close(); + dir.close(); + } + // here we do better, there is no current segments file, so we don't delete anything. // however, if you actually go and make a commit, the next time you run indexwriter // this file will be gone. diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java index 1e106cf7586..88cc95c2603 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java @@ -555,12 +555,16 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { private CharTermAttribute termAtt; private PositionIncrementAttribute posIncrAtt; private boolean returned; + private int val; + private final String word; + public SinglePositionTokenStream(String word) { termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); - termAtt.setEmpty().append(word); + this.word = word; returned = true; } + /** * Set the value we want to keep, as the position increment. * Note that when TermPositions.nextPosition() is later used to @@ -574,15 +578,21 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { * This change is described in Lucene's JIRA: LUCENE-1542. */ public void set(int val) { - posIncrAtt.setPositionIncrement(val); + this.val = val; returned = false; } + @Override public boolean incrementToken() throws IOException { if (returned) { return false; } - return returned = true; + clearAttributes(); + posIncrAtt.setPositionIncrement(val); + termAtt.setEmpty(); + termAtt.append(word); + returned = true; + return true; } } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java index 43ea21f5004..ea00fb3341d 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java @@ -264,7 +264,8 @@ public abstract class AbstractTestCase extends LuceneTestCase { } @Override - public final void end(){ + public final void end() throws IOException { + super.end(); offsetAtt.setOffset(getFinalOffset(),getFinalOffset()); } diff --git a/lucene/misc/src/test/org/apache/lucene/index/sorter/SorterTestBase.java b/lucene/misc/src/test/org/apache/lucene/index/sorter/SorterTestBase.java index 38456a588e4..e12f5286420 100644 --- a/lucene/misc/src/test/org/apache/lucene/index/sorter/SorterTestBase.java +++ b/lucene/misc/src/test/org/apache/lucene/index/sorter/SorterTestBase.java @@ -114,7 +114,6 @@ public abstract class SorterTestBase extends LuceneTestCase { public PositionsTokenStream() { term = addAttribute(CharTermAttribute.class); - term.append(DOC_POSITIONS_TERM); payload = addAttribute(PayloadAttribute.class); offset = addAttribute(OffsetAttribute.class); } @@ -125,6 +124,8 @@ public abstract class SorterTestBase extends LuceneTestCase { return false; } + clearAttributes(); + term.append(DOC_POSITIONS_TERM); payload.setPayload(new BytesRef(Integer.toString(pos))); offset.setOffset(off, off); --pos; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java index 290650a9a31..71cb4b98efd 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java @@ -50,7 +50,6 @@ public final class SuggestStopFilter extends TokenFilter { private final CharArraySet stopWords; private State endState; - private boolean ended; /** Sole constructor. */ public SuggestStopFilter(TokenStream input, CharArraySet stopWords) { @@ -61,28 +60,24 @@ public final class SuggestStopFilter extends TokenFilter { @Override public void reset() throws IOException { super.reset(); - ended = false; endState = null; } @Override public void end() throws IOException { - if (!ended) { + if (endState == null) { super.end(); } else { // NOTE: we already called .end() from our .next() when // the stream was complete, so we do not call // super.end() here - - if (endState != null) { - restoreState(endState); - } + restoreState(endState); } } @Override public boolean incrementToken() throws IOException { - if (ended) { + if (endState != null) { return false; } @@ -101,8 +96,9 @@ public final class SuggestStopFilter extends TokenFilter { // It was a stopword; skip it skippedPositions += posInc; } else { + clearAttributes(); input.end(); - ended = true; + endState = captureState(); int finalEndOffset = offsetAtt.endOffset(); assert finalEndOffset >= endOffset; if (finalEndOffset > endOffset) { @@ -112,7 +108,6 @@ public final class SuggestStopFilter extends TokenFilter { } else { // No token separator after final token that // looked like a stop-word; don't filter it: - endState = captureState(); restoreState(sav); posIncAtt.setPositionIncrement(skippedPositions + posIncAtt.getPositionIncrement()); keywordAtt.setKeyword(true); diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index f5de8ebde74..bcf51528477 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -112,7 +112,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { // - offsets only move forwards (startOffset >= // lastStartOffset) public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], - int posLengths[], Integer finalOffset, boolean[] keywordAtts, + int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException { assertNotNull(output); CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class); @@ -136,7 +136,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { } PositionIncrementAttribute posIncrAtt = null; - if (posIncrements != null) { + if (posIncrements != null || finalPosInc != null) { assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); } @@ -255,19 +255,43 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); } } + if (ts.incrementToken()) { fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString()); } + + // repeat our extra safety checks for end() + ts.clearAttributes(); + if (termAtt != null) termAtt.setEmpty().append("bogusTerm"); + if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243); + if (typeAtt != null) typeAtt.setType("bogusType"); + if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); + if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); + + checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before + ts.end(); + assertTrue("super.end()/clearAttributes() was not called correctly in end()", checkClearAtt.getAndResetClearCalled()); + if (finalOffset != null) { - assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); + assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset()); } if (offsetAtt != null) { assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0); } + if (finalPosInc != null) { + assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement()); + } + ts.close(); } + public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], + int posLengths[], Integer finalOffset, boolean[] keywordAtts, + boolean offsetsAreCorrect) throws IOException { + assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, null, offsetsAreCorrect); + } + public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException { assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java index 579d60d1037..0aea06c0d77 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java @@ -58,7 +58,8 @@ public final class MockTokenFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - + private int skippedPositions; + /** * Create a new MockTokenFilter. * @@ -76,7 +77,7 @@ public final class MockTokenFilter extends TokenFilter { // initial token with posInc=0 ever // return the first non-stop word found - int skippedPositions = 0; + skippedPositions = 0; while (input.incrementToken()) { if (!filter.run(termAtt.buffer(), 0, termAtt.length())) { posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); @@ -87,4 +88,16 @@ public final class MockTokenFilter extends TokenFilter { // reached EOS -- return false return false; } + + @Override + public void end() throws IOException { + super.end(); + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); + } + + @Override + public void reset() throws IOException { + super.reset(); + skippedPositions = 0; + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java index 707c591d31e..cf70ec11863 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java @@ -244,6 +244,7 @@ public class MockTokenizer extends Tokenizer { @Override public void end() throws IOException { + super.end(); int finalOffset = correctOffset(off); offsetAtt.setOffset(finalOffset, finalOffset); // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.