LUCENE-3849: end() now sets position increment, so any trailing holes are counted

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1515887 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-08-20 17:13:06 +00:00
parent fbbdba2614
commit 2a5421ca68
30 changed files with 226 additions and 53 deletions

View File

@ -126,6 +126,11 @@ Bug Fixes
the default one) have their own limits (David Smiley, Robert Muir, the default one) have their own limits (David Smiley, Robert Muir,
Mike McCandless) Mike McCandless)
* LUCENE-3849: TokenStreams now set the position increment in end(),
so we can handle trailing holes. If you have a custom TokenStream
implementing end() then be sure it calls super.end(). (Robert Muir,
Mike McCandless)
API Changes API Changes
* LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap. * LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap.

View File

@ -80,7 +80,8 @@ public final class KeywordTokenizer extends Tokenizer {
} }
@Override @Override
public final void end() { public final void end() throws IOException {
super.end();
// set final offset // set final offset
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);
} }

View File

@ -217,13 +217,15 @@ public class NGramTokenizer extends Tokenizer {
} }
@Override @Override
public final void end() { public final void end() throws IOException {
super.end();
assert bufferStart <= bufferEnd; assert bufferStart <= bufferEnd;
int endOffset = offset; int endOffset = offset;
for (int i = bufferStart; i < bufferEnd; ++i) { for (int i = bufferStart; i < bufferEnd; ++i) {
endOffset += Character.charCount(buffer[i]); endOffset += Character.charCount(buffer[i]);
} }
endOffset = correctOffset(endOffset); endOffset = correctOffset(endOffset);
// set final offset
offsetAtt.setOffset(endOffset, endOffset); offsetAtt.setOffset(endOffset, endOffset);
} }

View File

@ -191,7 +191,8 @@ public class PathHierarchyTokenizer extends Tokenizer {
} }
@Override @Override
public final void end() { public final void end() throws IOException {
super.end();
// set final offset // set final offset
int finalOffset = correctOffset(charsRead); int finalOffset = correctOffset(charsRead);
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);

View File

@ -176,7 +176,8 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
} }
@Override @Override
public final void end() { public final void end() throws IOException {
super.end();
// set final offset // set final offset
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);
} }

View File

@ -130,7 +130,8 @@ public final class PatternTokenizer extends Tokenizer {
} }
@Override @Override
public void end() { public void end() throws IOException {
super.end();
final int ofs = correctOffset(str.length()); final int ofs = correctOffset(str.length());
offsetAtt.setOffset(ofs, ofs); offsetAtt.setOffset(ofs, ofs);
} }

View File

@ -76,6 +76,8 @@ public final class ClassicTokenizer extends Tokenizer {
"<CJ>", "<CJ>",
"<ACRONYM_DEP>" "<ACRONYM_DEP>"
}; };
private int skippedPositions;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
@ -130,7 +132,7 @@ public final class ClassicTokenizer extends Tokenizer {
@Override @Override
public final boolean incrementToken() throws IOException { public final boolean incrementToken() throws IOException {
clearAttributes(); clearAttributes();
int posIncr = 1; skippedPositions = 0;
while(true) { while(true) {
int tokenType = scanner.getNextToken(); int tokenType = scanner.getNextToken();
@ -140,7 +142,7 @@ public final class ClassicTokenizer extends Tokenizer {
} }
if (scanner.yylength() <= maxTokenLength) { if (scanner.yylength() <= maxTokenLength) {
posIncrAtt.setPositionIncrement(posIncr); posIncrAtt.setPositionIncrement(skippedPositions+1);
scanner.getText(termAtt); scanner.getText(termAtt);
final int start = scanner.yychar(); final int start = scanner.yychar();
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
@ -155,19 +157,23 @@ public final class ClassicTokenizer extends Tokenizer {
} else } else
// When we skip a too-long term, we still increment the // When we skip a too-long term, we still increment the
// position increment // position increment
posIncr++; skippedPositions++;
} }
} }
@Override @Override
public final void end() { public final void end() throws IOException {
super.end();
// set final offset // set final offset
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);
// adjust any skipped tokens
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
} }
@Override @Override
public void reset() throws IOException { public void reset() throws IOException {
scanner.yyreset(input); scanner.yyreset(input);
skippedPositions = 0;
} }
} }

View File

@ -90,6 +90,8 @@ public final class StandardTokenizer extends Tokenizer {
"<KATAKANA>", "<KATAKANA>",
"<HANGUL>" "<HANGUL>"
}; };
private int skippedPositions;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
@ -144,7 +146,7 @@ public final class StandardTokenizer extends Tokenizer {
@Override @Override
public final boolean incrementToken() throws IOException { public final boolean incrementToken() throws IOException {
clearAttributes(); clearAttributes();
int posIncr = 1; skippedPositions = 0;
while(true) { while(true) {
int tokenType = scanner.getNextToken(); int tokenType = scanner.getNextToken();
@ -154,7 +156,7 @@ public final class StandardTokenizer extends Tokenizer {
} }
if (scanner.yylength() <= maxTokenLength) { if (scanner.yylength() <= maxTokenLength) {
posIncrAtt.setPositionIncrement(posIncr); posIncrAtt.setPositionIncrement(skippedPositions+1);
scanner.getText(termAtt); scanner.getText(termAtt);
final int start = scanner.yychar(); final int start = scanner.yychar();
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
@ -163,19 +165,23 @@ public final class StandardTokenizer extends Tokenizer {
} else } else
// When we skip a too-long term, we still increment the // When we skip a too-long term, we still increment the
// position increment // position increment
posIncr++; skippedPositions++;
} }
} }
@Override @Override
public final void end() { public final void end() throws IOException {
super.end();
// set final offset // set final offset
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);
// adjust any skipped tokens
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
} }
@Override @Override
public void reset() throws IOException { public void reset() throws IOException {
scanner.yyreset(input); scanner.yyreset(input);
skippedPositions = 0;
} }
} }

View File

@ -76,6 +76,8 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
"<URL>", "<URL>",
"<EMAIL>", "<EMAIL>",
}; };
private int skippedPositions;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
@ -123,7 +125,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
@Override @Override
public final boolean incrementToken() throws IOException { public final boolean incrementToken() throws IOException {
clearAttributes(); clearAttributes();
int posIncr = 1; skippedPositions = 0;
while(true) { while(true) {
int tokenType = scanner.getNextToken(); int tokenType = scanner.getNextToken();
@ -133,7 +135,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
} }
if (scanner.yylength() <= maxTokenLength) { if (scanner.yylength() <= maxTokenLength) {
posIncrAtt.setPositionIncrement(posIncr); posIncrAtt.setPositionIncrement(skippedPositions+1);
scanner.getText(termAtt); scanner.getText(termAtt);
final int start = scanner.yychar(); final int start = scanner.yychar();
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
@ -142,19 +144,23 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
} else } else
// When we skip a too-long term, we still increment the // When we skip a too-long term, we still increment the
// position increment // position increment
posIncr++; skippedPositions++;
} }
} }
@Override @Override
public final void end() { public final void end() throws IOException {
super.end();
// set final offset // set final offset
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);
// adjust any skipped tokens
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
} }
@Override @Override
public void reset() throws IOException { public void reset() throws IOException {
scanner.yyreset(input); scanner.yyreset(input);
skippedPositions = 0;
} }
} }

View File

@ -142,7 +142,8 @@ public abstract class CharTokenizer extends Tokenizer {
} }
@Override @Override
public final void end() { public final void end() throws IOException {
super.end();
// set final offset // set final offset
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);
} }

View File

@ -34,6 +34,7 @@ public abstract class FilteringTokenFilter extends TokenFilter {
protected final Version version; protected final Version version;
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private int skippedPositions;
/** /**
* Create a new {@link FilteringTokenFilter}. * Create a new {@link FilteringTokenFilter}.
@ -50,7 +51,7 @@ public abstract class FilteringTokenFilter extends TokenFilter {
@Override @Override
public final boolean incrementToken() throws IOException { public final boolean incrementToken() throws IOException {
int skippedPositions = 0; skippedPositions = 0;
while (input.incrementToken()) { while (input.incrementToken()) {
if (accept()) { if (accept()) {
if (skippedPositions != 0) { if (skippedPositions != 0) {
@ -68,6 +69,12 @@ public abstract class FilteringTokenFilter extends TokenFilter {
@Override @Override
public void reset() throws IOException { public void reset() throws IOException {
super.reset(); super.reset();
skippedPositions = 0;
} }
@Override
public void end() throws IOException {
super.end();
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
} }

View File

@ -309,7 +309,8 @@ public final class WikipediaTokenizer extends Tokenizer {
} }
@Override @Override
public void end() { public void end() throws IOException {
super.end();
// set final offset // set final offset
final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
this.offsetAtt.setOffset(finalOffset, finalOffset); this.offsetAtt.setOffset(finalOffset, finalOffset);

View File

@ -90,6 +90,22 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
StopFilter stpf01 = new StopFilter(TEST_VERSION_CURRENT, stpf0, stopSet1); // two stop filters concatenated! StopFilter stpf01 = new StopFilter(TEST_VERSION_CURRENT, stpf0, stopSet1); // two stop filters concatenated!
doTestStopPositons(stpf01); doTestStopPositons(stpf01);
} }
// LUCENE-3849: make sure after .end() we see the "ending" posInc
public void testEndStopword() throws Exception {
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of");
StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(new StringReader("test of"), MockTokenizer.WHITESPACE, false), stopSet);
assertTokenStreamContents(stpf, new String[] { "test" },
new int[] {0},
new int[] {4},
null,
new int[] {1},
null,
7,
1,
null,
true);
}
private void doTestStopPositons(StopFilter stpf) throws IOException { private void doTestStopPositons(StopFilter stpf) throws IOException {
CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class); CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);

View File

@ -120,7 +120,8 @@ public final class ICUTokenizer extends Tokenizer {
} }
@Override @Override
public void end() { public void end() throws IOException {
super.end();
final int finalOffset = (length < 0) ? offset : offset + length; final int finalOffset = (length < 0) ? offset : offset + length;
offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset)); offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
} }

View File

@ -280,7 +280,8 @@ public final class JapaneseTokenizer extends Tokenizer {
} }
@Override @Override
public void end() { public void end() throws IOException {
super.end();
// Set final offset // Set final offset
int finalOffset = correctOffset(pos); int finalOffset = correctOffset(pos);
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);

View File

@ -112,7 +112,8 @@ public final class SentenceTokenizer extends Tokenizer {
} }
@Override @Override
public void end() { public void end() throws IOException {
super.end();
// set final offset // set final offset
final int finalOffset = correctOffset(tokenEnd); final int finalOffset = correctOffset(tokenEnd);
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);

View File

@ -91,9 +91,4 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
public void reset() throws IOException { public void reset() throws IOException {
iterator = null; iterator = null;
} }
@Override
public void end() throws IOException {
iterator = null;
}
} }

View File

@ -86,7 +86,7 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
@Override @Override
public void end() throws IOException { public void end() throws IOException {
offsetAttr.setOffset(finalOffset, finalOffset);
super.end(); super.end();
offsetAttr.setOffset(finalOffset, finalOffset);
} }
} }

View File

@ -107,8 +107,8 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer {
@Override @Override
public void end() throws IOException { public void end() throws IOException {
offsetAttr.setOffset(finalOffset, finalOffset);
super.end(); super.end();
offsetAttr.setOffset(finalOffset, finalOffset);
} }

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.Closeable; import java.io.Closeable;
import java.lang.reflect.Modifier; import java.lang.reflect.Modifier;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
@ -159,11 +160,18 @@ public abstract class TokenStream extends AttributeSource implements Closeable {
* setting the final offset of a stream. The final offset of a stream might * setting the final offset of a stream. The final offset of a stream might
* differ from the offset of the last token eg in case one or more whitespaces * differ from the offset of the last token eg in case one or more whitespaces
* followed after the last token, but a WhitespaceTokenizer was used. * followed after the last token, but a WhitespaceTokenizer was used.
* <p>
* Additionally any skipped positions (such as those removed by a stopfilter)
* can be applied to the position increment, or any adjustment of other
* attributes where the end-of-stream value may be important.
* *
* @throws IOException If an I/O error occurs * @throws IOException If an I/O error occurs
*/ */
public void end() throws IOException { public void end() throws IOException {
// do nothing by default clearAttributes(); // LUCENE-3849: don't consume dirty atts
if (hasAttribute(PositionIncrementAttribute.class)) {
getAttribute(PositionIncrementAttribute.class).setPositionIncrement(0);
}
} }
/** /**

View File

@ -175,7 +175,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
} }
// trigger streams to perform end-of-stream operations // trigger streams to perform end-of-stream operations
stream.end(); stream.end();
// TODO: maybe add some safety? then again, its already checked
// when we come back around to the field...
fieldState.position += posIncrAttribute.getPositionIncrement();
fieldState.offset += offsetAttribute.endOffset(); fieldState.offset += offsetAttribute.endOffset();
success2 = true; success2 = true;
} finally { } finally {

View File

@ -31,16 +31,19 @@ import org.apache.lucene.analysis.CannedBinaryTokenStream; // javadocs
*/ */
public final class BinaryTokenStream extends TokenStream { public final class BinaryTokenStream extends TokenStream {
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class); private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
private final BytesRef bytes;
private boolean available = true; private boolean available = true;
public BinaryTokenStream(BytesRef bytes) { public BinaryTokenStream(BytesRef bytes) {
bytesAtt.setBytesRef(bytes); this.bytes = bytes;
} }
@Override @Override
public boolean incrementToken() { public boolean incrementToken() {
if (available) { if (available) {
clearAttributes();
available = false; available = false;
bytesAtt.setBytesRef(bytes);
return true; return true;
} }
return false; return false;

View File

@ -50,6 +50,7 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.AlreadyClosedException;
@ -72,6 +73,9 @@ import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.SetOnce; import org.apache.lucene.util.SetOnce;
import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util._TestUtil; import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedInts;
import org.junit.Test; import org.junit.Test;
@ -1899,6 +1903,65 @@ public class TestIndexWriter extends LuceneTestCase {
} }
} }
// LUCENE-3849
public void testStopwordsPosIncHole() throws Exception {
Directory dir = newDirectory();
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader);
TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
return new TokenStreamComponents(tokenizer, stream);
}
};
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
Document doc = new Document();
doc.add(new TextField("body", "just a", Field.Store.NO));
doc.add(new TextField("body", "test of gaps", Field.Store.NO));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher is = newSearcher(ir);
PhraseQuery pq = new PhraseQuery();
pq.add(new Term("body", "just"), 0);
pq.add(new Term("body", "test"), 2);
// body:"just ? test"
assertEquals(1, is.search(pq, 5).totalHits);
ir.close();
dir.close();
}
// LUCENE-3849
public void testStopwordsPosIncHole2() throws Exception {
// use two stopfilters for testing here
Directory dir = newDirectory();
final Automaton secondSet = BasicAutomata.makeString("foobar");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader);
TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet));
return new TokenStreamComponents(tokenizer, stream);
}
};
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
Document doc = new Document();
doc.add(new TextField("body", "just a foobar", Field.Store.NO));
doc.add(new TextField("body", "test of gaps", Field.Store.NO));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher is = newSearcher(ir);
PhraseQuery pq = new PhraseQuery();
pq.add(new Term("body", "just"), 0);
pq.add(new Term("body", "test"), 3);
// body:"just ? ? test"
assertEquals(1, is.search(pq, 5).totalHits);
ir.close();
dir.close();
}
// here we do better, there is no current segments file, so we don't delete anything. // here we do better, there is no current segments file, so we don't delete anything.
// however, if you actually go and make a commit, the next time you run indexwriter // however, if you actually go and make a commit, the next time you run indexwriter
// this file will be gone. // this file will be gone.

View File

@ -555,12 +555,16 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
private CharTermAttribute termAtt; private CharTermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt; private PositionIncrementAttribute posIncrAtt;
private boolean returned; private boolean returned;
private int val;
private final String word;
public SinglePositionTokenStream(String word) { public SinglePositionTokenStream(String word) {
termAtt = addAttribute(CharTermAttribute.class); termAtt = addAttribute(CharTermAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class);
termAtt.setEmpty().append(word); this.word = word;
returned = true; returned = true;
} }
/** /**
* Set the value we want to keep, as the position increment. * Set the value we want to keep, as the position increment.
* Note that when TermPositions.nextPosition() is later used to * Note that when TermPositions.nextPosition() is later used to
@ -574,15 +578,21 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* This change is described in Lucene's JIRA: LUCENE-1542. * This change is described in Lucene's JIRA: LUCENE-1542.
*/ */
public void set(int val) { public void set(int val) {
posIncrAtt.setPositionIncrement(val); this.val = val;
returned = false; returned = false;
} }
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
if (returned) { if (returned) {
return false; return false;
} }
return returned = true; clearAttributes();
posIncrAtt.setPositionIncrement(val);
termAtt.setEmpty();
termAtt.append(word);
returned = true;
return true;
} }
} }

View File

@ -264,7 +264,8 @@ public abstract class AbstractTestCase extends LuceneTestCase {
} }
@Override @Override
public final void end(){ public final void end() throws IOException {
super.end();
offsetAtt.setOffset(getFinalOffset(),getFinalOffset()); offsetAtt.setOffset(getFinalOffset(),getFinalOffset());
} }

View File

@ -114,7 +114,6 @@ public abstract class SorterTestBase extends LuceneTestCase {
public PositionsTokenStream() { public PositionsTokenStream() {
term = addAttribute(CharTermAttribute.class); term = addAttribute(CharTermAttribute.class);
term.append(DOC_POSITIONS_TERM);
payload = addAttribute(PayloadAttribute.class); payload = addAttribute(PayloadAttribute.class);
offset = addAttribute(OffsetAttribute.class); offset = addAttribute(OffsetAttribute.class);
} }
@ -125,6 +124,8 @@ public abstract class SorterTestBase extends LuceneTestCase {
return false; return false;
} }
clearAttributes();
term.append(DOC_POSITIONS_TERM);
payload.setPayload(new BytesRef(Integer.toString(pos))); payload.setPayload(new BytesRef(Integer.toString(pos)));
offset.setOffset(off, off); offset.setOffset(off, off);
--pos; --pos;

View File

@ -50,7 +50,6 @@ public final class SuggestStopFilter extends TokenFilter {
private final CharArraySet stopWords; private final CharArraySet stopWords;
private State endState; private State endState;
private boolean ended;
/** Sole constructor. */ /** Sole constructor. */
public SuggestStopFilter(TokenStream input, CharArraySet stopWords) { public SuggestStopFilter(TokenStream input, CharArraySet stopWords) {
@ -61,28 +60,24 @@ public final class SuggestStopFilter extends TokenFilter {
@Override @Override
public void reset() throws IOException { public void reset() throws IOException {
super.reset(); super.reset();
ended = false;
endState = null; endState = null;
} }
@Override @Override
public void end() throws IOException { public void end() throws IOException {
if (!ended) { if (endState == null) {
super.end(); super.end();
} else { } else {
// NOTE: we already called .end() from our .next() when // NOTE: we already called .end() from our .next() when
// the stream was complete, so we do not call // the stream was complete, so we do not call
// super.end() here // super.end() here
restoreState(endState);
if (endState != null) {
restoreState(endState);
}
} }
} }
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
if (ended) { if (endState != null) {
return false; return false;
} }
@ -101,8 +96,9 @@ public final class SuggestStopFilter extends TokenFilter {
// It was a stopword; skip it // It was a stopword; skip it
skippedPositions += posInc; skippedPositions += posInc;
} else { } else {
clearAttributes();
input.end(); input.end();
ended = true; endState = captureState();
int finalEndOffset = offsetAtt.endOffset(); int finalEndOffset = offsetAtt.endOffset();
assert finalEndOffset >= endOffset; assert finalEndOffset >= endOffset;
if (finalEndOffset > endOffset) { if (finalEndOffset > endOffset) {
@ -112,7 +108,6 @@ public final class SuggestStopFilter extends TokenFilter {
} else { } else {
// No token separator after final token that // No token separator after final token that
// looked like a stop-word; don't filter it: // looked like a stop-word; don't filter it:
endState = captureState();
restoreState(sav); restoreState(sav);
posIncAtt.setPositionIncrement(skippedPositions + posIncAtt.getPositionIncrement()); posIncAtt.setPositionIncrement(skippedPositions + posIncAtt.getPositionIncrement());
keywordAtt.setKeyword(true); keywordAtt.setKeyword(true);

View File

@ -112,7 +112,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// - offsets only move forwards (startOffset >= // - offsets only move forwards (startOffset >=
// lastStartOffset) // lastStartOffset)
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
int posLengths[], Integer finalOffset, boolean[] keywordAtts, int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts,
boolean offsetsAreCorrect) throws IOException { boolean offsetsAreCorrect) throws IOException {
assertNotNull(output); assertNotNull(output);
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class); CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
@ -136,7 +136,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
} }
PositionIncrementAttribute posIncrAtt = null; PositionIncrementAttribute posIncrAtt = null;
if (posIncrements != null) { if (posIncrements != null || finalPosInc != null) {
assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
} }
@ -255,19 +255,43 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
} }
} }
if (ts.incrementToken()) { if (ts.incrementToken()) {
fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString()); fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString());
} }
// repeat our extra safety checks for end()
ts.clearAttributes();
if (termAtt != null) termAtt.setEmpty().append("bogusTerm");
if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
if (typeAtt != null) typeAtt.setType("bogusType");
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
ts.end(); ts.end();
assertTrue("super.end()/clearAttributes() was not called correctly in end()", checkClearAtt.getAndResetClearCalled());
if (finalOffset != null) { if (finalOffset != null) {
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
} }
if (offsetAtt != null) { if (offsetAtt != null) {
assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0); assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
} }
if (finalPosInc != null) {
assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
}
ts.close(); ts.close();
} }
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
int posLengths[], Integer finalOffset, boolean[] keywordAtts,
boolean offsetsAreCorrect) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, null, offsetsAreCorrect);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException { public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect); assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
} }

View File

@ -58,7 +58,8 @@ public final class MockTokenFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private int skippedPositions;
/** /**
* Create a new MockTokenFilter. * Create a new MockTokenFilter.
* *
@ -76,7 +77,7 @@ public final class MockTokenFilter extends TokenFilter {
// initial token with posInc=0 ever // initial token with posInc=0 ever
// return the first non-stop word found // return the first non-stop word found
int skippedPositions = 0; skippedPositions = 0;
while (input.incrementToken()) { while (input.incrementToken()) {
if (!filter.run(termAtt.buffer(), 0, termAtt.length())) { if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
@ -87,4 +88,16 @@ public final class MockTokenFilter extends TokenFilter {
// reached EOS -- return false // reached EOS -- return false
return false; return false;
} }
@Override
public void end() throws IOException {
super.end();
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
@Override
public void reset() throws IOException {
super.reset();
skippedPositions = 0;
}
} }

View File

@ -244,6 +244,7 @@ public class MockTokenizer extends Tokenizer {
@Override @Override
public void end() throws IOException { public void end() throws IOException {
super.end();
int finalOffset = correctOffset(off); int finalOffset = correctOffset(off);
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(finalOffset, finalOffset);
// some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false. // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.