LUCENE-4343: clear up more Tokenizer.setReader/TokenStream.reset issues

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1379036 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-08-30 17:50:18 +00:00
parent 06fdfe6f32
commit d7120c7e43
28 changed files with 73 additions and 116 deletions

View File

@ -77,6 +77,10 @@ API Changes
fields in a stored document, has been replaced with the simpler
StoredFieldVisitor API. (Mike McCandless)
* LUCENE-4343: Made Tokenizer.setReader final. This is a setter that should
not be overriden by subclasses: per-stream initialization should happen
in reset(). (Robert Muir)
Bug Fixes
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor

View File

@ -94,8 +94,7 @@ public final class KeywordTokenizer extends Tokenizer {
}
@Override
public void setReader(Reader input) throws IOException {
super.setReader(input);
public void reset() throws IOException {
this.done = false;
}
}

View File

@ -78,9 +78,6 @@ public final class PatternTokenizer extends Tokenizer {
if (group >= 0 && group > matcher.groupCount()) {
throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups");
}
fillBuffer(str, input);
matcher.reset(str);
index = 0;
}
@Override
@ -136,8 +133,7 @@ public final class PatternTokenizer extends Tokenizer {
}
@Override
public void setReader(Reader input) throws IOException {
super.setReader(input);
public void reset() throws IOException {
fillBuffer(str, input);
matcher.reset(str);
index = 0;

View File

@ -175,8 +175,7 @@ public final class ClassicTokenizer extends Tokenizer {
}
@Override
public void setReader(Reader reader) throws IOException {
super.setReader(reader);
scanner.yyreset(reader);
public void reset() throws IOException {
scanner.yyreset(input);
}
}

View File

@ -183,8 +183,7 @@ public final class StandardTokenizer extends Tokenizer {
}
@Override
public void setReader(Reader reader) throws IOException {
super.setReader(reader);
scanner.yyreset(reader);
public void reset() throws IOException {
scanner.yyreset(input);
}
}

View File

@ -162,8 +162,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
}
@Override
public void setReader(Reader reader) throws IOException {
super.setReader(reader);
scanner.yyreset(reader);
public void reset() throws IOException {
scanner.yyreset(input);
}
}

View File

@ -78,7 +78,8 @@ public abstract class CharTokenizer extends Tokenizer {
charUtils = CharacterUtils.getInstance(matchVersion);
}
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
// note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()
private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
@ -162,8 +163,7 @@ public abstract class CharTokenizer extends Tokenizer {
}
@Override
public void setReader(Reader input) throws IOException {
super.setReader(input);
public void reset() throws IOException {
bufferIndex = 0;
offset = 0;
dataLen = 0;

View File

@ -318,18 +318,12 @@ public final class WikipediaTokenizer extends Tokenizer {
*/
@Override
public void reset() throws IOException {
super.reset();
scanner.yyreset(input);
tokens = null;
scanner.reset();
first = true;
}
@Override
public void setReader(Reader reader) throws IOException {
super.setReader(reader);
scanner.yyreset(input);
}
@Override
public void end() {
// set final offset

View File

@ -39,6 +39,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
cgf.reset();
assertTrue(cgf.incrementToken());
assertEquals("How", term.toString());
assertTrue(cgf.incrementToken());
@ -61,6 +62,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
nsf.reset();
assertTrue(nsf.incrementToken());
assertEquals("How_the", term.toString());
assertTrue(nsf.incrementToken());

View File

@ -235,6 +235,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
tf.reset();
assertTrue(tf.incrementToken());
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
assertTrue(tf.incrementToken());
@ -256,6 +257,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
stream.reset();
while (stream.incrementToken()) {
assertTrue("Custom attribute value was lost", retAtt.getRetain());
}

View File

@ -80,6 +80,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
void verifyPayload(TokenStream ts) throws IOException {
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
ts.reset();
for(byte b=1;;b++) {
boolean hasNext = ts.incrementToken();
if (!hasNext) break;

View File

@ -66,6 +66,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
assertNotNull(stream);
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
String text = termAtt.toString();
assertFalse(stopWordsSet.contains(text));
@ -83,6 +84,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
stream.reset();
while (stream.incrementToken()) {
String text = termAtt.toString();
assertFalse(stopWordsSet.contains(text));

View File

@ -111,6 +111,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
// assign bogus values
in.clearAttributes();
termAtt.setEmpty().append("bogusTerm");
in.reset();
while (in.incrementToken()) {
if (out.length() > 0)
out.append(' ');

View File

@ -45,7 +45,8 @@ public final class ICUTokenizer extends Tokenizer {
/** true length of text in the buffer */
private int length = 0;
/** length in buffer that can be evaluated safely, up to a safe end point */
private int usableLength = 0;
// note: usableLength is -1 here to best-effort AIOOBE consumers that don't call reset()
private int usableLength = -1;
/** accumulated offset of previous buffers for this reader, for offsetAtt */
private int offset = 0;
@ -101,12 +102,6 @@ public final class ICUTokenizer extends Tokenizer {
breaker.setText(buffer, 0, 0);
length = usableLength = offset = 0;
}
@Override
public void setReader(Reader input) throws IOException {
super.setReader(input);
reset();
}
@Override
public void end() {

View File

@ -244,15 +244,9 @@ public final class JapaneseTokenizer extends Tokenizer {
this.dotOut = dotOut;
}
@Override
public void setReader(Reader input) throws IOException {
super.setReader(input);
buffer.reset(input);
}
@Override
public void reset() throws IOException {
super.reset();
buffer.reset(input);
resetState();
}

View File

@ -112,16 +112,9 @@ public final class SentenceTokenizer extends Tokenizer {
@Override
public void reset() throws IOException {
super.reset();
tokenStart = tokenEnd = 0;
}
@Override
public void setReader(Reader input) throws IOException {
super.setReader(input);
reset();
}
@Override
public void end() {
// set final offset

View File

@ -80,8 +80,7 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
}
@Override
public void setReader(Reader input) throws IOException {
super.setReader(input);
public void reset() throws IOException {
iterator = null;
}

View File

@ -170,12 +170,8 @@ public abstract class TokenStream extends AttributeSource implements Closeable {
* This method is called by a consumer before it begins consumption using
* {@link #incrementToken()}.
* <p/>
* Resets this stream to the beginning. As all TokenStreams must be reusable,
* any implementations which have state that needs to be reset between usages
* of the TokenStream, must implement this method. Note that if your TokenStream
* caches tokens and feeds them back again after a reset, it is imperative
* that you clone the tokens when you store them away (on the first pass) as
* well as when you return them (on future passes after {@link #reset()}).
* Resets this stream to a clean state. Stateful implementations must implement
* this method so that they can be reused, just as if they had been created fresh.
*/
public void reset() throws IOException {}

View File

@ -82,12 +82,18 @@ public abstract class Tokenizer extends TokenStream {
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(currentOff) : currentOff;
}
/** Expert: Reset the tokenizer to a new reader. Typically, an
/** Expert: Set a new reader on the Tokenizer. Typically, an
* analyzer (in its tokenStream method) will use
* this to re-use a previously created tokenizer. */
public void setReader(Reader input) throws IOException {
public final void setReader(Reader input) throws IOException {
assert input != null: "input must not be null";
this.input = input;
assert setReaderTestPoint();
}
// only used by assert, for testing
boolean setReaderTestPoint() {
return true;
}
}

View File

@ -1545,7 +1545,7 @@ public class TestIndexWriter extends LuceneTestCase {
}
@Override
public void setReader(Reader input) throws IOException {
public void reset() throws IOException {
this.upto = 0;
final StringBuilder b = new StringBuilder();
final char[] buffer = new char[1024];

View File

@ -227,8 +227,7 @@ public class TestTermRangeQuery extends LuceneTestCase {
}
@Override
public final void setReader(Reader reader) throws IOException {
super.setReader(reader);
public void reset() throws IOException {;
done = false;
}
}

View File

@ -176,6 +176,8 @@ public abstract class AbstractTestCase extends LuceneTestCase {
BytesRef bytesRef = termAttribute.getBytesRef();
tokenStream.reset();
while (tokenStream.incrementToken()) {
termAttribute.fillBytesRef();
bytesRefs.add(BytesRef.deepCopyOf(bytesRef));
@ -316,12 +318,6 @@ public abstract class AbstractTestCase extends LuceneTestCase {
return delimiters.indexOf( c ) >= 0;
}
@Override
public void setReader( Reader input ) throws IOException {
super.setReader( input );
reset();
}
@Override
public void reset() {
startTerm = 0;

View File

@ -81,8 +81,7 @@ public class TestMultiPhraseQueryParsing extends LuceneTestCase {
}
@Override
public void setReader(Reader reader) throws IOException {
super.setReader(reader);
public void reset() throws IOException {
this.upto = 0;
this.lastPos = 0;
}

View File

@ -76,14 +76,4 @@ class PrefixCellsTokenizer extends Tokenizer {
termAtt.setLength(length);
return length > 0; // should only happen at the end
}
@Override
public final void end() {
}
@Override
public void setReader(Reader input) throws IOException {
super.setReader(input);
}
}

View File

@ -227,10 +227,10 @@ public class MockTokenizer extends Tokenizer {
}
@Override
public void setReader(Reader input) throws IOException {
super.setReader(input);
boolean setReaderTestPoint() {
assert !enableChecks || streamState == State.CLOSE : "setReader() called in wrong state: " + streamState;
streamState = State.SETREADER;
return true;
}
@Override

View File

@ -72,15 +72,11 @@ final class TrieTokenizer extends Tokenizer {
this.type = type;
this.precisionStep = precisionStep;
this.ts = ts;
setReader(input);
}
@Override
public void setReader(Reader input) {
public void reset() {
try {
super.setReader(input);
input = super.input;
char[] buf = new char[32];
int len = input.read(buf);
this.startOfs = correctOffset(0);
@ -113,6 +109,7 @@ final class TrieTokenizer extends Tokenizer {
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
}
ts.reset();
}
@Override
@ -120,12 +117,6 @@ final class TrieTokenizer extends Tokenizer {
super.close();
ts.close();
}
@Override
public void reset() throws IOException {
super.reset();
ts.reset();
}
@Override
public boolean incrementToken() {

View File

@ -71,9 +71,8 @@ public class BoolField extends PrimitiveFieldType {
boolean done = false;
@Override
public void setReader(Reader input) throws IOException {
public void reset() throws IOException {
done = false;
super.setReader(input);
}
@Override

View File

@ -81,13 +81,8 @@ public class PreAnalyzedField extends FieldType {
return new SolrAnalyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
try {
return new TokenStreamComponents(new PreAnalyzedTokenizer(reader, parser));
} catch (IOException e) {
return null;
}
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new PreAnalyzedTokenizer(reader, parser));
}
};
@ -169,6 +164,7 @@ public class PreAnalyzedField extends FieldType {
return null;
}
PreAnalyzedTokenizer parse = new PreAnalyzedTokenizer(new StringReader(val), parser);
parse.reset(); // consume
Field f = (Field)super.createField(field, val, boost);
if (parse.getStringValue() != null) {
f.setStringValue(parse.getStringValue());
@ -195,11 +191,11 @@ public class PreAnalyzedField extends FieldType {
private String stringValue = null;
private byte[] binaryValue = null;
private PreAnalyzedParser parser;
private Reader lastReader;
public PreAnalyzedTokenizer(Reader reader, PreAnalyzedParser parser) throws IOException {
public PreAnalyzedTokenizer(Reader reader, PreAnalyzedParser parser) {
super(reader);
this.parser = parser;
setReader(reader);
}
public boolean hasTokenStream() {
@ -229,24 +225,30 @@ public class PreAnalyzedField extends FieldType {
return true;
}
public final void reset() {
@Override
public final void reset() throws IOException {
// NOTE: this acts like rewind if you call it again
if (input != lastReader) {
lastReader = input;
cachedStates.clear();
stringValue = null;
binaryValue = null;
ParseResult res = parser.parse(input, this);
if (res != null) {
stringValue = res.str;
binaryValue = res.bin;
if (res.states != null) {
cachedStates.addAll(res.states);
}
}
}
it = cachedStates.iterator();
}
@Override
public void setReader(Reader input) throws IOException {
super.setReader(input);
cachedStates.clear();
stringValue = null;
binaryValue = null;
ParseResult res = parser.parse(input, this);
if (res != null) {
stringValue = res.str;
binaryValue = res.bin;
if (res.states != null) {
cachedStates.addAll(res.states);
}
}
public void close() throws IOException {
super.close();
lastReader = null; // just a ref, null for gc
}
}