mirror of https://github.com/apache/lucene.git
LUCENE-4343: clear up more Tokenizer.setReader/TokenStream.reset issues
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1379036 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
06fdfe6f32
commit
d7120c7e43
|
@ -77,6 +77,10 @@ API Changes
|
|||
fields in a stored document, has been replaced with the simpler
|
||||
StoredFieldVisitor API. (Mike McCandless)
|
||||
|
||||
* LUCENE-4343: Made Tokenizer.setReader final. This is a setter that should
|
||||
not be overriden by subclasses: per-stream initialization should happen
|
||||
in reset(). (Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor
|
||||
|
|
|
@ -94,8 +94,7 @@ public final class KeywordTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
public void reset() throws IOException {
|
||||
this.done = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -78,9 +78,6 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
if (group >= 0 && group > matcher.groupCount()) {
|
||||
throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups");
|
||||
}
|
||||
fillBuffer(str, input);
|
||||
matcher.reset(str);
|
||||
index = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -136,8 +133,7 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
public void reset() throws IOException {
|
||||
fillBuffer(str, input);
|
||||
matcher.reset(str);
|
||||
index = 0;
|
||||
|
|
|
@ -175,8 +175,7 @@ public final class ClassicTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(reader);
|
||||
public void reset() throws IOException {
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -183,8 +183,7 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(reader);
|
||||
public void reset() throws IOException {
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -162,8 +162,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(reader);
|
||||
public void reset() throws IOException {
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -78,7 +78,8 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
charUtils = CharacterUtils.getInstance(matchVersion);
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
|
||||
// note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()
|
||||
private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
private static final int IO_BUFFER_SIZE = 4096;
|
||||
|
||||
|
@ -162,8 +163,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
public void reset() throws IOException {
|
||||
bufferIndex = 0;
|
||||
offset = 0;
|
||||
dataLen = 0;
|
||||
|
|
|
@ -318,18 +318,12 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
tokens = null;
|
||||
scanner.reset();
|
||||
first = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
// set final offset
|
||||
|
|
|
@ -39,6 +39,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
|
||||
|
||||
CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
|
||||
cgf.reset();
|
||||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("How", term.toString());
|
||||
assertTrue(cgf.incrementToken());
|
||||
|
@ -61,6 +62,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
|
||||
CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
|
||||
nsf.reset();
|
||||
assertTrue(nsf.incrementToken());
|
||||
assertEquals("How_the", term.toString());
|
||||
assertTrue(nsf.incrementToken());
|
||||
|
|
|
@ -235,6 +235,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
|
||||
CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
|
||||
tf.reset();
|
||||
assertTrue(tf.incrementToken());
|
||||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||
assertTrue(tf.incrementToken());
|
||||
|
@ -256,6 +257,7 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
assertTrue("Custom attribute value was lost", retAtt.getRetain());
|
||||
}
|
||||
|
|
|
@ -80,6 +80,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
|
||||
void verifyPayload(TokenStream ts) throws IOException {
|
||||
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
|
||||
ts.reset();
|
||||
for(byte b=1;;b++) {
|
||||
boolean hasNext = ts.incrementToken();
|
||||
if (!hasNext) break;
|
||||
|
|
|
@ -66,6 +66,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertNotNull(stream);
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
String text = termAtt.toString();
|
||||
assertFalse(stopWordsSet.contains(text));
|
||||
|
@ -83,6 +84,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
|||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
String text = termAtt.toString();
|
||||
assertFalse(stopWordsSet.contains(text));
|
||||
|
|
|
@ -111,6 +111,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
|||
// assign bogus values
|
||||
in.clearAttributes();
|
||||
termAtt.setEmpty().append("bogusTerm");
|
||||
in.reset();
|
||||
while (in.incrementToken()) {
|
||||
if (out.length() > 0)
|
||||
out.append(' ');
|
||||
|
|
|
@ -45,7 +45,8 @@ public final class ICUTokenizer extends Tokenizer {
|
|||
/** true length of text in the buffer */
|
||||
private int length = 0;
|
||||
/** length in buffer that can be evaluated safely, up to a safe end point */
|
||||
private int usableLength = 0;
|
||||
// note: usableLength is -1 here to best-effort AIOOBE consumers that don't call reset()
|
||||
private int usableLength = -1;
|
||||
/** accumulated offset of previous buffers for this reader, for offsetAtt */
|
||||
private int offset = 0;
|
||||
|
||||
|
@ -101,12 +102,6 @@ public final class ICUTokenizer extends Tokenizer {
|
|||
breaker.setText(buffer, 0, 0);
|
||||
length = usableLength = offset = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
|
|
|
@ -244,15 +244,9 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
this.dotOut = dotOut;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
buffer.reset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
buffer.reset(input);
|
||||
resetState();
|
||||
}
|
||||
|
||||
|
|
|
@ -112,16 +112,9 @@ public final class SentenceTokenizer extends Tokenizer {
|
|||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokenStart = tokenEnd = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
// set final offset
|
||||
|
|
|
@ -80,8 +80,7 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
public void reset() throws IOException {
|
||||
iterator = null;
|
||||
}
|
||||
|
||||
|
|
|
@ -170,12 +170,8 @@ public abstract class TokenStream extends AttributeSource implements Closeable {
|
|||
* This method is called by a consumer before it begins consumption using
|
||||
* {@link #incrementToken()}.
|
||||
* <p/>
|
||||
* Resets this stream to the beginning. As all TokenStreams must be reusable,
|
||||
* any implementations which have state that needs to be reset between usages
|
||||
* of the TokenStream, must implement this method. Note that if your TokenStream
|
||||
* caches tokens and feeds them back again after a reset, it is imperative
|
||||
* that you clone the tokens when you store them away (on the first pass) as
|
||||
* well as when you return them (on future passes after {@link #reset()}).
|
||||
* Resets this stream to a clean state. Stateful implementations must implement
|
||||
* this method so that they can be reused, just as if they had been created fresh.
|
||||
*/
|
||||
public void reset() throws IOException {}
|
||||
|
||||
|
|
|
@ -82,12 +82,18 @@ public abstract class Tokenizer extends TokenStream {
|
|||
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(currentOff) : currentOff;
|
||||
}
|
||||
|
||||
/** Expert: Reset the tokenizer to a new reader. Typically, an
|
||||
/** Expert: Set a new reader on the Tokenizer. Typically, an
|
||||
* analyzer (in its tokenStream method) will use
|
||||
* this to re-use a previously created tokenizer. */
|
||||
public void setReader(Reader input) throws IOException {
|
||||
public final void setReader(Reader input) throws IOException {
|
||||
assert input != null: "input must not be null";
|
||||
this.input = input;
|
||||
assert setReaderTestPoint();
|
||||
}
|
||||
|
||||
// only used by assert, for testing
|
||||
boolean setReaderTestPoint() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1545,7 +1545,7 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
public void reset() throws IOException {
|
||||
this.upto = 0;
|
||||
final StringBuilder b = new StringBuilder();
|
||||
final char[] buffer = new char[1024];
|
||||
|
|
|
@ -227,8 +227,7 @@ public class TestTermRangeQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
public void reset() throws IOException {;
|
||||
done = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -176,6 +176,8 @@ public abstract class AbstractTestCase extends LuceneTestCase {
|
|||
|
||||
BytesRef bytesRef = termAttribute.getBytesRef();
|
||||
|
||||
tokenStream.reset();
|
||||
|
||||
while (tokenStream.incrementToken()) {
|
||||
termAttribute.fillBytesRef();
|
||||
bytesRefs.add(BytesRef.deepCopyOf(bytesRef));
|
||||
|
@ -316,12 +318,6 @@ public abstract class AbstractTestCase extends LuceneTestCase {
|
|||
return delimiters.indexOf( c ) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader( Reader input ) throws IOException {
|
||||
super.setReader( input );
|
||||
reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
startTerm = 0;
|
||||
|
|
|
@ -81,8 +81,7 @@ public class TestMultiPhraseQueryParsing extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader reader) throws IOException {
|
||||
super.setReader(reader);
|
||||
public void reset() throws IOException {
|
||||
this.upto = 0;
|
||||
this.lastPos = 0;
|
||||
}
|
||||
|
|
|
@ -76,14 +76,4 @@ class PrefixCellsTokenizer extends Tokenizer {
|
|||
termAtt.setLength(length);
|
||||
return length > 0; // should only happen at the end
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
}
|
||||
}
|
|
@ -227,10 +227,10 @@ public class MockTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
boolean setReaderTestPoint() {
|
||||
assert !enableChecks || streamState == State.CLOSE : "setReader() called in wrong state: " + streamState;
|
||||
streamState = State.SETREADER;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -72,15 +72,11 @@ final class TrieTokenizer extends Tokenizer {
|
|||
this.type = type;
|
||||
this.precisionStep = precisionStep;
|
||||
this.ts = ts;
|
||||
|
||||
setReader(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) {
|
||||
public void reset() {
|
||||
try {
|
||||
super.setReader(input);
|
||||
input = super.input;
|
||||
char[] buf = new char[32];
|
||||
int len = input.read(buf);
|
||||
this.startOfs = correctOffset(0);
|
||||
|
@ -113,6 +109,7 @@ final class TrieTokenizer extends Tokenizer {
|
|||
} catch (IOException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
|
||||
}
|
||||
ts.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -120,12 +117,6 @@ final class TrieTokenizer extends Tokenizer {
|
|||
super.close();
|
||||
ts.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
ts.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
|
|
|
@ -71,9 +71,8 @@ public class BoolField extends PrimitiveFieldType {
|
|||
boolean done = false;
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
public void reset() throws IOException {
|
||||
done = false;
|
||||
super.setReader(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -81,13 +81,8 @@ public class PreAnalyzedField extends FieldType {
|
|||
return new SolrAnalyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
try {
|
||||
return new TokenStreamComponents(new PreAnalyzedTokenizer(reader, parser));
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
return new TokenStreamComponents(new PreAnalyzedTokenizer(reader, parser));
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -169,6 +164,7 @@ public class PreAnalyzedField extends FieldType {
|
|||
return null;
|
||||
}
|
||||
PreAnalyzedTokenizer parse = new PreAnalyzedTokenizer(new StringReader(val), parser);
|
||||
parse.reset(); // consume
|
||||
Field f = (Field)super.createField(field, val, boost);
|
||||
if (parse.getStringValue() != null) {
|
||||
f.setStringValue(parse.getStringValue());
|
||||
|
@ -195,11 +191,11 @@ public class PreAnalyzedField extends FieldType {
|
|||
private String stringValue = null;
|
||||
private byte[] binaryValue = null;
|
||||
private PreAnalyzedParser parser;
|
||||
private Reader lastReader;
|
||||
|
||||
public PreAnalyzedTokenizer(Reader reader, PreAnalyzedParser parser) throws IOException {
|
||||
public PreAnalyzedTokenizer(Reader reader, PreAnalyzedParser parser) {
|
||||
super(reader);
|
||||
this.parser = parser;
|
||||
setReader(reader);
|
||||
}
|
||||
|
||||
public boolean hasTokenStream() {
|
||||
|
@ -229,24 +225,30 @@ public class PreAnalyzedField extends FieldType {
|
|||
return true;
|
||||
}
|
||||
|
||||
public final void reset() {
|
||||
@Override
|
||||
public final void reset() throws IOException {
|
||||
// NOTE: this acts like rewind if you call it again
|
||||
if (input != lastReader) {
|
||||
lastReader = input;
|
||||
cachedStates.clear();
|
||||
stringValue = null;
|
||||
binaryValue = null;
|
||||
ParseResult res = parser.parse(input, this);
|
||||
if (res != null) {
|
||||
stringValue = res.str;
|
||||
binaryValue = res.bin;
|
||||
if (res.states != null) {
|
||||
cachedStates.addAll(res.states);
|
||||
}
|
||||
}
|
||||
}
|
||||
it = cachedStates.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReader(Reader input) throws IOException {
|
||||
super.setReader(input);
|
||||
cachedStates.clear();
|
||||
stringValue = null;
|
||||
binaryValue = null;
|
||||
ParseResult res = parser.parse(input, this);
|
||||
if (res != null) {
|
||||
stringValue = res.str;
|
||||
binaryValue = res.bin;
|
||||
if (res.states != null) {
|
||||
cachedStates.addAll(res.states);
|
||||
}
|
||||
}
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
lastReader = null; // just a ref, null for gc
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue