LUCENE-3968: factor out LookaheadTokenFilter from MockGraphTokenFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1311043 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-04-08 16:56:54 +00:00
parent 72cc92c903
commit c567961eca
4 changed files with 482 additions and 258 deletions

View File

@ -0,0 +1,66 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class TestLookaheadTokenFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, random.nextBoolean());
TokenStream output = new MockRandomLookaheadTokenFilter(random, tokenizer);
return new TokenStreamComponents(tokenizer, output);
}
};
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
}
private static class NeverPeeksLookaheadTokenFilter extends LookaheadTokenFilter<LookaheadTokenFilter.Position> {
public NeverPeeksLookaheadTokenFilter(TokenStream input) {
super(input);
}
@Override
public Position newPosition() {
return new Position();
}
@Override
public boolean incrementToken() throws IOException {
return nextToken();
}
}
public void testNeverCallingPeek() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, random.nextBoolean());
TokenStream output = new NeverPeeksLookaheadTokenFilter(tokenizer);
return new TokenStreamComponents(tokenizer, output);
}
};
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
}
}

View File

@ -0,0 +1,268 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.RollingBuffer;
// TODO: cut SynFilter over to this
// TODO: somehow add "nuke this input token" capability...
/** An abstract TokenFilter to make it easier to build graph
* token filters requiring some lookahead. This class handles
* the details of buffering up tokens, recording them by
* position, restoring them, providing access to them, etc. */
public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Position> extends TokenFilter {
private final static boolean DEBUG = false;
protected final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
protected final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
// Position of last read input token:
protected int inputPos;
// Position of next possible output token to return:
protected int outputPos;
// True if we hit end from our input:
protected boolean end;
private boolean tokenPending;
private boolean insertPending;
/** Holds all state for a single position; subclass this
* to record other state at each position. */
protected static class Position implements RollingBuffer.Resettable {
// Buffered input tokens at this position:
public final List<AttributeSource.State> inputTokens = new ArrayList<AttributeSource.State>();
// Next buffered token to be returned to consumer:
public int nextRead;
// Any token leaving from this position should have this startOffset:
public int startOffset = -1;
// Any token arriving to this position should have this endOffset:
public int endOffset = -1;
@Override
public void reset() {
inputTokens.clear();
nextRead = 0;
startOffset = -1;
endOffset = -1;
}
public void add(AttributeSource.State state) {
inputTokens.add(state);
}
public AttributeSource.State nextState() {
assert nextRead < inputTokens.size();
return inputTokens.get(nextRead++);
}
}
protected LookaheadTokenFilter(TokenStream input) {
super(input);
}
/** Call this only from within afterPosition, to insert a new
* token. After calling this you should set any
* necessary token you need. */
protected void insertToken() throws IOException {
if (tokenPending) {
positions.get(inputPos).add(captureState());
tokenPending = false;
}
assert !insertPending;
insertPending = true;
}
/** This is called when all input tokens leaving a given
* position have been returned. Override this and
* call createToken and then set whichever token's
* attributes you want, if you want to inject
* a token starting from this position. */
protected void afterPosition() throws IOException {
}
protected abstract T newPosition();
protected final RollingBuffer<T> positions = new RollingBuffer<T>() {
@Override
protected T newInstance() {
return newPosition();
}
};
/** Returns true if there is a new token. */
protected boolean peekToken() throws IOException {
if (DEBUG) {
System.out.println("LTF.peekToken inputPos=" + inputPos + " outputPos=" + outputPos + " tokenPending=" + tokenPending);
}
assert !end;
assert inputPos == -1 || outputPos <= inputPos;
if (tokenPending) {
positions.get(inputPos).add(captureState());
tokenPending = false;
}
final boolean gotToken = input.incrementToken();
if (DEBUG) {
System.out.println(" input.incrToken() returned " + gotToken);
}
if (gotToken) {
inputPos += posIncAtt.getPositionIncrement();
assert inputPos >= 0;
if (DEBUG) {
System.out.println(" now inputPos=" + inputPos);
}
final Position startPosData = positions.get(inputPos);
final Position endPosData = positions.get(inputPos + posLenAtt.getPositionLength());
final int startOffset = offsetAtt.startOffset();
if (startPosData.startOffset == -1) {
startPosData.startOffset = startOffset;
} else {
// Make sure our input isn't messing up offsets:
assert startPosData.startOffset == startOffset;
}
final int endOffset = offsetAtt.endOffset();
if (endPosData.endOffset == -1) {
endPosData.endOffset = endOffset;
} else {
// Make sure our input isn't messing up offsets:
assert endPosData.endOffset == endOffset;
}
tokenPending = true;
} else {
end = true;
}
return gotToken;
}
/** Call this when you are done looking ahead; it will set
* the next token to return. Return the boolean back to
* the caller. */
protected boolean nextToken() throws IOException {
//System.out.println(" nextToken: tokenPending=" + tokenPending);
if (DEBUG) {
System.out.println("LTF.nextToken inputPos=" + inputPos + " outputPos=" + outputPos + " tokenPending=" + tokenPending);
}
Position posData = positions.get(outputPos);
// While loop here in case we have to
// skip over a hole from the input:
while (true) {
//System.out.println(" check buffer @ outputPos=" +
//outputPos + " inputPos=" + inputPos + " nextRead=" +
//posData.nextRead + " vs size=" +
//posData.inputTokens.size());
// See if we have a previously buffered token to
// return at the current position:
if (posData.nextRead < posData.inputTokens.size()) {
if (DEBUG) {
System.out.println(" return previously buffered token");
}
// This position has buffered tokens to serve up:
if (tokenPending) {
positions.get(inputPos).add(captureState());
tokenPending = false;
}
restoreState(positions.get(outputPos).nextState());
//System.out.println(" return!");
return true;
}
if (inputPos == -1 || outputPos == inputPos) {
// No more buffered tokens:
// We may still get input tokens at this position
//System.out.println(" break buffer");
if (tokenPending) {
// Fast path: just return token we had just incr'd,
// without having captured/restored its state:
if (DEBUG) {
System.out.println(" pass-through: return pending token");
}
tokenPending = false;
return true;
} else if (end || !peekToken()) {
if (DEBUG) {
System.out.println(" END");
}
return false;
}
} else {
if (posData.startOffset != -1) {
// This position had at least one token leaving
if (DEBUG) {
System.out.println(" call afterPosition");
}
afterPosition();
if (insertPending) {
// Subclass inserted a token at this same
// position:
if (DEBUG) {
System.out.println(" return inserted token");
}
insertPending = false;
return true;
}
}
// Done with this position; move on:
outputPos++;
if (DEBUG) {
System.out.println(" next position: outputPos=" + outputPos);
}
positions.freeBefore(outputPos);
posData = positions.get(outputPos);
}
}
}
// TODO: end()?
// TODO: close()?
@Override
public void reset() throws IOException {
super.reset();
positions.reset();
inputPos = -1;
outputPos = 0;
tokenPending = false;
end = false;
}
}

View File

@ -18,16 +18,9 @@ package org.apache.lucene.analysis;
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.RollingBuffer;
import org.apache.lucene.util._TestUtil;
// TODO: sometimes remove tokens too...?
@ -36,283 +29,86 @@ import org.apache.lucene.util._TestUtil;
* posLength sometimes > 1. The chain must have
* an OffsetAttribute. */
public final class MockGraphTokenFilter extends TokenFilter {
public final class MockGraphTokenFilter extends LookaheadTokenFilter<LookaheadTokenFilter.Position> {
private static boolean DEBUG = false;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final long seed;
private Random random;
// Don't init to -1 (caller must first call reset):
private int inputPos;
private int outputPos;
// Don't init to -1 (caller must first call reset):
private int lastOutputPos;
private boolean end;
private final class Position implements RollingBuffer.Resettable {
final List<AttributeSource.State> states = new ArrayList<AttributeSource.State>();
int nextRead;
// Any token leaving from this position should have this startOffset:
int startOffset = -1;
// Any token arriving to this positoin should have this endOffset:
int endOffset = -1;
@Override
public void reset() {
states.clear();
nextRead = 0;
startOffset = -1;
endOffset = -1;
}
public void captureState() throws IOException {
assert startOffset == offsetAtt.startOffset();
states.add(MockGraphTokenFilter.this.captureState());
}
}
private final RollingBuffer<Position> positions = new RollingBuffer<Position>() {
@Override
protected Position newInstance() {
return new Position();
}
};
public MockGraphTokenFilter(Random random, TokenStream input) {
super(input);
seed = random.nextLong();
}
@Override
protected Position newPosition() {
return new Position();
}
@Override
protected void afterPosition() throws IOException {
if (DEBUG) {
System.out.println("MockGraphTF.afterPos");
}
if (random.nextInt(7) == 5) {
final int posLength = _TestUtil.nextInt(random, 1, 5);
if (DEBUG) {
System.out.println(" do insert! posLen=" + posLength);
}
final Position posEndData = positions.get(outputPos + posLength);
// Look ahead as needed until we figure out the right
// endOffset:
while(!end && posEndData.endOffset == -1 && inputPos <= (outputPos + posLength)) {
if (!peekToken()) {
break;
}
}
if (posEndData.endOffset != -1) {
// Notify super class that we are injecting a token:
insertToken();
clearAttributes();
posLenAtt.setPositionLength(posLength);
termAtt.append(_TestUtil.randomUnicodeString(random));
posIncAtt.setPositionIncrement(0);
offsetAtt.setOffset(positions.get(outputPos).startOffset,
posEndData.endOffset);
if (DEBUG) {
System.out.println(" inject: outputPos=" + outputPos + " startOffset=" + offsetAtt.startOffset() +
" endOffset=" + offsetAtt.endOffset() +
" posLength=" + posLenAtt.getPositionLength());
}
// TODO: set TypeAtt too?
} else {
// Either 1) the tokens ended before our posLength,
// or 2) our posLength ended inside a hole from the
// input. In each case we just skip the inserted
// token.
}
}
}
@Override
public void reset() throws IOException {
super.reset();
end = false;
positions.reset();
// NOTE: must be "deterministically random" because
// BaseTokenStreamTestCase pulls tokens twice on the
// same input and asserts they are the same:
this.random = new Random(seed);
inputPos = -1;
outputPos = 0;
lastOutputPos = -1;
}
private enum TOKEN_POS {SAME, NEXT, END};
private TOKEN_POS nextInputToken() throws IOException {
assert !end;
if (DEBUG) {
System.out.println(" call input.incr");
}
final boolean result = input.incrementToken();
if (result) {
final int posInc = posIncAtt.getPositionIncrement();
final int posLength = posLengthAtt.getPositionLength();
// NOTE: when posLength > 1, we have a hole... we
// don't allow injected tokens to start or end
// "inside" a hole, so we don't need to make up
// offsets inside it
assert inputPos != -1 || posInc > 0;
inputPos += posInc;
if (DEBUG) {
System.out.println(" got token term=" + termAtt + " posLength=" + posLength + " posInc=" + posInc + " inputPos=" + inputPos);
}
final Position posData = positions.get(inputPos);
if (posInc == 0) {
assert posData.startOffset == offsetAtt.startOffset();
} else {
assert posData.startOffset == -1;
posData.startOffset = offsetAtt.startOffset();
if (DEBUG) {
System.out.println(" record startOffset[" + inputPos + "]=" + posData.startOffset);
}
}
final Position posEndData = positions.get(inputPos + posLength);
if (posEndData.endOffset == -1) {
// First time we are seeing a token that
// arrives to this position: record the
// endOffset
posEndData.endOffset = offsetAtt.endOffset();
if (DEBUG) {
System.out.println(" record endOffset[" + (inputPos+posLength) + "]=" + posEndData.endOffset);
}
} else {
// We've already seen a token arriving there;
// make sure its endOffset is the same (NOTE:
// some tokenizers, eg WDF, will fail
// this...):
assert posEndData.endOffset == offsetAtt.endOffset(): "posEndData.endOffset=" + posEndData.endOffset + " vs offsetAtt.endOffset()=" + offsetAtt.endOffset();
}
if (posInc == 0) {
return TOKEN_POS.SAME;
} else {
return TOKEN_POS.NEXT;
}
} else {
if (DEBUG) {
System.out.println(" got END");
}
return TOKEN_POS.END;
}
}
private void pushOutputPos() {
posIncAtt.setPositionIncrement(outputPos - lastOutputPos);
if (DEBUG) {
System.out.println(" pushOutputPos: set posInc=" + posIncAtt.getPositionIncrement());
}
lastOutputPos = outputPos;
positions.freeBefore(outputPos);
}
@Override
public boolean incrementToken() throws IOException {
if (DEBUG) {
System.out.println("MockGraphTF.incr inputPos=" + inputPos + " outputPos=" + outputPos);
}
while (true) {
final Position posData = positions.get(outputPos);
if (posData.nextRead < posData.states.size()) {
// Serve up all buffered tokens from this position:
if (DEBUG) {
System.out.println(" restore buffered nextRead=" + posData.nextRead + " vs " + posData.states.size());
}
restoreState(posData.states.get(posData.nextRead++));
if (DEBUG) {
System.out.println(" term=" + termAtt + " outputPos=" + outputPos);
}
pushOutputPos();
return true;
}
boolean tokenPending = false;
final int prevInputPos = inputPos;
if (inputPos == -1 || inputPos == outputPos) {
// We've used up the buffered tokens; pull the next
// input token:
if (end) {
return false;
}
final TOKEN_POS result = nextInputToken();
if (result == TOKEN_POS.SAME) {
return true;
} else if (result == TOKEN_POS.NEXT) {
tokenPending = true;
} else {
// NOTE: we don't set end=true here... because we
// are immediately passing through "the end" to
// caller (return false), and caller must not call
// us again:
return false;
}
} else {
assert inputPos > outputPos;
if (DEBUG) {
System.out.println(" done @ outputPos=" + outputPos);
}
}
// We're done (above) serving up all tokens leaving
// from the same position; now maybe insert a token.
// Note that we may insert more than one token leaving
// from this position. We only inject tokens at
// positions where we've seen at least one input token
// (ie, we cannot inject inside holes):
if (prevInputPos != -1 && positions.get(outputPos).startOffset != -1 && random.nextInt(7) == 5) {
if (DEBUG) {
System.out.println(" inject @ outputPos=" + outputPos);
}
if (tokenPending) {
positions.get(inputPos).captureState();
}
final int posLength = _TestUtil.nextInt(random, 1, 5);
final Position posEndData = positions.get(outputPos + posLength);
// Pull enough tokens until we discover what our
// endOffset should be:
while (!end && posEndData.endOffset == -1 && inputPos <= (outputPos + posLength)) {
if (DEBUG) {
System.out.println(" lookahead [endPos=" + (outputPos + posLength) + "]...");
}
final TOKEN_POS result = nextInputToken();
if (result != TOKEN_POS.END) {
positions.get(inputPos).captureState();
} else {
end = true;
if (DEBUG) {
System.out.println(" force end lookahead");
}
break;
}
}
// TODO: really, here, on hitting end-of-tokens,
// we'd like to know the ending "posInc", and allow
// our token to extend up until that. But: a
// TokenFilter is not allowed to call end() from
// within its incrementToken, so we can't do that.
// It may have been better if the ending
// posInc/offsets were set when incrementToken
// returned false (ie, without having to call the
// special end method):
if (posEndData.endOffset != -1) {
assert posEndData.endOffset != -1;
clearAttributes();
posLengthAtt.setPositionLength(posLength);
termAtt.append(_TestUtil.randomUnicodeString(random));
pushOutputPos();
offsetAtt.setOffset(positions.get(outputPos).startOffset,
positions.get(outputPos + posLength).endOffset);
if (DEBUG) {
System.out.println(" inject: outputPos=" + outputPos + " startOffset=" + offsetAtt.startOffset() +
" endOffset=" + offsetAtt.endOffset() +
" posLength=" + posLengthAtt.getPositionLength());
}
// TODO: set TypeAtt too?
return true;
} else {
// Either, we hit the end of the tokens (ie, our
// attempted posLength is too long because it
// hangs out over the end), or, our attempted
// posLength ended in the middle of a hole; just
// skip injecting in these cases. We will still
// test these cases by having a StopFilter after
// MockGraphTokenFilter...
}
} else if (tokenPending) {
outputPos = inputPos;
if (DEBUG) {
System.out.println(" pass-through");
}
pushOutputPos();
return true;
} else {
// We are skipping over a hole (posInc > 1) from our input:
outputPos++;
if (DEBUG) {
System.out.println(" incr outputPos=" + outputPos);
}
}
}
return nextToken();
}
}

View File

@ -0,0 +1,94 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* Uses {@link LookaheadTokenFilter} to randomly peek at future tokens.
*/
public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<LookaheadTokenFilter.Position> {
private final static boolean DEBUG = false;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final Random random;
public MockRandomLookaheadTokenFilter(Random random, TokenStream in) {
super(in);
this.random = random;
}
@Override
public Position newPosition() {
return new Position();
}
@Override
protected void afterPosition() throws IOException {
if (!end && random.nextInt(4) == 2) {
peekToken();
}
}
@Override
public boolean incrementToken() throws IOException {
if (DEBUG) {
System.out.println("\n" + Thread.currentThread().getName() + ": MRLTF.incrToken");
}
if (!end) {
while (true) {
// We can use un-re-seeded random, because how far
// ahead we peek should never alter the resulting
// tokens as seen by the consumer:
if (random.nextInt(3) == 1) {
if (!peekToken()) {
if (DEBUG) {
System.out.println(" peek; inputPos=" + inputPos + " END");
}
break;
}
if (DEBUG) {
System.out.println(" peek; inputPos=" + inputPos + " token=" + termAtt);
}
} else {
if (DEBUG) {
System.out.println(" done peek");
}
break;
}
}
}
final boolean result = nextToken();
if (result) {
if (DEBUG) {
System.out.println(" return nextToken token=" + termAtt);
}
} else {
if (DEBUG) {
System.out.println(" return nextToken END");
}
}
return result;
}
}