mirror of https://github.com/apache/lucene.git
LUCENE-3968: factor out LookaheadTokenFilter from MockGraphTokenFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1311043 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
72cc92c903
commit
c567961eca
|
@ -0,0 +1,66 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
public class TestLookaheadTokenFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, random.nextBoolean());
|
||||
TokenStream output = new MockRandomLookaheadTokenFilter(random, tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, output);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
|
||||
private static class NeverPeeksLookaheadTokenFilter extends LookaheadTokenFilter<LookaheadTokenFilter.Position> {
|
||||
public NeverPeeksLookaheadTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Position newPosition() {
|
||||
return new Position();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
return nextToken();
|
||||
}
|
||||
}
|
||||
|
||||
public void testNeverCallingPeek() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, random.nextBoolean());
|
||||
TokenStream output = new NeverPeeksLookaheadTokenFilter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, output);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,268 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.RollingBuffer;
|
||||
|
||||
// TODO: cut SynFilter over to this
|
||||
// TODO: somehow add "nuke this input token" capability...
|
||||
|
||||
/** An abstract TokenFilter to make it easier to build graph
|
||||
* token filters requiring some lookahead. This class handles
|
||||
* the details of buffering up tokens, recording them by
|
||||
* position, restoring them, providing access to them, etc. */
|
||||
|
||||
public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Position> extends TokenFilter {
|
||||
|
||||
private final static boolean DEBUG = false;
|
||||
|
||||
protected final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
protected final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
// Position of last read input token:
|
||||
protected int inputPos;
|
||||
|
||||
// Position of next possible output token to return:
|
||||
protected int outputPos;
|
||||
|
||||
// True if we hit end from our input:
|
||||
protected boolean end;
|
||||
|
||||
private boolean tokenPending;
|
||||
private boolean insertPending;
|
||||
|
||||
/** Holds all state for a single position; subclass this
|
||||
* to record other state at each position. */
|
||||
protected static class Position implements RollingBuffer.Resettable {
|
||||
// Buffered input tokens at this position:
|
||||
public final List<AttributeSource.State> inputTokens = new ArrayList<AttributeSource.State>();
|
||||
|
||||
// Next buffered token to be returned to consumer:
|
||||
public int nextRead;
|
||||
|
||||
// Any token leaving from this position should have this startOffset:
|
||||
public int startOffset = -1;
|
||||
|
||||
// Any token arriving to this position should have this endOffset:
|
||||
public int endOffset = -1;
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
inputTokens.clear();
|
||||
nextRead = 0;
|
||||
startOffset = -1;
|
||||
endOffset = -1;
|
||||
}
|
||||
|
||||
public void add(AttributeSource.State state) {
|
||||
inputTokens.add(state);
|
||||
}
|
||||
|
||||
public AttributeSource.State nextState() {
|
||||
assert nextRead < inputTokens.size();
|
||||
return inputTokens.get(nextRead++);
|
||||
}
|
||||
}
|
||||
|
||||
protected LookaheadTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/** Call this only from within afterPosition, to insert a new
|
||||
* token. After calling this you should set any
|
||||
* necessary token you need. */
|
||||
protected void insertToken() throws IOException {
|
||||
if (tokenPending) {
|
||||
positions.get(inputPos).add(captureState());
|
||||
tokenPending = false;
|
||||
}
|
||||
assert !insertPending;
|
||||
insertPending = true;
|
||||
}
|
||||
|
||||
/** This is called when all input tokens leaving a given
|
||||
* position have been returned. Override this and
|
||||
* call createToken and then set whichever token's
|
||||
* attributes you want, if you want to inject
|
||||
* a token starting from this position. */
|
||||
protected void afterPosition() throws IOException {
|
||||
}
|
||||
|
||||
protected abstract T newPosition();
|
||||
|
||||
protected final RollingBuffer<T> positions = new RollingBuffer<T>() {
|
||||
@Override
|
||||
protected T newInstance() {
|
||||
return newPosition();
|
||||
}
|
||||
};
|
||||
|
||||
/** Returns true if there is a new token. */
|
||||
protected boolean peekToken() throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("LTF.peekToken inputPos=" + inputPos + " outputPos=" + outputPos + " tokenPending=" + tokenPending);
|
||||
}
|
||||
assert !end;
|
||||
assert inputPos == -1 || outputPos <= inputPos;
|
||||
if (tokenPending) {
|
||||
positions.get(inputPos).add(captureState());
|
||||
tokenPending = false;
|
||||
}
|
||||
final boolean gotToken = input.incrementToken();
|
||||
if (DEBUG) {
|
||||
System.out.println(" input.incrToken() returned " + gotToken);
|
||||
}
|
||||
if (gotToken) {
|
||||
inputPos += posIncAtt.getPositionIncrement();
|
||||
assert inputPos >= 0;
|
||||
if (DEBUG) {
|
||||
System.out.println(" now inputPos=" + inputPos);
|
||||
}
|
||||
|
||||
final Position startPosData = positions.get(inputPos);
|
||||
final Position endPosData = positions.get(inputPos + posLenAtt.getPositionLength());
|
||||
|
||||
final int startOffset = offsetAtt.startOffset();
|
||||
if (startPosData.startOffset == -1) {
|
||||
startPosData.startOffset = startOffset;
|
||||
} else {
|
||||
// Make sure our input isn't messing up offsets:
|
||||
assert startPosData.startOffset == startOffset;
|
||||
}
|
||||
|
||||
final int endOffset = offsetAtt.endOffset();
|
||||
if (endPosData.endOffset == -1) {
|
||||
endPosData.endOffset = endOffset;
|
||||
} else {
|
||||
// Make sure our input isn't messing up offsets:
|
||||
assert endPosData.endOffset == endOffset;
|
||||
}
|
||||
|
||||
tokenPending = true;
|
||||
} else {
|
||||
end = true;
|
||||
}
|
||||
|
||||
return gotToken;
|
||||
}
|
||||
|
||||
/** Call this when you are done looking ahead; it will set
|
||||
* the next token to return. Return the boolean back to
|
||||
* the caller. */
|
||||
protected boolean nextToken() throws IOException {
|
||||
//System.out.println(" nextToken: tokenPending=" + tokenPending);
|
||||
if (DEBUG) {
|
||||
System.out.println("LTF.nextToken inputPos=" + inputPos + " outputPos=" + outputPos + " tokenPending=" + tokenPending);
|
||||
}
|
||||
|
||||
Position posData = positions.get(outputPos);
|
||||
|
||||
// While loop here in case we have to
|
||||
// skip over a hole from the input:
|
||||
while (true) {
|
||||
|
||||
//System.out.println(" check buffer @ outputPos=" +
|
||||
//outputPos + " inputPos=" + inputPos + " nextRead=" +
|
||||
//posData.nextRead + " vs size=" +
|
||||
//posData.inputTokens.size());
|
||||
|
||||
// See if we have a previously buffered token to
|
||||
// return at the current position:
|
||||
if (posData.nextRead < posData.inputTokens.size()) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" return previously buffered token");
|
||||
}
|
||||
// This position has buffered tokens to serve up:
|
||||
if (tokenPending) {
|
||||
positions.get(inputPos).add(captureState());
|
||||
tokenPending = false;
|
||||
}
|
||||
restoreState(positions.get(outputPos).nextState());
|
||||
//System.out.println(" return!");
|
||||
return true;
|
||||
}
|
||||
|
||||
if (inputPos == -1 || outputPos == inputPos) {
|
||||
// No more buffered tokens:
|
||||
// We may still get input tokens at this position
|
||||
//System.out.println(" break buffer");
|
||||
if (tokenPending) {
|
||||
// Fast path: just return token we had just incr'd,
|
||||
// without having captured/restored its state:
|
||||
if (DEBUG) {
|
||||
System.out.println(" pass-through: return pending token");
|
||||
}
|
||||
tokenPending = false;
|
||||
return true;
|
||||
} else if (end || !peekToken()) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" END");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (posData.startOffset != -1) {
|
||||
// This position had at least one token leaving
|
||||
if (DEBUG) {
|
||||
System.out.println(" call afterPosition");
|
||||
}
|
||||
afterPosition();
|
||||
if (insertPending) {
|
||||
// Subclass inserted a token at this same
|
||||
// position:
|
||||
if (DEBUG) {
|
||||
System.out.println(" return inserted token");
|
||||
}
|
||||
insertPending = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Done with this position; move on:
|
||||
outputPos++;
|
||||
if (DEBUG) {
|
||||
System.out.println(" next position: outputPos=" + outputPos);
|
||||
}
|
||||
positions.freeBefore(outputPos);
|
||||
posData = positions.get(outputPos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: end()?
|
||||
// TODO: close()?
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
positions.reset();
|
||||
inputPos = -1;
|
||||
outputPos = 0;
|
||||
tokenPending = false;
|
||||
end = false;
|
||||
}
|
||||
}
|
|
@ -18,16 +18,9 @@ package org.apache.lucene.analysis;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.RollingBuffer;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
// TODO: sometimes remove tokens too...?
|
||||
|
@ -36,283 +29,86 @@ import org.apache.lucene.util._TestUtil;
|
|||
* posLength sometimes > 1. The chain must have
|
||||
* an OffsetAttribute. */
|
||||
|
||||
public final class MockGraphTokenFilter extends TokenFilter {
|
||||
public final class MockGraphTokenFilter extends LookaheadTokenFilter<LookaheadTokenFilter.Position> {
|
||||
|
||||
private static boolean DEBUG = false;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final long seed;
|
||||
private Random random;
|
||||
|
||||
// Don't init to -1 (caller must first call reset):
|
||||
private int inputPos;
|
||||
private int outputPos;
|
||||
// Don't init to -1 (caller must first call reset):
|
||||
private int lastOutputPos;
|
||||
private boolean end;
|
||||
|
||||
private final class Position implements RollingBuffer.Resettable {
|
||||
final List<AttributeSource.State> states = new ArrayList<AttributeSource.State>();
|
||||
int nextRead;
|
||||
|
||||
// Any token leaving from this position should have this startOffset:
|
||||
int startOffset = -1;
|
||||
|
||||
// Any token arriving to this positoin should have this endOffset:
|
||||
int endOffset = -1;
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
states.clear();
|
||||
nextRead = 0;
|
||||
startOffset = -1;
|
||||
endOffset = -1;
|
||||
}
|
||||
|
||||
public void captureState() throws IOException {
|
||||
assert startOffset == offsetAtt.startOffset();
|
||||
states.add(MockGraphTokenFilter.this.captureState());
|
||||
}
|
||||
}
|
||||
|
||||
private final RollingBuffer<Position> positions = new RollingBuffer<Position>() {
|
||||
@Override
|
||||
protected Position newInstance() {
|
||||
return new Position();
|
||||
}
|
||||
};
|
||||
|
||||
public MockGraphTokenFilter(Random random, TokenStream input) {
|
||||
super(input);
|
||||
seed = random.nextLong();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Position newPosition() {
|
||||
return new Position();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void afterPosition() throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("MockGraphTF.afterPos");
|
||||
}
|
||||
if (random.nextInt(7) == 5) {
|
||||
|
||||
final int posLength = _TestUtil.nextInt(random, 1, 5);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" do insert! posLen=" + posLength);
|
||||
}
|
||||
|
||||
final Position posEndData = positions.get(outputPos + posLength);
|
||||
|
||||
// Look ahead as needed until we figure out the right
|
||||
// endOffset:
|
||||
while(!end && posEndData.endOffset == -1 && inputPos <= (outputPos + posLength)) {
|
||||
if (!peekToken()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (posEndData.endOffset != -1) {
|
||||
// Notify super class that we are injecting a token:
|
||||
insertToken();
|
||||
clearAttributes();
|
||||
posLenAtt.setPositionLength(posLength);
|
||||
termAtt.append(_TestUtil.randomUnicodeString(random));
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
offsetAtt.setOffset(positions.get(outputPos).startOffset,
|
||||
posEndData.endOffset);
|
||||
if (DEBUG) {
|
||||
System.out.println(" inject: outputPos=" + outputPos + " startOffset=" + offsetAtt.startOffset() +
|
||||
" endOffset=" + offsetAtt.endOffset() +
|
||||
" posLength=" + posLenAtt.getPositionLength());
|
||||
}
|
||||
// TODO: set TypeAtt too?
|
||||
} else {
|
||||
// Either 1) the tokens ended before our posLength,
|
||||
// or 2) our posLength ended inside a hole from the
|
||||
// input. In each case we just skip the inserted
|
||||
// token.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
end = false;
|
||||
positions.reset();
|
||||
// NOTE: must be "deterministically random" because
|
||||
// BaseTokenStreamTestCase pulls tokens twice on the
|
||||
// same input and asserts they are the same:
|
||||
this.random = new Random(seed);
|
||||
inputPos = -1;
|
||||
outputPos = 0;
|
||||
lastOutputPos = -1;
|
||||
}
|
||||
|
||||
private enum TOKEN_POS {SAME, NEXT, END};
|
||||
|
||||
private TOKEN_POS nextInputToken() throws IOException {
|
||||
assert !end;
|
||||
if (DEBUG) {
|
||||
System.out.println(" call input.incr");
|
||||
}
|
||||
final boolean result = input.incrementToken();
|
||||
if (result) {
|
||||
final int posInc = posIncAtt.getPositionIncrement();
|
||||
final int posLength = posLengthAtt.getPositionLength();
|
||||
|
||||
// NOTE: when posLength > 1, we have a hole... we
|
||||
// don't allow injected tokens to start or end
|
||||
// "inside" a hole, so we don't need to make up
|
||||
// offsets inside it
|
||||
|
||||
assert inputPos != -1 || posInc > 0;
|
||||
inputPos += posInc;
|
||||
if (DEBUG) {
|
||||
System.out.println(" got token term=" + termAtt + " posLength=" + posLength + " posInc=" + posInc + " inputPos=" + inputPos);
|
||||
}
|
||||
final Position posData = positions.get(inputPos);
|
||||
if (posInc == 0) {
|
||||
assert posData.startOffset == offsetAtt.startOffset();
|
||||
} else {
|
||||
assert posData.startOffset == -1;
|
||||
posData.startOffset = offsetAtt.startOffset();
|
||||
if (DEBUG) {
|
||||
System.out.println(" record startOffset[" + inputPos + "]=" + posData.startOffset);
|
||||
}
|
||||
}
|
||||
|
||||
final Position posEndData = positions.get(inputPos + posLength);
|
||||
if (posEndData.endOffset == -1) {
|
||||
// First time we are seeing a token that
|
||||
// arrives to this position: record the
|
||||
// endOffset
|
||||
posEndData.endOffset = offsetAtt.endOffset();
|
||||
if (DEBUG) {
|
||||
System.out.println(" record endOffset[" + (inputPos+posLength) + "]=" + posEndData.endOffset);
|
||||
}
|
||||
} else {
|
||||
// We've already seen a token arriving there;
|
||||
// make sure its endOffset is the same (NOTE:
|
||||
// some tokenizers, eg WDF, will fail
|
||||
// this...):
|
||||
assert posEndData.endOffset == offsetAtt.endOffset(): "posEndData.endOffset=" + posEndData.endOffset + " vs offsetAtt.endOffset()=" + offsetAtt.endOffset();
|
||||
}
|
||||
if (posInc == 0) {
|
||||
return TOKEN_POS.SAME;
|
||||
} else {
|
||||
return TOKEN_POS.NEXT;
|
||||
}
|
||||
} else {
|
||||
if (DEBUG) {
|
||||
System.out.println(" got END");
|
||||
}
|
||||
return TOKEN_POS.END;
|
||||
}
|
||||
}
|
||||
|
||||
private void pushOutputPos() {
|
||||
posIncAtt.setPositionIncrement(outputPos - lastOutputPos);
|
||||
if (DEBUG) {
|
||||
System.out.println(" pushOutputPos: set posInc=" + posIncAtt.getPositionIncrement());
|
||||
}
|
||||
lastOutputPos = outputPos;
|
||||
positions.freeBefore(outputPos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("MockGraphTF.incr inputPos=" + inputPos + " outputPos=" + outputPos);
|
||||
}
|
||||
|
||||
while (true) {
|
||||
final Position posData = positions.get(outputPos);
|
||||
if (posData.nextRead < posData.states.size()) {
|
||||
// Serve up all buffered tokens from this position:
|
||||
if (DEBUG) {
|
||||
System.out.println(" restore buffered nextRead=" + posData.nextRead + " vs " + posData.states.size());
|
||||
}
|
||||
restoreState(posData.states.get(posData.nextRead++));
|
||||
if (DEBUG) {
|
||||
System.out.println(" term=" + termAtt + " outputPos=" + outputPos);
|
||||
}
|
||||
pushOutputPos();
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean tokenPending = false;
|
||||
|
||||
final int prevInputPos = inputPos;
|
||||
|
||||
if (inputPos == -1 || inputPos == outputPos) {
|
||||
// We've used up the buffered tokens; pull the next
|
||||
// input token:
|
||||
if (end) {
|
||||
return false;
|
||||
}
|
||||
final TOKEN_POS result = nextInputToken();
|
||||
if (result == TOKEN_POS.SAME) {
|
||||
return true;
|
||||
} else if (result == TOKEN_POS.NEXT) {
|
||||
tokenPending = true;
|
||||
} else {
|
||||
// NOTE: we don't set end=true here... because we
|
||||
// are immediately passing through "the end" to
|
||||
// caller (return false), and caller must not call
|
||||
// us again:
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
assert inputPos > outputPos;
|
||||
if (DEBUG) {
|
||||
System.out.println(" done @ outputPos=" + outputPos);
|
||||
}
|
||||
}
|
||||
|
||||
// We're done (above) serving up all tokens leaving
|
||||
// from the same position; now maybe insert a token.
|
||||
// Note that we may insert more than one token leaving
|
||||
// from this position. We only inject tokens at
|
||||
// positions where we've seen at least one input token
|
||||
// (ie, we cannot inject inside holes):
|
||||
|
||||
if (prevInputPos != -1 && positions.get(outputPos).startOffset != -1 && random.nextInt(7) == 5) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" inject @ outputPos=" + outputPos);
|
||||
}
|
||||
|
||||
if (tokenPending) {
|
||||
positions.get(inputPos).captureState();
|
||||
}
|
||||
final int posLength = _TestUtil.nextInt(random, 1, 5);
|
||||
final Position posEndData = positions.get(outputPos + posLength);
|
||||
|
||||
// Pull enough tokens until we discover what our
|
||||
// endOffset should be:
|
||||
while (!end && posEndData.endOffset == -1 && inputPos <= (outputPos + posLength)) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" lookahead [endPos=" + (outputPos + posLength) + "]...");
|
||||
}
|
||||
final TOKEN_POS result = nextInputToken();
|
||||
if (result != TOKEN_POS.END) {
|
||||
positions.get(inputPos).captureState();
|
||||
} else {
|
||||
end = true;
|
||||
if (DEBUG) {
|
||||
System.out.println(" force end lookahead");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: really, here, on hitting end-of-tokens,
|
||||
// we'd like to know the ending "posInc", and allow
|
||||
// our token to extend up until that. But: a
|
||||
// TokenFilter is not allowed to call end() from
|
||||
// within its incrementToken, so we can't do that.
|
||||
// It may have been better if the ending
|
||||
// posInc/offsets were set when incrementToken
|
||||
// returned false (ie, without having to call the
|
||||
// special end method):
|
||||
|
||||
if (posEndData.endOffset != -1) {
|
||||
assert posEndData.endOffset != -1;
|
||||
clearAttributes();
|
||||
posLengthAtt.setPositionLength(posLength);
|
||||
termAtt.append(_TestUtil.randomUnicodeString(random));
|
||||
pushOutputPos();
|
||||
offsetAtt.setOffset(positions.get(outputPos).startOffset,
|
||||
positions.get(outputPos + posLength).endOffset);
|
||||
if (DEBUG) {
|
||||
System.out.println(" inject: outputPos=" + outputPos + " startOffset=" + offsetAtt.startOffset() +
|
||||
" endOffset=" + offsetAtt.endOffset() +
|
||||
" posLength=" + posLengthAtt.getPositionLength());
|
||||
}
|
||||
// TODO: set TypeAtt too?
|
||||
return true;
|
||||
|
||||
} else {
|
||||
// Either, we hit the end of the tokens (ie, our
|
||||
// attempted posLength is too long because it
|
||||
// hangs out over the end), or, our attempted
|
||||
// posLength ended in the middle of a hole; just
|
||||
// skip injecting in these cases. We will still
|
||||
// test these cases by having a StopFilter after
|
||||
// MockGraphTokenFilter...
|
||||
}
|
||||
|
||||
} else if (tokenPending) {
|
||||
outputPos = inputPos;
|
||||
if (DEBUG) {
|
||||
System.out.println(" pass-through");
|
||||
}
|
||||
pushOutputPos();
|
||||
return true;
|
||||
} else {
|
||||
// We are skipping over a hole (posInc > 1) from our input:
|
||||
outputPos++;
|
||||
if (DEBUG) {
|
||||
System.out.println(" incr outputPos=" + outputPos);
|
||||
}
|
||||
}
|
||||
}
|
||||
return nextToken();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* Uses {@link LookaheadTokenFilter} to randomly peek at future tokens.
|
||||
*/
|
||||
|
||||
public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<LookaheadTokenFilter.Position> {
|
||||
private final static boolean DEBUG = false;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final Random random;
|
||||
|
||||
public MockRandomLookaheadTokenFilter(Random random, TokenStream in) {
|
||||
super(in);
|
||||
this.random = random;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Position newPosition() {
|
||||
return new Position();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void afterPosition() throws IOException {
|
||||
if (!end && random.nextInt(4) == 2) {
|
||||
peekToken();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (DEBUG) {
|
||||
System.out.println("\n" + Thread.currentThread().getName() + ": MRLTF.incrToken");
|
||||
}
|
||||
|
||||
if (!end) {
|
||||
while (true) {
|
||||
// We can use un-re-seeded random, because how far
|
||||
// ahead we peek should never alter the resulting
|
||||
// tokens as seen by the consumer:
|
||||
if (random.nextInt(3) == 1) {
|
||||
if (!peekToken()) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" peek; inputPos=" + inputPos + " END");
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println(" peek; inputPos=" + inputPos + " token=" + termAtt);
|
||||
}
|
||||
} else {
|
||||
if (DEBUG) {
|
||||
System.out.println(" done peek");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final boolean result = nextToken();
|
||||
if (result) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" return nextToken token=" + termAtt);
|
||||
}
|
||||
} else {
|
||||
if (DEBUG) {
|
||||
System.out.println(" return nextToken END");
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue