LUCENE-8564: Add GraphTokenFilter

This commit is contained in:
Alan Woodward 2018-12-03 12:17:58 +00:00
parent 6c11161111
commit f5867a1413
7 changed files with 594 additions and 233 deletions

View File

@ -155,6 +155,10 @@ New Features
* LUCENE-8216: Added a new BM25FQuery in sandbox to blend statistics across several fields
using the BM25F formula. (Adrien Grand, Jim Ferenczi)
* LUCENE-8564: GraphTokenFilter is an abstract class useful for token filters that need
to read-ahead in the token stream and take into account graph structures. This
also changes FixedShingleFilter to extend GraphTokenFilter (Alan Woodward)
Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.

View File

@ -18,16 +18,14 @@
package org.apache.lucene.analysis.shingle;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Deque;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.GraphTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
@ -47,27 +45,20 @@ import org.apache.lucene.util.AttributeSource;
*
* @lucene.experimental
*/
public final class FixedShingleFilter extends TokenFilter {
public final class FixedShingleFilter extends GraphTokenFilter {
private final Deque<Token> tokenPool = new ArrayDeque<>();
private static final int MAX_SHINGLE_STACK_SIZE = 1000;
private static final int MAX_SHINGLE_SIZE = 4;
private final int shingleSize;
private final String tokenSeparator;
private final Token gapToken = new Token(new AttributeSource());
private final Token endToken = new Token(new AttributeSource());
private final String fillerToken;
private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private Token[] currentShingleTokens;
private int currentShingleStackSize;
private boolean inputStreamExhausted = false;
private final CharTermAttribute buffer = new CharTermAttributeImpl();
/**
* Creates a FixedShingleFilter over an input token stream
@ -89,228 +80,82 @@ public final class FixedShingleFilter extends TokenFilter {
*/
public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
super(input);
if (shingleSize <= 1 || shingleSize > MAX_SHINGLE_SIZE) {
throw new IllegalArgumentException("Shingle size must be between 2 and " + MAX_SHINGLE_SIZE + ", got " + shingleSize);
}
this.shingleSize = shingleSize;
this.tokenSeparator = tokenSeparator;
this.gapToken.termAtt.setEmpty().append(fillerToken);
this.currentShingleTokens = new Token[shingleSize];
this.fillerToken = fillerToken;
}
@Override
public boolean incrementToken() throws IOException {
int posInc = 0;
if (nextShingle() == false) {
Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
if (nextRoot == endToken)
return false;
recycleToken(currentShingleTokens[0]);
if (resetShingleRoot(nextRoot) == false) {
int shinglePosInc;
if (incrementGraph() == false) {
if (incrementBaseToken() == false) {
return false;
}
posInc = currentShingleTokens[0].posInc();
// starting a shingle at a new base position, use base position increment
shinglePosInc = incAtt.getPositionIncrement();
}
clearAttributes();
incAtt.setPositionIncrement(posInc);
offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
termAtt.setEmpty();
termAtt.append(currentShingleTokens[0].term());
typeAtt.setType("shingle");
else {
// starting a new shingle at the same base with a different graph, use a 0
// position increment
shinglePosInc = 0;
}
final int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
this.buffer.setEmpty();
this.buffer.append(termAtt);
// build the shingle by iterating over the current graph, adding
// filler tokens if we encounter gaps
for (int i = 1; i < shingleSize; i++) {
termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
}
return true;
}
@Override
public void reset() throws IOException {
super.reset();
this.tokenPool.clear();
this.currentShingleTokens[0] = null;
this.inputStreamExhausted = false;
this.currentShingleStackSize = 0;
}
@Override
public void end() throws IOException {
if (inputStreamExhausted == false) {
finishInnerStream();
}
clearAttributes();
this.offsetAtt.setOffset(0, endToken.endOffset());
this.incAtt.setPositionIncrement(endToken.posInc());
}
private void finishInnerStream() throws IOException {
input.end();
inputStreamExhausted = true;
// check for gaps at the end of the tokenstream
endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
}
private Token lastTokenInShingle() {
int lastTokenIndex = shingleSize - 1;
while (currentShingleTokens[lastTokenIndex] == gapToken) {
lastTokenIndex--;
}
return currentShingleTokens[lastTokenIndex];
}
private boolean resetShingleRoot(Token token) throws IOException {
this.currentShingleTokens[0] = token;
for (int i = 1; i < shingleSize; i++) {
Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
if (current == endToken) {
if (endToken.posInc() + i >= shingleSize) {
// end tokens are a special case, because their posIncs are always
// due to stopwords. Therefore, we can happily append gap tokens
// to the end of the current shingle
for (int j = i; j < shingleSize; j++) {
this.currentShingleTokens[i] = gapToken;
if (incrementGraphToken() == false) {
// we've reached the end of the token stream, check for trailing
// positions and add fillers if necessary
int trailingPositions = getTrailingPositions();
if (i + trailingPositions < shingleSize) {
// not enough trailing positions to make a full shingle
return false;
}
while (i < shingleSize) {
this.buffer.append(tokenSeparator).append(fillerToken);
i++;
}
break;
}
int posInc = incAtt.getPositionIncrement();
if (posInc > 1) {
// if we have a posInc > 1, we need to fill in the gaps
if (i + posInc > shingleSize) {
// if the posInc is greater than the shingle size, we need to add fillers
// up to the shingle size but no further
while (i < shingleSize) {
this.buffer.append(tokenSeparator).append(fillerToken);
i++;
}
return true;
break;
}
return false;
}
if (current.posInc() > 1) {
// insert gaps into the shingle list
for (int j = 1; j < current.posInc(); j++) {
this.currentShingleTokens[i] = gapToken;
// otherwise just add them in as far as we need
while (posInc > 1) {
this.buffer.append(tokenSeparator).append(fillerToken);
posInc--;
i++;
if (i >= shingleSize)
return true;
}
}
this.currentShingleTokens[i] = current;
this.buffer.append(tokenSeparator).append(termAtt);
endOffset = offsetAtt.endOffset();
}
clearAttributes();
this.offsetAtt.setOffset(startOffset, endOffset);
this.incAtt.setPositionIncrement(shinglePosInc);
this.termAtt.setEmpty().append(buffer);
this.typeAtt.setType("shingle");
return true;
}
private boolean nextShingle() throws IOException {
return currentShingleTokens[0] != null && advanceStack();
}
// check if the next token in the tokenstream is at the same position as this one
private boolean lastInStack(Token token) throws IOException {
Token next = nextTokenInStream(token);
return next == endToken || next.posInc() != 0;
}
private boolean advanceStack() throws IOException {
for (int i = shingleSize - 1; i >= 1; i--) {
if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
for (int j = i + 1; j < shingleSize; j++) {
currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
}
if (currentShingleStackSize++ > MAX_SHINGLE_STACK_SIZE) {
throw new IllegalStateException("Too many shingles (> " + MAX_SHINGLE_STACK_SIZE + ") at term [" + currentShingleTokens[0].term() + "]");
}
return true;
}
}
currentShingleStackSize = 0;
return false;
}
private Token newToken() {
Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
token.reset(this);
return token;
}
private void recycleToken(Token token) {
if (token == null)
return;
token.nextToken = null;
tokenPool.add(token);
}
// for testing
int instantiatedTokenCount() {
int tokenCount = tokenPool.size() + 1;
if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
return tokenCount;
for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
tokenCount++;
}
return tokenCount;
}
private Token nextTokenInGraph(Token token) throws IOException {
do {
token = nextTokenInStream(token);
if (token == endToken) {
return endToken;
}
} while (token.posInc() == 0);
return token;
}
private Token nextTokenInStream(Token token) throws IOException {
if (token != null && token.nextToken != null) {
return token.nextToken;
}
if (input.incrementToken() == false) {
finishInnerStream();
if (token == null) {
return endToken;
} else {
token.nextToken = endToken;
return endToken;
}
}
if (token == null) {
return newToken();
}
token.nextToken = newToken();
return token.nextToken;
}
private static class Token {
final AttributeSource attSource;
final PositionIncrementAttribute posIncAtt;
final CharTermAttribute termAtt;
final OffsetAttribute offsetAtt;
Token nextToken;
Token(AttributeSource attSource) {
this.attSource = attSource;
this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
this.termAtt = attSource.addAttribute(CharTermAttribute.class);
this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
}
int posInc() {
return this.posIncAtt.getPositionIncrement();
}
CharSequence term() {
return this.termAtt;
}
int startOffset() {
return this.offsetAtt.startOffset();
}
int endOffset() {
return this.offsetAtt.endOffset();
}
void reset(AttributeSource attSource) {
attSource.copyTo(this.attSource);
this.nextToken = null;
}
@Override
public String toString() {
return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
}
}
}

View File

@ -163,6 +163,8 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
public void testIncomingGraphs() throws IOException {
// b/a c b/a d
TokenStream ts = new CannedTokenStream(
new Token("b", 0, 1),
new Token("a", 0, 0, 1),
@ -208,21 +210,4 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
assertEquals("Shingle size must be between 2 and 4, got 5", e2.getMessage());
}
public void testShingleCountLimits() {
Token[] tokens = new Token[5000];
tokens[0] = new Token("term", 1, 0, 1);
tokens[1] = new Token("term1", 1, 2, 3);
for (int i = 2; i < 5000; i++) {
tokens[i] = new Token("term" + i, 0, 2, 3);
}
Exception e = expectThrows(IllegalStateException.class, () -> {
TokenStream ts = new FixedShingleFilter(new CannedTokenStream(tokens), 2);
ts.reset();
while (ts.incrementToken()) {}
});
assertEquals("Too many shingles (> 1000) at term [term]", e.getMessage());
}
}

View File

@ -0,0 +1,284 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* An abstract TokenFilter that exposes its input stream as a graph
*
* Call {@link #incrementBaseToken()} to move the root of the graph to the next
* position in the TokenStream, {@link #incrementGraphToken()} to move along
* the current graph, and {@link #incrementGraph()} to reset to the next graph
* based at the current root.
*
* For example, given the stream 'a b/c:2 d e`, then with the base token at
* 'a', incrementGraphToken() will produce the stream 'a b d e', and then
* after calling incrementGraph() will produce the stream 'a c e'.
*/
public abstract class GraphTokenFilter extends TokenFilter {
private final Deque<Token> tokenPool = new ArrayDeque<>();
private final List<Token> currentGraph = new ArrayList<>();
/**
* The maximum permitted number of routes through a graph
*/
public static final int MAX_GRAPH_STACK_SIZE = 1000;
/**
* The maximum permitted read-ahead in the token stream
*/
public static final int MAX_TOKEN_CACHE_SIZE = 100;
private Token baseToken;
private int graphDepth;
private int graphPos;
private int trailingPositions = -1;
private int finalOffsets = -1;
private int stackSize;
private int cacheSize;
private final PositionIncrementAttribute posIncAtt;
private final OffsetAttribute offsetAtt;
/**
* Create a new GraphTokenFilter
*/
public GraphTokenFilter(TokenStream input) {
super(input);
this.posIncAtt = input.addAttribute(PositionIncrementAttribute.class);
this.offsetAtt = input.addAttribute(OffsetAttribute.class);
}
/**
* Move the root of the graph to the next token in the wrapped TokenStream
*
* @return {@code false} if the underlying stream is exhausted
*/
protected final boolean incrementBaseToken() throws IOException {
stackSize = 0;
graphDepth = 0;
graphPos = 0;
Token oldBase = baseToken;
baseToken = nextTokenInStream(baseToken);
if (baseToken == null) {
return false;
}
currentGraph.clear();
currentGraph.add(baseToken);
baseToken.attSource.copyTo(this);
recycleToken(oldBase);
return true;
}
/**
* Move to the next token in the current route through the graph
*
* @return {@code false} if there are not more tokens in the current graph
*/
protected final boolean incrementGraphToken() throws IOException {
if (graphPos < graphDepth) {
graphPos++;
currentGraph.get(graphPos).attSource.copyTo(this);
return true;
}
Token token = nextTokenInGraph(currentGraph.get(graphDepth));
if (token == null) {
return false;
}
graphDepth++;
graphPos++;
currentGraph.add(graphDepth, token);
token.attSource.copyTo(this);
return true;
}
/**
* Reset to the root token again, and move down the next route through the graph
*
* @return false if there are no more routes through the graph
*/
protected final boolean incrementGraph() throws IOException {
if (baseToken == null) {
return false;
}
graphPos = 0;
for (int i = graphDepth; i >= 1; i--) {
if (lastInStack(currentGraph.get(i)) == false) {
currentGraph.set(i, nextTokenInStream(currentGraph.get(i)));
for (int j = i + 1; j < graphDepth; j++) {
currentGraph.set(j, nextTokenInGraph(currentGraph.get(j)));
}
if (stackSize++ > MAX_GRAPH_STACK_SIZE) {
throw new IllegalStateException("Too many graph paths (> " + MAX_GRAPH_STACK_SIZE + ")");
}
currentGraph.get(0).attSource.copyTo(this);
graphDepth = i;
return true;
}
}
return false;
}
/**
* Return the number of trailing positions at the end of the graph
*
* NB this should only be called after {@link #incrementGraphToken()} has returned {@code false}
*/
public int getTrailingPositions() {
return trailingPositions;
}
@Override
public void end() throws IOException {
if (trailingPositions == -1) {
input.end();
trailingPositions = posIncAtt.getPositionIncrement();
finalOffsets = offsetAtt.endOffset();
}
else {
endAttributes();
this.posIncAtt.setPositionIncrement(trailingPositions);
this.offsetAtt.setOffset(finalOffsets, finalOffsets);
}
}
@Override
public void reset() throws IOException {
input.reset();
// new attributes can be added between reset() calls, so we can't reuse
// token objects from a previous run
tokenPool.clear();
cacheSize = 0;
graphDepth = 0;
trailingPositions = -1;
finalOffsets = -1;
baseToken = null;
}
int cachedTokenCount() {
return cacheSize;
}
private Token newToken() {
if (tokenPool.size() == 0) {
cacheSize++;
if (cacheSize > MAX_TOKEN_CACHE_SIZE) {
throw new IllegalStateException("Too many cached tokens (> " + MAX_TOKEN_CACHE_SIZE + ")");
}
return new Token(this.cloneAttributes());
}
Token token = tokenPool.removeFirst();
token.reset(input);
return token;
}
private void recycleToken(Token token) {
if (token == null)
return;
token.nextToken = null;
tokenPool.add(token);
}
private Token nextTokenInGraph(Token token) throws IOException {
int remaining = token.length();
do {
token = nextTokenInStream(token);
if (token == null) {
return null;
}
remaining -= token.posInc();
} while (remaining > 0);
return token;
}
// check if the next token in the tokenstream is at the same position as this one
private boolean lastInStack(Token token) throws IOException {
Token next = nextTokenInStream(token);
return next == null || next.posInc() != 0;
}
private Token nextTokenInStream(Token token) throws IOException {
if (token != null && token.nextToken != null) {
return token.nextToken;
}
if (this.trailingPositions != -1) {
// already hit the end
return null;
}
if (input.incrementToken() == false) {
input.end();
trailingPositions = posIncAtt.getPositionIncrement();
finalOffsets = offsetAtt.endOffset();
return null;
}
if (token == null) {
return newToken();
}
token.nextToken = newToken();
return token.nextToken;
}
private static class Token {
final AttributeSource attSource;
final PositionIncrementAttribute posIncAtt;
final PositionLengthAttribute lengthAtt;
Token nextToken;
Token(AttributeSource attSource) {
this.attSource = attSource;
this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
boolean hasLengthAtt = attSource.hasAttribute(PositionLengthAttribute.class);
this.lengthAtt = hasLengthAtt ? attSource.addAttribute(PositionLengthAttribute.class) : null;
}
int posInc() {
return this.posIncAtt.getPositionIncrement();
}
int length() {
if (this.lengthAtt == null) {
return 1;
}
return this.lengthAtt.getPositionLength();
}
void reset(AttributeSource attSource) {
attSource.copyTo(this.attSource);
this.nextToken = null;
}
@Override
public String toString() {
return attSource.toString();
}
}
}

View File

@ -0,0 +1,236 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class TestGraphTokenFilter extends BaseTokenStreamTestCase {
static class TestFilter extends GraphTokenFilter {
public TestFilter(TokenStream input) {
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
return incrementBaseToken();
}
}
public void testGraphTokenStream() throws IOException {
TestGraphTokenizers.GraphTokenizer tok = new TestGraphTokenizers.GraphTokenizer();
GraphTokenFilter graph = new TestFilter(tok);
CharTermAttribute termAtt = graph.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = graph.addAttribute(PositionIncrementAttribute.class);
tok.setReader(new StringReader("a b/c d e/f:3 g/h i j k"));
tok.reset();
assertFalse(graph.incrementGraph());
assertEquals(0, graph.cachedTokenCount());
assertTrue(graph.incrementBaseToken());
assertEquals("a", termAtt.toString());
assertEquals(1, posIncAtt.getPositionIncrement());
assertTrue(graph.incrementGraphToken());
assertEquals("b", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("d", termAtt.toString());
assertTrue(graph.incrementGraph());
assertEquals("a", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("c", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("d", termAtt.toString());
assertFalse(graph.incrementGraph());
assertEquals(5, graph.cachedTokenCount());
assertTrue(graph.incrementBaseToken());
assertEquals("b", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("d", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("e", termAtt.toString());
assertTrue(graph.incrementGraph());
assertEquals("b", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("d", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("f", termAtt.toString());
assertFalse(graph.incrementGraph());
assertEquals(6, graph.cachedTokenCount());
assertTrue(graph.incrementBaseToken());
assertEquals("c", termAtt.toString());
assertEquals(0, posIncAtt.getPositionIncrement());
assertTrue(graph.incrementGraphToken());
assertEquals("d", termAtt.toString());
assertFalse(graph.incrementGraph());
assertEquals(6, graph.cachedTokenCount());
assertTrue(graph.incrementBaseToken());
assertEquals("d", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("e", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("g", termAtt.toString());
assertTrue(graph.incrementGraph());
assertEquals("d", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("e", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("h", termAtt.toString());
assertTrue(graph.incrementGraph());
assertEquals("d", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("f", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("j", termAtt.toString());
assertFalse(graph.incrementGraph());
assertEquals(8, graph.cachedTokenCount());
//tok.setReader(new StringReader("a b/c d e/f:3 g/h i j k"));
assertTrue(graph.incrementBaseToken());
assertEquals("e", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("g", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("i", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("j", termAtt.toString());
assertTrue(graph.incrementGraph());
assertEquals("e", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("h", termAtt.toString());
assertFalse(graph.incrementGraph());
assertEquals(8, graph.cachedTokenCount());
assertTrue(graph.incrementBaseToken());
assertEquals("f", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("j", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("k", termAtt.toString());
assertFalse(graph.incrementGraphToken());
assertFalse(graph.incrementGraph());
assertEquals(8, graph.cachedTokenCount());
assertTrue(graph.incrementBaseToken());
assertEquals("g", termAtt.toString());
assertTrue(graph.incrementGraphToken());
assertEquals("i", termAtt.toString());
assertFalse(graph.incrementGraph());
assertEquals(8, graph.cachedTokenCount());
assertTrue(graph.incrementBaseToken());
assertEquals("h", termAtt.toString());
assertFalse(graph.incrementGraph());
assertEquals(8, graph.cachedTokenCount());
assertTrue(graph.incrementBaseToken());
assertTrue(graph.incrementBaseToken());
assertTrue(graph.incrementBaseToken());
assertEquals("k", termAtt.toString());
assertFalse(graph.incrementGraphToken());
assertEquals(0, graph.getTrailingPositions());
assertFalse(graph.incrementGraph());
assertFalse(graph.incrementBaseToken());
assertEquals(8, graph.cachedTokenCount());
}
public void testTrailingPositions() throws IOException {
// a/b:2 c _
CannedTokenStream cts = new CannedTokenStream(1, 5,
new Token("a", 0, 1),
new Token("b", 0, 0, 1, 2),
new Token("c", 1, 2, 3)
);
GraphTokenFilter gts = new TestFilter(cts);
assertFalse(gts.incrementGraph());
assertTrue(gts.incrementBaseToken());
assertTrue(gts.incrementGraphToken());
assertFalse(gts.incrementGraphToken());
assertEquals(1, gts.getTrailingPositions());
assertFalse(gts.incrementGraph());
assertTrue(gts.incrementBaseToken());
assertFalse(gts.incrementGraphToken());
assertEquals(1, gts.getTrailingPositions());
assertFalse(gts.incrementGraph());
}
public void testMaximumGraphCacheSize() throws IOException {
Token[] tokens = new Token[GraphTokenFilter.MAX_TOKEN_CACHE_SIZE + 5];
for (int i = 0; i < GraphTokenFilter.MAX_TOKEN_CACHE_SIZE + 5; i++) {
tokens[i] = new Token("a", 1, i * 2, i * 2 + 1);
}
GraphTokenFilter gts = new TestFilter(new CannedTokenStream(tokens));
Exception e = expectThrows(IllegalStateException.class, () -> {
gts.reset();
gts.incrementBaseToken();
while (true) {
gts.incrementGraphToken();
}
});
assertEquals("Too many cached tokens (> 100)", e.getMessage());
gts.reset();
// after reset, the cache should be cleared and so we can read ahead once more
gts.incrementBaseToken();
gts.incrementGraphToken();
}
public void testGraphPathCountLimits() {
Token[] tokens = new Token[50];
tokens[0] = new Token("term", 1, 0, 1);
tokens[1] = new Token("term1", 1, 2, 3);
for (int i = 2; i < 50; i++) {
tokens[i] = new Token("term" + i, i % 2, 2, 3);
}
Exception e = expectThrows(IllegalStateException.class, () -> {
GraphTokenFilter graph = new TestFilter(new CannedTokenStream(tokens));
graph.reset();
graph.incrementBaseToken();
for (int i = 0; i < 10; i++) {
graph.incrementGraphToken();
}
while (graph.incrementGraph()) {
for (int i = 0; i < 10; i++) {
graph.incrementGraphToken();
}
}
});
assertEquals("Too many graph paths (> 1000)", e.getMessage());
}
}

View File

@ -56,7 +56,7 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
// you cannot turn on MockCharFilter when random
// testing...
private static class GraphTokenizer extends Tokenizer {
public static final class GraphTokenizer extends Tokenizer {
private List<Token> tokens;
private int upto;
private int inputLength;

View File

@ -82,6 +82,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
setPositionIncrement(posInc);
}
public Token(CharSequence text, int posInc, int start, int end, int posLength) {
append(text);
setOffset(start, end);
setPositionIncrement(posInc);
setPositionLength(posLength);
}
/**
* {@inheritDoc}
* @see FlagsAttribute