mirror of https://github.com/apache/lucene.git
LUCENE-8564: Add GraphTokenFilter
This commit is contained in:
parent
6c11161111
commit
f5867a1413
|
@ -155,6 +155,10 @@ New Features
|
|||
* LUCENE-8216: Added a new BM25FQuery in sandbox to blend statistics across several fields
|
||||
using the BM25F formula. (Adrien Grand, Jim Ferenczi)
|
||||
|
||||
* LUCENE-8564: GraphTokenFilter is an abstract class useful for token filters that need
|
||||
to read-ahead in the token stream and take into account graph structures. This
|
||||
also changes FixedShingleFilter to extend GraphTokenFilter (Alan Woodward)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
|
||||
|
|
|
@ -18,16 +18,14 @@
|
|||
package org.apache.lucene.analysis.shingle;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Deque;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.GraphTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
|
||||
|
@ -47,27 +45,20 @@ import org.apache.lucene.util.AttributeSource;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class FixedShingleFilter extends TokenFilter {
|
||||
public final class FixedShingleFilter extends GraphTokenFilter {
|
||||
|
||||
private final Deque<Token> tokenPool = new ArrayDeque<>();
|
||||
|
||||
private static final int MAX_SHINGLE_STACK_SIZE = 1000;
|
||||
private static final int MAX_SHINGLE_SIZE = 4;
|
||||
|
||||
private final int shingleSize;
|
||||
private final String tokenSeparator;
|
||||
|
||||
private final Token gapToken = new Token(new AttributeSource());
|
||||
private final Token endToken = new Token(new AttributeSource());
|
||||
private final String fillerToken;
|
||||
|
||||
private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
private Token[] currentShingleTokens;
|
||||
private int currentShingleStackSize;
|
||||
private boolean inputStreamExhausted = false;
|
||||
private final CharTermAttribute buffer = new CharTermAttributeImpl();
|
||||
|
||||
/**
|
||||
* Creates a FixedShingleFilter over an input token stream
|
||||
|
@ -89,228 +80,82 @@ public final class FixedShingleFilter extends TokenFilter {
|
|||
*/
|
||||
public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
|
||||
super(input);
|
||||
|
||||
if (shingleSize <= 1 || shingleSize > MAX_SHINGLE_SIZE) {
|
||||
throw new IllegalArgumentException("Shingle size must be between 2 and " + MAX_SHINGLE_SIZE + ", got " + shingleSize);
|
||||
}
|
||||
this.shingleSize = shingleSize;
|
||||
this.tokenSeparator = tokenSeparator;
|
||||
this.gapToken.termAtt.setEmpty().append(fillerToken);
|
||||
this.currentShingleTokens = new Token[shingleSize];
|
||||
this.fillerToken = fillerToken;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
int posInc = 0;
|
||||
if (nextShingle() == false) {
|
||||
Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
|
||||
if (nextRoot == endToken)
|
||||
return false;
|
||||
recycleToken(currentShingleTokens[0]);
|
||||
if (resetShingleRoot(nextRoot) == false) {
|
||||
|
||||
int shinglePosInc;
|
||||
if (incrementGraph() == false) {
|
||||
if (incrementBaseToken() == false) {
|
||||
return false;
|
||||
}
|
||||
posInc = currentShingleTokens[0].posInc();
|
||||
// starting a shingle at a new base position, use base position increment
|
||||
shinglePosInc = incAtt.getPositionIncrement();
|
||||
}
|
||||
clearAttributes();
|
||||
incAtt.setPositionIncrement(posInc);
|
||||
offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(currentShingleTokens[0].term());
|
||||
typeAtt.setType("shingle");
|
||||
else {
|
||||
// starting a new shingle at the same base with a different graph, use a 0
|
||||
// position increment
|
||||
shinglePosInc = 0;
|
||||
}
|
||||
|
||||
final int startOffset = offsetAtt.startOffset();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
this.buffer.setEmpty();
|
||||
this.buffer.append(termAtt);
|
||||
|
||||
// build the shingle by iterating over the current graph, adding
|
||||
// filler tokens if we encounter gaps
|
||||
for (int i = 1; i < shingleSize; i++) {
|
||||
termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
this.tokenPool.clear();
|
||||
this.currentShingleTokens[0] = null;
|
||||
this.inputStreamExhausted = false;
|
||||
this.currentShingleStackSize = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
if (inputStreamExhausted == false) {
|
||||
finishInnerStream();
|
||||
}
|
||||
clearAttributes();
|
||||
this.offsetAtt.setOffset(0, endToken.endOffset());
|
||||
this.incAtt.setPositionIncrement(endToken.posInc());
|
||||
}
|
||||
|
||||
private void finishInnerStream() throws IOException {
|
||||
input.end();
|
||||
inputStreamExhausted = true;
|
||||
// check for gaps at the end of the tokenstream
|
||||
endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
|
||||
OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
|
||||
endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
|
||||
}
|
||||
|
||||
private Token lastTokenInShingle() {
|
||||
int lastTokenIndex = shingleSize - 1;
|
||||
while (currentShingleTokens[lastTokenIndex] == gapToken) {
|
||||
lastTokenIndex--;
|
||||
}
|
||||
return currentShingleTokens[lastTokenIndex];
|
||||
}
|
||||
|
||||
private boolean resetShingleRoot(Token token) throws IOException {
|
||||
this.currentShingleTokens[0] = token;
|
||||
for (int i = 1; i < shingleSize; i++) {
|
||||
Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
|
||||
if (current == endToken) {
|
||||
if (endToken.posInc() + i >= shingleSize) {
|
||||
// end tokens are a special case, because their posIncs are always
|
||||
// due to stopwords. Therefore, we can happily append gap tokens
|
||||
// to the end of the current shingle
|
||||
for (int j = i; j < shingleSize; j++) {
|
||||
this.currentShingleTokens[i] = gapToken;
|
||||
if (incrementGraphToken() == false) {
|
||||
// we've reached the end of the token stream, check for trailing
|
||||
// positions and add fillers if necessary
|
||||
int trailingPositions = getTrailingPositions();
|
||||
if (i + trailingPositions < shingleSize) {
|
||||
// not enough trailing positions to make a full shingle
|
||||
return false;
|
||||
}
|
||||
while (i < shingleSize) {
|
||||
this.buffer.append(tokenSeparator).append(fillerToken);
|
||||
i++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
int posInc = incAtt.getPositionIncrement();
|
||||
if (posInc > 1) {
|
||||
// if we have a posInc > 1, we need to fill in the gaps
|
||||
if (i + posInc > shingleSize) {
|
||||
// if the posInc is greater than the shingle size, we need to add fillers
|
||||
// up to the shingle size but no further
|
||||
while (i < shingleSize) {
|
||||
this.buffer.append(tokenSeparator).append(fillerToken);
|
||||
i++;
|
||||
}
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (current.posInc() > 1) {
|
||||
// insert gaps into the shingle list
|
||||
for (int j = 1; j < current.posInc(); j++) {
|
||||
this.currentShingleTokens[i] = gapToken;
|
||||
// otherwise just add them in as far as we need
|
||||
while (posInc > 1) {
|
||||
this.buffer.append(tokenSeparator).append(fillerToken);
|
||||
posInc--;
|
||||
i++;
|
||||
if (i >= shingleSize)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
this.currentShingleTokens[i] = current;
|
||||
this.buffer.append(tokenSeparator).append(termAtt);
|
||||
endOffset = offsetAtt.endOffset();
|
||||
}
|
||||
clearAttributes();
|
||||
this.offsetAtt.setOffset(startOffset, endOffset);
|
||||
this.incAtt.setPositionIncrement(shinglePosInc);
|
||||
this.termAtt.setEmpty().append(buffer);
|
||||
this.typeAtt.setType("shingle");
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean nextShingle() throws IOException {
|
||||
return currentShingleTokens[0] != null && advanceStack();
|
||||
}
|
||||
|
||||
// check if the next token in the tokenstream is at the same position as this one
|
||||
private boolean lastInStack(Token token) throws IOException {
|
||||
Token next = nextTokenInStream(token);
|
||||
return next == endToken || next.posInc() != 0;
|
||||
}
|
||||
|
||||
private boolean advanceStack() throws IOException {
|
||||
for (int i = shingleSize - 1; i >= 1; i--) {
|
||||
if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
|
||||
currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
|
||||
for (int j = i + 1; j < shingleSize; j++) {
|
||||
currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
|
||||
}
|
||||
if (currentShingleStackSize++ > MAX_SHINGLE_STACK_SIZE) {
|
||||
throw new IllegalStateException("Too many shingles (> " + MAX_SHINGLE_STACK_SIZE + ") at term [" + currentShingleTokens[0].term() + "]");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
currentShingleStackSize = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
private Token newToken() {
|
||||
Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
|
||||
token.reset(this);
|
||||
return token;
|
||||
}
|
||||
|
||||
private void recycleToken(Token token) {
|
||||
if (token == null)
|
||||
return;
|
||||
token.nextToken = null;
|
||||
tokenPool.add(token);
|
||||
}
|
||||
|
||||
// for testing
|
||||
int instantiatedTokenCount() {
|
||||
int tokenCount = tokenPool.size() + 1;
|
||||
if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
|
||||
return tokenCount;
|
||||
for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
|
||||
tokenCount++;
|
||||
}
|
||||
return tokenCount;
|
||||
}
|
||||
|
||||
private Token nextTokenInGraph(Token token) throws IOException {
|
||||
do {
|
||||
token = nextTokenInStream(token);
|
||||
if (token == endToken) {
|
||||
return endToken;
|
||||
}
|
||||
} while (token.posInc() == 0);
|
||||
return token;
|
||||
}
|
||||
|
||||
private Token nextTokenInStream(Token token) throws IOException {
|
||||
if (token != null && token.nextToken != null) {
|
||||
return token.nextToken;
|
||||
}
|
||||
if (input.incrementToken() == false) {
|
||||
finishInnerStream();
|
||||
if (token == null) {
|
||||
return endToken;
|
||||
} else {
|
||||
token.nextToken = endToken;
|
||||
return endToken;
|
||||
}
|
||||
}
|
||||
if (token == null) {
|
||||
return newToken();
|
||||
}
|
||||
token.nextToken = newToken();
|
||||
return token.nextToken;
|
||||
}
|
||||
|
||||
private static class Token {
|
||||
final AttributeSource attSource;
|
||||
final PositionIncrementAttribute posIncAtt;
|
||||
final CharTermAttribute termAtt;
|
||||
final OffsetAttribute offsetAtt;
|
||||
|
||||
Token nextToken;
|
||||
|
||||
Token(AttributeSource attSource) {
|
||||
this.attSource = attSource;
|
||||
this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
|
||||
this.termAtt = attSource.addAttribute(CharTermAttribute.class);
|
||||
this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
int posInc() {
|
||||
return this.posIncAtt.getPositionIncrement();
|
||||
}
|
||||
|
||||
CharSequence term() {
|
||||
return this.termAtt;
|
||||
}
|
||||
|
||||
int startOffset() {
|
||||
return this.offsetAtt.startOffset();
|
||||
}
|
||||
|
||||
int endOffset() {
|
||||
return this.offsetAtt.endOffset();
|
||||
}
|
||||
|
||||
void reset(AttributeSource attSource) {
|
||||
attSource.copyTo(this.attSource);
|
||||
this.nextToken = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -163,6 +163,8 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testIncomingGraphs() throws IOException {
|
||||
|
||||
// b/a c b/a d
|
||||
|
||||
TokenStream ts = new CannedTokenStream(
|
||||
new Token("b", 0, 1),
|
||||
new Token("a", 0, 0, 1),
|
||||
|
@ -208,21 +210,4 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
|
|||
assertEquals("Shingle size must be between 2 and 4, got 5", e2.getMessage());
|
||||
}
|
||||
|
||||
public void testShingleCountLimits() {
|
||||
|
||||
Token[] tokens = new Token[5000];
|
||||
tokens[0] = new Token("term", 1, 0, 1);
|
||||
tokens[1] = new Token("term1", 1, 2, 3);
|
||||
for (int i = 2; i < 5000; i++) {
|
||||
tokens[i] = new Token("term" + i, 0, 2, 3);
|
||||
}
|
||||
|
||||
Exception e = expectThrows(IllegalStateException.class, () -> {
|
||||
TokenStream ts = new FixedShingleFilter(new CannedTokenStream(tokens), 2);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {}
|
||||
});
|
||||
assertEquals("Too many shingles (> 1000) at term [term]", e.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,284 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Deque;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* An abstract TokenFilter that exposes its input stream as a graph
|
||||
*
|
||||
* Call {@link #incrementBaseToken()} to move the root of the graph to the next
|
||||
* position in the TokenStream, {@link #incrementGraphToken()} to move along
|
||||
* the current graph, and {@link #incrementGraph()} to reset to the next graph
|
||||
* based at the current root.
|
||||
*
|
||||
* For example, given the stream 'a b/c:2 d e`, then with the base token at
|
||||
* 'a', incrementGraphToken() will produce the stream 'a b d e', and then
|
||||
* after calling incrementGraph() will produce the stream 'a c e'.
|
||||
*/
|
||||
public abstract class GraphTokenFilter extends TokenFilter {
|
||||
|
||||
private final Deque<Token> tokenPool = new ArrayDeque<>();
|
||||
private final List<Token> currentGraph = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* The maximum permitted number of routes through a graph
|
||||
*/
|
||||
public static final int MAX_GRAPH_STACK_SIZE = 1000;
|
||||
|
||||
/**
|
||||
* The maximum permitted read-ahead in the token stream
|
||||
*/
|
||||
public static final int MAX_TOKEN_CACHE_SIZE = 100;
|
||||
|
||||
private Token baseToken;
|
||||
private int graphDepth;
|
||||
private int graphPos;
|
||||
private int trailingPositions = -1;
|
||||
private int finalOffsets = -1;
|
||||
|
||||
private int stackSize;
|
||||
private int cacheSize;
|
||||
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
private final OffsetAttribute offsetAtt;
|
||||
|
||||
/**
|
||||
* Create a new GraphTokenFilter
|
||||
*/
|
||||
public GraphTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
this.posIncAtt = input.addAttribute(PositionIncrementAttribute.class);
|
||||
this.offsetAtt = input.addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Move the root of the graph to the next token in the wrapped TokenStream
|
||||
*
|
||||
* @return {@code false} if the underlying stream is exhausted
|
||||
*/
|
||||
protected final boolean incrementBaseToken() throws IOException {
|
||||
stackSize = 0;
|
||||
graphDepth = 0;
|
||||
graphPos = 0;
|
||||
Token oldBase = baseToken;
|
||||
baseToken = nextTokenInStream(baseToken);
|
||||
if (baseToken == null) {
|
||||
return false;
|
||||
}
|
||||
currentGraph.clear();
|
||||
currentGraph.add(baseToken);
|
||||
baseToken.attSource.copyTo(this);
|
||||
recycleToken(oldBase);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Move to the next token in the current route through the graph
|
||||
*
|
||||
* @return {@code false} if there are not more tokens in the current graph
|
||||
*/
|
||||
protected final boolean incrementGraphToken() throws IOException {
|
||||
if (graphPos < graphDepth) {
|
||||
graphPos++;
|
||||
currentGraph.get(graphPos).attSource.copyTo(this);
|
||||
return true;
|
||||
}
|
||||
Token token = nextTokenInGraph(currentGraph.get(graphDepth));
|
||||
if (token == null) {
|
||||
return false;
|
||||
}
|
||||
graphDepth++;
|
||||
graphPos++;
|
||||
currentGraph.add(graphDepth, token);
|
||||
token.attSource.copyTo(this);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset to the root token again, and move down the next route through the graph
|
||||
*
|
||||
* @return false if there are no more routes through the graph
|
||||
*/
|
||||
protected final boolean incrementGraph() throws IOException {
|
||||
if (baseToken == null) {
|
||||
return false;
|
||||
}
|
||||
graphPos = 0;
|
||||
for (int i = graphDepth; i >= 1; i--) {
|
||||
if (lastInStack(currentGraph.get(i)) == false) {
|
||||
currentGraph.set(i, nextTokenInStream(currentGraph.get(i)));
|
||||
for (int j = i + 1; j < graphDepth; j++) {
|
||||
currentGraph.set(j, nextTokenInGraph(currentGraph.get(j)));
|
||||
}
|
||||
if (stackSize++ > MAX_GRAPH_STACK_SIZE) {
|
||||
throw new IllegalStateException("Too many graph paths (> " + MAX_GRAPH_STACK_SIZE + ")");
|
||||
}
|
||||
currentGraph.get(0).attSource.copyTo(this);
|
||||
graphDepth = i;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of trailing positions at the end of the graph
|
||||
*
|
||||
* NB this should only be called after {@link #incrementGraphToken()} has returned {@code false}
|
||||
*/
|
||||
public int getTrailingPositions() {
|
||||
return trailingPositions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
if (trailingPositions == -1) {
|
||||
input.end();
|
||||
trailingPositions = posIncAtt.getPositionIncrement();
|
||||
finalOffsets = offsetAtt.endOffset();
|
||||
}
|
||||
else {
|
||||
endAttributes();
|
||||
this.posIncAtt.setPositionIncrement(trailingPositions);
|
||||
this.offsetAtt.setOffset(finalOffsets, finalOffsets);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
// new attributes can be added between reset() calls, so we can't reuse
|
||||
// token objects from a previous run
|
||||
tokenPool.clear();
|
||||
cacheSize = 0;
|
||||
graphDepth = 0;
|
||||
trailingPositions = -1;
|
||||
finalOffsets = -1;
|
||||
baseToken = null;
|
||||
}
|
||||
|
||||
int cachedTokenCount() {
|
||||
return cacheSize;
|
||||
}
|
||||
|
||||
private Token newToken() {
|
||||
if (tokenPool.size() == 0) {
|
||||
cacheSize++;
|
||||
if (cacheSize > MAX_TOKEN_CACHE_SIZE) {
|
||||
throw new IllegalStateException("Too many cached tokens (> " + MAX_TOKEN_CACHE_SIZE + ")");
|
||||
}
|
||||
return new Token(this.cloneAttributes());
|
||||
}
|
||||
Token token = tokenPool.removeFirst();
|
||||
token.reset(input);
|
||||
return token;
|
||||
}
|
||||
|
||||
private void recycleToken(Token token) {
|
||||
if (token == null)
|
||||
return;
|
||||
token.nextToken = null;
|
||||
tokenPool.add(token);
|
||||
}
|
||||
|
||||
private Token nextTokenInGraph(Token token) throws IOException {
|
||||
int remaining = token.length();
|
||||
do {
|
||||
token = nextTokenInStream(token);
|
||||
if (token == null) {
|
||||
return null;
|
||||
}
|
||||
remaining -= token.posInc();
|
||||
} while (remaining > 0);
|
||||
return token;
|
||||
}
|
||||
|
||||
// check if the next token in the tokenstream is at the same position as this one
|
||||
private boolean lastInStack(Token token) throws IOException {
|
||||
Token next = nextTokenInStream(token);
|
||||
return next == null || next.posInc() != 0;
|
||||
}
|
||||
|
||||
private Token nextTokenInStream(Token token) throws IOException {
|
||||
if (token != null && token.nextToken != null) {
|
||||
return token.nextToken;
|
||||
}
|
||||
if (this.trailingPositions != -1) {
|
||||
// already hit the end
|
||||
return null;
|
||||
}
|
||||
if (input.incrementToken() == false) {
|
||||
input.end();
|
||||
trailingPositions = posIncAtt.getPositionIncrement();
|
||||
finalOffsets = offsetAtt.endOffset();
|
||||
return null;
|
||||
}
|
||||
if (token == null) {
|
||||
return newToken();
|
||||
}
|
||||
token.nextToken = newToken();
|
||||
return token.nextToken;
|
||||
}
|
||||
|
||||
private static class Token {
|
||||
|
||||
final AttributeSource attSource;
|
||||
final PositionIncrementAttribute posIncAtt;
|
||||
final PositionLengthAttribute lengthAtt;
|
||||
Token nextToken;
|
||||
|
||||
Token(AttributeSource attSource) {
|
||||
this.attSource = attSource;
|
||||
this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
|
||||
boolean hasLengthAtt = attSource.hasAttribute(PositionLengthAttribute.class);
|
||||
this.lengthAtt = hasLengthAtt ? attSource.addAttribute(PositionLengthAttribute.class) : null;
|
||||
}
|
||||
|
||||
int posInc() {
|
||||
return this.posIncAtt.getPositionIncrement();
|
||||
}
|
||||
|
||||
int length() {
|
||||
if (this.lengthAtt == null) {
|
||||
return 1;
|
||||
}
|
||||
return this.lengthAtt.getPositionLength();
|
||||
}
|
||||
|
||||
void reset(AttributeSource attSource) {
|
||||
attSource.copyTo(this.attSource);
|
||||
this.nextToken = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return attSource.toString();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,236 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
public class TestGraphTokenFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
static class TestFilter extends GraphTokenFilter {
|
||||
|
||||
public TestFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
return incrementBaseToken();
|
||||
}
|
||||
}
|
||||
|
||||
public void testGraphTokenStream() throws IOException {
|
||||
|
||||
TestGraphTokenizers.GraphTokenizer tok = new TestGraphTokenizers.GraphTokenizer();
|
||||
GraphTokenFilter graph = new TestFilter(tok);
|
||||
|
||||
CharTermAttribute termAtt = graph.addAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = graph.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
tok.setReader(new StringReader("a b/c d e/f:3 g/h i j k"));
|
||||
tok.reset();
|
||||
|
||||
assertFalse(graph.incrementGraph());
|
||||
assertEquals(0, graph.cachedTokenCount());
|
||||
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertEquals("a", termAtt.toString());
|
||||
assertEquals(1, posIncAtt.getPositionIncrement());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("b", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("d", termAtt.toString());
|
||||
assertTrue(graph.incrementGraph());
|
||||
assertEquals("a", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("c", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("d", termAtt.toString());
|
||||
assertFalse(graph.incrementGraph());
|
||||
assertEquals(5, graph.cachedTokenCount());
|
||||
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertEquals("b", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("d", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("e", termAtt.toString());
|
||||
assertTrue(graph.incrementGraph());
|
||||
assertEquals("b", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("d", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("f", termAtt.toString());
|
||||
assertFalse(graph.incrementGraph());
|
||||
assertEquals(6, graph.cachedTokenCount());
|
||||
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertEquals("c", termAtt.toString());
|
||||
assertEquals(0, posIncAtt.getPositionIncrement());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("d", termAtt.toString());
|
||||
assertFalse(graph.incrementGraph());
|
||||
assertEquals(6, graph.cachedTokenCount());
|
||||
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertEquals("d", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("e", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("g", termAtt.toString());
|
||||
assertTrue(graph.incrementGraph());
|
||||
assertEquals("d", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("e", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("h", termAtt.toString());
|
||||
assertTrue(graph.incrementGraph());
|
||||
assertEquals("d", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("f", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("j", termAtt.toString());
|
||||
assertFalse(graph.incrementGraph());
|
||||
assertEquals(8, graph.cachedTokenCount());
|
||||
|
||||
//tok.setReader(new StringReader("a b/c d e/f:3 g/h i j k"));
|
||||
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertEquals("e", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("g", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("i", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("j", termAtt.toString());
|
||||
assertTrue(graph.incrementGraph());
|
||||
assertEquals("e", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("h", termAtt.toString());
|
||||
assertFalse(graph.incrementGraph());
|
||||
assertEquals(8, graph.cachedTokenCount());
|
||||
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertEquals("f", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("j", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("k", termAtt.toString());
|
||||
assertFalse(graph.incrementGraphToken());
|
||||
assertFalse(graph.incrementGraph());
|
||||
assertEquals(8, graph.cachedTokenCount());
|
||||
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertEquals("g", termAtt.toString());
|
||||
assertTrue(graph.incrementGraphToken());
|
||||
assertEquals("i", termAtt.toString());
|
||||
assertFalse(graph.incrementGraph());
|
||||
assertEquals(8, graph.cachedTokenCount());
|
||||
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertEquals("h", termAtt.toString());
|
||||
assertFalse(graph.incrementGraph());
|
||||
assertEquals(8, graph.cachedTokenCount());
|
||||
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertTrue(graph.incrementBaseToken());
|
||||
assertEquals("k", termAtt.toString());
|
||||
assertFalse(graph.incrementGraphToken());
|
||||
assertEquals(0, graph.getTrailingPositions());
|
||||
assertFalse(graph.incrementGraph());
|
||||
assertFalse(graph.incrementBaseToken());
|
||||
assertEquals(8, graph.cachedTokenCount());
|
||||
|
||||
}
|
||||
|
||||
public void testTrailingPositions() throws IOException {
|
||||
|
||||
// a/b:2 c _
|
||||
CannedTokenStream cts = new CannedTokenStream(1, 5,
|
||||
new Token("a", 0, 1),
|
||||
new Token("b", 0, 0, 1, 2),
|
||||
new Token("c", 1, 2, 3)
|
||||
);
|
||||
|
||||
GraphTokenFilter gts = new TestFilter(cts);
|
||||
assertFalse(gts.incrementGraph());
|
||||
assertTrue(gts.incrementBaseToken());
|
||||
assertTrue(gts.incrementGraphToken());
|
||||
assertFalse(gts.incrementGraphToken());
|
||||
assertEquals(1, gts.getTrailingPositions());
|
||||
assertFalse(gts.incrementGraph());
|
||||
assertTrue(gts.incrementBaseToken());
|
||||
assertFalse(gts.incrementGraphToken());
|
||||
assertEquals(1, gts.getTrailingPositions());
|
||||
assertFalse(gts.incrementGraph());
|
||||
}
|
||||
|
||||
public void testMaximumGraphCacheSize() throws IOException {
|
||||
|
||||
Token[] tokens = new Token[GraphTokenFilter.MAX_TOKEN_CACHE_SIZE + 5];
|
||||
for (int i = 0; i < GraphTokenFilter.MAX_TOKEN_CACHE_SIZE + 5; i++) {
|
||||
tokens[i] = new Token("a", 1, i * 2, i * 2 + 1);
|
||||
}
|
||||
|
||||
GraphTokenFilter gts = new TestFilter(new CannedTokenStream(tokens));
|
||||
Exception e = expectThrows(IllegalStateException.class, () -> {
|
||||
gts.reset();
|
||||
gts.incrementBaseToken();
|
||||
while (true) {
|
||||
gts.incrementGraphToken();
|
||||
}
|
||||
});
|
||||
assertEquals("Too many cached tokens (> 100)", e.getMessage());
|
||||
|
||||
gts.reset();
|
||||
// after reset, the cache should be cleared and so we can read ahead once more
|
||||
gts.incrementBaseToken();
|
||||
gts.incrementGraphToken();
|
||||
|
||||
}
|
||||
|
||||
public void testGraphPathCountLimits() {
|
||||
|
||||
Token[] tokens = new Token[50];
|
||||
tokens[0] = new Token("term", 1, 0, 1);
|
||||
tokens[1] = new Token("term1", 1, 2, 3);
|
||||
for (int i = 2; i < 50; i++) {
|
||||
tokens[i] = new Token("term" + i, i % 2, 2, 3);
|
||||
}
|
||||
|
||||
Exception e = expectThrows(IllegalStateException.class, () -> {
|
||||
GraphTokenFilter graph = new TestFilter(new CannedTokenStream(tokens));
|
||||
graph.reset();
|
||||
graph.incrementBaseToken();
|
||||
for (int i = 0; i < 10; i++) {
|
||||
graph.incrementGraphToken();
|
||||
}
|
||||
while (graph.incrementGraph()) {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
graph.incrementGraphToken();
|
||||
}
|
||||
}
|
||||
});
|
||||
assertEquals("Too many graph paths (> 1000)", e.getMessage());
|
||||
}
|
||||
|
||||
}
|
|
@ -56,7 +56,7 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
|
|||
// you cannot turn on MockCharFilter when random
|
||||
// testing...
|
||||
|
||||
private static class GraphTokenizer extends Tokenizer {
|
||||
public static final class GraphTokenizer extends Tokenizer {
|
||||
private List<Token> tokens;
|
||||
private int upto;
|
||||
private int inputLength;
|
||||
|
|
|
@ -82,6 +82,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
|
|||
setPositionIncrement(posInc);
|
||||
}
|
||||
|
||||
public Token(CharSequence text, int posInc, int start, int end, int posLength) {
|
||||
append(text);
|
||||
setOffset(start, end);
|
||||
setPositionIncrement(posInc);
|
||||
setPositionLength(posLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see FlagsAttribute
|
||||
|
|
Loading…
Reference in New Issue