mirror of
https://github.com/apache/lucene.git
synced 2025-02-09 11:35:14 +00:00
LUCENE-8202: Add FixedShingleFilter
This commit is contained in:
parent
d4e69c5cd8
commit
fac84c01c8
@ -106,6 +106,9 @@ New Features
|
|||||||
* LUCENE-8197: A new FeatureField makes it easy and efficient to integrate
|
* LUCENE-8197: A new FeatureField makes it easy and efficient to integrate
|
||||||
static relevance signals into the final score. (Adrien Grand, Robert Muir)
|
static relevance signals into the final score. (Adrien Grand, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-8202: Add a FixedShingleFilter (Alan Woodward, Adrien Grand, Jim
|
||||||
|
Ferenczi)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
|
|
||||||
* LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon.
|
* LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon.
|
||||||
|
@ -0,0 +1,294 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.shingle;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayDeque;
|
||||||
|
import java.util.Deque;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
|
||||||
|
* In other words, it creates combinations of tokens as a single token.
|
||||||
|
*
|
||||||
|
* Unlike the {@link ShingleFilter}, FixedShingleFilter only emits shingles of a
|
||||||
|
* fixed size, and never emits unigrams, even at the end of a TokenStream. In
|
||||||
|
* addition, if the filter encounters stacked tokens (eg synonyms), then it will
|
||||||
|
* output stacked shingles
|
||||||
|
*
|
||||||
|
* For example, the sentence "please divide this sentence into shingles"
|
||||||
|
* might be tokenized into shingles "please divide", "divide this",
|
||||||
|
* "this sentence", "sentence into", and "into shingles".
|
||||||
|
*
|
||||||
|
* This filter handles position increments > 1 by inserting filler tokens
|
||||||
|
* (tokens with termtext "_").
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public final class FixedShingleFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final Deque<Token> tokenPool = new ArrayDeque<>();
|
||||||
|
|
||||||
|
private final int shingleSize;
|
||||||
|
private final String tokenSeparator;
|
||||||
|
private final Token gapToken = new Token(new AttributeSource());
|
||||||
|
private final Token endToken = new Token(new AttributeSource());
|
||||||
|
|
||||||
|
private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||||
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
|
private Token[] currentShingleTokens;
|
||||||
|
private boolean inputStreamExhausted = false;
|
||||||
|
|
||||||
|
public FixedShingleFilter(TokenStream input, int shingleSize) {
|
||||||
|
this(input, shingleSize, " ", "_");
|
||||||
|
}
|
||||||
|
|
||||||
|
public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
|
||||||
|
super(input);
|
||||||
|
this.shingleSize = shingleSize;
|
||||||
|
this.tokenSeparator = tokenSeparator;
|
||||||
|
|
||||||
|
this.gapToken.termAtt.setEmpty().append(fillerToken);
|
||||||
|
|
||||||
|
this.currentShingleTokens = new Token[shingleSize];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
int posInc = 0;
|
||||||
|
if (nextShingle() == false) {
|
||||||
|
Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
|
||||||
|
if (nextRoot == endToken)
|
||||||
|
return false;
|
||||||
|
recycleToken(currentShingleTokens[0]);
|
||||||
|
if (resetShingleRoot(nextRoot) == false) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
posInc = currentShingleTokens[0].posInc();
|
||||||
|
}
|
||||||
|
clearAttributes();
|
||||||
|
incAtt.setPositionIncrement(posInc);
|
||||||
|
offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
|
||||||
|
termAtt.setEmpty();
|
||||||
|
termAtt.append(currentShingleTokens[0].term());
|
||||||
|
typeAtt.setType("shingle");
|
||||||
|
posLenAtt.setPositionLength(shingleSize);
|
||||||
|
for (int i = 1; i < shingleSize; i++) {
|
||||||
|
termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
this.tokenPool.clear();
|
||||||
|
this.currentShingleTokens[0] = null;
|
||||||
|
this.inputStreamExhausted = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void end() throws IOException {
|
||||||
|
if (inputStreamExhausted == false) {
|
||||||
|
finishInnerStream();
|
||||||
|
}
|
||||||
|
clearAttributes();
|
||||||
|
this.offsetAtt.setOffset(0, endToken.endOffset());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void finishInnerStream() throws IOException {
|
||||||
|
input.end();
|
||||||
|
inputStreamExhausted = true;
|
||||||
|
// check for gaps at the end of the tokenstream
|
||||||
|
endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
|
||||||
|
OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
|
||||||
|
endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
|
||||||
|
}
|
||||||
|
|
||||||
|
private Token lastTokenInShingle() {
|
||||||
|
int lastTokenIndex = shingleSize - 1;
|
||||||
|
while (currentShingleTokens[lastTokenIndex] == gapToken) {
|
||||||
|
lastTokenIndex--;
|
||||||
|
}
|
||||||
|
return currentShingleTokens[lastTokenIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean resetShingleRoot(Token token) throws IOException {
|
||||||
|
this.currentShingleTokens[0] = token;
|
||||||
|
for (int i = 1; i < shingleSize; i++) {
|
||||||
|
Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
|
||||||
|
if (current == endToken) {
|
||||||
|
if (endToken.posInc() + i >= shingleSize) {
|
||||||
|
// end tokens are a special case, because their posIncs are always
|
||||||
|
// due to stopwords. Therefore, we can happily append gap tokens
|
||||||
|
// to the end of the current shingle
|
||||||
|
for (int j = i; j < shingleSize; j++) {
|
||||||
|
this.currentShingleTokens[i] = gapToken;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (current.posInc() > 1) {
|
||||||
|
// insert gaps into the shingle list
|
||||||
|
for (int j = 1; j < current.posInc(); j++) {
|
||||||
|
this.currentShingleTokens[i] = gapToken;
|
||||||
|
i++;
|
||||||
|
if (i >= shingleSize)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.currentShingleTokens[i] = current;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean nextShingle() throws IOException {
|
||||||
|
return currentShingleTokens[0] != null && advanceStack();
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if the next token in the tokenstream is at the same position as this one
|
||||||
|
private boolean lastInStack(Token token) throws IOException {
|
||||||
|
Token next = nextTokenInStream(token);
|
||||||
|
return next == endToken || next.posInc() != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean advanceStack() throws IOException {
|
||||||
|
for (int i = shingleSize - 1; i >= 1; i--) {
|
||||||
|
if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
|
||||||
|
currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
|
||||||
|
for (int j = i + 1; j < shingleSize; j++) {
|
||||||
|
currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Token newToken() {
|
||||||
|
Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
|
||||||
|
token.reset(this);
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void recycleToken(Token token) {
|
||||||
|
if (token == null)
|
||||||
|
return;
|
||||||
|
token.nextToken = null;
|
||||||
|
tokenPool.add(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
// for testing
|
||||||
|
int instantiatedTokenCount() {
|
||||||
|
int tokenCount = tokenPool.size() + 1;
|
||||||
|
if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
|
||||||
|
return tokenCount;
|
||||||
|
for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
|
||||||
|
tokenCount++;
|
||||||
|
}
|
||||||
|
return tokenCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Token nextTokenInGraph(Token token) throws IOException {
|
||||||
|
do {
|
||||||
|
token = nextTokenInStream(token);
|
||||||
|
if (token == endToken) {
|
||||||
|
return endToken;
|
||||||
|
}
|
||||||
|
} while (token.posInc() == 0);
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Token nextTokenInStream(Token token) throws IOException {
|
||||||
|
if (token != null && token.nextToken != null) {
|
||||||
|
return token.nextToken;
|
||||||
|
}
|
||||||
|
if (input.incrementToken() == false) {
|
||||||
|
finishInnerStream();
|
||||||
|
if (token == null) {
|
||||||
|
return endToken;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
token.nextToken = endToken;
|
||||||
|
return endToken;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (token == null) {
|
||||||
|
return newToken();
|
||||||
|
}
|
||||||
|
token.nextToken = newToken();
|
||||||
|
return token.nextToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class Token {
|
||||||
|
final AttributeSource attSource;
|
||||||
|
final PositionIncrementAttribute posIncAtt;
|
||||||
|
final CharTermAttribute termAtt;
|
||||||
|
final OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
|
Token nextToken;
|
||||||
|
|
||||||
|
Token(AttributeSource attSource) {
|
||||||
|
this.attSource = attSource;
|
||||||
|
this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
this.termAtt = attSource.addAttribute(CharTermAttribute.class);
|
||||||
|
this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
int posInc() {
|
||||||
|
return this.posIncAtt.getPositionIncrement();
|
||||||
|
}
|
||||||
|
|
||||||
|
CharSequence term() {
|
||||||
|
return this.termAtt;
|
||||||
|
}
|
||||||
|
|
||||||
|
int startOffset() {
|
||||||
|
return this.offsetAtt.startOffset();
|
||||||
|
}
|
||||||
|
|
||||||
|
int endOffset() {
|
||||||
|
return this.offsetAtt.endOffset();
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset(AttributeSource attSource) {
|
||||||
|
attSource.copyTo(this.attSource);
|
||||||
|
this.nextToken = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.shingle;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link FixedShingleFilter}
|
||||||
|
*
|
||||||
|
* Parameters are:
|
||||||
|
* <ul>
|
||||||
|
* <li>shingleSize - how many tokens should be combined into each shingle (default: 2)
|
||||||
|
* <li>tokenSeparator - how tokens should be joined together in the shingle (default: space)
|
||||||
|
* <li>fillerToken - what should be added in place of stop words (default: _ )
|
||||||
|
* </ul>
|
||||||
|
*/
|
||||||
|
public class FixedShingleFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
private final int shingleSize;
|
||||||
|
private final String tokenSeparator;
|
||||||
|
private final String fillerToken;
|
||||||
|
|
||||||
|
public FixedShingleFilterFactory(Map<String, String> args) {
|
||||||
|
super(args);
|
||||||
|
this.shingleSize = getInt(args, "shingleSize", 2);
|
||||||
|
this.tokenSeparator = get(args, "tokenSeparator", " ");
|
||||||
|
this.fillerToken = get(args, "fillerToken", "_");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new FixedShingleFilter(input, shingleSize, tokenSeparator, fillerToken);
|
||||||
|
}
|
||||||
|
}
|
@ -101,6 +101,7 @@ org.apache.lucene.analysis.pt.PortugueseStemFilterFactory
|
|||||||
org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
|
org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
|
||||||
org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
|
org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
|
||||||
org.apache.lucene.analysis.shingle.ShingleFilterFactory
|
org.apache.lucene.analysis.shingle.ShingleFilterFactory
|
||||||
|
org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
|
||||||
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
|
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
|
||||||
org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
|
org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
|
||||||
org.apache.lucene.analysis.standard.ClassicFilterFactory
|
org.apache.lucene.analysis.standard.ClassicFilterFactory
|
||||||
|
@ -0,0 +1,200 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.shingle;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.CannedTokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void testBiGramFilter() throws IOException {
|
||||||
|
|
||||||
|
TokenStream ts = new CannedTokenStream(
|
||||||
|
new Token("please", 0, 6),
|
||||||
|
new Token("divide", 7, 13),
|
||||||
|
new Token("this", 14, 18),
|
||||||
|
new Token("sentence", 19, 27),
|
||||||
|
new Token("into", 28, 32),
|
||||||
|
new Token("shingles", 33, 41)
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTokenStreamContents(new FixedShingleFilter(ts, 2),
|
||||||
|
new String[]{"please divide", "divide this", "this sentence", "sentence into", "into shingles"},
|
||||||
|
new int[]{0, 7, 14, 19, 28,},
|
||||||
|
new int[]{13, 18, 27, 32, 41,},
|
||||||
|
new String[]{"shingle", "shingle", "shingle", "shingle", "shingle",},
|
||||||
|
new int[]{1, 1, 1, 1, 1,},
|
||||||
|
new int[]{2, 2, 2, 2, 2});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBiGramFilterWithAltSeparator() throws IOException {
|
||||||
|
|
||||||
|
TokenStream ts = new CannedTokenStream(
|
||||||
|
new Token("please", 0, 6),
|
||||||
|
new Token("divide", 7, 13),
|
||||||
|
new Token("this", 14, 18),
|
||||||
|
new Token("sentence", 19, 27),
|
||||||
|
new Token("into", 28, 32),
|
||||||
|
new Token("shingles", 33, 41)
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTokenStreamContents(new FixedShingleFilter(ts, 2, "<SEP>", "_"),
|
||||||
|
new String[]{"please<SEP>divide", "divide<SEP>this", "this<SEP>sentence", "sentence<SEP>into", "into<SEP>shingles"},
|
||||||
|
new int[]{0, 7, 14, 19, 28},
|
||||||
|
new int[]{13, 18, 27, 32, 41},
|
||||||
|
new String[]{"shingle", "shingle", "shingle", "shingle", "shingle"},
|
||||||
|
new int[]{1, 1, 1, 1, 1});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTriGramFilter() throws IOException {
|
||||||
|
|
||||||
|
TokenStream ts = new CannedTokenStream(
|
||||||
|
new Token("please", 0, 6),
|
||||||
|
new Token("divide", 7, 13),
|
||||||
|
new Token("this", 14, 18),
|
||||||
|
new Token("sentence", 19, 27),
|
||||||
|
new Token("into", 28, 32),
|
||||||
|
new Token("shingles", 33, 41)
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
|
||||||
|
new String[]{"please divide this", "divide this sentence", "this sentence into", "sentence into shingles"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testShingleSizeGreaterThanTokenstreamLength() throws IOException {
|
||||||
|
|
||||||
|
TokenStream ts = new FixedShingleFilter(new CannedTokenStream(
|
||||||
|
new Token("please", 0, 6),
|
||||||
|
new Token("divide", 7, 13)
|
||||||
|
), 3);
|
||||||
|
|
||||||
|
ts.reset();
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWithStopwords() throws IOException {
|
||||||
|
|
||||||
|
TokenStream ts = new CannedTokenStream(
|
||||||
|
new Token("please", 0, 6),
|
||||||
|
new Token("divide", 7, 13),
|
||||||
|
new Token("sentence", 2, 19, 27),
|
||||||
|
new Token("shingles", 2, 33, 41)
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
|
||||||
|
new String[]{"please divide _", "divide _ sentence", "sentence _ shingles"},
|
||||||
|
new int[]{0, 7, 19,},
|
||||||
|
new int[]{13, 27, 41,},
|
||||||
|
new String[]{"shingle", "shingle", "shingle",},
|
||||||
|
new int[]{1, 1, 2,});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testConsecutiveStopwords() throws IOException {
|
||||||
|
|
||||||
|
TokenStream ts = new CannedTokenStream(
|
||||||
|
new Token("b", 2, 2, 3),
|
||||||
|
new Token("c", 4, 5),
|
||||||
|
new Token("d", 6, 7),
|
||||||
|
new Token("b", 3, 12, 13),
|
||||||
|
new Token("c", 14, 15)
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTokenStreamContents(new FixedShingleFilter(ts, 4),
|
||||||
|
new String[]{"b c d _", "c d _ _", "d _ _ b"},
|
||||||
|
new int[]{2, 4, 6,},
|
||||||
|
new int[]{7, 7, 13,},
|
||||||
|
new int[]{2, 1, 1,});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTrailingStopwords() throws IOException {
|
||||||
|
|
||||||
|
TokenStream ts = new CannedTokenStream(1, 7,
|
||||||
|
new Token("b", 0, 1),
|
||||||
|
new Token("c", 2, 3),
|
||||||
|
new Token("d", 4, 5)
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
|
||||||
|
new String[] { "b c d", "c d _" },
|
||||||
|
new int[] { 0, 2, },
|
||||||
|
new int[] { 5, 5, },
|
||||||
|
new int[] { 1, 1, });
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMultipleTrailingStopwords() throws IOException {
|
||||||
|
|
||||||
|
TokenStream ts = new CannedTokenStream(2, 9,
|
||||||
|
new Token("b", 0, 1),
|
||||||
|
new Token("c", 2, 3),
|
||||||
|
new Token("d", 4, 5)
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
|
||||||
|
new String[] { "b c d", "c d _", "d _ _" },
|
||||||
|
new int[] { 0, 2, 4 },
|
||||||
|
new int[] { 5, 5, 5 },
|
||||||
|
new int[] { 1, 1, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIncomingGraphs() throws IOException {
|
||||||
|
|
||||||
|
TokenStream ts = new CannedTokenStream(
|
||||||
|
new Token("b", 0, 1),
|
||||||
|
new Token("a", 0, 0, 1),
|
||||||
|
new Token("c", 2, 3),
|
||||||
|
new Token("b", 4, 5),
|
||||||
|
new Token("a", 0, 4, 5),
|
||||||
|
new Token("d", 6, 7)
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTokenStreamContents(new FixedShingleFilter(ts, 2),
|
||||||
|
new String[] { "b c", "a c", "c b", "c a", "b d", "a d" },
|
||||||
|
new int[] { 0, 0, 2, 2, 4, 4 },
|
||||||
|
new int[] { 3, 3, 5, 5, 7, 7 },
|
||||||
|
new int[] { 1, 0, 1, 0, 1, 0 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testShinglesSpanningGraphs() throws IOException {
|
||||||
|
|
||||||
|
TokenStream ts = new CannedTokenStream(
|
||||||
|
new Token("b", 0, 1),
|
||||||
|
new Token("a", 0, 0, 1),
|
||||||
|
new Token("c", 2, 3),
|
||||||
|
new Token("b", 4, 5),
|
||||||
|
new Token("a", 0, 4, 5),
|
||||||
|
new Token("d", 6, 7)
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
|
||||||
|
new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d" },
|
||||||
|
new int[] { 0, 0, 0, 0, 2, 2, },
|
||||||
|
new int[] { 5, 5, 5, 5, 7, 7, },
|
||||||
|
new int[] { 1, 0, 0, 0, 1, 0, });
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -341,6 +341,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
|
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int[] posLengths) throws IOException {
|
||||||
|
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, null);
|
||||||
|
}
|
||||||
|
|
||||||
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
|
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
|
||||||
assertTokenStreamContents(ts, output, null, null, null, null, null, null);
|
assertTokenStreamContents(ts, output, null, null, null, null, null, null);
|
||||||
}
|
}
|
||||||
|
@ -75,6 +75,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
|
|||||||
setOffset(start, end);
|
setOffset(start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Constructs a Token with the given term text, position increment, start and end offsets */
|
||||||
|
public Token(CharSequence text, int posInc, int start, int end) {
|
||||||
|
append(text);
|
||||||
|
setOffset(start, end);
|
||||||
|
setPositionIncrement(posInc);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@inheritDoc}
|
* {@inheritDoc}
|
||||||
* @see FlagsAttribute
|
* @see FlagsAttribute
|
||||||
|
Loading…
x
Reference in New Issue
Block a user