LUCENE-8202: Add FixedShingleFilter

This commit is contained in:
Alan Woodward 2018-03-21 10:35:28 +00:00
parent d4e69c5cd8
commit fac84c01c8
7 changed files with 561 additions and 0 deletions

View File

@ -106,6 +106,9 @@ New Features
* LUCENE-8197: A new FeatureField makes it easy and efficient to integrate
static relevance signals into the final score. (Adrien Grand, Robert Muir)
* LUCENE-8202: Add a FixedShingleFilter (Alan Woodward, Adrien Grand, Jim
Ferenczi)
Other
* LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon.

View File

@ -0,0 +1,294 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.shingle;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Deque;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
* In other words, it creates combinations of tokens as a single token.
*
* Unlike the {@link ShingleFilter}, FixedShingleFilter only emits shingles of a
* fixed size, and never emits unigrams, even at the end of a TokenStream. In
* addition, if the filter encounters stacked tokens (eg synonyms), then it will
* output stacked shingles
*
* For example, the sentence "please divide this sentence into shingles"
* might be tokenized into shingles "please divide", "divide this",
* "this sentence", "sentence into", and "into shingles".
*
* This filter handles position increments > 1 by inserting filler tokens
* (tokens with termtext "_").
*
* @lucene.experimental
*/
public final class FixedShingleFilter extends TokenFilter {
private final Deque<Token> tokenPool = new ArrayDeque<>();
private final int shingleSize;
private final String tokenSeparator;
private final Token gapToken = new Token(new AttributeSource());
private final Token endToken = new Token(new AttributeSource());
private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private Token[] currentShingleTokens;
private boolean inputStreamExhausted = false;
public FixedShingleFilter(TokenStream input, int shingleSize) {
this(input, shingleSize, " ", "_");
}
public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
super(input);
this.shingleSize = shingleSize;
this.tokenSeparator = tokenSeparator;
this.gapToken.termAtt.setEmpty().append(fillerToken);
this.currentShingleTokens = new Token[shingleSize];
}
@Override
public boolean incrementToken() throws IOException {
int posInc = 0;
if (nextShingle() == false) {
Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
if (nextRoot == endToken)
return false;
recycleToken(currentShingleTokens[0]);
if (resetShingleRoot(nextRoot) == false) {
return false;
}
posInc = currentShingleTokens[0].posInc();
}
clearAttributes();
incAtt.setPositionIncrement(posInc);
offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
termAtt.setEmpty();
termAtt.append(currentShingleTokens[0].term());
typeAtt.setType("shingle");
posLenAtt.setPositionLength(shingleSize);
for (int i = 1; i < shingleSize; i++) {
termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
}
return true;
}
@Override
public void reset() throws IOException {
super.reset();
this.tokenPool.clear();
this.currentShingleTokens[0] = null;
this.inputStreamExhausted = false;
}
@Override
public void end() throws IOException {
if (inputStreamExhausted == false) {
finishInnerStream();
}
clearAttributes();
this.offsetAtt.setOffset(0, endToken.endOffset());
}
private void finishInnerStream() throws IOException {
input.end();
inputStreamExhausted = true;
// check for gaps at the end of the tokenstream
endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
}
private Token lastTokenInShingle() {
int lastTokenIndex = shingleSize - 1;
while (currentShingleTokens[lastTokenIndex] == gapToken) {
lastTokenIndex--;
}
return currentShingleTokens[lastTokenIndex];
}
private boolean resetShingleRoot(Token token) throws IOException {
this.currentShingleTokens[0] = token;
for (int i = 1; i < shingleSize; i++) {
Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
if (current == endToken) {
if (endToken.posInc() + i >= shingleSize) {
// end tokens are a special case, because their posIncs are always
// due to stopwords. Therefore, we can happily append gap tokens
// to the end of the current shingle
for (int j = i; j < shingleSize; j++) {
this.currentShingleTokens[i] = gapToken;
i++;
}
return true;
}
return false;
}
if (current.posInc() > 1) {
// insert gaps into the shingle list
for (int j = 1; j < current.posInc(); j++) {
this.currentShingleTokens[i] = gapToken;
i++;
if (i >= shingleSize)
return true;
}
}
this.currentShingleTokens[i] = current;
}
return true;
}
private boolean nextShingle() throws IOException {
return currentShingleTokens[0] != null && advanceStack();
}
// check if the next token in the tokenstream is at the same position as this one
private boolean lastInStack(Token token) throws IOException {
Token next = nextTokenInStream(token);
return next == endToken || next.posInc() != 0;
}
private boolean advanceStack() throws IOException {
for (int i = shingleSize - 1; i >= 1; i--) {
if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
for (int j = i + 1; j < shingleSize; j++) {
currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
}
return true;
}
}
return false;
}
private Token newToken() {
Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
token.reset(this);
return token;
}
private void recycleToken(Token token) {
if (token == null)
return;
token.nextToken = null;
tokenPool.add(token);
}
// for testing
int instantiatedTokenCount() {
int tokenCount = tokenPool.size() + 1;
if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
return tokenCount;
for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
tokenCount++;
}
return tokenCount;
}
private Token nextTokenInGraph(Token token) throws IOException {
do {
token = nextTokenInStream(token);
if (token == endToken) {
return endToken;
}
} while (token.posInc() == 0);
return token;
}
private Token nextTokenInStream(Token token) throws IOException {
if (token != null && token.nextToken != null) {
return token.nextToken;
}
if (input.incrementToken() == false) {
finishInnerStream();
if (token == null) {
return endToken;
}
else {
token.nextToken = endToken;
return endToken;
}
}
if (token == null) {
return newToken();
}
token.nextToken = newToken();
return token.nextToken;
}
private static class Token {
final AttributeSource attSource;
final PositionIncrementAttribute posIncAtt;
final CharTermAttribute termAtt;
final OffsetAttribute offsetAtt;
Token nextToken;
Token(AttributeSource attSource) {
this.attSource = attSource;
this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
this.termAtt = attSource.addAttribute(CharTermAttribute.class);
this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
}
int posInc() {
return this.posIncAtt.getPositionIncrement();
}
CharSequence term() {
return this.termAtt;
}
int startOffset() {
return this.offsetAtt.startOffset();
}
int endOffset() {
return this.offsetAtt.endOffset();
}
void reset(AttributeSource attSource) {
attSource.copyTo(this.attSource);
this.nextToken = null;
}
@Override
public String toString() {
return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
}
}
}

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.shingle;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link FixedShingleFilter}
*
* Parameters are:
* <ul>
* <li>shingleSize - how many tokens should be combined into each shingle (default: 2)
* <li>tokenSeparator - how tokens should be joined together in the shingle (default: space)
* <li>fillerToken - what should be added in place of stop words (default: _ )
* </ul>
*/
public class FixedShingleFilterFactory extends TokenFilterFactory {
private final int shingleSize;
private final String tokenSeparator;
private final String fillerToken;
public FixedShingleFilterFactory(Map<String, String> args) {
super(args);
this.shingleSize = getInt(args, "shingleSize", 2);
this.tokenSeparator = get(args, "tokenSeparator", " ");
this.fillerToken = get(args, "fillerToken", "_");
}
@Override
public TokenStream create(TokenStream input) {
return new FixedShingleFilter(input, shingleSize, tokenSeparator, fillerToken);
}
}

View File

@ -101,6 +101,7 @@ org.apache.lucene.analysis.pt.PortugueseStemFilterFactory
org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
org.apache.lucene.analysis.shingle.ShingleFilterFactory
org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
org.apache.lucene.analysis.standard.ClassicFilterFactory

View File

@ -0,0 +1,200 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.shingle;
import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
public void testBiGramFilter() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 2),
new String[]{"please divide", "divide this", "this sentence", "sentence into", "into shingles"},
new int[]{0, 7, 14, 19, 28,},
new int[]{13, 18, 27, 32, 41,},
new String[]{"shingle", "shingle", "shingle", "shingle", "shingle",},
new int[]{1, 1, 1, 1, 1,},
new int[]{2, 2, 2, 2, 2});
}
public void testBiGramFilterWithAltSeparator() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 2, "<SEP>", "_"),
new String[]{"please<SEP>divide", "divide<SEP>this", "this<SEP>sentence", "sentence<SEP>into", "into<SEP>shingles"},
new int[]{0, 7, 14, 19, 28},
new int[]{13, 18, 27, 32, 41},
new String[]{"shingle", "shingle", "shingle", "shingle", "shingle"},
new int[]{1, 1, 1, 1, 1});
}
public void testTriGramFilter() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[]{"please divide this", "divide this sentence", "this sentence into", "sentence into shingles"});
}
public void testShingleSizeGreaterThanTokenstreamLength() throws IOException {
TokenStream ts = new FixedShingleFilter(new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13)
), 3);
ts.reset();
assertFalse(ts.incrementToken());
}
public void testWithStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("sentence", 2, 19, 27),
new Token("shingles", 2, 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[]{"please divide _", "divide _ sentence", "sentence _ shingles"},
new int[]{0, 7, 19,},
new int[]{13, 27, 41,},
new String[]{"shingle", "shingle", "shingle",},
new int[]{1, 1, 2,});
}
public void testConsecutiveStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("b", 2, 2, 3),
new Token("c", 4, 5),
new Token("d", 6, 7),
new Token("b", 3, 12, 13),
new Token("c", 14, 15)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 4),
new String[]{"b c d _", "c d _ _", "d _ _ b"},
new int[]{2, 4, 6,},
new int[]{7, 7, 13,},
new int[]{2, 1, 1,});
}
public void testTrailingStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(1, 7,
new Token("b", 0, 1),
new Token("c", 2, 3),
new Token("d", 4, 5)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[] { "b c d", "c d _" },
new int[] { 0, 2, },
new int[] { 5, 5, },
new int[] { 1, 1, });
}
public void testMultipleTrailingStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(2, 9,
new Token("b", 0, 1),
new Token("c", 2, 3),
new Token("d", 4, 5)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[] { "b c d", "c d _", "d _ _" },
new int[] { 0, 2, 4 },
new int[] { 5, 5, 5 },
new int[] { 1, 1, 1 });
}
public void testIncomingGraphs() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("b", 0, 1),
new Token("a", 0, 0, 1),
new Token("c", 2, 3),
new Token("b", 4, 5),
new Token("a", 0, 4, 5),
new Token("d", 6, 7)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 2),
new String[] { "b c", "a c", "c b", "c a", "b d", "a d" },
new int[] { 0, 0, 2, 2, 4, 4 },
new int[] { 3, 3, 5, 5, 7, 7 },
new int[] { 1, 0, 1, 0, 1, 0 });
}
public void testShinglesSpanningGraphs() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("b", 0, 1),
new Token("a", 0, 0, 1),
new Token("c", 2, 3),
new Token("b", 4, 5),
new Token("a", 0, 4, 5),
new Token("d", 6, 7)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d" },
new int[] { 0, 0, 0, 0, 2, 2, },
new int[] { 5, 5, 5, 5, 7, 7, },
new int[] { 1, 0, 0, 0, 1, 0, });
}
}

View File

@ -341,6 +341,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int[] posLengths) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
assertTokenStreamContents(ts, output, null, null, null, null, null, null);
}

View File

@ -75,6 +75,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
setOffset(start, end);
}
/** Constructs a Token with the given term text, position increment, start and end offsets */
public Token(CharSequence text, int posInc, int start, int end) {
append(text);
setOffset(start, end);
setPositionIncrement(posInc);
}
/**
* {@inheritDoc}
* @see FlagsAttribute