LUCENE-8202: Add FixedShingleFilter

This commit is contained in:
Alan Woodward 2018-03-21 10:35:28 +00:00
parent d4e69c5cd8
commit fac84c01c8
7 changed files with 561 additions and 0 deletions

View File

@ -106,6 +106,9 @@ New Features
* LUCENE-8197: A new FeatureField makes it easy and efficient to integrate * LUCENE-8197: A new FeatureField makes it easy and efficient to integrate
static relevance signals into the final score. (Adrien Grand, Robert Muir) static relevance signals into the final score. (Adrien Grand, Robert Muir)
* LUCENE-8202: Add a FixedShingleFilter (Alan Woodward, Adrien Grand, Jim
Ferenczi)
Other Other
* LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon. * LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon.

View File

@ -0,0 +1,294 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.shingle;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Deque;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
* In other words, it creates combinations of tokens as a single token.
*
* Unlike the {@link ShingleFilter}, FixedShingleFilter only emits shingles of a
* fixed size, and never emits unigrams, even at the end of a TokenStream. In
* addition, if the filter encounters stacked tokens (eg synonyms), then it will
* output stacked shingles
*
* For example, the sentence "please divide this sentence into shingles"
* might be tokenized into shingles "please divide", "divide this",
* "this sentence", "sentence into", and "into shingles".
*
* This filter handles position increments > 1 by inserting filler tokens
* (tokens with termtext "_").
*
* @lucene.experimental
*/
public final class FixedShingleFilter extends TokenFilter {
private final Deque<Token> tokenPool = new ArrayDeque<>();
private final int shingleSize;
private final String tokenSeparator;
private final Token gapToken = new Token(new AttributeSource());
private final Token endToken = new Token(new AttributeSource());
private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private Token[] currentShingleTokens;
private boolean inputStreamExhausted = false;
public FixedShingleFilter(TokenStream input, int shingleSize) {
this(input, shingleSize, " ", "_");
}
public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
super(input);
this.shingleSize = shingleSize;
this.tokenSeparator = tokenSeparator;
this.gapToken.termAtt.setEmpty().append(fillerToken);
this.currentShingleTokens = new Token[shingleSize];
}
@Override
public boolean incrementToken() throws IOException {
int posInc = 0;
if (nextShingle() == false) {
Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
if (nextRoot == endToken)
return false;
recycleToken(currentShingleTokens[0]);
if (resetShingleRoot(nextRoot) == false) {
return false;
}
posInc = currentShingleTokens[0].posInc();
}
clearAttributes();
incAtt.setPositionIncrement(posInc);
offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
termAtt.setEmpty();
termAtt.append(currentShingleTokens[0].term());
typeAtt.setType("shingle");
posLenAtt.setPositionLength(shingleSize);
for (int i = 1; i < shingleSize; i++) {
termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
}
return true;
}
@Override
public void reset() throws IOException {
super.reset();
this.tokenPool.clear();
this.currentShingleTokens[0] = null;
this.inputStreamExhausted = false;
}
@Override
public void end() throws IOException {
if (inputStreamExhausted == false) {
finishInnerStream();
}
clearAttributes();
this.offsetAtt.setOffset(0, endToken.endOffset());
}
private void finishInnerStream() throws IOException {
input.end();
inputStreamExhausted = true;
// check for gaps at the end of the tokenstream
endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
}
private Token lastTokenInShingle() {
int lastTokenIndex = shingleSize - 1;
while (currentShingleTokens[lastTokenIndex] == gapToken) {
lastTokenIndex--;
}
return currentShingleTokens[lastTokenIndex];
}
private boolean resetShingleRoot(Token token) throws IOException {
this.currentShingleTokens[0] = token;
for (int i = 1; i < shingleSize; i++) {
Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
if (current == endToken) {
if (endToken.posInc() + i >= shingleSize) {
// end tokens are a special case, because their posIncs are always
// due to stopwords. Therefore, we can happily append gap tokens
// to the end of the current shingle
for (int j = i; j < shingleSize; j++) {
this.currentShingleTokens[i] = gapToken;
i++;
}
return true;
}
return false;
}
if (current.posInc() > 1) {
// insert gaps into the shingle list
for (int j = 1; j < current.posInc(); j++) {
this.currentShingleTokens[i] = gapToken;
i++;
if (i >= shingleSize)
return true;
}
}
this.currentShingleTokens[i] = current;
}
return true;
}
private boolean nextShingle() throws IOException {
return currentShingleTokens[0] != null && advanceStack();
}
// check if the next token in the tokenstream is at the same position as this one
private boolean lastInStack(Token token) throws IOException {
Token next = nextTokenInStream(token);
return next == endToken || next.posInc() != 0;
}
private boolean advanceStack() throws IOException {
for (int i = shingleSize - 1; i >= 1; i--) {
if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
for (int j = i + 1; j < shingleSize; j++) {
currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
}
return true;
}
}
return false;
}
private Token newToken() {
Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
token.reset(this);
return token;
}
private void recycleToken(Token token) {
if (token == null)
return;
token.nextToken = null;
tokenPool.add(token);
}
// for testing
int instantiatedTokenCount() {
int tokenCount = tokenPool.size() + 1;
if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
return tokenCount;
for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
tokenCount++;
}
return tokenCount;
}
private Token nextTokenInGraph(Token token) throws IOException {
do {
token = nextTokenInStream(token);
if (token == endToken) {
return endToken;
}
} while (token.posInc() == 0);
return token;
}
private Token nextTokenInStream(Token token) throws IOException {
if (token != null && token.nextToken != null) {
return token.nextToken;
}
if (input.incrementToken() == false) {
finishInnerStream();
if (token == null) {
return endToken;
}
else {
token.nextToken = endToken;
return endToken;
}
}
if (token == null) {
return newToken();
}
token.nextToken = newToken();
return token.nextToken;
}
private static class Token {
final AttributeSource attSource;
final PositionIncrementAttribute posIncAtt;
final CharTermAttribute termAtt;
final OffsetAttribute offsetAtt;
Token nextToken;
Token(AttributeSource attSource) {
this.attSource = attSource;
this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
this.termAtt = attSource.addAttribute(CharTermAttribute.class);
this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
}
int posInc() {
return this.posIncAtt.getPositionIncrement();
}
CharSequence term() {
return this.termAtt;
}
int startOffset() {
return this.offsetAtt.startOffset();
}
int endOffset() {
return this.offsetAtt.endOffset();
}
void reset(AttributeSource attSource) {
attSource.copyTo(this.attSource);
this.nextToken = null;
}
@Override
public String toString() {
return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
}
}
}

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.shingle;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link FixedShingleFilter}
*
* Parameters are:
* <ul>
* <li>shingleSize - how many tokens should be combined into each shingle (default: 2)
* <li>tokenSeparator - how tokens should be joined together in the shingle (default: space)
* <li>fillerToken - what should be added in place of stop words (default: _ )
* </ul>
*/
public class FixedShingleFilterFactory extends TokenFilterFactory {
private final int shingleSize;
private final String tokenSeparator;
private final String fillerToken;
public FixedShingleFilterFactory(Map<String, String> args) {
super(args);
this.shingleSize = getInt(args, "shingleSize", 2);
this.tokenSeparator = get(args, "tokenSeparator", " ");
this.fillerToken = get(args, "fillerToken", "_");
}
@Override
public TokenStream create(TokenStream input) {
return new FixedShingleFilter(input, shingleSize, tokenSeparator, fillerToken);
}
}

View File

@ -101,6 +101,7 @@ org.apache.lucene.analysis.pt.PortugueseStemFilterFactory
org.apache.lucene.analysis.reverse.ReverseStringFilterFactory org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
org.apache.lucene.analysis.ru.RussianLightStemFilterFactory org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
org.apache.lucene.analysis.shingle.ShingleFilterFactory org.apache.lucene.analysis.shingle.ShingleFilterFactory
org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
org.apache.lucene.analysis.standard.ClassicFilterFactory org.apache.lucene.analysis.standard.ClassicFilterFactory

View File

@ -0,0 +1,200 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.shingle;
import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
public void testBiGramFilter() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 2),
new String[]{"please divide", "divide this", "this sentence", "sentence into", "into shingles"},
new int[]{0, 7, 14, 19, 28,},
new int[]{13, 18, 27, 32, 41,},
new String[]{"shingle", "shingle", "shingle", "shingle", "shingle",},
new int[]{1, 1, 1, 1, 1,},
new int[]{2, 2, 2, 2, 2});
}
public void testBiGramFilterWithAltSeparator() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 2, "<SEP>", "_"),
new String[]{"please<SEP>divide", "divide<SEP>this", "this<SEP>sentence", "sentence<SEP>into", "into<SEP>shingles"},
new int[]{0, 7, 14, 19, 28},
new int[]{13, 18, 27, 32, 41},
new String[]{"shingle", "shingle", "shingle", "shingle", "shingle"},
new int[]{1, 1, 1, 1, 1});
}
public void testTriGramFilter() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[]{"please divide this", "divide this sentence", "this sentence into", "sentence into shingles"});
}
public void testShingleSizeGreaterThanTokenstreamLength() throws IOException {
TokenStream ts = new FixedShingleFilter(new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13)
), 3);
ts.reset();
assertFalse(ts.incrementToken());
}
public void testWithStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("sentence", 2, 19, 27),
new Token("shingles", 2, 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[]{"please divide _", "divide _ sentence", "sentence _ shingles"},
new int[]{0, 7, 19,},
new int[]{13, 27, 41,},
new String[]{"shingle", "shingle", "shingle",},
new int[]{1, 1, 2,});
}
public void testConsecutiveStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("b", 2, 2, 3),
new Token("c", 4, 5),
new Token("d", 6, 7),
new Token("b", 3, 12, 13),
new Token("c", 14, 15)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 4),
new String[]{"b c d _", "c d _ _", "d _ _ b"},
new int[]{2, 4, 6,},
new int[]{7, 7, 13,},
new int[]{2, 1, 1,});
}
public void testTrailingStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(1, 7,
new Token("b", 0, 1),
new Token("c", 2, 3),
new Token("d", 4, 5)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[] { "b c d", "c d _" },
new int[] { 0, 2, },
new int[] { 5, 5, },
new int[] { 1, 1, });
}
public void testMultipleTrailingStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(2, 9,
new Token("b", 0, 1),
new Token("c", 2, 3),
new Token("d", 4, 5)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[] { "b c d", "c d _", "d _ _" },
new int[] { 0, 2, 4 },
new int[] { 5, 5, 5 },
new int[] { 1, 1, 1 });
}
public void testIncomingGraphs() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("b", 0, 1),
new Token("a", 0, 0, 1),
new Token("c", 2, 3),
new Token("b", 4, 5),
new Token("a", 0, 4, 5),
new Token("d", 6, 7)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 2),
new String[] { "b c", "a c", "c b", "c a", "b d", "a d" },
new int[] { 0, 0, 2, 2, 4, 4 },
new int[] { 3, 3, 5, 5, 7, 7 },
new int[] { 1, 0, 1, 0, 1, 0 });
}
public void testShinglesSpanningGraphs() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("b", 0, 1),
new Token("a", 0, 0, 1),
new Token("c", 2, 3),
new Token("b", 4, 5),
new Token("a", 0, 4, 5),
new Token("d", 6, 7)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d" },
new int[] { 0, 0, 0, 0, 2, 2, },
new int[] { 5, 5, 5, 5, 7, 7, },
new int[] { 1, 0, 0, 0, 1, 0, });
}
}

View File

@ -341,6 +341,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null); assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
} }
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int[] posLengths) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException { public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
assertTokenStreamContents(ts, output, null, null, null, null, null, null); assertTokenStreamContents(ts, output, null, null, null, null, null, null);
} }

View File

@ -75,6 +75,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
setOffset(start, end); setOffset(start, end);
} }
/** Constructs a Token with the given term text, position increment, start and end offsets */
public Token(CharSequence text, int posInc, int start, int end) {
append(text);
setOffset(start, end);
setPositionIncrement(posInc);
}
/** /**
* {@inheritDoc} * {@inheritDoc}
* @see FlagsAttribute * @see FlagsAttribute