mirror of https://github.com/apache/lucene.git
LUCENE-1380: Add PositionFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@725691 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f92d3cc82f
commit
2225462178
|
@ -1,4 +1,4 @@
|
||||||
Lucene Change Log
|
Lucene Change Log
|
||||||
$Id$
|
$Id$
|
||||||
|
|
||||||
======================= Trunk (not yet released) =======================
|
======================= Trunk (not yet released) =======================
|
||||||
|
@ -879,6 +879,8 @@ New features
|
||||||
the query parser ignores position increments).
|
the query parser ignores position increments).
|
||||||
(Doron Cohen)
|
(Doron Cohen)
|
||||||
|
|
||||||
|
13. LUCENE-1380: Added TokenFilter for setting position increment in special cases related to the ShingleFilter (Mck SembWever, Steve Rowe, Karl Wettin via Grant Ingersoll)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
|
@ -0,0 +1,79 @@
|
||||||
|
package org.apache.lucene.analysis.position;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
|
/** Set the positionIncrement of all tokens to the "positionIncrement",
|
||||||
|
* except the first return token which retains its original positionIncrement value.
|
||||||
|
* The default positionIncrement value is zero.
|
||||||
|
*/
|
||||||
|
public class PositionFilter extends TokenFilter {
|
||||||
|
|
||||||
|
/** Position increment to assign to all but the first token - default = 0 */
|
||||||
|
private int positionIncrement = 0;
|
||||||
|
|
||||||
|
/** The first token must have non-zero positionIncrement **/
|
||||||
|
private boolean firstTokenPositioned = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a PositionFilter that assigns a position increment of zero to
|
||||||
|
* all but the first token from the given input stream.
|
||||||
|
*
|
||||||
|
* @param input the input stream
|
||||||
|
*/
|
||||||
|
public PositionFilter(final TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a PositionFilter that assigns the given position increment to
|
||||||
|
* all but the first token from the given input stream.
|
||||||
|
*
|
||||||
|
* @param input the input stream
|
||||||
|
* @param positionIncrement position increment to assign to all but the first
|
||||||
|
* token from the input stream
|
||||||
|
*/
|
||||||
|
public PositionFilter(final TokenStream input, final int positionIncrement) {
|
||||||
|
this(input);
|
||||||
|
this.positionIncrement = positionIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token next(Token reusableToken) throws IOException {
|
||||||
|
|
||||||
|
assert reusableToken != null;
|
||||||
|
reusableToken = input.next(reusableToken);
|
||||||
|
if (null != reusableToken) {
|
||||||
|
if (firstTokenPositioned) {
|
||||||
|
reusableToken.setPositionIncrement(positionIncrement);
|
||||||
|
} else {
|
||||||
|
firstTokenPositioned = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return reusableToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
firstTokenPositioned = false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,174 @@
|
||||||
|
package org.apache.lucene.analysis.position;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||||
|
|
||||||
|
public class PositionFilterTest extends TestCase {
|
||||||
|
|
||||||
|
public class TestTokenStream extends TokenStream {
|
||||||
|
|
||||||
|
protected int index = 0;
|
||||||
|
protected Token[] testToken;
|
||||||
|
|
||||||
|
public TestTokenStream(Token[] testToken) {
|
||||||
|
super();
|
||||||
|
this.testToken = testToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token next(final Token reusableToken) throws IOException {
|
||||||
|
assert reusableToken != null;
|
||||||
|
if (index < testToken.length) {
|
||||||
|
return testToken[index++];
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public void reset() {
|
||||||
|
index = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
junit.textui.TestRunner.run(PositionFilterTest.class);
|
||||||
|
}
|
||||||
|
public static final Token[] TEST_TOKEN = new Token[]{
|
||||||
|
createToken("please"),
|
||||||
|
createToken("divide"),
|
||||||
|
createToken("this"),
|
||||||
|
createToken("sentence"),
|
||||||
|
createToken("into"),
|
||||||
|
createToken("shingles"),
|
||||||
|
};
|
||||||
|
public static final int[] TEST_TOKEN_POSITION_INCREMENTS = new int[]{
|
||||||
|
1, 0, 0, 0, 0, 0
|
||||||
|
};
|
||||||
|
public static final int[] TEST_TOKEN_NON_ZERO_POSITION_INCREMENTS = new int[]{
|
||||||
|
1, 5, 5, 5, 5, 5
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Token[] SIX_GRAM_NO_POSITIONS_TOKENS = new Token[]{
|
||||||
|
createToken("please"),
|
||||||
|
createToken("please divide"),
|
||||||
|
createToken("please divide this"),
|
||||||
|
createToken("please divide this sentence"),
|
||||||
|
createToken("please divide this sentence into"),
|
||||||
|
createToken("please divide this sentence into shingles"),
|
||||||
|
createToken("divide"),
|
||||||
|
createToken("divide this"),
|
||||||
|
createToken("divide this sentence"),
|
||||||
|
createToken("divide this sentence into"),
|
||||||
|
createToken("divide this sentence into shingles"),
|
||||||
|
createToken("this"),
|
||||||
|
createToken("this sentence"),
|
||||||
|
createToken("this sentence into"),
|
||||||
|
createToken("this sentence into shingles"),
|
||||||
|
createToken("sentence"),
|
||||||
|
createToken("sentence into"),
|
||||||
|
createToken("sentence into shingles"),
|
||||||
|
createToken("into"),
|
||||||
|
createToken("into shingles"),
|
||||||
|
createToken("shingles"),
|
||||||
|
};
|
||||||
|
public static final int[] SIX_GRAM_NO_POSITIONS_INCREMENTS = new int[]{
|
||||||
|
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||||
|
};
|
||||||
|
public static final String[] SIX_GRAM_NO_POSITIONS_TYPES = new String[]{
|
||||||
|
"word", "shingle", "shingle", "shingle", "shingle", "shingle",
|
||||||
|
"word", "shingle", "shingle", "shingle", "shingle",
|
||||||
|
"word", "shingle", "shingle", "shingle",
|
||||||
|
"word", "shingle", "shingle",
|
||||||
|
"word", "shingle",
|
||||||
|
"word"
|
||||||
|
};
|
||||||
|
|
||||||
|
public void testFilter() throws IOException {
|
||||||
|
|
||||||
|
filterTest(new PositionFilter(new TestTokenStream(TEST_TOKEN)),
|
||||||
|
TEST_TOKEN,
|
||||||
|
TEST_TOKEN_POSITION_INCREMENTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNonZeroPositionIncrement() throws IOException {
|
||||||
|
|
||||||
|
filterTest(new PositionFilter(new TestTokenStream(TEST_TOKEN), 5),
|
||||||
|
TEST_TOKEN,
|
||||||
|
TEST_TOKEN_NON_ZERO_POSITION_INCREMENTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testReset() throws IOException {
|
||||||
|
|
||||||
|
PositionFilter filter = new PositionFilter(new TestTokenStream(TEST_TOKEN));
|
||||||
|
filterTest(filter, TEST_TOKEN, TEST_TOKEN_POSITION_INCREMENTS);
|
||||||
|
filter.reset();
|
||||||
|
// Make sure that the reset filter provides correct position increments
|
||||||
|
filterTest(filter, TEST_TOKEN, TEST_TOKEN_POSITION_INCREMENTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Tests ShingleFilter up to six shingles against six terms.
|
||||||
|
* Tests PositionFilter setting all but the first positionIncrement to zero.
|
||||||
|
* @throws java.io.IOException @see Token#next(Token)
|
||||||
|
*/
|
||||||
|
public void test6GramFilterNoPositions() throws IOException {
|
||||||
|
|
||||||
|
ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
|
||||||
|
filterTest(new PositionFilter(filter),
|
||||||
|
SIX_GRAM_NO_POSITIONS_TOKENS,
|
||||||
|
SIX_GRAM_NO_POSITIONS_INCREMENTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected TokenStream filterTest(final TokenStream filter,
|
||||||
|
final Token[] tokensToCompare,
|
||||||
|
final int[] positionIncrements)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
final Token reusableToken = new Token();
|
||||||
|
|
||||||
|
for (Token nextToken = filter.next(reusableToken)
|
||||||
|
; i < tokensToCompare.length
|
||||||
|
; nextToken = filter.next(reusableToken)) {
|
||||||
|
|
||||||
|
if (null != nextToken) {
|
||||||
|
final String termText = nextToken.term();
|
||||||
|
final String goldText = tokensToCompare[i].term();
|
||||||
|
|
||||||
|
assertEquals("Wrong termText", goldText, termText);
|
||||||
|
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
|
||||||
|
positionIncrements[i], nextToken.getPositionIncrement());
|
||||||
|
}else{
|
||||||
|
assertNull(tokensToCompare[i]);
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
return filter;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Token createToken(String term) {
|
||||||
|
final Token token = new Token();
|
||||||
|
if (null != term) {
|
||||||
|
token.setTermBuffer(term);
|
||||||
|
}
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue