mirror of https://github.com/apache/lucene.git
LUCENE-7877: Add ConcatenatingTokenStream, remove PrefixAwareTokenFilter
This commit is contained in:
parent
ad2cb7784e
commit
a948e17146
|
@ -83,6 +83,9 @@ API Changes
|
|||
* LUCENE-7868: IndexWriterConfig.setMaxBufferedDeleteTerms is
|
||||
removed. (Simon Willnauer, Mike McCandless)
|
||||
|
||||
* LUCENE-7877: PrefixAwareTokenStream is replaced with ConcatenatingTokenStream
|
||||
(Alan Woodward, Uwe Schindler, Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-7626: IndexWriter will no longer accept broken token offsets
|
||||
|
|
|
@ -131,3 +131,9 @@ they might have more than 2B matches in total. However TopDocs instances
|
|||
returned by IndexSearcher will still have a total number of hits which is less
|
||||
than 2B since Lucene indexes are still bound to at most 2B documents, so it
|
||||
can safely be casted to an int in that case.
|
||||
|
||||
## PrefixAwareTokenFilter and PrefixAndSuffixAwareTokenFilter removed
|
||||
(LUCENE-7877)
|
||||
|
||||
Instead use ConcatentingTokenStream, which will allow for the use of custom
|
||||
attributes.
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* A TokenStream that takes an array of input TokenStreams as sources, and
|
||||
* concatenates them together.
|
||||
*
|
||||
* Offsets from the second and subsequent sources are incremented to behave
|
||||
* as if all the inputs were from a single source.
|
||||
*
|
||||
* All of the input TokenStreams must have the same attribute implementations
|
||||
*/
|
||||
public final class ConcatenatingTokenStream extends TokenStream {
|
||||
|
||||
private final TokenStream[] sources;
|
||||
private final OffsetAttribute[] sourceOffsets;
|
||||
private final OffsetAttribute offsetAtt;
|
||||
|
||||
private int currentSource;
|
||||
private int offsetIncrement;
|
||||
|
||||
/**
|
||||
* Create a new ConcatenatingTokenStream from a set of inputs
|
||||
* @param sources an array of TokenStream inputs to concatenate
|
||||
*/
|
||||
public ConcatenatingTokenStream(TokenStream... sources) {
|
||||
super(combineSources(sources));
|
||||
this.sources = sources;
|
||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
this.sourceOffsets = new OffsetAttribute[sources.length];
|
||||
for (int i = 0; i < sources.length; i++) {
|
||||
this.sourceOffsets[i] = sources[i].addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
}
|
||||
|
||||
private static AttributeSource combineSources(TokenStream... sources) {
|
||||
AttributeSource base = sources[0].cloneAttributes();
|
||||
try {
|
||||
for (int i = 1; i < sources.length; i++) {
|
||||
Iterator<Class<? extends Attribute>> it = sources[i].getAttributeClassesIterator();
|
||||
while (it.hasNext()) {
|
||||
base.addAttribute(it.next());
|
||||
}
|
||||
// check attributes can be captured
|
||||
sources[i].copyTo(base);
|
||||
}
|
||||
return base;
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException("Attempted to concatenate TokenStreams with different attribute types", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (sources[currentSource].incrementToken() == false) {
|
||||
if (currentSource >= sources.length - 1)
|
||||
return false;
|
||||
sources[currentSource].end();
|
||||
OffsetAttribute att = sourceOffsets[currentSource];
|
||||
if (att != null)
|
||||
offsetIncrement += att.endOffset();
|
||||
currentSource++;
|
||||
}
|
||||
|
||||
clearAttributes();
|
||||
sources[currentSource].copyTo(this);
|
||||
offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
sources[currentSource].end();
|
||||
super.end();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
for (TokenStream source : sources) {
|
||||
source.reset();
|
||||
}
|
||||
super.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
IOUtils.close(sources);
|
||||
}
|
||||
finally {
|
||||
super.close();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,84 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Links two {@link PrefixAwareTokenFilter}.
|
||||
* <p>
|
||||
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
||||
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
||||
*/
|
||||
public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
|
||||
|
||||
private PrefixAwareTokenFilter suffix;
|
||||
|
||||
public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) {
|
||||
super(suffix);
|
||||
prefix = new PrefixAwareTokenFilter(prefix, input) {
|
||||
@Override
|
||||
public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
|
||||
return PrefixAndSuffixAwareTokenFilter.this.updateInputToken(suffixToken, lastInputToken);
|
||||
}
|
||||
};
|
||||
this.suffix = new PrefixAwareTokenFilter(prefix, suffix) {
|
||||
@Override
|
||||
public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
|
||||
return PrefixAndSuffixAwareTokenFilter.this.updateSuffixToken(suffixToken, lastInputToken);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public Token updateInputToken(Token inputToken, Token lastPrefixToken) {
|
||||
inputToken.setOffset(lastPrefixToken.endOffset() + inputToken.startOffset(),
|
||||
lastPrefixToken.endOffset() + inputToken.endOffset());
|
||||
return inputToken;
|
||||
}
|
||||
|
||||
public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
|
||||
suffixToken.setOffset(lastInputToken.endOffset() + suffixToken.startOffset(),
|
||||
lastInputToken.endOffset() + suffixToken.endOffset());
|
||||
return suffixToken;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
return suffix.incrementToken();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
suffix.reset();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
suffix.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
suffix.end();
|
||||
}
|
||||
}
|
|
@ -1,202 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* Joins two token streams and leaves the last token of the first stream available
|
||||
* to be used when updating the token values in the second stream based on that token.
|
||||
*
|
||||
* The default implementation adds last prefix token end offset to the suffix token start and end offsets.
|
||||
* <p>
|
||||
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
||||
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
||||
*/
|
||||
public class PrefixAwareTokenFilter extends TokenStream {
|
||||
|
||||
private TokenStream prefix;
|
||||
private TokenStream suffix;
|
||||
|
||||
private CharTermAttribute termAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private PayloadAttribute payloadAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
private FlagsAttribute flagsAtt;
|
||||
|
||||
private CharTermAttribute p_termAtt;
|
||||
private PositionIncrementAttribute p_posIncrAtt;
|
||||
private PayloadAttribute p_payloadAtt;
|
||||
private OffsetAttribute p_offsetAtt;
|
||||
private TypeAttribute p_typeAtt;
|
||||
private FlagsAttribute p_flagsAtt;
|
||||
|
||||
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) {
|
||||
super(suffix);
|
||||
this.suffix = suffix;
|
||||
this.prefix = prefix;
|
||||
prefixExhausted = false;
|
||||
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
typeAtt = addAttribute(TypeAttribute.class);
|
||||
flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
|
||||
p_termAtt = prefix.addAttribute(CharTermAttribute.class);
|
||||
p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class);
|
||||
p_payloadAtt = prefix.addAttribute(PayloadAttribute.class);
|
||||
p_offsetAtt = prefix.addAttribute(OffsetAttribute.class);
|
||||
p_typeAtt = prefix.addAttribute(TypeAttribute.class);
|
||||
p_flagsAtt = prefix.addAttribute(FlagsAttribute.class);
|
||||
}
|
||||
|
||||
private Token previousPrefixToken = new Token();
|
||||
private Token reusableToken = new Token();
|
||||
|
||||
private boolean prefixExhausted;
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!prefixExhausted) {
|
||||
Token nextToken = getNextPrefixInputToken(reusableToken);
|
||||
if (nextToken == null) {
|
||||
prefixExhausted = true;
|
||||
} else {
|
||||
previousPrefixToken.reinit(nextToken);
|
||||
// Make it a deep copy
|
||||
BytesRef p = previousPrefixToken.getPayload();
|
||||
if (p != null) {
|
||||
previousPrefixToken.setPayload(p.clone());
|
||||
}
|
||||
setCurrentToken(nextToken);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
Token nextToken = getNextSuffixInputToken(reusableToken);
|
||||
if (nextToken == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
nextToken = updateSuffixToken(nextToken, previousPrefixToken);
|
||||
setCurrentToken(nextToken);
|
||||
return true;
|
||||
}
|
||||
|
||||
private void setCurrentToken(Token token) {
|
||||
if (token == null) return;
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(token.buffer(), 0, token.length());
|
||||
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
||||
flagsAtt.setFlags(token.getFlags());
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
typeAtt.setType(token.type());
|
||||
payloadAtt.setPayload(token.getPayload());
|
||||
}
|
||||
|
||||
private Token getNextPrefixInputToken(Token token) throws IOException {
|
||||
if (!prefix.incrementToken()) return null;
|
||||
token.copyBuffer(p_termAtt.buffer(), 0, p_termAtt.length());
|
||||
token.setPositionIncrement(p_posIncrAtt.getPositionIncrement());
|
||||
token.setFlags(p_flagsAtt.getFlags());
|
||||
token.setOffset(p_offsetAtt.startOffset(), p_offsetAtt.endOffset());
|
||||
token.setType(p_typeAtt.type());
|
||||
token.setPayload(p_payloadAtt.getPayload());
|
||||
return token;
|
||||
}
|
||||
|
||||
private Token getNextSuffixInputToken(Token token) throws IOException {
|
||||
if (!suffix.incrementToken()) return null;
|
||||
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||
token.setPositionIncrement(posIncrAtt.getPositionIncrement());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
token.setType(typeAtt.type());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
return token;
|
||||
}
|
||||
|
||||
/**
|
||||
* The default implementation adds last prefix token end offset to the suffix token start and end offsets.
|
||||
*
|
||||
* @param suffixToken a token from the suffix stream
|
||||
* @param lastPrefixToken the last token from the prefix stream
|
||||
* @return consumer token
|
||||
*/
|
||||
public Token updateSuffixToken(Token suffixToken, Token lastPrefixToken) {
|
||||
suffixToken.setOffset(lastPrefixToken.endOffset() + suffixToken.startOffset(),
|
||||
lastPrefixToken.endOffset() + suffixToken.endOffset());
|
||||
return suffixToken;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
prefix.end();
|
||||
suffix.end();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
prefix.close();
|
||||
suffix.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
if (prefix != null) {
|
||||
prefixExhausted = false;
|
||||
prefix.reset();
|
||||
}
|
||||
if (suffix != null) {
|
||||
suffix.reset();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
public TokenStream getPrefix() {
|
||||
return prefix;
|
||||
}
|
||||
|
||||
public void setPrefix(TokenStream prefix) {
|
||||
this.prefix = prefix;
|
||||
}
|
||||
|
||||
public TokenStream getSuffix() {
|
||||
return suffix;
|
||||
}
|
||||
|
||||
public void setSuffix(TokenStream suffix) {
|
||||
this.suffix = suffix;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testBasic() throws IOException {
|
||||
|
||||
AttributeFactory factory = newAttributeFactory();
|
||||
|
||||
final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
|
||||
first.setReader(new StringReader("first words "));
|
||||
final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
|
||||
second.setReader(new StringReader("second words"));
|
||||
final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
|
||||
third.setReader(new StringReader(" third words"));
|
||||
|
||||
TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "first", "words", "second", "words", "third", "words" },
|
||||
new int[]{ 0, 6, 12, 19, 25, 31 },
|
||||
new int[]{ 5, 11, 18, 24, 30, 36 });
|
||||
|
||||
}
|
||||
|
||||
public void testInconsistentAttributes() throws IOException {
|
||||
|
||||
AttributeFactory factory = newAttributeFactory();
|
||||
|
||||
final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
|
||||
first.setReader(new StringReader("first words "));
|
||||
first.addAttribute(PayloadAttribute.class);
|
||||
final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
|
||||
second.setReader(new StringReader("second words"));
|
||||
second.addAttribute(FlagsAttribute.class);
|
||||
|
||||
TokenStream ts = new ConcatenatingTokenStream(first, second);
|
||||
assertTrue(ts.hasAttribute(FlagsAttribute.class));
|
||||
assertTrue(ts.hasAttribute(PayloadAttribute.class));
|
||||
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "first", "words", "second", "words" },
|
||||
new int[]{ 0, 6, 12, 19, },
|
||||
new int[]{ 5, 11, 18, 24, });
|
||||
|
||||
}
|
||||
|
||||
public void testInconsistentAttributeFactories() throws IOException {
|
||||
|
||||
final MockTokenizer first = new MockTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);
|
||||
final MockTokenizer second = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);
|
||||
|
||||
expectThrows(IllegalArgumentException.class, () -> new ConcatenatingTokenStream(first, second));
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class TestPrefixAndSuffixAwareTokenFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
|
||||
final MockTokenizer input = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
input.setReader(new StringReader("hello world"));
|
||||
PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(
|
||||
new CannedTokenStream(createToken("^", 0, 0)),
|
||||
input,
|
||||
new CannedTokenStream(createToken("$", 0, 0)));
|
||||
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "^", "hello", "world", "$" },
|
||||
new int[] { 0, 0, 6, 11 },
|
||||
new int[] { 0, 5, 11, 11 });
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
{
|
||||
return new Token(term, start, offset);
|
||||
}
|
||||
}
|
|
@ -1,60 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class TestPrefixAwareTokenFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
|
||||
PrefixAwareTokenFilter ts;
|
||||
|
||||
ts = new PrefixAwareTokenFilter(
|
||||
new CannedTokenStream(createToken("a", 0, 1)),
|
||||
new CannedTokenStream(createToken("b", 0, 1)));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "a", "b" },
|
||||
new int[] { 0, 1 },
|
||||
new int[] { 1, 2 });
|
||||
|
||||
// prefix and suffix using 2x prefix
|
||||
|
||||
final MockTokenizer suffix = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
suffix.setReader(new StringReader("hello world"));
|
||||
ts = new PrefixAwareTokenFilter(new CannedTokenStream(createToken("^", 0, 0)),
|
||||
suffix);
|
||||
ts = new PrefixAwareTokenFilter(ts, new CannedTokenStream(createToken("$", 0, 0)));
|
||||
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "^", "hello", "world", "$" },
|
||||
new int[] { 0, 0, 6, 11 },
|
||||
new int[] { 0, 5, 11, 11 });
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
{
|
||||
return new Token(term, start, offset);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue