mirror of https://github.com/apache/lucene.git
LUCENE-5165: add SuggestStopFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1513940 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cec43e7d92
commit
c74dda1495
|
@ -83,6 +83,13 @@ New features
|
|||
FacetsAggregator.createOrdinalValueResolver. This gives better options for
|
||||
resolving an ordinal's value by FacetAggregators. (Shai Erera)
|
||||
|
||||
* LUCENE-5165: Add SuggestStopFilter, to be used with analyzing
|
||||
suggesters, so that a stop word at the very end of the lookup query,
|
||||
and without any trailing token characters, will be preserved. This
|
||||
enables query "a" to suggest apple; see
|
||||
http://blog.mikemccandless.com/2013/08/suggeststopfilter-carefully-removes.html
|
||||
for details.
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-5116: IndexWriter.addIndexes(IndexReader...) should drop empty (or all
|
||||
|
|
|
@ -34,6 +34,15 @@
|
|||
<path refid="base.classpath"/>
|
||||
</path>
|
||||
|
||||
|
||||
<target name="javadocs" depends="javadocs-queries,compile-core">
|
||||
<invoke-module-javadoc>
|
||||
<links>
|
||||
<link href="../analyzers-common"/>
|
||||
</links>
|
||||
</invoke-module-javadoc>
|
||||
</target>
|
||||
|
||||
<target name="compile-core" depends="jar-misc, jar-analyzers-common, common.compile-core" />
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.search.suggest.analyzing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.AttributeSource.State;
|
||||
|
||||
/** Like {@link StopFilter} except it will not remove the
|
||||
* last token if that token was not followed by some token
|
||||
* separator. For example, a query 'find the' would
|
||||
* preserve the 'the' since it was not followed by a space or
|
||||
* punctuation or something, and mark it KEYWORD so future
|
||||
* stemmers won't touch it either while a query like "find
|
||||
* the popsicle' would remove 'the' as a stopword.
|
||||
*
|
||||
* <p>Normally you'd use the ordinary {@link StopFilter}
|
||||
* in your indexAnalyzer and then this class in your
|
||||
* queryAnalyzer, when using one of the analyzing suggesters. */
|
||||
|
||||
public final class SuggestStopFilter extends TokenFilter {
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final CharArraySet stopWords;
|
||||
|
||||
private State endState;
|
||||
private boolean ended;
|
||||
|
||||
/** Sole constructor. */
|
||||
public SuggestStopFilter(TokenStream input, CharArraySet stopWords) {
|
||||
super(input);
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
ended = false;
|
||||
endState = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
if (!ended) {
|
||||
super.end();
|
||||
} else {
|
||||
// NOTE: we already called .end() from our .next() when
|
||||
// the stream was complete, so we do not call
|
||||
// super.end() here
|
||||
|
||||
if (endState != null) {
|
||||
restoreState(endState);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (ended) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int skippedPositions = 0;
|
||||
while (true) {
|
||||
if (stopWords.contains(termAtt.buffer(), 0, termAtt.length())) {
|
||||
int posInc = posIncAtt.getPositionIncrement();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
// This token may be a stopword, if it's not end:
|
||||
State sav = captureState();
|
||||
if (input.incrementToken()) {
|
||||
// It was a stopword; skip it
|
||||
skippedPositions += posInc;
|
||||
} else {
|
||||
input.end();
|
||||
ended = true;
|
||||
int finalEndOffset = offsetAtt.endOffset();
|
||||
assert finalEndOffset >= endOffset;
|
||||
if (finalEndOffset > endOffset) {
|
||||
// OK there was a token separator after the
|
||||
// stopword, so it was a stopword
|
||||
return false;
|
||||
} else {
|
||||
// No token separator after final token that
|
||||
// looked like a stop-word; don't filter it:
|
||||
endState = captureState();
|
||||
restoreState(sav);
|
||||
posIncAtt.setPositionIncrement(skippedPositions + posIncAtt.getPositionIncrement());
|
||||
keywordAtt.setKeyword(true);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Not a stopword; return the current token:
|
||||
posIncAtt.setPositionIncrement(skippedPositions + posIncAtt.getPositionIncrement());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -25,11 +25,8 @@ import java.util.Locale;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||
import org.apache.lucene.search.suggest.TermFreqPayload;
|
||||
import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator;
|
||||
|
@ -294,16 +291,32 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
|
|||
suggester.close();
|
||||
}
|
||||
|
||||
public void testForkLastToken() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
public void testSuggestStopFilter() throws Exception {
|
||||
final CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "a");
|
||||
Analyzer indexAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
MockTokenizer tokens = new MockTokenizer(reader);
|
||||
// ForkLastTokenFilter is a bit evil:
|
||||
tokens.setEnableChecks(false);
|
||||
return new TokenStreamComponents(tokens,
|
||||
new StopKeywordFilter(TEST_VERSION_CURRENT,
|
||||
new ForkLastTokenFilter(tokens), StopKeywordFilter.makeStopSet(TEST_VERSION_CURRENT, "a")));
|
||||
new StopFilter(TEST_VERSION_CURRENT, tokens, stopWords));
|
||||
}
|
||||
};
|
||||
|
||||
Analyzer queryAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
MockTokenizer tokens = new MockTokenizer(reader);
|
||||
return new TokenStreamComponents(tokens,
|
||||
new SuggestStopFilter(tokens, stopWords));
|
||||
}
|
||||
};
|
||||
|
||||
File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
|
||||
|
||||
AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, indexAnalyzer, queryAnalyzer, 3) {
|
||||
@Override
|
||||
protected Directory getDirectory(File path) {
|
||||
return newDirectory();
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -311,47 +324,6 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
|
|||
new TermFreqPayload("a bob for apples", 10, new BytesRef("foobaz")),
|
||||
};
|
||||
|
||||
File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
|
||||
|
||||
AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
|
||||
@Override
|
||||
protected Query finishQuery(BooleanQuery in, boolean allTermsRequired) {
|
||||
List<BooleanClause> clauses = in.clauses();
|
||||
if (clauses.size() >= 2 && allTermsRequired) {
|
||||
String t1 = getTerm(clauses.get(clauses.size()-2).getQuery());
|
||||
String t2 = getTerm(clauses.get(clauses.size()-1).getQuery());
|
||||
if (t1.equals(t2)) {
|
||||
// The last 2 tokens came from
|
||||
// ForkLastTokenFilter; we remove them and
|
||||
// replace them with a MUST BooleanQuery that
|
||||
// SHOULDs the two of them together:
|
||||
BooleanQuery sub = new BooleanQuery();
|
||||
BooleanClause other = clauses.get(clauses.size()-2);
|
||||
sub.add(new BooleanClause(clauses.get(clauses.size()-2).getQuery(), BooleanClause.Occur.SHOULD));
|
||||
sub.add(new BooleanClause(clauses.get(clauses.size()-1).getQuery(), BooleanClause.Occur.SHOULD));
|
||||
clauses.subList(clauses.size()-2, clauses.size()).clear();
|
||||
clauses.add(new BooleanClause(sub, BooleanClause.Occur.MUST));
|
||||
}
|
||||
}
|
||||
return in;
|
||||
}
|
||||
|
||||
private String getTerm(Query query) {
|
||||
if (query instanceof TermQuery) {
|
||||
return ((TermQuery) query).getTerm().text();
|
||||
} else if (query instanceof PrefixQuery) {
|
||||
return ((PrefixQuery) query).getPrefix().text();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Directory getDirectory(File path) {
|
||||
return newDirectory();
|
||||
}
|
||||
};
|
||||
|
||||
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
||||
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("a", random()), 10, true, true);
|
||||
assertEquals(1, results.size());
|
||||
|
|
|
@ -1,89 +0,0 @@
|
|||
package org.apache.lucene.search.suggest.analyzing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/** Repeats the last token, if the endOffset indicates that
|
||||
* the token didn't have any characters after it (i.e. it
|
||||
* is not "done"). This is useful in analyzing
|
||||
* suggesters along with StopKeywordFilter: imagine the
|
||||
* user has typed 'a', but your stop filter would normally
|
||||
* remove that. This token filter will repeat that last a
|
||||
* token, setting {@link KeywordAttribute}, so that the
|
||||
* {@link StopKeywordFilter} won't remove it, and then
|
||||
* suggestions starting with a will be shown. */
|
||||
|
||||
final class ForkLastTokenFilter extends TokenFilter {
|
||||
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
State lastToken;
|
||||
int maxEndOffset;
|
||||
boolean stop = false;
|
||||
|
||||
public ForkLastTokenFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (stop) {
|
||||
return false;
|
||||
} else if (input.incrementToken()) {
|
||||
lastToken = captureState();
|
||||
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
|
||||
return true;
|
||||
} else if (lastToken == null) {
|
||||
return false;
|
||||
} else {
|
||||
|
||||
// TODO: this is iffy!!! maybe somehow instead caller
|
||||
// could tell us endOffset up front?
|
||||
input.end();
|
||||
|
||||
if (offsetAtt.endOffset() == maxEndOffset) {
|
||||
// Text did not see end of token char:
|
||||
restoreState(lastToken);
|
||||
keywordAtt.setKeyword(true);
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
lastToken = null;
|
||||
stop = true;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
lastToken = null;
|
||||
maxEndOffset = -1;
|
||||
stop = false;
|
||||
}
|
||||
}
|
|
@ -1,131 +0,0 @@
|
|||
package org.apache.lucene.search.suggest.analyzing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes stop words from a token stream, if
|
||||
* {@link KeywordAttribute} is set then does not remove the
|
||||
* word.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StopFilter:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
|
||||
* supplementary characters in stopwords and position
|
||||
* increments are preserved
|
||||
* </ul>
|
||||
*/
|
||||
final class StopKeywordFilter extends FilteringTokenFilter {
|
||||
|
||||
private final CharArraySet stopWords;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
|
||||
/**
|
||||
* Constructs a filter which removes words from the input TokenStream that are
|
||||
* named in the Set.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the stop
|
||||
* set if Version > 3.0. See <a href="#version">above</a> for details.
|
||||
* @param in
|
||||
* Input stream
|
||||
* @param stopWords
|
||||
* A {@link CharArraySet} representing the stopwords.
|
||||
* @see #makeStopSet(Version, java.lang.String...)
|
||||
*/
|
||||
public StopKeywordFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
|
||||
super(matchVersion, in);
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a Set from an array of stop words,
|
||||
* appropriate for passing into the StopFilter constructor.
|
||||
* This permits this stopWords construction to be cached once when
|
||||
* an Analyzer is constructed.
|
||||
*
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords An array of stopwords
|
||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static CharArraySet makeStopSet(Version matchVersion, String... stopWords) {
|
||||
return makeStopSet(matchVersion, stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a Set from an array of stop words,
|
||||
* appropriate for passing into the StopFilter constructor.
|
||||
* This permits this stopWords construction to be cached once when
|
||||
* an Analyzer is constructed.
|
||||
*
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords) {
|
||||
return makeStopSet(matchVersion, stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a stopword set from the given stopword array.
|
||||
*
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords An array of stopwords
|
||||
* @param ignoreCase If true, all words are lower cased first.
|
||||
* @return a Set containing the words
|
||||
*/
|
||||
public static CharArraySet makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
|
||||
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
|
||||
stopSet.addAll(Arrays.asList(stopWords));
|
||||
return stopSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a stopword set from the given stopword list.
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
|
||||
* @param ignoreCase if true, all words are lower cased first
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
*/
|
||||
public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
|
||||
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
|
||||
stopSet.addAll(stopWords);
|
||||
return stopSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is not a stop word.
|
||||
*/
|
||||
@Override
|
||||
protected boolean accept() {
|
||||
return keywordAtt.isKeyword() || !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,140 @@
|
|||
package org.apache.lucene.search.suggest.analyzing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestSuggestStopFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testEndNotStopWord() throws Exception {
|
||||
CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to");
|
||||
TokenStream stream = new MockTokenizer(new StringReader("go to"));
|
||||
TokenStream filter = new SuggestStopFilter(stream, stopWords);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] {"go", "to"},
|
||||
new int[] {0, 3},
|
||||
new int[] {2, 5},
|
||||
null,
|
||||
new int[] {1, 1},
|
||||
null,
|
||||
5,
|
||||
new boolean[] {false, true},
|
||||
true);
|
||||
}
|
||||
|
||||
public void testEndIsStopWord() throws Exception {
|
||||
|
||||
CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to");
|
||||
TokenStream stream = new MockTokenizer(new StringReader("go to "));
|
||||
TokenStream filter = new SuggestStopFilter(stream, stopWords);
|
||||
|
||||
filter = new SuggestStopFilter(stream, stopWords);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] {"go"},
|
||||
new int[] {0},
|
||||
new int[] {2},
|
||||
null,
|
||||
new int[] {1},
|
||||
null,
|
||||
6,
|
||||
new boolean[] {false},
|
||||
true);
|
||||
}
|
||||
|
||||
public void testMidStopWord() throws Exception {
|
||||
|
||||
CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to");
|
||||
TokenStream stream = new MockTokenizer(new StringReader("go to school"));
|
||||
TokenStream filter = new SuggestStopFilter(stream, stopWords);
|
||||
|
||||
filter = new SuggestStopFilter(stream, stopWords);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] {"go", "school"},
|
||||
new int[] {0, 6},
|
||||
new int[] {2, 12},
|
||||
null,
|
||||
new int[] {1, 2},
|
||||
null,
|
||||
12,
|
||||
new boolean[] {false, false},
|
||||
true);
|
||||
}
|
||||
|
||||
public void testMultipleStopWords() throws Exception {
|
||||
|
||||
CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a");
|
||||
TokenStream stream = new MockTokenizer(new StringReader("go to a the school"));
|
||||
TokenStream filter = new SuggestStopFilter(stream, stopWords);
|
||||
|
||||
filter = new SuggestStopFilter(stream, stopWords);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] { "go", "school" },
|
||||
new int[] {0, 12},
|
||||
new int[] {2, 18},
|
||||
null,
|
||||
new int[] {1, 4},
|
||||
null,
|
||||
18,
|
||||
new boolean[] {false, false},
|
||||
true);
|
||||
}
|
||||
|
||||
public void testMultipleStopWordsEnd() throws Exception {
|
||||
|
||||
CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a");
|
||||
TokenStream stream = new MockTokenizer(new StringReader("go to a the"));
|
||||
TokenStream filter = new SuggestStopFilter(stream, stopWords);
|
||||
|
||||
filter = new SuggestStopFilter(stream, stopWords);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] { "go", "the"},
|
||||
new int[] {0, 8},
|
||||
new int[] {2, 11},
|
||||
null,
|
||||
new int[] {1, 3},
|
||||
null,
|
||||
11,
|
||||
new boolean[] {false, true},
|
||||
true);
|
||||
}
|
||||
|
||||
public void testMultipleStopWordsEnd2() throws Exception {
|
||||
|
||||
CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a");
|
||||
TokenStream stream = new MockTokenizer(new StringReader("go to a the "));
|
||||
TokenStream filter = new SuggestStopFilter(stream, stopWords);
|
||||
|
||||
filter = new SuggestStopFilter(stream, stopWords);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[] { "go"},
|
||||
new int[] {0},
|
||||
new int[] {2},
|
||||
null,
|
||||
new int[] {1},
|
||||
null,
|
||||
12,
|
||||
new boolean[] {false},
|
||||
true);
|
||||
}
|
||||
}
|
|
@ -111,7 +111,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
// arriving to pos Y have the same endOffset)
|
||||
// - offsets only move forwards (startOffset >=
|
||||
// lastStartOffset)
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
|
||||
int posLengths[], Integer finalOffset, boolean[] keywordAtts,
|
||||
boolean offsetsAreCorrect) throws IOException {
|
||||
assertNotNull(output);
|
||||
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
|
||||
|
@ -145,6 +146,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
|
||||
posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
|
||||
}
|
||||
|
||||
KeywordAttribute keywordAtt = null;
|
||||
if (keywordAtts != null) {
|
||||
assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
|
||||
keywordAtt = ts.getAttribute(KeywordAttribute.class);
|
||||
}
|
||||
|
||||
// Maps position to the start/end offset:
|
||||
final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
|
||||
|
@ -161,22 +168,31 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
if (typeAtt != null) typeAtt.setType("bogusType");
|
||||
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
|
||||
if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
|
||||
if (keywordAtt != null) keywordAtt.setKeyword((i&1) == 0);
|
||||
|
||||
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
|
||||
assertTrue("token "+i+" does not exist", ts.incrementToken());
|
||||
assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled());
|
||||
|
||||
assertEquals("term "+i, output[i], termAtt.toString());
|
||||
if (startOffsets != null)
|
||||
if (startOffsets != null) {
|
||||
assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset());
|
||||
if (endOffsets != null)
|
||||
}
|
||||
if (endOffsets != null) {
|
||||
assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset());
|
||||
if (types != null)
|
||||
}
|
||||
if (types != null) {
|
||||
assertEquals("type "+i, types[i], typeAtt.type());
|
||||
if (posIncrements != null)
|
||||
}
|
||||
if (posIncrements != null) {
|
||||
assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
|
||||
if (posLengths != null)
|
||||
}
|
||||
if (posLengths != null) {
|
||||
assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength());
|
||||
}
|
||||
if (keywordAtts != null) {
|
||||
assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
|
||||
}
|
||||
|
||||
// we can enforce some basic things about a few attributes even if the caller doesn't check:
|
||||
if (offsetAtt != null) {
|
||||
|
@ -239,7 +255,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
|
||||
}
|
||||
}
|
||||
assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
|
||||
if (ts.incrementToken()) {
|
||||
fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString());
|
||||
}
|
||||
ts.end();
|
||||
if (finalOffset != null) {
|
||||
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
|
||||
|
@ -250,6 +268,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
ts.close();
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue