mirror of https://github.com/apache/lucene.git
LUCENE-3431: Removed deprecated addStopwords methods in QueryAutoStopWordAnalyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1170424 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d6ab323d4e
commit
94028fe11a
|
@ -34,6 +34,10 @@ API Changes
|
|||
|
||||
* LUCENE-3400: Removed DutchAnalyzer.setStemDictionary (Chris Male)
|
||||
|
||||
* LUCENE-3431: Removed QueryAutoStopWordAnalyzer.addStopWords* deprecated methods
|
||||
since they prevented reuse. Stopwords are now generated at instantiation through
|
||||
the Analyzer's constructors. (Chris Male)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
|
||||
|
|
|
@ -41,131 +41,130 @@ import java.util.*;
|
|||
* a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
|
||||
* this term to take 2 seconds.
|
||||
* </p>
|
||||
* <p>
|
||||
* Use the various "addStopWords" methods in this class to automate the identification and addition of
|
||||
* stop words found in an already existing index.
|
||||
* </p>
|
||||
*/
|
||||
public final class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||
Analyzer delegate;
|
||||
HashMap<String,HashSet<String>> stopWordsPerField = new HashMap<String,HashSet<String>>();
|
||||
|
||||
private final Analyzer delegate;
|
||||
private final Map<String, Set<String>> stopWordsPerField = new HashMap<String, Set<String>>();
|
||||
//The default maximum percentage (40%) of index documents which
|
||||
//can contain a term, after which the term is considered to be a stop word.
|
||||
public static final float defaultMaxDocFreqPercent = 0.4f;
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Initializes this analyzer with the Analyzer object that actually produces the tokens
|
||||
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
|
||||
* indexed fields from terms with a document frequency percentage greater than
|
||||
* {@link #defaultMaxDocFreqPercent}
|
||||
*
|
||||
* @param delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
|
||||
* @param matchVersion Version to be used in {@link StopFilter}
|
||||
* @param delegate Analyzer whose TokenStream will be filtered
|
||||
* @param indexReader IndexReader to identify the stopwords from
|
||||
* @throws IOException Can be thrown while reading from the IndexReader
|
||||
*/
|
||||
public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) {
|
||||
this.delegate = delegate;
|
||||
public QueryAutoStopWordAnalyzer(
|
||||
Version matchVersion,
|
||||
Analyzer delegate,
|
||||
IndexReader indexReader) throws IOException {
|
||||
this(matchVersion, delegate, indexReader, defaultMaxDocFreqPercent);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
|
||||
* indexed fields from terms with a document frequency greater than the given
|
||||
* maxDocFreq
|
||||
*
|
||||
* @param matchVersion Version to be used in {@link StopFilter}
|
||||
* @param delegate Analyzer whose TokenStream will be filtered
|
||||
* @param indexReader IndexReader to identify the stopwords from
|
||||
* @param maxDocFreq Document frequency terms should be above in order to be stopwords
|
||||
* @throws IOException Can be thrown while reading from the IndexReader
|
||||
*/
|
||||
public QueryAutoStopWordAnalyzer(
|
||||
Version matchVersion,
|
||||
Analyzer delegate,
|
||||
IndexReader indexReader,
|
||||
int maxDocFreq) throws IOException {
|
||||
this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxDocFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
|
||||
* indexed fields from terms with a document frequency percentage greater than
|
||||
* the given maxPercentDocs
|
||||
*
|
||||
* @param matchVersion Version to be used in {@link StopFilter}
|
||||
* @param delegate Analyzer whose TokenStream will be filtered
|
||||
* @param indexReader IndexReader to identify the stopwords from
|
||||
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
||||
* contain a term, after which the word is considered to be a stop word
|
||||
* @throws IOException Can be thrown while reading from the IndexReader
|
||||
*/
|
||||
public QueryAutoStopWordAnalyzer(
|
||||
Version matchVersion,
|
||||
Analyzer delegate,
|
||||
IndexReader indexReader,
|
||||
float maxPercentDocs) throws IOException {
|
||||
this(matchVersion, delegate, indexReader, indexReader.getFieldNames(IndexReader.FieldOption.INDEXED), maxPercentDocs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
|
||||
* given selection of fields from terms with a document frequency percentage
|
||||
* greater than the given maxPercentDocs
|
||||
*
|
||||
* @param matchVersion Version to be used in {@link StopFilter}
|
||||
* @param delegate Analyzer whose TokenStream will be filtered
|
||||
* @param indexReader IndexReader to identify the stopwords from
|
||||
* @param fields Selection of fields to calculate stopwords for
|
||||
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
||||
* contain a term, after which the word is considered to be a stop word
|
||||
* @throws IOException Can be thrown while reading from the IndexReader
|
||||
*/
|
||||
public QueryAutoStopWordAnalyzer(
|
||||
Version matchVersion,
|
||||
Analyzer delegate,
|
||||
IndexReader indexReader,
|
||||
Collection<String> fields,
|
||||
float maxPercentDocs) throws IOException {
|
||||
this(matchVersion, delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
|
||||
* given selection of fields from terms with a document frequency greater than
|
||||
* the given maxDocFreq
|
||||
*
|
||||
* @param matchVersion Version to be used in {@link StopFilter}
|
||||
* @param delegate Analyzer whose TokenStream will be filtered
|
||||
* @param indexReader IndexReader to identify the stopwords from
|
||||
* @param fields Selection of fields to calculate stopwords for
|
||||
* @param maxDocFreq Document frequency terms should be above in order to be stopwords
|
||||
* @throws IOException Can be thrown while reading from the IndexReader
|
||||
*/
|
||||
public QueryAutoStopWordAnalyzer(
|
||||
Version matchVersion,
|
||||
Analyzer delegate,
|
||||
IndexReader indexReader,
|
||||
Collection<String> fields,
|
||||
int maxDocFreq) throws IOException {
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
|
||||
*
|
||||
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
|
||||
* exceed the required document frequency
|
||||
* @return The number of stop words identified.
|
||||
* @throws IOException
|
||||
*/
|
||||
public int addStopWords(IndexReader reader) throws IOException {
|
||||
return addStopWords(reader, defaultMaxDocFreqPercent);
|
||||
}
|
||||
|
||||
/**
|
||||
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
|
||||
*
|
||||
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
|
||||
* exceed the required document frequency
|
||||
* @param maxDocFreq The maximum number of index documents which can contain a term, after which
|
||||
* the term is considered to be a stop word
|
||||
* @return The number of stop words identified.
|
||||
* @throws IOException
|
||||
*/
|
||||
public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException {
|
||||
int numStopWords = 0;
|
||||
Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
|
||||
for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
|
||||
String fieldName = iter.next();
|
||||
numStopWords += addStopWords(reader, fieldName, maxDocFreq);
|
||||
}
|
||||
return numStopWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
|
||||
*
|
||||
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
|
||||
* exceed the required document frequency
|
||||
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
||||
* contain a term, after which the word is considered to be a stop word.
|
||||
* @return The number of stop words identified.
|
||||
* @throws IOException
|
||||
*/
|
||||
public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException {
|
||||
int numStopWords = 0;
|
||||
Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
|
||||
for (Iterator<String> iter = fieldNames.iterator(); iter.hasNext();) {
|
||||
String fieldName = iter.next();
|
||||
numStopWords += addStopWords(reader, fieldName, maxPercentDocs);
|
||||
}
|
||||
return numStopWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
|
||||
*
|
||||
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
|
||||
* exceed the required document frequency
|
||||
* @param fieldName The field for which stopwords will be added
|
||||
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
||||
* contain a term, after which the word is considered to be a stop word.
|
||||
* @return The number of stop words identified.
|
||||
* @throws IOException
|
||||
*/
|
||||
public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException {
|
||||
return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs));
|
||||
}
|
||||
|
||||
/**
|
||||
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
|
||||
*
|
||||
* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
|
||||
* exceed the required document frequency
|
||||
* @param fieldName The field for which stopwords will be added
|
||||
* @param maxDocFreq The maximum number of index documents which
|
||||
* can contain a term, after which the term is considered to be a stop word.
|
||||
* @return The number of stop words identified.
|
||||
* @throws IOException
|
||||
*/
|
||||
public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
|
||||
HashSet<String> stopWords = new HashSet<String>();
|
||||
final Terms terms = MultiFields.getTerms(reader, fieldName);
|
||||
final CharsRef spare = new CharsRef();
|
||||
if (terms != null) {
|
||||
final TermsEnum te = terms.iterator();
|
||||
BytesRef text;
|
||||
while ((text = te.next()) != null) {
|
||||
if (te.docFreq() > maxDocFreq) {
|
||||
stopWords.add(text.utf8ToChars(spare).toString());
|
||||
this.delegate = delegate;
|
||||
|
||||
for (String field : fields) {
|
||||
Set<String> stopWords = new HashSet<String>();
|
||||
Terms terms = MultiFields.getTerms(indexReader, field);
|
||||
CharsRef spare = new CharsRef();
|
||||
if (terms != null) {
|
||||
TermsEnum te = terms.iterator();
|
||||
BytesRef text;
|
||||
while ((text = te.next()) != null) {
|
||||
if (te.docFreq() > maxDocFreq) {
|
||||
stopWords.add(text.utf8ToChars(spare).toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
stopWordsPerField.put(field, stopWords);
|
||||
}
|
||||
stopWordsPerField.put(fieldName, stopWords);
|
||||
|
||||
/* if the stopwords for a field are changed,
|
||||
* then saved streams for that field are erased.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
|
||||
if (streamMap != null)
|
||||
streamMap.remove(fieldName);
|
||||
|
||||
return stopWords.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -176,7 +175,7 @@ public final class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
} catch (IOException e) {
|
||||
result = delegate.tokenStream(fieldName, reader);
|
||||
}
|
||||
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
Set<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null) {
|
||||
result = new StopFilter(matchVersion, result, stopWords);
|
||||
}
|
||||
|
@ -193,12 +192,11 @@ public final class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
*/
|
||||
TokenStream withStopFilter;
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
/* map of SavedStreams for each field */
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
|
||||
if (streamMap == null) {
|
||||
streamMap = new HashMap<String, SavedStreams>();
|
||||
|
@ -213,31 +211,32 @@ public final class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
streams.wrapped = delegate.reusableTokenStream(fieldName, reader);
|
||||
|
||||
/* if there are any stopwords for the field, save the stopfilter */
|
||||
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null)
|
||||
Set<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null) {
|
||||
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
|
||||
else
|
||||
} else {
|
||||
streams.withStopFilter = streams.wrapped;
|
||||
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* an entry for this field exists, verify the wrapped stream has not
|
||||
* changed. if it has not, reuse it, otherwise wrap the new stream.
|
||||
*/
|
||||
* an entry for this field exists, verify the wrapped stream has not
|
||||
* changed. if it has not, reuse it, otherwise wrap the new stream.
|
||||
*/
|
||||
TokenStream result = delegate.reusableTokenStream(fieldName, reader);
|
||||
if (result == streams.wrapped) {
|
||||
/* the wrapped analyzer reused the stream */
|
||||
} else {
|
||||
/*
|
||||
* the wrapped analyzer did not. if there are any stopwords for the
|
||||
* field, create a new StopFilter around the new stream
|
||||
*/
|
||||
* the wrapped analyzer did not. if there are any stopwords for the
|
||||
* field, create a new StopFilter around the new stream
|
||||
*/
|
||||
streams.wrapped = result;
|
||||
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null)
|
||||
Set<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null) {
|
||||
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
|
||||
else
|
||||
} else {
|
||||
streams.withStopFilter = streams.wrapped;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -252,14 +251,8 @@ public final class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
* @return the stop words identified for a field
|
||||
*/
|
||||
public String[] getStopWords(String fieldName) {
|
||||
String[] result;
|
||||
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null) {
|
||||
result = stopWords.toArray(new String[stopWords.size()]);
|
||||
} else {
|
||||
result = new String[0];
|
||||
}
|
||||
return result;
|
||||
Set<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
return stopWords != null ? stopWords.toArray(new String[stopWords.size()]) : new String[0];
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -268,12 +261,10 @@ public final class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
* @return the stop words (as terms)
|
||||
*/
|
||||
public Term[] getStopWords() {
|
||||
ArrayList<Term> allStopWords = new ArrayList<Term>();
|
||||
for (Iterator<String> iter = stopWordsPerField.keySet().iterator(); iter.hasNext();) {
|
||||
String fieldName = iter.next();
|
||||
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
for (Iterator<String> iterator = stopWords.iterator(); iterator.hasNext();) {
|
||||
String text = iterator.next();
|
||||
List<Term> allStopWords = new ArrayList<Term>();
|
||||
for (String fieldName : stopWordsPerField.keySet()) {
|
||||
Set<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
for (String text : stopWords) {
|
||||
allStopWords.add(new Term(fieldName, text));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,23 +16,19 @@ package org.apache.lucene.analysis.query;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
||||
public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
||||
String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
|
||||
String repetitiveFieldValues[] = {"boring", "boring", "vaguelyboring"};
|
||||
|
@ -58,7 +54,6 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
writer.close();
|
||||
reader = IndexReader.open(dir, true);
|
||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -67,9 +62,9 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
super.tearDown();
|
||||
}
|
||||
|
||||
public void testUninitializedAnalyzer() throws Exception {
|
||||
// Note: no calls to "addStopWord"
|
||||
// query = "variedField:quick repetitiveField:boring";
|
||||
public void testNoStopwords() throws Exception {
|
||||
// Note: an empty list of fields passed in
|
||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Collections.EMPTY_LIST, 1);
|
||||
TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("variedField", new StringReader("quick"));
|
||||
assertTokenStreamContents(protectedTokenStream, new String[]{"quick"});
|
||||
|
||||
|
@ -77,21 +72,14 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(protectedTokenStream, new String[]{"boring"});
|
||||
}
|
||||
|
||||
/*
|
||||
* Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader)'
|
||||
*/
|
||||
public void testDefaultAddStopWordsIndexReader() throws Exception {
|
||||
protectedAnalyzer.addStopWords(reader);
|
||||
public void testDefaultStopwordsAllFields() throws Exception {
|
||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader);
|
||||
TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring"));
|
||||
|
||||
assertTokenStreamContents(protectedTokenStream, new String[0]); // Default stop word filtering will remove boring
|
||||
}
|
||||
|
||||
/*
|
||||
* Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader, int)'
|
||||
*/
|
||||
public void testAddStopWordsIndexReaderInt() throws Exception {
|
||||
protectedAnalyzer.addStopWords(reader, 1f / 2f);
|
||||
public void testStopwordsAllFieldsMaxPercentDocs() throws Exception {
|
||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 2f);
|
||||
|
||||
TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring"));
|
||||
// A filter on terms in > one half of docs remove boring
|
||||
|
@ -101,39 +89,36 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
// A filter on terms in > half of docs should not remove vaguelyBoring
|
||||
assertTokenStreamContents(protectedTokenStream, new String[]{"vaguelyboring"});
|
||||
|
||||
protectedAnalyzer.addStopWords(reader, 1f / 4f);
|
||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 4f);
|
||||
protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("vaguelyboring"));
|
||||
// A filter on terms in > quarter of docs should remove vaguelyBoring
|
||||
assertTokenStreamContents(protectedTokenStream, new String[0]);
|
||||
}
|
||||
|
||||
public void testAddStopWordsIndexReaderStringFloat() throws Exception {
|
||||
protectedAnalyzer.addStopWords(reader, "variedField", 1f / 2f);
|
||||
public void testStopwordsPerFieldMaxPercentDocs() throws Exception {
|
||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("variedField"), 1f / 2f);
|
||||
TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring"));
|
||||
// A filter on one Field should not affect queries on another
|
||||
assertTokenStreamContents(protectedTokenStream, new String[]{"boring"});
|
||||
|
||||
protectedAnalyzer.addStopWords(reader, "repetitiveField", 1f / 2f);
|
||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("variedField", "repetitiveField"), 1f / 2f);
|
||||
protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring"));
|
||||
// A filter on the right Field should affect queries on it
|
||||
assertTokenStreamContents(protectedTokenStream, new String[0]);
|
||||
}
|
||||
|
||||
public void testAddStopWordsIndexReaderStringInt() throws Exception {
|
||||
int numStopWords = protectedAnalyzer.addStopWords(reader, "repetitiveField", 10);
|
||||
public void testStopwordsPerFieldMaxDocFreq() throws Exception {
|
||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField"), 10);
|
||||
int numStopWords = protectedAnalyzer.getStopWords("repetitiveField").length;
|
||||
assertTrue("Should have identified stop words", numStopWords > 0);
|
||||
|
||||
Term[] t = protectedAnalyzer.getStopWords();
|
||||
assertEquals("num terms should = num stopwords returned", t.length, numStopWords);
|
||||
|
||||
int numNewStopWords = protectedAnalyzer.addStopWords(reader, "variedField", 10);
|
||||
assertTrue("Should have identified more stop words", numNewStopWords > 0);
|
||||
t = protectedAnalyzer.getStopWords();
|
||||
assertEquals("num terms should = num stopwords returned", t.length, numStopWords + numNewStopWords);
|
||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField", "variedField"), 10);
|
||||
int numNewStopWords = protectedAnalyzer.getStopWords("repetitiveField").length + protectedAnalyzer.getStopWords("variedField").length;
|
||||
assertTrue("Should have identified more stop words", numNewStopWords > numStopWords);
|
||||
}
|
||||
|
||||
public void testNoFieldNamePollution() throws Exception {
|
||||
protectedAnalyzer.addStopWords(reader, "repetitiveField", 10);
|
||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.asList("repetitiveField"), 10);
|
||||
|
||||
TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring"));
|
||||
// Check filter set up OK
|
||||
|
@ -145,8 +130,9 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testTokenStream() throws Exception {
|
||||
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
a.addStopWords(reader, 10);
|
||||
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(
|
||||
TEST_VERSION_CURRENT,
|
||||
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), reader, 10);
|
||||
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
|
||||
assertTokenStreamContents(ts, new String[] { "this" });
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue