mirror of https://github.com/apache/lucene.git
LUCENE-1688: Deprecate static final String stop word array in and StopAnalzyer and replace it with an immutable implementation of CharArraySet.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@794078 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6bf4d35ce8
commit
ea7e4ad344
10
CHANGES.txt
10
CHANGES.txt
|
@ -309,6 +309,11 @@ API Changes
|
|||
all synchronization in TermInfosReader, which previously could
|
||||
cause threads to pile up in certain cases. (Dan Rosher via Mike
|
||||
McCandless)
|
||||
|
||||
30. LUCENE-1688: Deprecate static final String stop word array in and
|
||||
StopAnalzyer and replace it with an immutable implementation of
|
||||
CharArraySet. (Simon Willnauer via Mark Miller)
|
||||
|
||||
|
||||
Bug fixes
|
||||
|
||||
|
@ -604,6 +609,11 @@ Optimizations
|
|||
9. LUCENE-1653: Avoid creating a Calendar in every call to
|
||||
DateTools#dateToString, DateTools#timeToString and
|
||||
DateTools#round. (Shai Erera via Mark Miller)
|
||||
|
||||
10. LUCENE-1688: Deprecate static final String stop word array and
|
||||
replace it with an immutable implementation of CharArraySet.
|
||||
Removes conversions between Set and array.
|
||||
(Simon Willnauer via Mark Miller)
|
||||
|
||||
Documentation
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ public class ThaiAnalyzer extends Analyzer {
|
|||
TokenStream ts = new StandardTokenizer(reader);
|
||||
ts = new StandardFilter(ts);
|
||||
ts = new ThaiWordFilter(ts);
|
||||
ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS);
|
||||
ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
return ts;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,9 +23,11 @@ import java.io.Reader;
|
|||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
|
@ -982,7 +984,8 @@ public class HighlighterTest extends TestCase implements Formatter {
|
|||
|
||||
public void run() throws Exception {
|
||||
String goodWord = "goodtoken";
|
||||
String stopWords[] = { "stoppedtoken" };
|
||||
Set stopWords = new HashSet(1);
|
||||
stopWords.add("stoppedtoken");
|
||||
|
||||
TermQuery query = new TermQuery(new Term("data", goodWord));
|
||||
|
||||
|
@ -991,7 +994,8 @@ public class HighlighterTest extends TestCase implements Formatter {
|
|||
sb.append(goodWord);
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
sb.append(" ");
|
||||
sb.append(stopWords[0]);
|
||||
// only one stopword
|
||||
sb.append(stopWords.iterator().next());
|
||||
}
|
||||
SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
|
||||
Highlighter hg = getHighlighter(query, "data", new StandardAnalyzer(stopWords).tokenStream(
|
||||
|
@ -1024,7 +1028,9 @@ public class HighlighterTest extends TestCase implements Formatter {
|
|||
public void testMaxSizeEndHighlight() throws Exception {
|
||||
TestHighlightRunner helper = new TestHighlightRunner() {
|
||||
public void run() throws Exception {
|
||||
String stopWords[] = { "in", "it" };
|
||||
Set stopWords = new HashSet();
|
||||
stopWords.add("in");
|
||||
stopWords.add("it");
|
||||
TermQuery query = new TermQuery(new Term("text", "searchterm"));
|
||||
|
||||
String text = "this is a text with searchterm in it";
|
||||
|
|
|
@ -70,55 +70,60 @@ public class PatternAnalyzer extends Analyzer {
|
|||
/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
|
||||
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
|
||||
|
||||
private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
|
||||
"a", "about", "above", "across", "adj", "after", "afterwards",
|
||||
"again", "against", "albeit", "all", "almost", "alone", "along",
|
||||
"already", "also", "although", "always", "among", "amongst", "an",
|
||||
"and", "another", "any", "anyhow", "anyone", "anything",
|
||||
"anywhere", "are", "around", "as", "at", "be", "became", "because",
|
||||
"become", "becomes", "becoming", "been", "before", "beforehand",
|
||||
"behind", "being", "below", "beside", "besides", "between",
|
||||
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
|
||||
"down", "during", "each", "eg", "either", "else", "elsewhere",
|
||||
"enough", "etc", "even", "ever", "every", "everyone", "everything",
|
||||
"everywhere", "except", "few", "first", "for", "former",
|
||||
"formerly", "from", "further", "had", "has", "have", "he", "hence",
|
||||
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
|
||||
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
|
||||
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
|
||||
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
|
||||
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
|
||||
"must", "my", "myself", "namely", "neither", "never",
|
||||
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
|
||||
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
|
||||
"once one", "only", "onto", "or", "other", "others", "otherwise",
|
||||
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
|
||||
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
|
||||
"several", "she", "should", "since", "so", "some", "somehow",
|
||||
"someone", "something", "sometime", "sometimes", "somewhere",
|
||||
"still", "such", "t", "than", "that", "the", "their", "them",
|
||||
"themselves", "then", "thence", "there", "thereafter", "thereby",
|
||||
"therefor", "therein", "thereupon", "these", "they", "this",
|
||||
"those", "though", "through", "throughout", "thru", "thus", "to",
|
||||
"together", "too", "toward", "towards", "under", "until", "up",
|
||||
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
|
||||
"whatever", "whatsoever", "when", "whence", "whenever",
|
||||
"whensoever", "where", "whereafter", "whereas", "whereat",
|
||||
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
|
||||
"whereon", "whereto", "whereunto", "whereupon", "wherever",
|
||||
"wherewith", "whether", "which", "whichever", "whichsoever",
|
||||
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
|
||||
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
|
||||
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
|
||||
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
|
||||
"yourselves"});
|
||||
private static final Set EXTENDED_ENGLISH_STOP_WORDS;
|
||||
static {
|
||||
EXTENDED_ENGLISH_STOP_WORDS = new HashSet();
|
||||
|
||||
EXTENDED_ENGLISH_STOP_WORDS.addAll(Arrays.asList(new String[] {
|
||||
"a", "about", "above", "across", "adj", "after", "afterwards",
|
||||
"again", "against", "albeit", "all", "almost", "alone", "along",
|
||||
"already", "also", "although", "always", "among", "amongst", "an",
|
||||
"and", "another", "any", "anyhow", "anyone", "anything",
|
||||
"anywhere", "are", "around", "as", "at", "be", "became", "because",
|
||||
"become", "becomes", "becoming", "been", "before", "beforehand",
|
||||
"behind", "being", "below", "beside", "besides", "between",
|
||||
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
|
||||
"down", "during", "each", "eg", "either", "else", "elsewhere",
|
||||
"enough", "etc", "even", "ever", "every", "everyone", "everything",
|
||||
"everywhere", "except", "few", "first", "for", "former",
|
||||
"formerly", "from", "further", "had", "has", "have", "he", "hence",
|
||||
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
|
||||
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
|
||||
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
|
||||
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
|
||||
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
|
||||
"must", "my", "myself", "namely", "neither", "never",
|
||||
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
|
||||
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
|
||||
"once one", "only", "onto", "or", "other", "others", "otherwise",
|
||||
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
|
||||
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
|
||||
"several", "she", "should", "since", "so", "some", "somehow",
|
||||
"someone", "something", "sometime", "sometimes", "somewhere",
|
||||
"still", "such", "t", "than", "that", "the", "their", "them",
|
||||
"themselves", "then", "thence", "there", "thereafter", "thereby",
|
||||
"therefor", "therein", "thereupon", "these", "they", "this",
|
||||
"those", "though", "through", "throughout", "thru", "thus", "to",
|
||||
"together", "too", "toward", "towards", "under", "until", "up",
|
||||
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
|
||||
"whatever", "whatsoever", "when", "whence", "whenever",
|
||||
"whensoever", "where", "whereafter", "whereas", "whereat",
|
||||
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
|
||||
"whereon", "whereto", "whereunto", "whereupon", "wherever",
|
||||
"wherewith", "whether", "which", "whichever", "whichsoever",
|
||||
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
|
||||
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
|
||||
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
|
||||
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
|
||||
"yourselves"}));
|
||||
}
|
||||
|
||||
/**
|
||||
* A lower-casing word analyzer with English stop words (can be shared
|
||||
* freely across threads without harm); global per class loader.
|
||||
*/
|
||||
public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
|
||||
NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
|
||||
NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
|
||||
/**
|
||||
* A lower-casing word analyzer with <b>extended </b> English stop words
|
||||
|
@ -191,7 +196,7 @@ public class PatternAnalyzer extends Analyzer {
|
|||
}
|
||||
else {
|
||||
stream = new PatternTokenizer(text, pattern, toLowerCase);
|
||||
if (stopWords != null) stream = new StopFilter(stream, stopWords);
|
||||
if (stopWords != null) stream = new StopFilter(false, stream, stopWords);
|
||||
}
|
||||
|
||||
return stream;
|
||||
|
@ -304,9 +309,9 @@ public class PatternAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
/** somewhat oversized to minimize hash collisions */
|
||||
private static Set makeStopSet(String[] stopWords) {
|
||||
Set stops = new HashSet(stopWords.length * 2, 0.3f);
|
||||
stops.addAll(Arrays.asList(stopWords));
|
||||
private static Set makeStopSet(Set stopWords) {
|
||||
Set stops = new HashSet(stopWords.size() * 2, 0.3f);
|
||||
stops.addAll(stopWords);
|
||||
return stops;
|
||||
// return Collections.unmodifiableSet(stops);
|
||||
}
|
||||
|
|
|
@ -271,7 +271,7 @@ public class MemoryIndexTest extends TestCase {
|
|||
boolean toLowerCase = true;
|
||||
// boolean toLowerCase = false;
|
||||
// Set stopWords = null;
|
||||
Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
|
||||
Set stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
|
||||
Analyzer[] analyzers = new Analyzer[] {
|
||||
new SimpleAnalyzer(),
|
||||
|
|
|
@ -135,7 +135,7 @@ public class PatternAnalyzerTest extends TestCase {
|
|||
|
||||
for (int stops=0; stops < maxStops; stops++) {
|
||||
Set stopWords = null;
|
||||
if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
|
||||
if (stops != 0) stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
|
||||
for (int toLower=0; toLower < maxToLower; toLower++) {
|
||||
boolean toLowerCase = toLower != 0;
|
||||
|
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.util.AbstractSet;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
|
@ -53,6 +54,12 @@ public class CharArraySet extends AbstractSet {
|
|||
this(c.size(), ignoreCase);
|
||||
addAll(c);
|
||||
}
|
||||
/** Create set from entries */
|
||||
private CharArraySet(char[][] entries, boolean ignoreCase, int count){
|
||||
this.entries = entries;
|
||||
this.ignoreCase = ignoreCase;
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
|
||||
* are in the set */
|
||||
|
@ -100,7 +107,7 @@ public class CharArraySet extends AbstractSet {
|
|||
public boolean add(CharSequence text) {
|
||||
return add(text.toString()); // could be more efficient
|
||||
}
|
||||
|
||||
|
||||
/** Add this String into the set */
|
||||
public boolean add(String text) {
|
||||
return add(text.toCharArray());
|
||||
|
@ -228,6 +235,26 @@ public class CharArraySet extends AbstractSet {
|
|||
}
|
||||
return add(o.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable {@link CharArraySet}. This allows to provide
|
||||
* unmodifiable views of internal sets for "read-only" use.
|
||||
*
|
||||
* @param set
|
||||
* a set for which the unmodifiable set is returned.
|
||||
* @return an new unmodifiable {@link CharArraySet}.
|
||||
* @throws NullPointerException
|
||||
* if the given set is <code>null</code>.
|
||||
*/
|
||||
public static CharArraySet unmodifiableSet(CharArraySet set) {
|
||||
if (set == null)
|
||||
throw new NullPointerException("Given set is null");
|
||||
/*
|
||||
* Instead of delegating calls to the given set copy the low-level values to
|
||||
* the unmodifiable Subclass
|
||||
*/
|
||||
return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
|
||||
}
|
||||
|
||||
/** The Iterator<String> for this set. Strings are constructed on the fly, so
|
||||
* use <code>nextCharArray</code> for more efficient access. */
|
||||
|
@ -270,5 +297,40 @@ public class CharArraySet extends AbstractSet {
|
|||
public Iterator iterator() {
|
||||
return new CharArraySetIterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Efficient unmodifiable {@link CharArraySet}. This implementation does not
|
||||
* delegate calls to a give {@link CharArraySet} like
|
||||
* {@link Collections#unmodifiableSet(java.util.Set)} does. Instead is passes
|
||||
* the internal representation of a {@link CharArraySet} to a super
|
||||
* constructor and overrides all mutators.
|
||||
*/
|
||||
private static final class UnmodifiableCharArraySet extends CharArraySet {
|
||||
|
||||
private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase,
|
||||
int count) {
|
||||
super(entries, ignoreCase, count);
|
||||
}
|
||||
|
||||
public boolean add(Object o){
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public boolean addAll(Collection coll) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public boolean add(char[] text) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public boolean add(CharSequence text) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public boolean add(String text) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,18 +20,20 @@ package org.apache.lucene.analysis;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
|
||||
|
||||
public final class StopAnalyzer extends Analyzer {
|
||||
private Set stopWords;
|
||||
private final Set/*<String>*/ stopWords;
|
||||
// @deprecated
|
||||
private boolean useDefaultStopPositionIncrement;
|
||||
private boolean enablePositionIncrements;
|
||||
private final boolean useDefaultStopPositionIncrement;
|
||||
private final boolean enablePositionIncrements;
|
||||
|
||||
/** An array containing some common English words that are not usually useful
|
||||
for searching. */
|
||||
for searching.
|
||||
@deprecated Use {@link #ENGLISH_STOP_WORDS_SET} instead */
|
||||
public static final String[] ENGLISH_STOP_WORDS = {
|
||||
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it",
|
||||
|
@ -39,13 +41,31 @@ public final class StopAnalyzer extends Analyzer {
|
|||
"that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
};
|
||||
|
||||
|
||||
/** An unmodifiable set containing some common English words that are not usually useful
|
||||
for searching.*/
|
||||
public static final Set/*<String>*/ ENGLISH_STOP_WORDS_SET;
|
||||
|
||||
static {
|
||||
final String[] stopWords = new String[]{
|
||||
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it",
|
||||
"no", "not", "of", "on", "or", "such",
|
||||
"that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
};
|
||||
final CharArraySet stopSet = new CharArraySet(stopWords.length, false);
|
||||
stopSet.addAll(Arrays.asList(stopWords));
|
||||
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
|
||||
}
|
||||
|
||||
/** Builds an analyzer which removes words in
|
||||
* ENGLISH_STOP_WORDS.
|
||||
* @deprecated Use {@link #StopAnalyzer(boolean)} instead */
|
||||
public StopAnalyzer() {
|
||||
stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
|
||||
stopWords = ENGLISH_STOP_WORDS_SET;
|
||||
useDefaultStopPositionIncrement = true;
|
||||
enablePositionIncrements = false;
|
||||
}
|
||||
|
||||
/** Builds an analyzer which removes words in
|
||||
|
@ -53,8 +73,9 @@ public final class StopAnalyzer extends Analyzer {
|
|||
* @param enablePositionIncrements See {@link
|
||||
* StopFilter#setEnablePositionIncrements} */
|
||||
public StopAnalyzer(boolean enablePositionIncrements) {
|
||||
stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
|
||||
stopWords = ENGLISH_STOP_WORDS_SET;
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
useDefaultStopPositionIncrement = false;
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given set.
|
||||
|
@ -62,6 +83,7 @@ public final class StopAnalyzer extends Analyzer {
|
|||
public StopAnalyzer(Set stopWords) {
|
||||
this.stopWords = stopWords;
|
||||
useDefaultStopPositionIncrement = true;
|
||||
enablePositionIncrements = false;
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given set.
|
||||
|
@ -71,22 +93,26 @@ public final class StopAnalyzer extends Analyzer {
|
|||
public StopAnalyzer(Set stopWords, boolean enablePositionIncrements) {
|
||||
this.stopWords = stopWords;
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
useDefaultStopPositionIncrement = false;
|
||||
}
|
||||
|
||||
/** Builds an analyzer which removes words in the provided array.
|
||||
* @deprecated Use {@link #StopAnalyzer(String[], boolean)} instead */
|
||||
* @deprecated Use {@link #StopAnalyzer(Set, boolean)} instead */
|
||||
public StopAnalyzer(String[] stopWords) {
|
||||
this.stopWords = StopFilter.makeStopSet(stopWords);
|
||||
useDefaultStopPositionIncrement = true;
|
||||
enablePositionIncrements = false;
|
||||
}
|
||||
|
||||
/** Builds an analyzer which removes words in the provided array.
|
||||
* @param stopWords Array of stop words
|
||||
* @param enablePositionIncrements See {@link
|
||||
* StopFilter#setEnablePositionIncrements} */
|
||||
* StopFilter#setEnablePositionIncrements}
|
||||
* @deprecated Use {@link #StopAnalyzer(Set, boolean) instead*/
|
||||
public StopAnalyzer(String[] stopWords, boolean enablePositionIncrements) {
|
||||
this.stopWords = StopFilter.makeStopSet(stopWords);
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
useDefaultStopPositionIncrement = false;
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
|
@ -95,6 +121,7 @@ public final class StopAnalyzer extends Analyzer {
|
|||
public StopAnalyzer(File stopwordsFile) throws IOException {
|
||||
stopWords = WordlistLoader.getWordSet(stopwordsFile);
|
||||
useDefaultStopPositionIncrement = true;
|
||||
enablePositionIncrements = false;
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
|
@ -105,6 +132,7 @@ public final class StopAnalyzer extends Analyzer {
|
|||
public StopAnalyzer(File stopwordsFile, boolean enablePositionIncrements) throws IOException {
|
||||
stopWords = WordlistLoader.getWordSet(stopwordsFile);
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
useDefaultStopPositionIncrement = false;
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
|
@ -114,6 +142,7 @@ public final class StopAnalyzer extends Analyzer {
|
|||
public StopAnalyzer(Reader stopwords) throws IOException {
|
||||
stopWords = WordlistLoader.getWordSet(stopwords);
|
||||
useDefaultStopPositionIncrement = true;
|
||||
enablePositionIncrements = false;
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
|
@ -124,6 +153,7 @@ public final class StopAnalyzer extends Analyzer {
|
|||
public StopAnalyzer(Reader stopwords, boolean enablePositionIncrements) throws IOException {
|
||||
stopWords = WordlistLoader.getWordSet(stopwords);
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
useDefaultStopPositionIncrement = false;
|
||||
}
|
||||
|
||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||
|
|
|
@ -55,6 +55,7 @@ public final class StopFilter extends TokenFilter {
|
|||
* @param enablePositionIncrements true if token positions should record the removed stop words
|
||||
* @param input input TokenStream
|
||||
* @param stopWords array of stop words
|
||||
* @deprecated Use {@link #StopFilter(boolean, TokenStream, Set)} instead.
|
||||
*/
|
||||
public StopFilter(boolean enablePositionIncrements, TokenStream input, String [] stopWords)
|
||||
{
|
||||
|
@ -77,6 +78,7 @@ public final class StopFilter extends TokenFilter {
|
|||
* @param in input TokenStream
|
||||
* @param stopWords array of stop words
|
||||
* @param ignoreCase true if case is ignored
|
||||
* @deprecated Use {@link #StopFilter(boolean, TokenStream, Set, boolean)} instead.
|
||||
*/
|
||||
public StopFilter(boolean enablePositionIncrements, TokenStream in, String[] stopWords, boolean ignoreCase) {
|
||||
super(in);
|
||||
|
|
|
@ -101,15 +101,19 @@ public class StandardAnalyzer extends Analyzer {
|
|||
|
||||
|
||||
/** An array containing some common English words that are usually not
|
||||
useful for searching. */
|
||||
useful for searching.
|
||||
@deprecated Use {@link #STOP_WORDS_SET} instead */
|
||||
public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
|
||||
|
||||
/** An unmodifiable set containing some common English words that are usually not
|
||||
useful for searching. */
|
||||
public static final Set/*<String>*/ STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
|
||||
/** Builds an analyzer with the default stop words ({@link
|
||||
* #STOP_WORDS}).
|
||||
* @deprecated Use {@link #StandardAnalyzer(Version)},
|
||||
* instead. */
|
||||
* #STOP_WORDS_SET}).
|
||||
* @deprecated Use {@link #StandardAnalyzer(Version)} instead. */
|
||||
public StandardAnalyzer() {
|
||||
this(Version.LUCENE_24, STOP_WORDS);
|
||||
this(Version.LUCENE_24, STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the default stop words ({@link
|
||||
|
@ -118,7 +122,7 @@ public class StandardAnalyzer extends Analyzer {
|
|||
* <a href="#version">above</a>}
|
||||
*/
|
||||
public StandardAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, STOP_WORDS);
|
||||
this(matchVersion, STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the given stop words.
|
||||
|
@ -138,22 +142,9 @@ public class StandardAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
/** Builds an analyzer with the given stop words.
|
||||
* @deprecated Use {@link #StandardAnalyzer(Version,
|
||||
* String[])} instead */
|
||||
* @deprecated Use {@link #StandardAnalyzer(Version, Set)} instead */
|
||||
public StandardAnalyzer(String[] stopWords) {
|
||||
this(Version.LUCENE_24, stopWords);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the given stop words.
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopWords Array of stop words */
|
||||
public StandardAnalyzer(Version matchVersion, String[] stopWords) {
|
||||
if (stopWords == null) {
|
||||
stopWords = STOP_WORDS;
|
||||
}
|
||||
stopSet = StopFilter.makeStopSet(stopWords);
|
||||
init(matchVersion);
|
||||
this(Version.LUCENE_24, StopFilter.makeStopSet(stopWords));
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
|
@ -203,8 +194,9 @@ public class StandardAnalyzer extends Analyzer {
|
|||
* @deprecated Remove in 3.X and make true the only valid value
|
||||
*/
|
||||
public StandardAnalyzer(boolean replaceInvalidAcronym) {
|
||||
this(Version.LUCENE_24, STOP_WORDS);
|
||||
this(Version.LUCENE_24, STOP_WORDS_SET);
|
||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||
useDefaultStopPositionIncrements = true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -243,7 +235,7 @@ public class StandardAnalyzer extends Analyzer {
|
|||
* @deprecated Remove in 3.X and make true the only valid value
|
||||
*/
|
||||
public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{
|
||||
this(Version.LUCENE_24, stopwords);
|
||||
this(Version.LUCENE_24, StopFilter.makeStopSet(stopwords));
|
||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,13 +23,22 @@ import org.apache.lucene.util.LuceneTestCase;
|
|||
|
||||
public class TestCharArraySet extends LuceneTestCase {
|
||||
|
||||
static final String[] TEST_STOP_WORDS = {
|
||||
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it",
|
||||
"no", "not", "of", "on", "or", "such",
|
||||
"that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
};
|
||||
|
||||
|
||||
public void testRehash() throws Exception {
|
||||
CharArraySet cas = new CharArraySet(0, true);
|
||||
for(int i=0;i<StopAnalyzer.ENGLISH_STOP_WORDS.length;i++)
|
||||
cas.add(StopAnalyzer.ENGLISH_STOP_WORDS[i]);
|
||||
assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS.length, cas.size());
|
||||
for(int i=0;i<StopAnalyzer.ENGLISH_STOP_WORDS.length;i++)
|
||||
assertTrue(cas.contains(StopAnalyzer.ENGLISH_STOP_WORDS[i]));
|
||||
for(int i=0;i<TEST_STOP_WORDS.length;i++)
|
||||
cas.add(TEST_STOP_WORDS[i]);
|
||||
assertEquals(TEST_STOP_WORDS.length, cas.size());
|
||||
for(int i=0;i<TEST_STOP_WORDS.length;i++)
|
||||
assertTrue(cas.contains(TEST_STOP_WORDS[i]));
|
||||
}
|
||||
|
||||
public void testNonZeroOffset() {
|
||||
|
@ -39,6 +48,11 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
set.addAll(Arrays.asList(words));
|
||||
assertTrue(set.contains(findme, 1, 4));
|
||||
assertTrue(set.contains(new String(findme,1,4)));
|
||||
|
||||
// test unmodifiable
|
||||
set = CharArraySet.unmodifiableSet(set);
|
||||
assertTrue(set.contains(findme, 1, 4));
|
||||
assertTrue(set.contains(new String(findme,1,4)));
|
||||
}
|
||||
|
||||
public void testObjectContains() {
|
||||
|
@ -47,5 +61,118 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
set.add(val);
|
||||
assertTrue(set.contains(val));
|
||||
assertTrue(set.contains(new Integer(1)));
|
||||
// test unmodifiable
|
||||
set = CharArraySet.unmodifiableSet(set);
|
||||
assertTrue(set.contains(val));
|
||||
assertTrue(set.contains(new Integer(1)));
|
||||
}
|
||||
|
||||
public void testClear(){
|
||||
CharArraySet set=new CharArraySet(10,true);
|
||||
set.addAll(Arrays.asList(TEST_STOP_WORDS));
|
||||
assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
|
||||
try{
|
||||
set.clear();
|
||||
fail("remove is not supported");
|
||||
}catch (UnsupportedOperationException e) {
|
||||
// expected
|
||||
assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
|
||||
}
|
||||
}
|
||||
|
||||
public void testModifyOnUnmodifiable(){
|
||||
CharArraySet set=new CharArraySet(10,true);
|
||||
set.addAll(Arrays.asList(TEST_STOP_WORDS));
|
||||
final int size = set.size();
|
||||
set = CharArraySet.unmodifiableSet(set);
|
||||
assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
|
||||
String NOT_IN_SET = "SirGallahad";
|
||||
assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));
|
||||
|
||||
try{
|
||||
set.add(NOT_IN_SET.toCharArray());
|
||||
fail("Modified unmodifiable set");
|
||||
}catch (UnsupportedOperationException e) {
|
||||
// expected
|
||||
assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
|
||||
assertEquals("Size of unmodifiable set has changed", size, set.size());
|
||||
}
|
||||
|
||||
try{
|
||||
set.add(NOT_IN_SET);
|
||||
fail("Modified unmodifiable set");
|
||||
}catch (UnsupportedOperationException e) {
|
||||
// expected
|
||||
assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
|
||||
assertEquals("Size of unmodifiable set has changed", size, set.size());
|
||||
}
|
||||
|
||||
try{
|
||||
set.add(new StringBuffer(NOT_IN_SET));
|
||||
fail("Modified unmodifiable set");
|
||||
}catch (UnsupportedOperationException e) {
|
||||
// expected
|
||||
assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
|
||||
assertEquals("Size of unmodifiable set has changed", size, set.size());
|
||||
}
|
||||
|
||||
try{
|
||||
set.clear();
|
||||
fail("Modified unmodifiable set");
|
||||
}catch (UnsupportedOperationException e) {
|
||||
// expected
|
||||
assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
|
||||
assertEquals("Size of unmodifiable set has changed", size, set.size());
|
||||
}
|
||||
try{
|
||||
set.add((Object) NOT_IN_SET);
|
||||
fail("Modified unmodifiable set");
|
||||
}catch (UnsupportedOperationException e) {
|
||||
// expected
|
||||
assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
|
||||
assertEquals("Size of unmodifiable set has changed", size, set.size());
|
||||
}
|
||||
try{
|
||||
set.removeAll(Arrays.asList(TEST_STOP_WORDS));
|
||||
fail("Modified unmodifiable set");
|
||||
}catch (UnsupportedOperationException e) {
|
||||
// expected
|
||||
assertEquals("Size of unmodifiable set has changed", size, set.size());
|
||||
}
|
||||
|
||||
try{
|
||||
set.retainAll(Arrays.asList(new String[]{NOT_IN_SET}));
|
||||
fail("Modified unmodifiable set");
|
||||
}catch (UnsupportedOperationException e) {
|
||||
// expected
|
||||
assertEquals("Size of unmodifiable set has changed", size, set.size());
|
||||
}
|
||||
|
||||
try{
|
||||
set.addAll(Arrays.asList(new String[]{NOT_IN_SET}));
|
||||
fail("Modified unmodifiable set");
|
||||
}catch (UnsupportedOperationException e) {
|
||||
// expected
|
||||
assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
|
||||
}
|
||||
|
||||
for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
|
||||
assertTrue(set.contains(TEST_STOP_WORDS[i]));
|
||||
}
|
||||
}
|
||||
|
||||
public void testUnmodifiableSet(){
|
||||
CharArraySet set=new CharArraySet(10,true);
|
||||
set.addAll(Arrays.asList(TEST_STOP_WORDS));
|
||||
final int size = set.size();
|
||||
set = CharArraySet.unmodifiableSet(set);
|
||||
assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
|
||||
|
||||
try{
|
||||
CharArraySet.unmodifiableSet(null);
|
||||
fail("can not make null unmodifiable");
|
||||
}catch (NullPointerException e) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,12 +23,13 @@ import org.apache.lucene.util.LuceneTestCase;
|
|||
|
||||
import java.io.StringReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
|
||||
public class TestStopAnalyzer extends LuceneTestCase {
|
||||
|
||||
private StopAnalyzer stop = new StopAnalyzer();
|
||||
private StopAnalyzer stop = new StopAnalyzer(false);
|
||||
private Set inValidTokens = new HashSet();
|
||||
|
||||
public TestStopAnalyzer(String s) {
|
||||
|
@ -37,8 +38,10 @@ public class TestStopAnalyzer extends LuceneTestCase {
|
|||
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
for (int i = 0; i < StopAnalyzer.ENGLISH_STOP_WORDS.length; i++) {
|
||||
inValidTokens.add(StopAnalyzer.ENGLISH_STOP_WORDS[i]);
|
||||
|
||||
Iterator it = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator();
|
||||
while(it.hasNext()) {
|
||||
inValidTokens.add(it.next());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,9 @@ import java.text.Collator;
|
|||
import java.text.DateFormat;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.KeywordAnalyzer;
|
||||
|
@ -768,7 +770,9 @@ public class TestQueryParser extends LuceneTestCase {
|
|||
|
||||
public void testBoost()
|
||||
throws Exception {
|
||||
StandardAnalyzer oneStopAnalyzer = new StandardAnalyzer(new String[]{"on"});
|
||||
Set stopWords = new HashSet(1);
|
||||
stopWords.add("on");
|
||||
StandardAnalyzer oneStopAnalyzer = new StandardAnalyzer(stopWords);
|
||||
QueryParser qp = new QueryParser("field", oneStopAnalyzer);
|
||||
Query q = qp.parse("on^1.0");
|
||||
assertNotNull(q);
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.document.Field;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
|
@ -169,7 +170,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase
|
|||
|
||||
public void testPhrasePrefixWithBooleanQuery() throws IOException {
|
||||
RAMDirectory indexStore = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(indexStore, new StandardAnalyzer(new String[]{}), true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
IndexWriter writer = new IndexWriter(indexStore, new StandardAnalyzer(new HashSet(0)), true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
add("This is a test", "object", writer);
|
||||
add("a note", "note", writer);
|
||||
writer.close();
|
||||
|
|
|
@ -39,6 +39,7 @@ import org.apache.lucene.document.Field;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
public class TestSpans extends LuceneTestCase {
|
||||
private IndexSearcher searcher;
|
||||
|
@ -448,7 +449,7 @@ public class TestSpans extends LuceneTestCase {
|
|||
// LUCENE-1404
|
||||
public void testNPESpanQuery() throws Throwable {
|
||||
final Directory dir = new MockRAMDirectory();
|
||||
final IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(new String[0]), IndexWriter.MaxFieldLength.LIMITED);
|
||||
final IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(new HashSet(0)), IndexWriter.MaxFieldLength.LIMITED);
|
||||
|
||||
// Add documents
|
||||
addDoc(writer, "1", "the big dogs went running to the market");
|
||||
|
|
Loading…
Reference in New Issue