LUCENE-1688: Deprecate static final String stop word array in and StopAnalzyer and replace it with an immutable implementation of CharArraySet.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@794078 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2009-07-14 21:39:22 +00:00
parent 6bf4d35ce8
commit ea7e4ad344
15 changed files with 340 additions and 97 deletions

View File

@ -309,6 +309,11 @@ API Changes
all synchronization in TermInfosReader, which previously could
cause threads to pile up in certain cases. (Dan Rosher via Mike
McCandless)
30. LUCENE-1688: Deprecate static final String stop word array in and
StopAnalzyer and replace it with an immutable implementation of
CharArraySet. (Simon Willnauer via Mark Miller)
Bug fixes
@ -604,6 +609,11 @@ Optimizations
9. LUCENE-1653: Avoid creating a Calendar in every call to
DateTools#dateToString, DateTools#timeToString and
DateTools#round. (Shai Erera via Mark Miller)
10. LUCENE-1688: Deprecate static final String stop word array and
replace it with an immutable implementation of CharArraySet.
Removes conversions between Set and array.
(Simon Willnauer via Mark Miller)
Documentation

View File

@ -33,7 +33,7 @@ public class ThaiAnalyzer extends Analyzer {
TokenStream ts = new StandardTokenizer(reader);
ts = new StandardFilter(ts);
ts = new ThaiWordFilter(ts);
ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS);
ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
return ts;
}
}

View File

@ -23,9 +23,11 @@ import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import javax.xml.parsers.DocumentBuilder;
@ -982,7 +984,8 @@ public class HighlighterTest extends TestCase implements Formatter {
public void run() throws Exception {
String goodWord = "goodtoken";
String stopWords[] = { "stoppedtoken" };
Set stopWords = new HashSet(1);
stopWords.add("stoppedtoken");
TermQuery query = new TermQuery(new Term("data", goodWord));
@ -991,7 +994,8 @@ public class HighlighterTest extends TestCase implements Formatter {
sb.append(goodWord);
for (int i = 0; i < 10000; i++) {
sb.append(" ");
sb.append(stopWords[0]);
// only one stopword
sb.append(stopWords.iterator().next());
}
SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
Highlighter hg = getHighlighter(query, "data", new StandardAnalyzer(stopWords).tokenStream(
@ -1024,7 +1028,9 @@ public class HighlighterTest extends TestCase implements Formatter {
public void testMaxSizeEndHighlight() throws Exception {
TestHighlightRunner helper = new TestHighlightRunner() {
public void run() throws Exception {
String stopWords[] = { "in", "it" };
Set stopWords = new HashSet();
stopWords.add("in");
stopWords.add("it");
TermQuery query = new TermQuery(new Term("text", "searchterm"));
String text = "this is a text with searchterm in it";

View File

@ -70,55 +70,60 @@ public class PatternAnalyzer extends Analyzer {
/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
"a", "about", "above", "across", "adj", "after", "afterwards",
"again", "against", "albeit", "all", "almost", "alone", "along",
"already", "also", "although", "always", "among", "amongst", "an",
"and", "another", "any", "anyhow", "anyone", "anything",
"anywhere", "are", "around", "as", "at", "be", "became", "because",
"become", "becomes", "becoming", "been", "before", "beforehand",
"behind", "being", "below", "beside", "besides", "between",
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
"down", "during", "each", "eg", "either", "else", "elsewhere",
"enough", "etc", "even", "ever", "every", "everyone", "everything",
"everywhere", "except", "few", "first", "for", "former",
"formerly", "from", "further", "had", "has", "have", "he", "hence",
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
"must", "my", "myself", "namely", "neither", "never",
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
"once one", "only", "onto", "or", "other", "others", "otherwise",
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
"several", "she", "should", "since", "so", "some", "somehow",
"someone", "something", "sometime", "sometimes", "somewhere",
"still", "such", "t", "than", "that", "the", "their", "them",
"themselves", "then", "thence", "there", "thereafter", "thereby",
"therefor", "therein", "thereupon", "these", "they", "this",
"those", "though", "through", "throughout", "thru", "thus", "to",
"together", "too", "toward", "towards", "under", "until", "up",
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
"whatever", "whatsoever", "when", "whence", "whenever",
"whensoever", "where", "whereafter", "whereas", "whereat",
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
"whereon", "whereto", "whereunto", "whereupon", "wherever",
"wherewith", "whether", "which", "whichever", "whichsoever",
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
"yourselves"});
private static final Set EXTENDED_ENGLISH_STOP_WORDS;
static {
EXTENDED_ENGLISH_STOP_WORDS = new HashSet();
EXTENDED_ENGLISH_STOP_WORDS.addAll(Arrays.asList(new String[] {
"a", "about", "above", "across", "adj", "after", "afterwards",
"again", "against", "albeit", "all", "almost", "alone", "along",
"already", "also", "although", "always", "among", "amongst", "an",
"and", "another", "any", "anyhow", "anyone", "anything",
"anywhere", "are", "around", "as", "at", "be", "became", "because",
"become", "becomes", "becoming", "been", "before", "beforehand",
"behind", "being", "below", "beside", "besides", "between",
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
"down", "during", "each", "eg", "either", "else", "elsewhere",
"enough", "etc", "even", "ever", "every", "everyone", "everything",
"everywhere", "except", "few", "first", "for", "former",
"formerly", "from", "further", "had", "has", "have", "he", "hence",
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
"must", "my", "myself", "namely", "neither", "never",
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
"once one", "only", "onto", "or", "other", "others", "otherwise",
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
"several", "she", "should", "since", "so", "some", "somehow",
"someone", "something", "sometime", "sometimes", "somewhere",
"still", "such", "t", "than", "that", "the", "their", "them",
"themselves", "then", "thence", "there", "thereafter", "thereby",
"therefor", "therein", "thereupon", "these", "they", "this",
"those", "though", "through", "throughout", "thru", "thus", "to",
"together", "too", "toward", "towards", "under", "until", "up",
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
"whatever", "whatsoever", "when", "whence", "whenever",
"whensoever", "where", "whereafter", "whereas", "whereat",
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
"whereon", "whereto", "whereunto", "whereupon", "wherever",
"wherewith", "whether", "which", "whichever", "whichsoever",
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
"yourselves"}));
}
/**
* A lower-casing word analyzer with English stop words (can be shared
* freely across threads without harm); global per class loader.
*/
public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
/**
* A lower-casing word analyzer with <b>extended </b> English stop words
@ -191,7 +196,7 @@ public class PatternAnalyzer extends Analyzer {
}
else {
stream = new PatternTokenizer(text, pattern, toLowerCase);
if (stopWords != null) stream = new StopFilter(stream, stopWords);
if (stopWords != null) stream = new StopFilter(false, stream, stopWords);
}
return stream;
@ -304,9 +309,9 @@ public class PatternAnalyzer extends Analyzer {
}
/** somewhat oversized to minimize hash collisions */
private static Set makeStopSet(String[] stopWords) {
Set stops = new HashSet(stopWords.length * 2, 0.3f);
stops.addAll(Arrays.asList(stopWords));
private static Set makeStopSet(Set stopWords) {
Set stops = new HashSet(stopWords.size() * 2, 0.3f);
stops.addAll(stopWords);
return stops;
// return Collections.unmodifiableSet(stops);
}

View File

@ -271,7 +271,7 @@ public class MemoryIndexTest extends TestCase {
boolean toLowerCase = true;
// boolean toLowerCase = false;
// Set stopWords = null;
Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
Set stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
Analyzer[] analyzers = new Analyzer[] {
new SimpleAnalyzer(),

View File

@ -135,7 +135,7 @@ public class PatternAnalyzerTest extends TestCase {
for (int stops=0; stops < maxStops; stops++) {
Set stopWords = null;
if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
if (stops != 0) stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
for (int toLower=0; toLower < maxToLower; toLower++) {
boolean toLowerCase = toLower != 0;

View File

@ -2,6 +2,7 @@ package org.apache.lucene.analysis;
import java.util.AbstractSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
/**
@ -53,6 +54,12 @@ public class CharArraySet extends AbstractSet {
this(c.size(), ignoreCase);
addAll(c);
}
/** Create set from entries */
private CharArraySet(char[][] entries, boolean ignoreCase, int count){
this.entries = entries;
this.ignoreCase = ignoreCase;
this.count = count;
}
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
* are in the set */
@ -100,7 +107,7 @@ public class CharArraySet extends AbstractSet {
public boolean add(CharSequence text) {
return add(text.toString()); // could be more efficient
}
/** Add this String into the set */
public boolean add(String text) {
return add(text.toCharArray());
@ -228,6 +235,26 @@ public class CharArraySet extends AbstractSet {
}
return add(o.toString());
}
/**
* Returns an unmodifiable {@link CharArraySet}. This allows to provide
* unmodifiable views of internal sets for "read-only" use.
*
* @param set
* a set for which the unmodifiable set is returned.
* @return an new unmodifiable {@link CharArraySet}.
* @throws NullPointerException
* if the given set is <code>null</code>.
*/
public static CharArraySet unmodifiableSet(CharArraySet set) {
if (set == null)
throw new NullPointerException("Given set is null");
/*
* Instead of delegating calls to the given set copy the low-level values to
* the unmodifiable Subclass
*/
return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
}
/** The Iterator<String> for this set. Strings are constructed on the fly, so
* use <code>nextCharArray</code> for more efficient access. */
@ -270,5 +297,40 @@ public class CharArraySet extends AbstractSet {
public Iterator iterator() {
return new CharArraySetIterator();
}
/**
* Efficient unmodifiable {@link CharArraySet}. This implementation does not
* delegate calls to a give {@link CharArraySet} like
* {@link Collections#unmodifiableSet(java.util.Set)} does. Instead is passes
* the internal representation of a {@link CharArraySet} to a super
* constructor and overrides all mutators.
*/
private static final class UnmodifiableCharArraySet extends CharArraySet {
private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase,
int count) {
super(entries, ignoreCase, count);
}
public boolean add(Object o){
throw new UnsupportedOperationException();
}
public boolean addAll(Collection coll) {
throw new UnsupportedOperationException();
}
public boolean add(char[] text) {
throw new UnsupportedOperationException();
}
public boolean add(CharSequence text) {
throw new UnsupportedOperationException();
}
public boolean add(String text) {
throw new UnsupportedOperationException();
}
}
}

View File

@ -20,18 +20,20 @@ package org.apache.lucene.analysis;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Set;
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
public final class StopAnalyzer extends Analyzer {
private Set stopWords;
private final Set/*<String>*/ stopWords;
// @deprecated
private boolean useDefaultStopPositionIncrement;
private boolean enablePositionIncrements;
private final boolean useDefaultStopPositionIncrement;
private final boolean enablePositionIncrements;
/** An array containing some common English words that are not usually useful
for searching. */
for searching.
@deprecated Use {@link #ENGLISH_STOP_WORDS_SET} instead */
public static final String[] ENGLISH_STOP_WORDS = {
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
@ -39,13 +41,31 @@ public final class StopAnalyzer extends Analyzer {
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
/** An unmodifiable set containing some common English words that are not usually useful
for searching.*/
public static final Set/*<String>*/ ENGLISH_STOP_WORDS_SET;
static {
final String[] stopWords = new String[]{
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
final CharArraySet stopSet = new CharArraySet(stopWords.length, false);
stopSet.addAll(Arrays.asList(stopWords));
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
}
/** Builds an analyzer which removes words in
* ENGLISH_STOP_WORDS.
* @deprecated Use {@link #StopAnalyzer(boolean)} instead */
public StopAnalyzer() {
stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
stopWords = ENGLISH_STOP_WORDS_SET;
useDefaultStopPositionIncrement = true;
enablePositionIncrements = false;
}
/** Builds an analyzer which removes words in
@ -53,8 +73,9 @@ public final class StopAnalyzer extends Analyzer {
* @param enablePositionIncrements See {@link
* StopFilter#setEnablePositionIncrements} */
public StopAnalyzer(boolean enablePositionIncrements) {
stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
stopWords = ENGLISH_STOP_WORDS_SET;
this.enablePositionIncrements = enablePositionIncrements;
useDefaultStopPositionIncrement = false;
}
/** Builds an analyzer with the stop words from the given set.
@ -62,6 +83,7 @@ public final class StopAnalyzer extends Analyzer {
public StopAnalyzer(Set stopWords) {
this.stopWords = stopWords;
useDefaultStopPositionIncrement = true;
enablePositionIncrements = false;
}
/** Builds an analyzer with the stop words from the given set.
@ -71,22 +93,26 @@ public final class StopAnalyzer extends Analyzer {
public StopAnalyzer(Set stopWords, boolean enablePositionIncrements) {
this.stopWords = stopWords;
this.enablePositionIncrements = enablePositionIncrements;
useDefaultStopPositionIncrement = false;
}
/** Builds an analyzer which removes words in the provided array.
* @deprecated Use {@link #StopAnalyzer(String[], boolean)} instead */
* @deprecated Use {@link #StopAnalyzer(Set, boolean)} instead */
public StopAnalyzer(String[] stopWords) {
this.stopWords = StopFilter.makeStopSet(stopWords);
useDefaultStopPositionIncrement = true;
enablePositionIncrements = false;
}
/** Builds an analyzer which removes words in the provided array.
* @param stopWords Array of stop words
* @param enablePositionIncrements See {@link
* StopFilter#setEnablePositionIncrements} */
* StopFilter#setEnablePositionIncrements}
* @deprecated Use {@link #StopAnalyzer(Set, boolean) instead*/
public StopAnalyzer(String[] stopWords, boolean enablePositionIncrements) {
this.stopWords = StopFilter.makeStopSet(stopWords);
this.enablePositionIncrements = enablePositionIncrements;
useDefaultStopPositionIncrement = false;
}
/** Builds an analyzer with the stop words from the given file.
@ -95,6 +121,7 @@ public final class StopAnalyzer extends Analyzer {
public StopAnalyzer(File stopwordsFile) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwordsFile);
useDefaultStopPositionIncrement = true;
enablePositionIncrements = false;
}
/** Builds an analyzer with the stop words from the given file.
@ -105,6 +132,7 @@ public final class StopAnalyzer extends Analyzer {
public StopAnalyzer(File stopwordsFile, boolean enablePositionIncrements) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwordsFile);
this.enablePositionIncrements = enablePositionIncrements;
useDefaultStopPositionIncrement = false;
}
/** Builds an analyzer with the stop words from the given reader.
@ -114,6 +142,7 @@ public final class StopAnalyzer extends Analyzer {
public StopAnalyzer(Reader stopwords) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwords);
useDefaultStopPositionIncrement = true;
enablePositionIncrements = false;
}
/** Builds an analyzer with the stop words from the given reader.
@ -124,6 +153,7 @@ public final class StopAnalyzer extends Analyzer {
public StopAnalyzer(Reader stopwords, boolean enablePositionIncrements) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwords);
this.enablePositionIncrements = enablePositionIncrements;
useDefaultStopPositionIncrement = false;
}
/** Filters LowerCaseTokenizer with StopFilter. */

View File

@ -55,6 +55,7 @@ public final class StopFilter extends TokenFilter {
* @param enablePositionIncrements true if token positions should record the removed stop words
* @param input input TokenStream
* @param stopWords array of stop words
* @deprecated Use {@link #StopFilter(boolean, TokenStream, Set)} instead.
*/
public StopFilter(boolean enablePositionIncrements, TokenStream input, String [] stopWords)
{
@ -77,6 +78,7 @@ public final class StopFilter extends TokenFilter {
* @param in input TokenStream
* @param stopWords array of stop words
* @param ignoreCase true if case is ignored
* @deprecated Use {@link #StopFilter(boolean, TokenStream, Set, boolean)} instead.
*/
public StopFilter(boolean enablePositionIncrements, TokenStream in, String[] stopWords, boolean ignoreCase) {
super(in);

View File

@ -101,15 +101,19 @@ public class StandardAnalyzer extends Analyzer {
/** An array containing some common English words that are usually not
useful for searching. */
useful for searching.
@deprecated Use {@link #STOP_WORDS_SET} instead */
public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
/** An unmodifiable set containing some common English words that are usually not
useful for searching. */
public static final Set/*<String>*/ STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer with the default stop words ({@link
* #STOP_WORDS}).
* @deprecated Use {@link #StandardAnalyzer(Version)},
* instead. */
* #STOP_WORDS_SET}).
* @deprecated Use {@link #StandardAnalyzer(Version)} instead. */
public StandardAnalyzer() {
this(Version.LUCENE_24, STOP_WORDS);
this(Version.LUCENE_24, STOP_WORDS_SET);
}
/** Builds an analyzer with the default stop words ({@link
@ -118,7 +122,7 @@ public class StandardAnalyzer extends Analyzer {
* <a href="#version">above</a>}
*/
public StandardAnalyzer(Version matchVersion) {
this(matchVersion, STOP_WORDS);
this(matchVersion, STOP_WORDS_SET);
}
/** Builds an analyzer with the given stop words.
@ -138,22 +142,9 @@ public class StandardAnalyzer extends Analyzer {
}
/** Builds an analyzer with the given stop words.
* @deprecated Use {@link #StandardAnalyzer(Version,
* String[])} instead */
* @deprecated Use {@link #StandardAnalyzer(Version, Set)} instead */
public StandardAnalyzer(String[] stopWords) {
this(Version.LUCENE_24, stopWords);
}
/** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopWords Array of stop words */
public StandardAnalyzer(Version matchVersion, String[] stopWords) {
if (stopWords == null) {
stopWords = STOP_WORDS;
}
stopSet = StopFilter.makeStopSet(stopWords);
init(matchVersion);
this(Version.LUCENE_24, StopFilter.makeStopSet(stopWords));
}
/** Builds an analyzer with the stop words from the given file.
@ -203,8 +194,9 @@ public class StandardAnalyzer extends Analyzer {
* @deprecated Remove in 3.X and make true the only valid value
*/
public StandardAnalyzer(boolean replaceInvalidAcronym) {
this(Version.LUCENE_24, STOP_WORDS);
this(Version.LUCENE_24, STOP_WORDS_SET);
this.replaceInvalidAcronym = replaceInvalidAcronym;
useDefaultStopPositionIncrements = true;
}
/**
@ -243,7 +235,7 @@ public class StandardAnalyzer extends Analyzer {
* @deprecated Remove in 3.X and make true the only valid value
*/
public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{
this(Version.LUCENE_24, stopwords);
this(Version.LUCENE_24, StopFilter.makeStopSet(stopwords));
this.replaceInvalidAcronym = replaceInvalidAcronym;
}

View File

@ -23,13 +23,22 @@ import org.apache.lucene.util.LuceneTestCase;
public class TestCharArraySet extends LuceneTestCase {
static final String[] TEST_STOP_WORDS = {
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
public void testRehash() throws Exception {
CharArraySet cas = new CharArraySet(0, true);
for(int i=0;i<StopAnalyzer.ENGLISH_STOP_WORDS.length;i++)
cas.add(StopAnalyzer.ENGLISH_STOP_WORDS[i]);
assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS.length, cas.size());
for(int i=0;i<StopAnalyzer.ENGLISH_STOP_WORDS.length;i++)
assertTrue(cas.contains(StopAnalyzer.ENGLISH_STOP_WORDS[i]));
for(int i=0;i<TEST_STOP_WORDS.length;i++)
cas.add(TEST_STOP_WORDS[i]);
assertEquals(TEST_STOP_WORDS.length, cas.size());
for(int i=0;i<TEST_STOP_WORDS.length;i++)
assertTrue(cas.contains(TEST_STOP_WORDS[i]));
}
public void testNonZeroOffset() {
@ -39,6 +48,11 @@ public class TestCharArraySet extends LuceneTestCase {
set.addAll(Arrays.asList(words));
assertTrue(set.contains(findme, 1, 4));
assertTrue(set.contains(new String(findme,1,4)));
// test unmodifiable
set = CharArraySet.unmodifiableSet(set);
assertTrue(set.contains(findme, 1, 4));
assertTrue(set.contains(new String(findme,1,4)));
}
public void testObjectContains() {
@ -47,5 +61,118 @@ public class TestCharArraySet extends LuceneTestCase {
set.add(val);
assertTrue(set.contains(val));
assertTrue(set.contains(new Integer(1)));
// test unmodifiable
set = CharArraySet.unmodifiableSet(set);
assertTrue(set.contains(val));
assertTrue(set.contains(new Integer(1)));
}
public void testClear(){
CharArraySet set=new CharArraySet(10,true);
set.addAll(Arrays.asList(TEST_STOP_WORDS));
assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
try{
set.clear();
fail("remove is not supported");
}catch (UnsupportedOperationException e) {
// expected
assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
}
}
public void testModifyOnUnmodifiable(){
CharArraySet set=new CharArraySet(10,true);
set.addAll(Arrays.asList(TEST_STOP_WORDS));
final int size = set.size();
set = CharArraySet.unmodifiableSet(set);
assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
String NOT_IN_SET = "SirGallahad";
assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));
try{
set.add(NOT_IN_SET.toCharArray());
fail("Modified unmodifiable set");
}catch (UnsupportedOperationException e) {
// expected
assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
assertEquals("Size of unmodifiable set has changed", size, set.size());
}
try{
set.add(NOT_IN_SET);
fail("Modified unmodifiable set");
}catch (UnsupportedOperationException e) {
// expected
assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
assertEquals("Size of unmodifiable set has changed", size, set.size());
}
try{
set.add(new StringBuffer(NOT_IN_SET));
fail("Modified unmodifiable set");
}catch (UnsupportedOperationException e) {
// expected
assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
assertEquals("Size of unmodifiable set has changed", size, set.size());
}
try{
set.clear();
fail("Modified unmodifiable set");
}catch (UnsupportedOperationException e) {
// expected
assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
assertEquals("Size of unmodifiable set has changed", size, set.size());
}
try{
set.add((Object) NOT_IN_SET);
fail("Modified unmodifiable set");
}catch (UnsupportedOperationException e) {
// expected
assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
assertEquals("Size of unmodifiable set has changed", size, set.size());
}
try{
set.removeAll(Arrays.asList(TEST_STOP_WORDS));
fail("Modified unmodifiable set");
}catch (UnsupportedOperationException e) {
// expected
assertEquals("Size of unmodifiable set has changed", size, set.size());
}
try{
set.retainAll(Arrays.asList(new String[]{NOT_IN_SET}));
fail("Modified unmodifiable set");
}catch (UnsupportedOperationException e) {
// expected
assertEquals("Size of unmodifiable set has changed", size, set.size());
}
try{
set.addAll(Arrays.asList(new String[]{NOT_IN_SET}));
fail("Modified unmodifiable set");
}catch (UnsupportedOperationException e) {
// expected
assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
}
for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
assertTrue(set.contains(TEST_STOP_WORDS[i]));
}
}
public void testUnmodifiableSet(){
CharArraySet set=new CharArraySet(10,true);
set.addAll(Arrays.asList(TEST_STOP_WORDS));
final int size = set.size();
set = CharArraySet.unmodifiableSet(set);
assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
try{
CharArraySet.unmodifiableSet(null);
fail("can not make null unmodifiable");
}catch (NullPointerException e) {
// expected
}
}
}

View File

@ -23,12 +23,13 @@ import org.apache.lucene.util.LuceneTestCase;
import java.io.StringReader;
import java.io.IOException;
import java.util.Iterator;
import java.util.Set;
import java.util.HashSet;
public class TestStopAnalyzer extends LuceneTestCase {
private StopAnalyzer stop = new StopAnalyzer();
private StopAnalyzer stop = new StopAnalyzer(false);
private Set inValidTokens = new HashSet();
public TestStopAnalyzer(String s) {
@ -37,8 +38,10 @@ public class TestStopAnalyzer extends LuceneTestCase {
protected void setUp() throws Exception {
super.setUp();
for (int i = 0; i < StopAnalyzer.ENGLISH_STOP_WORDS.length; i++) {
inValidTokens.add(StopAnalyzer.ENGLISH_STOP_WORDS[i]);
Iterator it = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator();
while(it.hasNext()) {
inValidTokens.add(it.next());
}
}

View File

@ -23,7 +23,9 @@ import java.text.Collator;
import java.text.DateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
@ -768,7 +770,9 @@ public class TestQueryParser extends LuceneTestCase {
public void testBoost()
throws Exception {
StandardAnalyzer oneStopAnalyzer = new StandardAnalyzer(new String[]{"on"});
Set stopWords = new HashSet(1);
stopWords.add("on");
StandardAnalyzer oneStopAnalyzer = new StandardAnalyzer(stopWords);
QueryParser qp = new QueryParser("field", oneStopAnalyzer);
Query q = qp.parse("on^1.0");
assertNotNull(q);

View File

@ -31,6 +31,7 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
/**
@ -169,7 +170,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase
public void testPhrasePrefixWithBooleanQuery() throws IOException {
RAMDirectory indexStore = new RAMDirectory();
IndexWriter writer = new IndexWriter(indexStore, new StandardAnalyzer(new String[]{}), true, IndexWriter.MaxFieldLength.LIMITED);
IndexWriter writer = new IndexWriter(indexStore, new StandardAnalyzer(new HashSet(0)), true, IndexWriter.MaxFieldLength.LIMITED);
add("This is a test", "object", writer);
add("a note", "note", writer);
writer.close();

View File

@ -39,6 +39,7 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.util.HashSet;
public class TestSpans extends LuceneTestCase {
private IndexSearcher searcher;
@ -448,7 +449,7 @@ public class TestSpans extends LuceneTestCase {
// LUCENE-1404
public void testNPESpanQuery() throws Throwable {
final Directory dir = new MockRAMDirectory();
final IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(new String[0]), IndexWriter.MaxFieldLength.LIMITED);
final IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(new HashSet(0)), IndexWriter.MaxFieldLength.LIMITED);
// Add documents
addDoc(writer, "1", "the big dogs went running to the market");