LUCENE-1040: new CharArraySet, make StopFilter directly use it

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@596484 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-11-19 23:23:04 +00:00
parent 49e85ee76d
commit e395be3faf
4 changed files with 246 additions and 97 deletions

View File

@ -1,5 +1,4 @@
Lucene Change Log Lucene Change Log
$Id$ $Id$
======================= Trunk (not yet released) ======================= ======================= Trunk (not yet released) =======================
@ -204,6 +203,10 @@ New features
index changes. SegmentReader, MultiSegmentReader, MultiReader, index changes. SegmentReader, MultiSegmentReader, MultiReader,
and ParallelReader implement reopen(). (Michael Busch) and ParallelReader implement reopen(). (Michael Busch)
10. LUCENE-1040: CharArraySet useful for efficiently checking
set membership of text specified by char[]. (yonik)
Optimizations Optimizations
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the 1. LUCENE-937: CachingTokenFilter now uses an iterator to access the

View File

@ -1,5 +1,9 @@
package org.apache.lucene.analysis; package org.apache.lucene.analysis;
import java.util.AbstractSet;
import java.util.Collection;
import java.util.Iterator;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -19,90 +23,138 @@ package org.apache.lucene.analysis;
/** /**
* A simple class that can store & retrieve char[]'s in a * A simple class that stores Strings as char[]'s in a
* hash table. Note that this is not a general purpose * hash table. Note that this is not a general purpose
* class. For example, it cannot remove char[]'s from the * class. For example, it cannot remove items from the
* set, nor does it resize its hash table to be smaller, * set, nor does it resize its hash table to be smaller,
* etc. It is designed for use with StopFilter to enable * etc. It is designed to be quick to test if a char[]
* quick filtering based on the char[] termBuffer in a * is in the set without the necessity of converting it
* Token. * to a String first.
*/ */
final class CharArraySet { public class CharArraySet extends AbstractSet {
private final static int INIT_SIZE = 8; private final static int INIT_SIZE = 8;
private final static double MAX_LOAD_FACTOR = 0.75;
private int mask;
private char[][] entries; private char[][] entries;
private int count; private int count;
private boolean ignoreCase; private final boolean ignoreCase;
/** Create set with enough capacity to hold startSize /** Create set with enough capacity to hold startSize
* terms */ * terms */
public CharArraySet(int startSize, boolean ignoreCase) { public CharArraySet(int startSize, boolean ignoreCase) {
this.ignoreCase = ignoreCase; this.ignoreCase = ignoreCase;
int size = INIT_SIZE; int size = INIT_SIZE;
while(((double) startSize)/size >= MAX_LOAD_FACTOR) while(startSize + (startSize>>2) > size)
size *= 2; size <<= 1;
mask = size-1;
entries = new char[size][]; entries = new char[size][];
} }
/** Returns true if the characters in text up to length /** Create set from a Collection of char[] or String */
* len is present in the set. */ public CharArraySet(Collection c, boolean ignoreCase) {
public boolean contains(char[] text, int len) { this(c.size(), ignoreCase);
addAll(c);
}
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
* are in the set */
public boolean contains(char[] text, int off, int len) {
return entries[getSlot(text, off, len)] != null;
}
/** true if the <code>CharSequence</code> is in the set */
public boolean contains(CharSequence cs) {
return entries[getSlot(cs)] != null;
}
private int getSlot(char[] text, int off, int len) {
int code = getHashCode(text, len); int code = getHashCode(text, len);
int pos = code & mask; int pos = code & (entries.length-1);
char[] text2 = entries[pos]; char[] text2 = entries[pos];
if (text2 != null && !equals(text, len, text2)) { if (text2 != null && !equals(text, off, len, text2)) {
final int inc = ((code>>8)+code)|1; final int inc = ((code>>8)+code)|1;
do { do {
code += inc; code += inc;
pos = code & mask; pos = code & (entries.length-1);
text2 = entries[pos]; text2 = entries[pos];
} while (text2 != null && !equals(text, len, text2)); } while (text2 != null && !equals(text, off, len, text2));
} }
return text2 != null; return pos;
}
/** Returns true if the String is in the set */
private int getSlot(CharSequence text) {
int code = getHashCode(text);
int pos = code & (entries.length-1);
char[] text2 = entries[pos];
if (text2 != null && !equals(text, text2)) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & (entries.length-1);
text2 = entries[pos];
} while (text2 != null && !equals(text, text2));
}
return pos;
}
/** Add this CharSequence into the set */
public boolean add(CharSequence text) {
return add(text.toString()); // could be more efficient
} }
/** Add this String into the set */ /** Add this String into the set */
public void add(String text) { public boolean add(String text) {
add(text.toCharArray()); return add(text.toCharArray());
} }
/** Add this text into the set */ /** Add this char[] directly to the set.
public void add(char[] text) { * If ignoreCase is true for this Set, the text array will be directly modified.
* The user should never modify this text array after calling this method.
*/
public boolean add(char[] text) {
if (ignoreCase) if (ignoreCase)
for(int i=0;i<text.length;i++) for(int i=0;i<text.length;i++)
text[i] = Character.toLowerCase(text[i]); text[i] = Character.toLowerCase(text[i]);
int code = getHashCode(text, text.length); int slot = getSlot(text, 0, text.length);
int pos = code & mask; if (entries[slot] != null) return false;
char[] text2 = entries[pos]; entries[slot] = text;
if (text2 != null) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & mask;
text2 = entries[pos];
} while (text2 != null);
}
entries[pos] = text;
count++; count++;
if (((double) count)/entries.length > MAX_LOAD_FACTOR) { if (count > entries.length + (entries.length>>2) ) {
rehash(); rehash();
} }
return true;
} }
private boolean equals(char[] text1, int len, char[] text2) { private boolean equals(char[] text1, int off, int len, char[] text2) {
if (len != text2.length) if (len != text2.length)
return false; return false;
for(int i=0;i<len;i++) {
if (ignoreCase) { if (ignoreCase) {
if (Character.toLowerCase(text1[i]) != text2[i]) for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1[off+i]) != text2[i])
return false; return false;
}
} else { } else {
if (text1[i] != text2[i]) for(int i=0;i<len;i++) {
if (text1[off+i] != text2[i])
return false;
}
}
return true;
}
private boolean equals(CharSequence text1, char[] text2) {
int len = text1.length();
if (len != text2.length)
return false;
if (ignoreCase) {
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
return false;
}
} else {
for(int i=0;i<len;i++) {
if (text1.charAt(i) != text2[i])
return false; return false;
} }
} }
@ -111,39 +163,125 @@ final class CharArraySet {
private void rehash() { private void rehash() {
final int newSize = 2*count; final int newSize = 2*count;
mask = newSize-1; char[][] oldEntries = entries;
char[][] entries = new char[newSize][];
char[][] newEntries = new char[newSize][]; for(int i=0;i<oldEntries.length;i++) {
for(int i=0;i<entries.length;i++) { char[] text = oldEntries[i];
char[] text = entries[i];
if (text != null) { if (text != null) {
int code = getHashCode(text, text.length); // todo: could be faster... no need to compare strings on collision
int pos = code & mask; entries[ getSlot(text,0,text.length) ] = text;
if (newEntries[pos] != null) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & mask;
} while (newEntries[pos] != null);
}
newEntries[pos] = text;
} }
} }
entries = newEntries;
} }
private int getHashCode(char[] text, int len) { private int getHashCode(char[] text, int len) {
int downto = len;
int code = 0; int code = 0;
while (downto > 0) { if (ignoreCase) {
final char c; for (int i=0; i<len; i++) {
if (ignoreCase) code = code*31 + Character.toLowerCase(text[i]);
c = Character.toLowerCase(text[--downto]); }
else } else {
c = text[--downto]; for (int i=0; i<len; i++) {
code = (code*31) + c; code = code*31 + text[i];
}
} }
return code; return code;
} }
private int getHashCode(CharSequence text) {
int code;
if (ignoreCase) {
code = 0;
int len = text.length();
for (int i=0; i<len; i++) {
code = code*31 + Character.toLowerCase(text.charAt(i));
}
} else {
if (false && text instanceof String) {
code = text.hashCode();
} else {
code = 0;
int len = text.length();
for (int i=0; i<len; i++) {
code = code*31 + text.charAt(i);
}
}
}
return code;
}
public int size() {
return count;
}
public boolean isEmpty() {
return count==0;
}
public boolean contains(Object o) {
if (o instanceof char[]) {
char[] text = (char[])o;
return contains(text, 0, text.length);
} else if (o instanceof CharSequence) {
return contains((CharSequence)o);
}
return false;
}
public boolean add(Object o) {
if (o instanceof char[]) {
return add((char[])o);
} else if (o instanceof String) {
return add((String)o);
} else if (o instanceof CharSequence) {
return add((CharSequence)o);
} else {
return add(o.toString());
}
}
/** The Iterator<String> for this set. Strings are constructed on the fly, so
* use <code>nextCharArray</code> for more efficient access. */
public class CharArraySetIterator implements Iterator {
int pos=-1;
char[] next;
CharArraySetIterator() {
goNext();
}
private void goNext() {
next = null;
pos++;
while (pos < entries.length && (next=entries[pos]) == null) pos++;
}
public boolean hasNext() {
return next != null;
}
/** do not modify the returned char[] */
public char[] nextCharArray() {
char[] ret = next;
goNext();
return ret;
}
/** Returns the next String, as a Set<String> would...
* use nextCharArray() for better efficiency. */
public Object next() {
return new String(nextCharArray());
}
public void remove() {
throw new UnsupportedOperationException();
}
}
public Iterator iterator() {
return new CharArraySetIterator();
}
} }

View File

@ -18,7 +18,7 @@ package org.apache.lucene.analysis;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet; import java.util.Arrays;
import java.util.Iterator; import java.util.Iterator;
import java.util.Set; import java.util.Set;
@ -29,7 +29,6 @@ import java.util.Set;
public final class StopFilter extends TokenFilter { public final class StopFilter extends TokenFilter {
private final CharArraySet stopWords; private final CharArraySet stopWords;
private final boolean ignoreCase;
/** /**
* Construct a token stream filtering the given input. * Construct a token stream filtering the given input.
@ -45,32 +44,39 @@ public final class StopFilter extends TokenFilter {
*/ */
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) { public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
super(in); super(in);
this.ignoreCase = ignoreCase; this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
} }
/** /**
* Construct a token stream filtering the given input. * Construct a token stream filtering the given input.
* If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
* <code>makeStopSet()</code> was used to construct the set) it will be directly used
* and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
* directly controls case sensitivity.
* <p/>
* If <code>stopWords</code> is not an instance of {@link CharArraySet},
* a new CharArraySet will be constructed and <code>ignoreCase</code> will be
* used to specify the case sensitivity of that set.
*
* @param input * @param input
* @param stopWords The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased * @param stopWords The set of Stop Words.
* @param ignoreCase -Ignore case when stopping. The stopWords set must be setup to contain only lower case words * @param ignoreCase -Ignore case when stopping.
*/ */
public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase) public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
{ {
super(input); super(input);
this.ignoreCase = ignoreCase; if (stopWords instanceof CharArraySet) {
this.stopWords = (CharArraySet)stopWords;
} else {
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase); this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
Iterator it = stopWords.iterator(); this.stopWords.addAll(stopWords);
while(it.hasNext()) }
this.stopWords.add((String) it.next());
} }
/** /**
* Constructs a filter which removes words from the input * Constructs a filter which removes words from the input
* TokenStream that are named in the Set. * TokenStream that are named in the Set.
* It is crucial that an efficient Set implementation is used
* for maximum performance.
* *
* @see #makeStopSet(java.lang.String[]) * @see #makeStopSet(java.lang.String[])
*/ */
@ -97,18 +103,9 @@ public final class StopFilter extends TokenFilter {
* @return a Set containing the words * @return a Set containing the words
*/ */
public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) { public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) {
HashSet stopTable = new HashSet(stopWords.length);
for (int i = 0; i < stopWords.length; i++)
stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
return stopTable;
}
private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) {
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase); CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
for (int i = 0; i < stopWords.length; i++) stopSet.addAll(Arrays.asList(stopWords));
stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]); return stopSet; }
return stopSet;
}
/** /**
* Returns the next input Token whose termText() is not a stop word. * Returns the next input Token whose termText() is not a stop word.
@ -116,7 +113,7 @@ public final class StopFilter extends TokenFilter {
public final Token next(Token result) throws IOException { public final Token next(Token result) throws IOException {
// return the first non-stop word found // return the first non-stop word found
while((result = input.next(result)) != null) { while((result = input.next(result)) != null) {
if (!stopWords.contains(result.termBuffer(), result.termLength)) if (!stopWords.contains(result.termBuffer(), 0, result.termLength))
return result; return result;
} }
// reached EOS -- return null // reached EOS -- return null

View File

@ -16,10 +16,11 @@ package org.apache.lucene.analysis;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.Set;
import org.apache.lucene.util.LuceneTestCase;
/** /**
* @author yonik * @author yonik
@ -45,4 +46,14 @@ public class TestStopFilter extends LuceneTestCase {
assertEquals(null,stream.next()); assertEquals(null,stream.next());
} }
public void testStopFilt() throws IOException {
StringReader reader = new StringReader("Now is The Time");
String[] stopWords = new String[] { "is", "the", "Time" };
Set stopSet = StopFilter.makeStopSet(stopWords);
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
assertEquals("Now", stream.next().termText());
assertEquals("The", stream.next().termText());
assertEquals(null, stream.next());
}
} }