LUCENE-1040: new CharArraySet, make StopFilter directly use it

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@596484 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-11-19 23:23:04 +00:00
parent 49e85ee76d
commit e395be3faf
4 changed files with 246 additions and 97 deletions

View File

@ -1,5 +1,4 @@
Lucene Change Log
Lucene Change Log
$Id$
======================= Trunk (not yet released) =======================
@ -204,6 +203,10 @@ New features
index changes. SegmentReader, MultiSegmentReader, MultiReader,
and ParallelReader implement reopen(). (Michael Busch)
10. LUCENE-1040: CharArraySet useful for efficiently checking
set membership of text specified by char[]. (yonik)
Optimizations
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the

View File

@ -1,5 +1,9 @@
package org.apache.lucene.analysis;
import java.util.AbstractSet;
import java.util.Collection;
import java.util.Iterator;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -19,90 +23,138 @@ package org.apache.lucene.analysis;
/**
* A simple class that can store & retrieve char[]'s in a
* A simple class that stores Strings as char[]'s in a
* hash table. Note that this is not a general purpose
* class. For example, it cannot remove char[]'s from the
* class. For example, it cannot remove items from the
* set, nor does it resize its hash table to be smaller,
* etc. It is designed for use with StopFilter to enable
* quick filtering based on the char[] termBuffer in a
* Token.
* etc. It is designed to be quick to test if a char[]
* is in the set without the necessity of converting it
* to a String first.
*/
final class CharArraySet {
public class CharArraySet extends AbstractSet {
private final static int INIT_SIZE = 8;
private final static double MAX_LOAD_FACTOR = 0.75;
private int mask;
private char[][] entries;
private int count;
private boolean ignoreCase;
private final boolean ignoreCase;
/** Create set with enough capacity to hold startSize
* terms */
public CharArraySet(int startSize, boolean ignoreCase) {
this.ignoreCase = ignoreCase;
int size = INIT_SIZE;
while(((double) startSize)/size >= MAX_LOAD_FACTOR)
size *= 2;
mask = size-1;
while(startSize + (startSize>>2) > size)
size <<= 1;
entries = new char[size][];
}
/** Returns true if the characters in text up to length
* len is present in the set. */
public boolean contains(char[] text, int len) {
/** Create set from a Collection of char[] or String */
public CharArraySet(Collection c, boolean ignoreCase) {
this(c.size(), ignoreCase);
addAll(c);
}
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
* are in the set */
public boolean contains(char[] text, int off, int len) {
return entries[getSlot(text, off, len)] != null;
}
/** true if the <code>CharSequence</code> is in the set */
public boolean contains(CharSequence cs) {
return entries[getSlot(cs)] != null;
}
private int getSlot(char[] text, int off, int len) {
int code = getHashCode(text, len);
int pos = code & mask;
int pos = code & (entries.length-1);
char[] text2 = entries[pos];
if (text2 != null && !equals(text, len, text2)) {
if (text2 != null && !equals(text, off, len, text2)) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & mask;
pos = code & (entries.length-1);
text2 = entries[pos];
} while (text2 != null && !equals(text, len, text2));
} while (text2 != null && !equals(text, off, len, text2));
}
return text2 != null;
return pos;
}
/** Returns true if the String is in the set */
private int getSlot(CharSequence text) {
int code = getHashCode(text);
int pos = code & (entries.length-1);
char[] text2 = entries[pos];
if (text2 != null && !equals(text, text2)) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & (entries.length-1);
text2 = entries[pos];
} while (text2 != null && !equals(text, text2));
}
return pos;
}
/** Add this CharSequence into the set */
public boolean add(CharSequence text) {
return add(text.toString()); // could be more efficient
}
/** Add this String into the set */
public void add(String text) {
add(text.toCharArray());
public boolean add(String text) {
return add(text.toCharArray());
}
/** Add this text into the set */
public void add(char[] text) {
/** Add this char[] directly to the set.
* If ignoreCase is true for this Set, the text array will be directly modified.
* The user should never modify this text array after calling this method.
*/
public boolean add(char[] text) {
if (ignoreCase)
for(int i=0;i<text.length;i++)
text[i] = Character.toLowerCase(text[i]);
int code = getHashCode(text, text.length);
int pos = code & mask;
char[] text2 = entries[pos];
if (text2 != null) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & mask;
text2 = entries[pos];
} while (text2 != null);
}
entries[pos] = text;
int slot = getSlot(text, 0, text.length);
if (entries[slot] != null) return false;
entries[slot] = text;
count++;
if (((double) count)/entries.length > MAX_LOAD_FACTOR) {
if (count > entries.length + (entries.length>>2) ) {
rehash();
}
return true;
}
private boolean equals(char[] text1, int len, char[] text2) {
private boolean equals(char[] text1, int off, int len, char[] text2) {
if (len != text2.length)
return false;
for(int i=0;i<len;i++) {
if (ignoreCase) {
if (Character.toLowerCase(text1[i]) != text2[i])
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1[off+i]) != text2[i])
return false;
}
} else {
if (text1[i] != text2[i])
for(int i=0;i<len;i++) {
if (text1[off+i] != text2[i])
return false;
}
}
return true;
}
private boolean equals(CharSequence text1, char[] text2) {
int len = text1.length();
if (len != text2.length)
return false;
if (ignoreCase) {
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
return false;
}
} else {
for(int i=0;i<len;i++) {
if (text1.charAt(i) != text2[i])
return false;
}
}
@ -111,39 +163,125 @@ final class CharArraySet {
private void rehash() {
final int newSize = 2*count;
mask = newSize-1;
char[][] oldEntries = entries;
char[][] entries = new char[newSize][];
char[][] newEntries = new char[newSize][];
for(int i=0;i<entries.length;i++) {
char[] text = entries[i];
for(int i=0;i<oldEntries.length;i++) {
char[] text = oldEntries[i];
if (text != null) {
int code = getHashCode(text, text.length);
int pos = code & mask;
if (newEntries[pos] != null) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
pos = code & mask;
} while (newEntries[pos] != null);
}
newEntries[pos] = text;
// todo: could be faster... no need to compare strings on collision
entries[ getSlot(text,0,text.length) ] = text;
}
}
entries = newEntries;
}
private int getHashCode(char[] text, int len) {
int downto = len;
int code = 0;
while (downto > 0) {
final char c;
if (ignoreCase)
c = Character.toLowerCase(text[--downto]);
else
c = text[--downto];
code = (code*31) + c;
if (ignoreCase) {
for (int i=0; i<len; i++) {
code = code*31 + Character.toLowerCase(text[i]);
}
} else {
for (int i=0; i<len; i++) {
code = code*31 + text[i];
}
}
return code;
}
private int getHashCode(CharSequence text) {
int code;
if (ignoreCase) {
code = 0;
int len = text.length();
for (int i=0; i<len; i++) {
code = code*31 + Character.toLowerCase(text.charAt(i));
}
} else {
if (false && text instanceof String) {
code = text.hashCode();
} else {
code = 0;
int len = text.length();
for (int i=0; i<len; i++) {
code = code*31 + text.charAt(i);
}
}
}
return code;
}
public int size() {
return count;
}
public boolean isEmpty() {
return count==0;
}
public boolean contains(Object o) {
if (o instanceof char[]) {
char[] text = (char[])o;
return contains(text, 0, text.length);
} else if (o instanceof CharSequence) {
return contains((CharSequence)o);
}
return false;
}
public boolean add(Object o) {
if (o instanceof char[]) {
return add((char[])o);
} else if (o instanceof String) {
return add((String)o);
} else if (o instanceof CharSequence) {
return add((CharSequence)o);
} else {
return add(o.toString());
}
}
/** The Iterator<String> for this set. Strings are constructed on the fly, so
* use <code>nextCharArray</code> for more efficient access. */
public class CharArraySetIterator implements Iterator {
int pos=-1;
char[] next;
CharArraySetIterator() {
goNext();
}
private void goNext() {
next = null;
pos++;
while (pos < entries.length && (next=entries[pos]) == null) pos++;
}
public boolean hasNext() {
return next != null;
}
/** do not modify the returned char[] */
public char[] nextCharArray() {
char[] ret = next;
goNext();
return ret;
}
/** Returns the next String, as a Set<String> would...
* use nextCharArray() for better efficiency. */
public Object next() {
return new String(nextCharArray());
}
public void remove() {
throw new UnsupportedOperationException();
}
}
public Iterator iterator() {
return new CharArraySetIterator();
}
}

View File

@ -18,7 +18,7 @@ package org.apache.lucene.analysis;
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Set;
@ -29,7 +29,6 @@ import java.util.Set;
public final class StopFilter extends TokenFilter {
private final CharArraySet stopWords;
private final boolean ignoreCase;
/**
* Construct a token stream filtering the given input.
@ -45,32 +44,39 @@ public final class StopFilter extends TokenFilter {
*/
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
super(in);
this.ignoreCase = ignoreCase;
this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
}
/**
* Construct a token stream filtering the given input.
* If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
* <code>makeStopSet()</code> was used to construct the set) it will be directly used
* and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
* directly controls case sensitivity.
* <p/>
* If <code>stopWords</code> is not an instance of {@link CharArraySet},
* a new CharArraySet will be constructed and <code>ignoreCase</code> will be
* used to specify the case sensitivity of that set.
*
* @param input
* @param stopWords The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased
* @param ignoreCase -Ignore case when stopping. The stopWords set must be setup to contain only lower case words
* @param stopWords The set of Stop Words.
* @param ignoreCase -Ignore case when stopping.
*/
public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
{
super(input);
this.ignoreCase = ignoreCase;
if (stopWords instanceof CharArraySet) {
this.stopWords = (CharArraySet)stopWords;
} else {
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
Iterator it = stopWords.iterator();
while(it.hasNext())
this.stopWords.add((String) it.next());
this.stopWords.addAll(stopWords);
}
}
/**
* Constructs a filter which removes words from the input
* TokenStream that are named in the Set.
* It is crucial that an efficient Set implementation is used
* for maximum performance.
*
* @see #makeStopSet(java.lang.String[])
*/
@ -97,18 +103,9 @@ public final class StopFilter extends TokenFilter {
* @return a Set containing the words
*/
public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) {
HashSet stopTable = new HashSet(stopWords.length);
for (int i = 0; i < stopWords.length; i++)
stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
return stopTable;
}
private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) {
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
for (int i = 0; i < stopWords.length; i++)
stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
return stopSet;
}
stopSet.addAll(Arrays.asList(stopWords));
return stopSet; }
/**
* Returns the next input Token whose termText() is not a stop word.
@ -116,7 +113,7 @@ public final class StopFilter extends TokenFilter {
public final Token next(Token result) throws IOException {
// return the first non-stop word found
while((result = input.next(result)) != null) {
if (!stopWords.contains(result.termBuffer(), result.termLength))
if (!stopWords.contains(result.termBuffer(), 0, result.termLength))
return result;
}
// reached EOS -- return null

View File

@ -16,10 +16,11 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.util.LuceneTestCase;
import java.util.Set;
/**
* @author yonik
@ -45,4 +46,14 @@ public class TestStopFilter extends LuceneTestCase {
assertEquals(null,stream.next());
}
public void testStopFilt() throws IOException {
StringReader reader = new StringReader("Now is The Time");
String[] stopWords = new String[] { "is", "the", "Time" };
Set stopSet = StopFilter.makeStopSet(stopWords);
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
assertEquals("Now", stream.next().termText());
assertEquals("The", stream.next().termText());
assertEquals(null, stream.next());
}
}