mirror of https://github.com/apache/lucene.git
LUCENE-1040: new CharArraySet, make StopFilter directly use it
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@596484 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
49e85ee76d
commit
e395be3faf
|
@ -1,5 +1,4 @@
|
|||
Lucene Change Log
|
||||
|
||||
Lucene Change Log
|
||||
$Id$
|
||||
|
||||
======================= Trunk (not yet released) =======================
|
||||
|
@ -204,6 +203,10 @@ New features
|
|||
index changes. SegmentReader, MultiSegmentReader, MultiReader,
|
||||
and ParallelReader implement reopen(). (Michael Busch)
|
||||
|
||||
10. LUCENE-1040: CharArraySet useful for efficiently checking
|
||||
set membership of text specified by char[]. (yonik)
|
||||
|
||||
|
||||
Optimizations
|
||||
|
||||
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.util.AbstractSet;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -19,90 +23,138 @@ package org.apache.lucene.analysis;
|
|||
|
||||
|
||||
/**
|
||||
* A simple class that can store & retrieve char[]'s in a
|
||||
* A simple class that stores Strings as char[]'s in a
|
||||
* hash table. Note that this is not a general purpose
|
||||
* class. For example, it cannot remove char[]'s from the
|
||||
* class. For example, it cannot remove items from the
|
||||
* set, nor does it resize its hash table to be smaller,
|
||||
* etc. It is designed for use with StopFilter to enable
|
||||
* quick filtering based on the char[] termBuffer in a
|
||||
* Token.
|
||||
* etc. It is designed to be quick to test if a char[]
|
||||
* is in the set without the necessity of converting it
|
||||
* to a String first.
|
||||
*/
|
||||
|
||||
final class CharArraySet {
|
||||
|
||||
public class CharArraySet extends AbstractSet {
|
||||
private final static int INIT_SIZE = 8;
|
||||
private final static double MAX_LOAD_FACTOR = 0.75;
|
||||
private int mask;
|
||||
private char[][] entries;
|
||||
private int count;
|
||||
private boolean ignoreCase;
|
||||
private final boolean ignoreCase;
|
||||
|
||||
/** Create set with enough capacity to hold startSize
|
||||
* terms */
|
||||
public CharArraySet(int startSize, boolean ignoreCase) {
|
||||
this.ignoreCase = ignoreCase;
|
||||
int size = INIT_SIZE;
|
||||
while(((double) startSize)/size >= MAX_LOAD_FACTOR)
|
||||
size *= 2;
|
||||
mask = size-1;
|
||||
while(startSize + (startSize>>2) > size)
|
||||
size <<= 1;
|
||||
entries = new char[size][];
|
||||
}
|
||||
|
||||
/** Returns true if the characters in text up to length
|
||||
* len is present in the set. */
|
||||
public boolean contains(char[] text, int len) {
|
||||
/** Create set from a Collection of char[] or String */
|
||||
public CharArraySet(Collection c, boolean ignoreCase) {
|
||||
this(c.size(), ignoreCase);
|
||||
addAll(c);
|
||||
}
|
||||
|
||||
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
|
||||
* are in the set */
|
||||
public boolean contains(char[] text, int off, int len) {
|
||||
return entries[getSlot(text, off, len)] != null;
|
||||
}
|
||||
|
||||
/** true if the <code>CharSequence</code> is in the set */
|
||||
public boolean contains(CharSequence cs) {
|
||||
return entries[getSlot(cs)] != null;
|
||||
}
|
||||
|
||||
private int getSlot(char[] text, int off, int len) {
|
||||
int code = getHashCode(text, len);
|
||||
int pos = code & mask;
|
||||
int pos = code & (entries.length-1);
|
||||
char[] text2 = entries[pos];
|
||||
if (text2 != null && !equals(text, len, text2)) {
|
||||
if (text2 != null && !equals(text, off, len, text2)) {
|
||||
final int inc = ((code>>8)+code)|1;
|
||||
do {
|
||||
code += inc;
|
||||
pos = code & mask;
|
||||
pos = code & (entries.length-1);
|
||||
text2 = entries[pos];
|
||||
} while (text2 != null && !equals(text, len, text2));
|
||||
} while (text2 != null && !equals(text, off, len, text2));
|
||||
}
|
||||
return text2 != null;
|
||||
return pos;
|
||||
}
|
||||
|
||||
/** Returns true if the String is in the set */
|
||||
private int getSlot(CharSequence text) {
|
||||
int code = getHashCode(text);
|
||||
int pos = code & (entries.length-1);
|
||||
char[] text2 = entries[pos];
|
||||
if (text2 != null && !equals(text, text2)) {
|
||||
final int inc = ((code>>8)+code)|1;
|
||||
do {
|
||||
code += inc;
|
||||
pos = code & (entries.length-1);
|
||||
text2 = entries[pos];
|
||||
} while (text2 != null && !equals(text, text2));
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
/** Add this CharSequence into the set */
|
||||
public boolean add(CharSequence text) {
|
||||
return add(text.toString()); // could be more efficient
|
||||
}
|
||||
|
||||
/** Add this String into the set */
|
||||
public void add(String text) {
|
||||
add(text.toCharArray());
|
||||
public boolean add(String text) {
|
||||
return add(text.toCharArray());
|
||||
}
|
||||
|
||||
/** Add this text into the set */
|
||||
public void add(char[] text) {
|
||||
/** Add this char[] directly to the set.
|
||||
* If ignoreCase is true for this Set, the text array will be directly modified.
|
||||
* The user should never modify this text array after calling this method.
|
||||
*/
|
||||
public boolean add(char[] text) {
|
||||
if (ignoreCase)
|
||||
for(int i=0;i<text.length;i++)
|
||||
text[i] = Character.toLowerCase(text[i]);
|
||||
int code = getHashCode(text, text.length);
|
||||
int pos = code & mask;
|
||||
char[] text2 = entries[pos];
|
||||
if (text2 != null) {
|
||||
final int inc = ((code>>8)+code)|1;
|
||||
do {
|
||||
code += inc;
|
||||
pos = code & mask;
|
||||
text2 = entries[pos];
|
||||
} while (text2 != null);
|
||||
}
|
||||
entries[pos] = text;
|
||||
int slot = getSlot(text, 0, text.length);
|
||||
if (entries[slot] != null) return false;
|
||||
entries[slot] = text;
|
||||
count++;
|
||||
|
||||
if (((double) count)/entries.length > MAX_LOAD_FACTOR) {
|
||||
if (count > entries.length + (entries.length>>2) ) {
|
||||
rehash();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean equals(char[] text1, int len, char[] text2) {
|
||||
private boolean equals(char[] text1, int off, int len, char[] text2) {
|
||||
if (len != text2.length)
|
||||
return false;
|
||||
for(int i=0;i<len;i++) {
|
||||
if (ignoreCase) {
|
||||
if (Character.toLowerCase(text1[i]) != text2[i])
|
||||
for(int i=0;i<len;i++) {
|
||||
if (Character.toLowerCase(text1[off+i]) != text2[i])
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (text1[i] != text2[i])
|
||||
for(int i=0;i<len;i++) {
|
||||
if (text1[off+i] != text2[i])
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean equals(CharSequence text1, char[] text2) {
|
||||
int len = text1.length();
|
||||
if (len != text2.length)
|
||||
return false;
|
||||
if (ignoreCase) {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (text1.charAt(i) != text2[i])
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -111,39 +163,125 @@ final class CharArraySet {
|
|||
|
||||
private void rehash() {
|
||||
final int newSize = 2*count;
|
||||
mask = newSize-1;
|
||||
char[][] oldEntries = entries;
|
||||
char[][] entries = new char[newSize][];
|
||||
|
||||
char[][] newEntries = new char[newSize][];
|
||||
for(int i=0;i<entries.length;i++) {
|
||||
char[] text = entries[i];
|
||||
for(int i=0;i<oldEntries.length;i++) {
|
||||
char[] text = oldEntries[i];
|
||||
if (text != null) {
|
||||
int code = getHashCode(text, text.length);
|
||||
int pos = code & mask;
|
||||
if (newEntries[pos] != null) {
|
||||
final int inc = ((code>>8)+code)|1;
|
||||
do {
|
||||
code += inc;
|
||||
pos = code & mask;
|
||||
} while (newEntries[pos] != null);
|
||||
}
|
||||
newEntries[pos] = text;
|
||||
// todo: could be faster... no need to compare strings on collision
|
||||
entries[ getSlot(text,0,text.length) ] = text;
|
||||
}
|
||||
}
|
||||
|
||||
entries = newEntries;
|
||||
}
|
||||
|
||||
private int getHashCode(char[] text, int len) {
|
||||
int downto = len;
|
||||
int code = 0;
|
||||
while (downto > 0) {
|
||||
final char c;
|
||||
if (ignoreCase)
|
||||
c = Character.toLowerCase(text[--downto]);
|
||||
else
|
||||
c = text[--downto];
|
||||
code = (code*31) + c;
|
||||
if (ignoreCase) {
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + Character.toLowerCase(text[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + text[i];
|
||||
}
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
private int getHashCode(CharSequence text) {
|
||||
int code;
|
||||
if (ignoreCase) {
|
||||
code = 0;
|
||||
int len = text.length();
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + Character.toLowerCase(text.charAt(i));
|
||||
}
|
||||
} else {
|
||||
if (false && text instanceof String) {
|
||||
code = text.hashCode();
|
||||
} else {
|
||||
code = 0;
|
||||
int len = text.length();
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + text.charAt(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
|
||||
public int size() {
|
||||
return count;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return count==0;
|
||||
}
|
||||
|
||||
public boolean contains(Object o) {
|
||||
if (o instanceof char[]) {
|
||||
char[] text = (char[])o;
|
||||
return contains(text, 0, text.length);
|
||||
} else if (o instanceof CharSequence) {
|
||||
return contains((CharSequence)o);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean add(Object o) {
|
||||
if (o instanceof char[]) {
|
||||
return add((char[])o);
|
||||
} else if (o instanceof String) {
|
||||
return add((String)o);
|
||||
} else if (o instanceof CharSequence) {
|
||||
return add((CharSequence)o);
|
||||
} else {
|
||||
return add(o.toString());
|
||||
}
|
||||
}
|
||||
|
||||
/** The Iterator<String> for this set. Strings are constructed on the fly, so
|
||||
* use <code>nextCharArray</code> for more efficient access. */
|
||||
public class CharArraySetIterator implements Iterator {
|
||||
int pos=-1;
|
||||
char[] next;
|
||||
CharArraySetIterator() {
|
||||
goNext();
|
||||
}
|
||||
|
||||
private void goNext() {
|
||||
next = null;
|
||||
pos++;
|
||||
while (pos < entries.length && (next=entries[pos]) == null) pos++;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return next != null;
|
||||
}
|
||||
|
||||
/** do not modify the returned char[] */
|
||||
public char[] nextCharArray() {
|
||||
char[] ret = next;
|
||||
goNext();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Returns the next String, as a Set<String> would...
|
||||
* use nextCharArray() for better efficiency. */
|
||||
public Object next() {
|
||||
return new String(nextCharArray());
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Iterator iterator() {
|
||||
return new CharArraySetIterator();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.analysis;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -29,7 +29,6 @@ import java.util.Set;
|
|||
public final class StopFilter extends TokenFilter {
|
||||
|
||||
private final CharArraySet stopWords;
|
||||
private final boolean ignoreCase;
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input.
|
||||
|
@ -45,32 +44,39 @@ public final class StopFilter extends TokenFilter {
|
|||
*/
|
||||
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
|
||||
super(in);
|
||||
this.ignoreCase = ignoreCase;
|
||||
this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
|
||||
this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input.
|
||||
* If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
|
||||
* <code>makeStopSet()</code> was used to construct the set) it will be directly used
|
||||
* and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
|
||||
* directly controls case sensitivity.
|
||||
* <p/>
|
||||
* If <code>stopWords</code> is not an instance of {@link CharArraySet},
|
||||
* a new CharArraySet will be constructed and <code>ignoreCase</code> will be
|
||||
* used to specify the case sensitivity of that set.
|
||||
*
|
||||
* @param input
|
||||
* @param stopWords The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased
|
||||
* @param ignoreCase -Ignore case when stopping. The stopWords set must be setup to contain only lower case words
|
||||
* @param stopWords The set of Stop Words.
|
||||
* @param ignoreCase -Ignore case when stopping.
|
||||
*/
|
||||
public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
|
||||
{
|
||||
super(input);
|
||||
this.ignoreCase = ignoreCase;
|
||||
if (stopWords instanceof CharArraySet) {
|
||||
this.stopWords = (CharArraySet)stopWords;
|
||||
} else {
|
||||
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
|
||||
Iterator it = stopWords.iterator();
|
||||
while(it.hasNext())
|
||||
this.stopWords.add((String) it.next());
|
||||
this.stopWords.addAll(stopWords);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a filter which removes words from the input
|
||||
* TokenStream that are named in the Set.
|
||||
* It is crucial that an efficient Set implementation is used
|
||||
* for maximum performance.
|
||||
*
|
||||
* @see #makeStopSet(java.lang.String[])
|
||||
*/
|
||||
|
@ -97,18 +103,9 @@ public final class StopFilter extends TokenFilter {
|
|||
* @return a Set containing the words
|
||||
*/
|
||||
public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) {
|
||||
HashSet stopTable = new HashSet(stopWords.length);
|
||||
for (int i = 0; i < stopWords.length; i++)
|
||||
stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
|
||||
return stopTable;
|
||||
}
|
||||
|
||||
private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) {
|
||||
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
|
||||
for (int i = 0; i < stopWords.length; i++)
|
||||
stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
|
||||
return stopSet;
|
||||
}
|
||||
stopSet.addAll(Arrays.asList(stopWords));
|
||||
return stopSet; }
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose termText() is not a stop word.
|
||||
|
@ -116,7 +113,7 @@ public final class StopFilter extends TokenFilter {
|
|||
public final Token next(Token result) throws IOException {
|
||||
// return the first non-stop word found
|
||||
while((result = input.next(result)) != null) {
|
||||
if (!stopWords.contains(result.termBuffer(), result.termLength))
|
||||
if (!stopWords.contains(result.termBuffer(), 0, result.termLength))
|
||||
return result;
|
||||
}
|
||||
// reached EOS -- return null
|
||||
|
|
|
@ -16,10 +16,11 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* @author yonik
|
||||
|
@ -45,4 +46,14 @@ public class TestStopFilter extends LuceneTestCase {
|
|||
assertEquals(null,stream.next());
|
||||
}
|
||||
|
||||
public void testStopFilt() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||
Set stopSet = StopFilter.makeStopSet(stopWords);
|
||||
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
|
||||
assertEquals("Now", stream.next().termText());
|
||||
assertEquals("The", stream.next().termText());
|
||||
assertEquals(null, stream.next());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue