mirror of https://github.com/apache/lucene.git
LUCENE-1040: new CharArraySet, make StopFilter directly use it
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@596484 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
49e85ee76d
commit
e395be3faf
|
@ -1,5 +1,4 @@
|
||||||
Lucene Change Log
|
Lucene Change Log
|
||||||
|
|
||||||
$Id$
|
$Id$
|
||||||
|
|
||||||
======================= Trunk (not yet released) =======================
|
======================= Trunk (not yet released) =======================
|
||||||
|
@ -204,6 +203,10 @@ New features
|
||||||
index changes. SegmentReader, MultiSegmentReader, MultiReader,
|
index changes. SegmentReader, MultiSegmentReader, MultiReader,
|
||||||
and ParallelReader implement reopen(). (Michael Busch)
|
and ParallelReader implement reopen(). (Michael Busch)
|
||||||
|
|
||||||
|
10. LUCENE-1040: CharArraySet useful for efficiently checking
|
||||||
|
set membership of text specified by char[]. (yonik)
|
||||||
|
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the
|
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
package org.apache.lucene.analysis;
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import java.util.AbstractSet;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -19,90 +23,138 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A simple class that can store & retrieve char[]'s in a
|
* A simple class that stores Strings as char[]'s in a
|
||||||
* hash table. Note that this is not a general purpose
|
* hash table. Note that this is not a general purpose
|
||||||
* class. For example, it cannot remove char[]'s from the
|
* class. For example, it cannot remove items from the
|
||||||
* set, nor does it resize its hash table to be smaller,
|
* set, nor does it resize its hash table to be smaller,
|
||||||
* etc. It is designed for use with StopFilter to enable
|
* etc. It is designed to be quick to test if a char[]
|
||||||
* quick filtering based on the char[] termBuffer in a
|
* is in the set without the necessity of converting it
|
||||||
* Token.
|
* to a String first.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
final class CharArraySet {
|
public class CharArraySet extends AbstractSet {
|
||||||
|
|
||||||
private final static int INIT_SIZE = 8;
|
private final static int INIT_SIZE = 8;
|
||||||
private final static double MAX_LOAD_FACTOR = 0.75;
|
|
||||||
private int mask;
|
|
||||||
private char[][] entries;
|
private char[][] entries;
|
||||||
private int count;
|
private int count;
|
||||||
private boolean ignoreCase;
|
private final boolean ignoreCase;
|
||||||
|
|
||||||
/** Create set with enough capacity to hold startSize
|
/** Create set with enough capacity to hold startSize
|
||||||
* terms */
|
* terms */
|
||||||
public CharArraySet(int startSize, boolean ignoreCase) {
|
public CharArraySet(int startSize, boolean ignoreCase) {
|
||||||
this.ignoreCase = ignoreCase;
|
this.ignoreCase = ignoreCase;
|
||||||
int size = INIT_SIZE;
|
int size = INIT_SIZE;
|
||||||
while(((double) startSize)/size >= MAX_LOAD_FACTOR)
|
while(startSize + (startSize>>2) > size)
|
||||||
size *= 2;
|
size <<= 1;
|
||||||
mask = size-1;
|
|
||||||
entries = new char[size][];
|
entries = new char[size][];
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns true if the characters in text up to length
|
/** Create set from a Collection of char[] or String */
|
||||||
* len is present in the set. */
|
public CharArraySet(Collection c, boolean ignoreCase) {
|
||||||
public boolean contains(char[] text, int len) {
|
this(c.size(), ignoreCase);
|
||||||
|
addAll(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
|
||||||
|
* are in the set */
|
||||||
|
public boolean contains(char[] text, int off, int len) {
|
||||||
|
return entries[getSlot(text, off, len)] != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** true if the <code>CharSequence</code> is in the set */
|
||||||
|
public boolean contains(CharSequence cs) {
|
||||||
|
return entries[getSlot(cs)] != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getSlot(char[] text, int off, int len) {
|
||||||
int code = getHashCode(text, len);
|
int code = getHashCode(text, len);
|
||||||
int pos = code & mask;
|
int pos = code & (entries.length-1);
|
||||||
char[] text2 = entries[pos];
|
char[] text2 = entries[pos];
|
||||||
if (text2 != null && !equals(text, len, text2)) {
|
if (text2 != null && !equals(text, off, len, text2)) {
|
||||||
final int inc = ((code>>8)+code)|1;
|
final int inc = ((code>>8)+code)|1;
|
||||||
do {
|
do {
|
||||||
code += inc;
|
code += inc;
|
||||||
pos = code & mask;
|
pos = code & (entries.length-1);
|
||||||
text2 = entries[pos];
|
text2 = entries[pos];
|
||||||
} while (text2 != null && !equals(text, len, text2));
|
} while (text2 != null && !equals(text, off, len, text2));
|
||||||
}
|
}
|
||||||
return text2 != null;
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true if the String is in the set */
|
||||||
|
private int getSlot(CharSequence text) {
|
||||||
|
int code = getHashCode(text);
|
||||||
|
int pos = code & (entries.length-1);
|
||||||
|
char[] text2 = entries[pos];
|
||||||
|
if (text2 != null && !equals(text, text2)) {
|
||||||
|
final int inc = ((code>>8)+code)|1;
|
||||||
|
do {
|
||||||
|
code += inc;
|
||||||
|
pos = code & (entries.length-1);
|
||||||
|
text2 = entries[pos];
|
||||||
|
} while (text2 != null && !equals(text, text2));
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Add this CharSequence into the set */
|
||||||
|
public boolean add(CharSequence text) {
|
||||||
|
return add(text.toString()); // could be more efficient
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Add this String into the set */
|
/** Add this String into the set */
|
||||||
public void add(String text) {
|
public boolean add(String text) {
|
||||||
add(text.toCharArray());
|
return add(text.toCharArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Add this text into the set */
|
/** Add this char[] directly to the set.
|
||||||
public void add(char[] text) {
|
* If ignoreCase is true for this Set, the text array will be directly modified.
|
||||||
|
* The user should never modify this text array after calling this method.
|
||||||
|
*/
|
||||||
|
public boolean add(char[] text) {
|
||||||
if (ignoreCase)
|
if (ignoreCase)
|
||||||
for(int i=0;i<text.length;i++)
|
for(int i=0;i<text.length;i++)
|
||||||
text[i] = Character.toLowerCase(text[i]);
|
text[i] = Character.toLowerCase(text[i]);
|
||||||
int code = getHashCode(text, text.length);
|
int slot = getSlot(text, 0, text.length);
|
||||||
int pos = code & mask;
|
if (entries[slot] != null) return false;
|
||||||
char[] text2 = entries[pos];
|
entries[slot] = text;
|
||||||
if (text2 != null) {
|
|
||||||
final int inc = ((code>>8)+code)|1;
|
|
||||||
do {
|
|
||||||
code += inc;
|
|
||||||
pos = code & mask;
|
|
||||||
text2 = entries[pos];
|
|
||||||
} while (text2 != null);
|
|
||||||
}
|
|
||||||
entries[pos] = text;
|
|
||||||
count++;
|
count++;
|
||||||
|
|
||||||
if (((double) count)/entries.length > MAX_LOAD_FACTOR) {
|
if (count > entries.length + (entries.length>>2) ) {
|
||||||
rehash();
|
rehash();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean equals(char[] text1, int len, char[] text2) {
|
private boolean equals(char[] text1, int off, int len, char[] text2) {
|
||||||
if (len != text2.length)
|
if (len != text2.length)
|
||||||
return false;
|
return false;
|
||||||
for(int i=0;i<len;i++) {
|
|
||||||
if (ignoreCase) {
|
if (ignoreCase) {
|
||||||
if (Character.toLowerCase(text1[i]) != text2[i])
|
for(int i=0;i<len;i++) {
|
||||||
|
if (Character.toLowerCase(text1[off+i]) != text2[i])
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
if (text1[i] != text2[i])
|
for(int i=0;i<len;i++) {
|
||||||
|
if (text1[off+i] != text2[i])
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean equals(CharSequence text1, char[] text2) {
|
||||||
|
int len = text1.length();
|
||||||
|
if (len != text2.length)
|
||||||
|
return false;
|
||||||
|
if (ignoreCase) {
|
||||||
|
for(int i=0;i<len;i++) {
|
||||||
|
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(int i=0;i<len;i++) {
|
||||||
|
if (text1.charAt(i) != text2[i])
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -111,39 +163,125 @@ final class CharArraySet {
|
||||||
|
|
||||||
private void rehash() {
|
private void rehash() {
|
||||||
final int newSize = 2*count;
|
final int newSize = 2*count;
|
||||||
mask = newSize-1;
|
char[][] oldEntries = entries;
|
||||||
|
char[][] entries = new char[newSize][];
|
||||||
|
|
||||||
char[][] newEntries = new char[newSize][];
|
for(int i=0;i<oldEntries.length;i++) {
|
||||||
for(int i=0;i<entries.length;i++) {
|
char[] text = oldEntries[i];
|
||||||
char[] text = entries[i];
|
|
||||||
if (text != null) {
|
if (text != null) {
|
||||||
int code = getHashCode(text, text.length);
|
// todo: could be faster... no need to compare strings on collision
|
||||||
int pos = code & mask;
|
entries[ getSlot(text,0,text.length) ] = text;
|
||||||
if (newEntries[pos] != null) {
|
|
||||||
final int inc = ((code>>8)+code)|1;
|
|
||||||
do {
|
|
||||||
code += inc;
|
|
||||||
pos = code & mask;
|
|
||||||
} while (newEntries[pos] != null);
|
|
||||||
}
|
|
||||||
newEntries[pos] = text;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
entries = newEntries;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getHashCode(char[] text, int len) {
|
private int getHashCode(char[] text, int len) {
|
||||||
int downto = len;
|
|
||||||
int code = 0;
|
int code = 0;
|
||||||
while (downto > 0) {
|
if (ignoreCase) {
|
||||||
final char c;
|
for (int i=0; i<len; i++) {
|
||||||
if (ignoreCase)
|
code = code*31 + Character.toLowerCase(text[i]);
|
||||||
c = Character.toLowerCase(text[--downto]);
|
}
|
||||||
else
|
} else {
|
||||||
c = text[--downto];
|
for (int i=0; i<len; i++) {
|
||||||
code = (code*31) + c;
|
code = code*31 + text[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return code;
|
return code;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int getHashCode(CharSequence text) {
|
||||||
|
int code;
|
||||||
|
if (ignoreCase) {
|
||||||
|
code = 0;
|
||||||
|
int len = text.length();
|
||||||
|
for (int i=0; i<len; i++) {
|
||||||
|
code = code*31 + Character.toLowerCase(text.charAt(i));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (false && text instanceof String) {
|
||||||
|
code = text.hashCode();
|
||||||
|
} else {
|
||||||
|
code = 0;
|
||||||
|
int len = text.length();
|
||||||
|
for (int i=0; i<len; i++) {
|
||||||
|
code = code*31 + text.charAt(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return code;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return count==0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean contains(Object o) {
|
||||||
|
if (o instanceof char[]) {
|
||||||
|
char[] text = (char[])o;
|
||||||
|
return contains(text, 0, text.length);
|
||||||
|
} else if (o instanceof CharSequence) {
|
||||||
|
return contains((CharSequence)o);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean add(Object o) {
|
||||||
|
if (o instanceof char[]) {
|
||||||
|
return add((char[])o);
|
||||||
|
} else if (o instanceof String) {
|
||||||
|
return add((String)o);
|
||||||
|
} else if (o instanceof CharSequence) {
|
||||||
|
return add((CharSequence)o);
|
||||||
|
} else {
|
||||||
|
return add(o.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The Iterator<String> for this set. Strings are constructed on the fly, so
|
||||||
|
* use <code>nextCharArray</code> for more efficient access. */
|
||||||
|
public class CharArraySetIterator implements Iterator {
|
||||||
|
int pos=-1;
|
||||||
|
char[] next;
|
||||||
|
CharArraySetIterator() {
|
||||||
|
goNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void goNext() {
|
||||||
|
next = null;
|
||||||
|
pos++;
|
||||||
|
while (pos < entries.length && (next=entries[pos]) == null) pos++;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
return next != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** do not modify the returned char[] */
|
||||||
|
public char[] nextCharArray() {
|
||||||
|
char[] ret = next;
|
||||||
|
goNext();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the next String, as a Set<String> would...
|
||||||
|
* use nextCharArray() for better efficiency. */
|
||||||
|
public Object next() {
|
||||||
|
return new String(nextCharArray());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Iterator iterator() {
|
||||||
|
return new CharArraySetIterator();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.analysis;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.Arrays;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
@ -29,7 +29,6 @@ import java.util.Set;
|
||||||
public final class StopFilter extends TokenFilter {
|
public final class StopFilter extends TokenFilter {
|
||||||
|
|
||||||
private final CharArraySet stopWords;
|
private final CharArraySet stopWords;
|
||||||
private final boolean ignoreCase;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a token stream filtering the given input.
|
* Construct a token stream filtering the given input.
|
||||||
|
@ -45,32 +44,39 @@ public final class StopFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
|
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
|
||||||
super(in);
|
super(in);
|
||||||
this.ignoreCase = ignoreCase;
|
this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
|
||||||
this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a token stream filtering the given input.
|
* Construct a token stream filtering the given input.
|
||||||
|
* If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
|
||||||
|
* <code>makeStopSet()</code> was used to construct the set) it will be directly used
|
||||||
|
* and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
|
||||||
|
* directly controls case sensitivity.
|
||||||
|
* <p/>
|
||||||
|
* If <code>stopWords</code> is not an instance of {@link CharArraySet},
|
||||||
|
* a new CharArraySet will be constructed and <code>ignoreCase</code> will be
|
||||||
|
* used to specify the case sensitivity of that set.
|
||||||
|
*
|
||||||
* @param input
|
* @param input
|
||||||
* @param stopWords The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased
|
* @param stopWords The set of Stop Words.
|
||||||
* @param ignoreCase -Ignore case when stopping. The stopWords set must be setup to contain only lower case words
|
* @param ignoreCase -Ignore case when stopping.
|
||||||
*/
|
*/
|
||||||
public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
|
public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
|
||||||
{
|
{
|
||||||
super(input);
|
super(input);
|
||||||
this.ignoreCase = ignoreCase;
|
if (stopWords instanceof CharArraySet) {
|
||||||
|
this.stopWords = (CharArraySet)stopWords;
|
||||||
|
} else {
|
||||||
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
|
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
|
||||||
Iterator it = stopWords.iterator();
|
this.stopWords.addAll(stopWords);
|
||||||
while(it.hasNext())
|
}
|
||||||
this.stopWords.add((String) it.next());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a filter which removes words from the input
|
* Constructs a filter which removes words from the input
|
||||||
* TokenStream that are named in the Set.
|
* TokenStream that are named in the Set.
|
||||||
* It is crucial that an efficient Set implementation is used
|
|
||||||
* for maximum performance.
|
|
||||||
*
|
*
|
||||||
* @see #makeStopSet(java.lang.String[])
|
* @see #makeStopSet(java.lang.String[])
|
||||||
*/
|
*/
|
||||||
|
@ -97,18 +103,9 @@ public final class StopFilter extends TokenFilter {
|
||||||
* @return a Set containing the words
|
* @return a Set containing the words
|
||||||
*/
|
*/
|
||||||
public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) {
|
public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) {
|
||||||
HashSet stopTable = new HashSet(stopWords.length);
|
|
||||||
for (int i = 0; i < stopWords.length; i++)
|
|
||||||
stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
|
|
||||||
return stopTable;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) {
|
|
||||||
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
|
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
|
||||||
for (int i = 0; i < stopWords.length; i++)
|
stopSet.addAll(Arrays.asList(stopWords));
|
||||||
stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
|
return stopSet; }
|
||||||
return stopSet;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next input Token whose termText() is not a stop word.
|
* Returns the next input Token whose termText() is not a stop word.
|
||||||
|
@ -116,7 +113,7 @@ public final class StopFilter extends TokenFilter {
|
||||||
public final Token next(Token result) throws IOException {
|
public final Token next(Token result) throws IOException {
|
||||||
// return the first non-stop word found
|
// return the first non-stop word found
|
||||||
while((result = input.next(result)) != null) {
|
while((result = input.next(result)) != null) {
|
||||||
if (!stopWords.contains(result.termBuffer(), result.termLength))
|
if (!stopWords.contains(result.termBuffer(), 0, result.termLength))
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
// reached EOS -- return null
|
// reached EOS -- return null
|
||||||
|
|
|
@ -16,10 +16,11 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.util.Set;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author yonik
|
* @author yonik
|
||||||
|
@ -45,4 +46,14 @@ public class TestStopFilter extends LuceneTestCase {
|
||||||
assertEquals(null,stream.next());
|
assertEquals(null,stream.next());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testStopFilt() throws IOException {
|
||||||
|
StringReader reader = new StringReader("Now is The Time");
|
||||||
|
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||||
|
Set stopSet = StopFilter.makeStopSet(stopWords);
|
||||||
|
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
|
||||||
|
assertEquals("Now", stream.next().termText());
|
||||||
|
assertEquals("The", stream.next().termText());
|
||||||
|
assertEquals(null, stream.next());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue