LUCENE-1040: new CharArraySet, make StopFilter directly use it

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@596484 13f79535-47bb-0310-9956-ffa450edef68
2007-11-19 23:23:04 +00:00 · 2007-11-19 23:23:04 +00:00 · e395be3faf
parent 49e85ee76d
commit e395be3faf
4 changed files with 246 additions and 97 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -1,5 +1,4 @@
-Lucene Change Log
+Lucene Change Log
 $Id$
 ======================= Trunk (not yet released) =======================
@ -204,6 +203,10 @@ New features
    index changes. SegmentReader, MultiSegmentReader, MultiReader,
    and ParallelReader implement reopen(). (Michael Busch) 
 10. LUCENE-1040: CharArraySet useful for efficiently checking
    set membership of text specified by char[]. (yonik)
 Optimizations
 1. LUCENE-937: CachingTokenFilter now uses an iterator to access the 
--- a/src/java/org/apache/lucene/analysis/CharArraySet.java
+++ b/src/java/org/apache/lucene/analysis/CharArraySet.java
@ -1,5 +1,9 @@
 package org.apache.lucene.analysis;
 import java.util.AbstractSet;
 import java.util.Collection;
 import java.util.Iterator;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -19,90 +23,138 @@ package org.apache.lucene.analysis;
 /**
- * A simple class that can store & retrieve char[]'s in a
+ * A simple class that stores Strings as char[]'s in a
 * hash table.  Note that this is not a general purpose
- * class.  For example, it cannot remove char[]'s from the
+ * class.  For example, it cannot remove items from the
 * set, nor does it resize its hash table to be smaller,
- * etc.  It is designed for use with StopFilter to enable
+ * etc.  It is designed to be quick to test if a char[]
- * quick filtering based on the char[] termBuffer in a
+ * is in the set without the necessity of converting it
- * Token.
+ * to a String first.
 */
-final class CharArraySet {
+public class CharArraySet extends AbstractSet {
  private final static int INIT_SIZE = 8;
  private final static double MAX_LOAD_FACTOR = 0.75;
  private int mask;
  private char[][] entries;
  private int count;
-  private boolean ignoreCase;
+  private final boolean ignoreCase;
  /** Create set with enough capacity to hold startSize
   *  terms */
  public CharArraySet(int startSize, boolean ignoreCase) {
    this.ignoreCase = ignoreCase;
    int size = INIT_SIZE;
-    while(((double) startSize)/size >= MAX_LOAD_FACTOR)
+    while(startSize + (startSize>>2) > size)
-      size *= 2;
+      size <<= 1;
    mask = size-1;
    entries = new char[size][];
  }
-  /** Returns true if the characters in text up to length
+ /** Create set from a Collection of char[] or String */
-   *  len is present in the set. */
+  public CharArraySet(Collection c, boolean ignoreCase) {
-  public boolean contains(char[] text, int len) {
+    this(c.size(), ignoreCase);
    addAll(c);
  }
  /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
   * are in the set */
  public boolean contains(char[] text, int off, int len) {
    return entries[getSlot(text, off, len)] != null;
  }
  /** true if the <code>CharSequence</code> is in the set */
  public boolean contains(CharSequence cs) {
    return entries[getSlot(cs)] != null;
  }
  private int getSlot(char[] text, int off, int len) {
    int code = getHashCode(text, len);
-    int pos = code & mask;
+    int pos = code & (entries.length-1);
    char[] text2 = entries[pos];
-    if (text2 != null && !equals(text, len, text2)) {
+    if (text2 != null && !equals(text, off, len, text2)) {
      final int inc = ((code>>8)+code)|1;
      do {
        code += inc;
-        pos = code & mask;
+        pos = code & (entries.length-1);
        text2 = entries[pos];
-      } while (text2 != null && !equals(text, len, text2));
+      } while (text2 != null && !equals(text, off, len, text2));
    }
-    return text2 != null;
+    return pos;
  }
  /** Returns true if the String is in the set */  
  private int getSlot(CharSequence text) {
    int code = getHashCode(text);
    int pos = code & (entries.length-1);
    char[] text2 = entries[pos];
    if (text2 != null && !equals(text, text2)) {
      final int inc = ((code>>8)+code)|1;
      do {
        code += inc;
        pos = code & (entries.length-1);
        text2 = entries[pos];
      } while (text2 != null && !equals(text, text2));
    }
    return pos;
  }
  /** Add this CharSequence into the set */
  public boolean add(CharSequence text) {
    return add(text.toString()); // could be more efficient
  }
  /** Add this String into the set */
-  public void add(String text) {
+  public boolean add(String text) {
-    add(text.toCharArray());
+    return add(text.toCharArray());
  }
-  /** Add this text into the set */
+  /** Add this char[] directly to the set.
-  public void add(char[] text) {
+   * If ignoreCase is true for this Set, the text array will be directly modified.
   * The user should never modify this text array after calling this method.
   */
  public boolean add(char[] text) {
    if (ignoreCase)
      for(int i=0;i<text.length;i++)
        text[i] = Character.toLowerCase(text[i]);
-    int code = getHashCode(text, text.length);
+    int slot = getSlot(text, 0, text.length);
-    int pos = code & mask;
+    if (entries[slot] != null) return false;
-    char[] text2 = entries[pos];
+    entries[slot] = text;
    if (text2 != null) {
      final int inc = ((code>>8)+code)|1;
      do {
        code += inc;
        pos = code & mask;
        text2 = entries[pos];
      } while (text2 != null);
    }
    entries[pos] = text;
    count++;
-    if (((double) count)/entries.length > MAX_LOAD_FACTOR) {
+    if (count > entries.length + (entries.length>>2) ) {
      rehash();
    }
    return true;
  }
-  private boolean equals(char[] text1, int len, char[] text2) {
+  private boolean equals(char[] text1, int off, int len, char[] text2) {
    if (len != text2.length)
      return false;
-    for(int i=0;i<len;i++) {
+    if (ignoreCase) {
-      if (ignoreCase) {
+      for(int i=0;i<len;i++) {
-        if (Character.toLowerCase(text1[i]) != text2[i])
+        if (Character.toLowerCase(text1[off+i]) != text2[i])
          return false;
-      } else {
+      }
-        if (text1[i] != text2[i])
+    } else {
      for(int i=0;i<len;i++) {
        if (text1[off+i] != text2[i])
          return false;
      }
    }
    return true;
  }
  private boolean equals(CharSequence text1, char[] text2) {
    int len = text1.length();
    if (len != text2.length)
      return false;
    if (ignoreCase) {
      for(int i=0;i<len;i++) {
        if (Character.toLowerCase(text1.charAt(i)) != text2[i])
          return false;
      }
    } else {
      for(int i=0;i<len;i++) {
        if (text1.charAt(i) != text2[i])
          return false;
      }
    }
@ -111,39 +163,125 @@ final class CharArraySet {
  private void rehash() {
    final int newSize = 2*count;
-    mask = newSize-1;
+    char[][] oldEntries = entries;
    char[][] entries = new char[newSize][];
-    char[][] newEntries = new char[newSize][];
+    for(int i=0;i<oldEntries.length;i++) {
-    for(int i=0;i<entries.length;i++) {
+      char[] text = oldEntries[i];
      char[] text = entries[i];
      if (text != null) {
-        int code = getHashCode(text, text.length);
+        // todo: could be faster... no need to compare strings on collision
-        int pos = code & mask;
+        entries[ getSlot(text,0,text.length) ] = text;
        if (newEntries[pos] != null) {
          final int inc = ((code>>8)+code)|1;
          do {
            code += inc;
            pos = code & mask;
          } while (newEntries[pos] != null);
        }
        newEntries[pos] = text;
      }
    }
    entries = newEntries;
  }
  private int getHashCode(char[] text, int len) {
    int downto = len;
    int code = 0;
-    while (downto > 0) {
+    if (ignoreCase) {
-      final char c;
+      for (int i=0; i<len; i++) {
-      if (ignoreCase)
+        code = code*31 + Character.toLowerCase(text[i]);
-        c = Character.toLowerCase(text[--downto]);
+      }
-      else
+    } else {
-        c = text[--downto];
+      for (int i=0; i<len; i++) {
-      code = (code*31) + c;
+        code = code*31 + text[i];
      }
    }
    return code;
  }
  private int getHashCode(CharSequence text) {
    int code;
    if (ignoreCase) {
      code = 0;
      int len = text.length();
      for (int i=0; i<len; i++) {
        code = code*31 + Character.toLowerCase(text.charAt(i));
      }
    } else {
      if (false && text instanceof String) {
        code = text.hashCode();
      } else {
        code = 0;
        int len = text.length();
        for (int i=0; i<len; i++) {
          code = code*31 + text.charAt(i);
        }
      }
    }
    return code;
  }
  public int size() {
    return count;
  }
  public boolean isEmpty() {
    return count==0;
  }
  public boolean contains(Object o) {
    if (o instanceof char[]) {
      char[] text = (char[])o;
      return contains(text, 0, text.length);
    } else if (o instanceof CharSequence) {
      return contains((CharSequence)o);
    }
    return false;
  }
  public boolean add(Object o) {
    if (o instanceof char[]) {
      return add((char[])o);
    } else if (o instanceof String) {
      return add((String)o);
    } else if (o instanceof CharSequence) {
      return add((CharSequence)o);
    } else {
      return add(o.toString());
    }
  }
  /** The Iterator<String> for this set.  Strings are constructed on the fly, so
   * use <code>nextCharArray</code> for more efficient access. */
  public class CharArraySetIterator implements Iterator {
    int pos=-1;
    char[] next;
    CharArraySetIterator() {
      goNext();
    }
    private void goNext() {
      next = null;
      pos++;
      while (pos < entries.length && (next=entries[pos]) == null) pos++;
    }
    public boolean hasNext() {
      return next != null;
    }
    /** do not modify the returned char[] */
    public char[] nextCharArray() {
      char[] ret = next;
      goNext();
      return ret;
    }
    /** Returns the next String, as a Set<String> would...
     * use nextCharArray() for better efficiency. */
    public Object next() {
      return new String(nextCharArray());
    }
    public void remove() {
      throw new UnsupportedOperationException();
    }
  }
  public Iterator iterator() {
    return new CharArraySetIterator();
  }
 }
--- a/src/java/org/apache/lucene/analysis/StopFilter.java
+++ b/src/java/org/apache/lucene/analysis/StopFilter.java
@ -18,7 +18,7 @@ package org.apache.lucene.analysis;
 */
 import java.io.IOException;
-import java.util.HashSet;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.Set;
@ -29,7 +29,6 @@ import java.util.Set;
 public final class StopFilter extends TokenFilter {
  private final CharArraySet stopWords;
  private final boolean ignoreCase;
  /**
   * Construct a token stream filtering the given input.
@ -45,32 +44,39 @@ public final class StopFilter extends TokenFilter {
   */
  public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
    super(in);
-    this.ignoreCase = ignoreCase;
+    this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
    this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
  }
  /**
   * Construct a token stream filtering the given input.
   * If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
   * <code>makeStopSet()</code> was used to construct the set) it will be directly used
   * and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
   * directly controls case sensitivity.
   * <p/>
   * If <code>stopWords</code> is not an instance of {@link CharArraySet},
   * a new CharArraySet will be constructed and <code>ignoreCase</code> will be
   * used to specify the case sensitivity of that set.
   *
   * @param input
-   * @param stopWords The set of Stop Words, as Strings.  If ignoreCase is true, all strings should be lower cased
+   * @param stopWords The set of Stop Words.
-   * @param ignoreCase -Ignore case when stopping.  The stopWords set must be setup to contain only lower case words 
+   * @param ignoreCase -Ignore case when stopping.
   */
  public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
  {
    super(input);
-    this.ignoreCase = ignoreCase;
+    if (stopWords instanceof CharArraySet) {
-    this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
+      this.stopWords = (CharArraySet)stopWords;
-    Iterator it = stopWords.iterator();
+    } else {
-    while(it.hasNext())
+      this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
-      this.stopWords.add((String) it.next());
+      this.stopWords.addAll(stopWords);
    }
  }
  /**
   * Constructs a filter which removes words from the input
   * TokenStream that are named in the Set.
   * It is crucial that an efficient Set implementation is used
   * for maximum performance.
   *
   * @see #makeStopSet(java.lang.String[])
   */
@ -97,18 +103,9 @@ public final class StopFilter extends TokenFilter {
   * @return a Set containing the words
   */    
  public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) {
    HashSet stopTable = new HashSet(stopWords.length);
    for (int i = 0; i < stopWords.length; i++)
      stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
    return stopTable;
  }
  private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) {
    CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
-    for (int i = 0; i < stopWords.length; i++)
+    stopSet.addAll(Arrays.asList(stopWords));
-      stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
+    return stopSet;  }
    return stopSet;
  }
  /**
   * Returns the next input Token whose termText() is not a stop word.
@ -116,7 +113,7 @@ public final class StopFilter extends TokenFilter {
  public final Token next(Token result) throws IOException {
    // return the first non-stop word found
    while((result = input.next(result)) != null) {
-      if (!stopWords.contains(result.termBuffer(), result.termLength))
+      if (!stopWords.contains(result.termBuffer(), 0, result.termLength))
        return result;
    }
    // reached EOS -- return null
--- a/src/test/org/apache/lucene/analysis/TestStopFilter.java
+++ b/src/test/org/apache/lucene/analysis/TestStopFilter.java
@ -16,10 +16,11 @@ package org.apache.lucene.analysis;
 * limitations under the License.
 */
 import org.apache.lucene.util.LuceneTestCase;
 import java.io.IOException;
 import java.io.StringReader;
-
+import java.util.Set;
 import org.apache.lucene.util.LuceneTestCase;
 /**
 * @author yonik
@ -45,4 +46,14 @@ public class TestStopFilter extends LuceneTestCase {
    assertEquals(null,stream.next());
  }
  public void testStopFilt() throws IOException {
    StringReader reader = new StringReader("Now is The Time");
    String[] stopWords = new String[] { "is", "the", "Time" };
    Set stopSet = StopFilter.makeStopSet(stopWords);
    TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
    assertEquals("Now", stream.next().termText());
    assertEquals("The", stream.next().termText());
    assertEquals(null, stream.next());
  }
 }