LUCENE-4838: Add BytesRefHash.find()

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457400 13f79535-47bb-0310-9956-ffa450edef68
2013-03-17 08:58:51 +00:00 · 2013-03-17 08:58:51 +00:00 · 0f3af7cec9
parent efaef82383
commit 0f3af7cec9
2 changed files with 147 additions and 79 deletions
--- a/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
@ -30,8 +30,8 @@ import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
 /**
 * {@link BytesRefHash} is a special purpose hash-map like data-structure
 * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
- * byte arrays to ordinal (Map&lt;BytesRef,int&gt;) storing the hashed bytes
- * efficiently in continuous storage. The mapping to the ordinal is
+ * byte arrays to ids (Map&lt;BytesRef,int&gt;) storing the hashed bytes
+ * efficiently in continuous storage. The mapping to the id is
 * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
 * for each added {@link BytesRef}.
 * 
@ -58,7 +58,7 @@ public final class BytesRefHash {
  private int hashMask;
  private int count;
  private int lastCount = -1;
-  private int[] ords;
+  private int[] ids;
  private final BytesStartArray bytesStartArray;
  private Counter bytesUsed;

@ -86,8 +86,8 @@ public final class BytesRefHash {
    hashHalfSize = hashSize >> 1;
    hashMask = hashSize - 1;
    this.pool = pool;
-    ords = new int[hashSize];
-    Arrays.fill(ords, -1);
+    ids = new int[hashSize];
+    Arrays.fill(ids, -1);
    this.bytesStartArray = bytesStartArray;
    bytesStart = bytesStartArray.init();
    bytesUsed = bytesStartArray.bytesUsed() == null? Counter.newCounter() : bytesStartArray.bytesUsed();
@ -104,26 +104,29 @@ public final class BytesRefHash {
  }

  /**
-   * Populates and returns a {@link BytesRef} with the bytes for the given ord.
+   * Populates and returns a {@link BytesRef} with the bytes for the given
+   * bytesID.
   * <p>
-   * Note: the given ord must be a positive integer less that the current size (
-   * {@link #size()})
-   * </p>
+   * Note: the given bytesID must be a positive integer less than the current
+   * size ({@link #size()})
   * 
-   * @param ord the ord
-   * @param ref the {@link BytesRef} to populate
+   * @param bytesID
+   *          the id
+   * @param ref
+   *          the {@link BytesRef} to populate
   * 
-   * @return the given BytesRef instance populated with the bytes for the given ord
+   * @return the given BytesRef instance populated with the bytes for the given
+   *         bytesID
   */
-  public BytesRef get(int ord, BytesRef ref) {
+  public BytesRef get(int bytesID, BytesRef ref) {
    assert bytesStart != null : "bytesStart is null - not initialized";
-    assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length;
-    pool.setBytesRef(ref, bytesStart[ord]);
+    assert bytesID < bytesStart.length: "bytesID exceeds byteStart len: " + bytesStart.length;
+    pool.setBytesRef(ref, bytesStart[bytesID]);
    return ref;
  }

  /**
-   * Returns the ords array in arbitrary order. Valid ords start at offset of 0
+   * Returns the ids array in arbitrary order. Valid ids start at offset of 0
   * and end at a limit of {@link #size()} - 1
   * <p>
   * Note: This is a destructive operation. {@link #clear()} must be called in
@ -131,13 +134,13 @@ public final class BytesRefHash {
   * </p>
   */
  int[] compact() {
-    assert bytesStart != null : "Bytesstart is null - not initialized";
+    assert bytesStart != null : "bytesStart is null - not initialized";
    int upto = 0;
    for (int i = 0; i < hashSize; i++) {
-      if (ords[i] != -1) {
+      if (ids[i] != -1) {
        if (upto < i) {
-          ords[upto] = ords[i];
-          ords[i] = -1;
+          ids[upto] = ids[i];
+          ids[i] = -1;
        }
        upto++;
      }
@ -145,7 +148,7 @@ public final class BytesRefHash {

    assert upto == count;
    lastCount = count;
-    return ords;
+    return ids;
  }

  /**
@ -170,25 +173,25 @@ public final class BytesRefHash {
      
      @Override
      protected int compare(int i, int j) {
-        final int ord1 = compact[i], ord2 = compact[j];
-        assert bytesStart.length > ord1 && bytesStart.length > ord2;
-        pool.setBytesRef(scratch1, bytesStart[ord1]);
-        pool.setBytesRef(scratch2, bytesStart[ord2]);
+        final int id1 = compact[i], id2 = compact[j];
+        assert bytesStart.length > id1 && bytesStart.length > id2;
+        pool.setBytesRef(scratch1, bytesStart[id1]);
+        pool.setBytesRef(scratch2, bytesStart[id2]);
        return comp.compare(scratch1, scratch2);
      }

      @Override
      protected void setPivot(int i) {
-        final int ord = compact[i];
-        assert bytesStart.length > ord;
-        pool.setBytesRef(pivot, bytesStart[ord]);
+        final int id = compact[i];
+        assert bytesStart.length > id;
+        pool.setBytesRef(pivot, bytesStart[id]);
      }
  
      @Override
      protected int comparePivot(int j) {
-        final int ord = compact[j];
-        assert bytesStart.length > ord;
-        pool.setBytesRef(scratch2, bytesStart[ord]);
+        final int id = compact[j];
+        assert bytesStart.length > id;
+        pool.setBytesRef(scratch2, bytesStart[id]);
        return comp.compare(pivot, scratch2);
      }
      
@ -198,8 +201,8 @@ public final class BytesRefHash {
    return compact;
  }

-  private boolean equals(int ord, BytesRef b) {
-    pool.setBytesRef(scratch1, bytesStart[ord]);
+  private boolean equals(int id, BytesRef b) {
+    pool.setBytesRef(scratch1, bytesStart[id]);
    return scratch1.bytesEquals(b);
  }

@ -213,8 +216,8 @@ public final class BytesRefHash {
    if (newSize != hashSize) {
      bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -(hashSize - newSize));
      hashSize = newSize;
-      ords = new int[hashSize];
-      Arrays.fill(ords, -1);
+      ids = new int[hashSize];
+      Arrays.fill(ids, -1);
      hashHalfSize = newSize / 2;
      hashMask = newSize - 1;
      return true;
@ -237,7 +240,7 @@ public final class BytesRefHash {
      // shrink clears the hash entries
      return;
    }
-    Arrays.fill(ords, -1);
+    Arrays.fill(ids, -1);
  }

  public void clear() {
@ -249,7 +252,7 @@ public final class BytesRefHash {
   */
  public void close() {
    clear(true);
-    ords = null;
+    ids = null;
    bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -hashSize);
  }

@ -258,8 +261,8 @@ public final class BytesRefHash {
   * 
   * @param bytes
   *          the bytes to hash
-   * @return the ord the given bytes are hashed if there was no mapping for the
-   *         given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
+   * @return the id the given bytes are hashed if there was no mapping for the
+   *         given bytes, otherwise <code>(-(id)-1)</code>. This guarantees
   *         that the return value will always be &gt;= 0 if the given bytes
   *         haven't been hashed before.
   * 
@ -289,8 +292,8 @@ public final class BytesRefHash {
   * }
   * </pre>
   * 
-   * @return the ord the given bytes are hashed if there was no mapping for the
-   *         given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
+   * @return the id the given bytes are hashed if there was no mapping for the
+   *         given bytes, otherwise <code>(-(id)-1)</code>. This guarantees
   *         that the return value will always be &gt;= 0 if the given bytes
   *         haven't been hashed before.
   * 
@ -302,18 +305,8 @@ public final class BytesRefHash {
    assert bytesStart != null : "Bytesstart is null - not initialized";
    final int length = bytes.length;
    // final position
-    int hashPos = code & hashMask;
-    int e = ords[hashPos];
-    if (e != -1 && !equals(e, bytes)) {
-      // Conflict: keep searching different locations in
-      // the hash table.
-      final int inc = ((code >> 8) + code) | 1;
-      do {
-        code += inc;
-        hashPos = code & hashMask;
-        e = ords[hashPos];
-      } while (e != -1 && !equals(e, bytes));
-    }
+    final int hashPos = findHash(bytes, code);
+    int e = ids[hashPos];
    
    if (e == -1) {
      // new entry
@ -355,8 +348,8 @@ public final class BytesRefHash {
        System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
            length);
      }
-      assert ords[hashPos] == -1;
-      ords[hashPos] = e;
+      assert ids[hashPos] == -1;
+      ids[hashPos] = e;

      if (count == hashHalfSize) {
        rehash(2 * hashSize, true);
@ -366,12 +359,55 @@ public final class BytesRefHash {
    return -(e + 1);
  }
  
+  /**
+   * Returns the id of the given {@link BytesRef}.
+   * 
+   * @see #find(BytesRef, int)
+   */
+  public int find(BytesRef bytes) {
+    return find(bytes, bytes.hashCode());
+  }
+
+  /**
+   * Returns the id of the given {@link BytesRef} with a pre-calculated hash code.
+   * 
+   * @param bytes
+   *          the bytes to look for
+   * @param code
+   *          the bytes hash code
+   * 
+   * @return the id of the given bytes, or {@code -1} if there is no mapping for the
+   *         given bytes.
+   */
+  public int find(BytesRef bytes, int code) {
+    return ids[findHash(bytes, code)];
+  }
+  
+  private final int findHash(BytesRef bytes, int code) {
+    assert bytesStart != null : "bytesStart is null - not initialized";
+    // final position
+    int hashPos = code & hashMask;
+    int e = ids[hashPos];
+    if (e != -1 && !equals(e, bytes)) {
+      // Conflict: keep searching different locations in
+      // the hash table.
+      final int inc = ((code >> 8) + code) | 1;
+      do {
+        code += inc;
+        hashPos = code & hashMask;
+        e = ids[hashPos];
+      } while (e != -1 && !equals(e, bytes));
+    }
+    
+    return hashPos;
+  }
+
  public int addByPoolOffset(int offset) {
    assert bytesStart != null : "Bytesstart is null - not initialized";
    // final position
    int code = offset;
    int hashPos = offset & hashMask;
-    int e = ords[hashPos];
+    int e = ids[hashPos];
    if (e != -1 && bytesStart[e] != offset) {
      // Conflict: keep searching different locations in
      // the hash table.
@ -379,7 +415,7 @@ public final class BytesRefHash {
      do {
        code += inc;
        hashPos = code & hashMask;
-        e = ords[hashPos];
+        e = ids[hashPos];
      } while (e != -1 && bytesStart[e] != offset);
    }
    if (e == -1) {
@ -391,8 +427,8 @@ public final class BytesRefHash {
      }
      e = count++;
      bytesStart[e] = offset;
-      assert ords[hashPos] == -1;
-      ords[hashPos] = e;
+      assert ids[hashPos] == -1;
+      ids[hashPos] = e;

      if (count == hashHalfSize) {
        rehash(2 * hashSize, false);
@ -412,7 +448,7 @@ public final class BytesRefHash {
    final int[] newHash = new int[newSize];
    Arrays.fill(newHash, -1);
    for (int i = 0; i < hashSize; i++) {
-      final int e0 = ords[i];
+      final int e0 = ids[i];
      if (e0 != -1) {
        int code;
        if (hashOnData) {
@ -453,8 +489,8 @@ public final class BytesRefHash {
    }

    hashMask = newMask;
-    bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length));
-    ords = newHash;
+    bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ids.length));
+    ids = newHash;
    hashSize = newSize;
    hashHalfSize = newSize / 2;
  }
@ -469,25 +505,25 @@ public final class BytesRefHash {
      bytesStart = bytesStartArray.init();
    }
    
-    if (ords == null) {
-      ords = new int[hashSize];
+    if (ids == null) {
+      ids = new int[hashSize];
      bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize);
    }
  }

  /**
   * Returns the bytesStart offset into the internally used
-   * {@link ByteBlockPool} for the given ord
+   * {@link ByteBlockPool} for the given bytesID
   * 
-   * @param ord
-   *          the ord to look up
+   * @param bytesID
+   *          the id to look up
   * @return the bytesStart offset into the internally used
-   *         {@link ByteBlockPool} for the given ord
+   *         {@link ByteBlockPool} for the given id
   */
-  public int byteStart(int ord) {
-    assert bytesStart != null : "Bytesstart is null - not initialized";
-    assert ord >= 0 && ord < count : ord;
-    return bytesStart[ord];
+  public int byteStart(int bytesID) {
+    assert bytesStart != null : "bytesStart is null - not initialized";
+    assert bytesID >= 0 && bytesID < count : bytesID;
+    return bytesStart[bytesID];
  }

  /**
--- a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java
@ -30,16 +30,11 @@ import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
 import org.junit.Before;
 import org.junit.Test;

-/**
- *
- */
 public class TestBytesRefHash extends LuceneTestCase {

  BytesRefHash hash;
  ByteBlockPool pool;
  
-  /**
-   */
  @Override
  @Before
  public void setUp() throws Exception {
@ -249,6 +244,43 @@ public class TestBytesRefHash extends LuceneTestCase {
    }
  }
  
+  @Test
+  public void testFind() throws Exception {
+    BytesRef ref = new BytesRef();
+    BytesRef scratch = new BytesRef();
+    int num = atLeast(2);
+    for (int j = 0; j < num; j++) {
+      Set<String> strings = new HashSet<String>();
+      int uniqueCount = 0;
+      for (int i = 0; i < 797; i++) {
+        String str;
+        do {
+          str = _TestUtil.randomRealisticUnicodeString(random(), 1000);
+        } while (str.length() == 0);
+        ref.copyChars(str);
+        int count = hash.size();
+        int key = hash.find(ref); //hash.add(ref);
+        if (key >= 0) { // string found in hash
+          assertFalse(strings.add(str));
+          assertTrue(key < count);
+          assertEquals(str, hash.get(key, scratch).utf8ToString());
+          assertEquals(count, hash.size());
+        } else {
+          key = hash.add(ref);
+          assertTrue(strings.add(str));
+          assertEquals(uniqueCount, key);
+          assertEquals(hash.size(), count + 1);
+          uniqueCount++;
+        }
+      }
+      
+      assertAllIn(strings, hash);
+      hash.clear();
+      assertEquals(0, hash.size());
+      hash.reinit();
+    }
+  }
+
  @Test(expected = MaxBytesLengthExceededException.class)
  public void testLargeValue() {
    int[] sizes = new int[] { random().nextInt(5),