LUCENE-4838: Add BytesRefHash.find()

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457400 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2013-03-17 08:58:51 +00:00
parent efaef82383
commit 0f3af7cec9
2 changed files with 147 additions and 79 deletions

View File

@ -30,8 +30,8 @@ import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
/** /**
* {@link BytesRefHash} is a special purpose hash-map like data-structure * {@link BytesRefHash} is a special purpose hash-map like data-structure
* optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
* byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes * byte arrays to ids (Map<BytesRef,int>) storing the hashed bytes
* efficiently in continuous storage. The mapping to the ordinal is * efficiently in continuous storage. The mapping to the id is
* encapsulated inside {@link BytesRefHash} and is guaranteed to be increased * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
* for each added {@link BytesRef}. * for each added {@link BytesRef}.
* *
@ -58,7 +58,7 @@ public final class BytesRefHash {
private int hashMask; private int hashMask;
private int count; private int count;
private int lastCount = -1; private int lastCount = -1;
private int[] ords; private int[] ids;
private final BytesStartArray bytesStartArray; private final BytesStartArray bytesStartArray;
private Counter bytesUsed; private Counter bytesUsed;
@ -86,8 +86,8 @@ public final class BytesRefHash {
hashHalfSize = hashSize >> 1; hashHalfSize = hashSize >> 1;
hashMask = hashSize - 1; hashMask = hashSize - 1;
this.pool = pool; this.pool = pool;
ords = new int[hashSize]; ids = new int[hashSize];
Arrays.fill(ords, -1); Arrays.fill(ids, -1);
this.bytesStartArray = bytesStartArray; this.bytesStartArray = bytesStartArray;
bytesStart = bytesStartArray.init(); bytesStart = bytesStartArray.init();
bytesUsed = bytesStartArray.bytesUsed() == null? Counter.newCounter() : bytesStartArray.bytesUsed(); bytesUsed = bytesStartArray.bytesUsed() == null? Counter.newCounter() : bytesStartArray.bytesUsed();
@ -104,26 +104,29 @@ public final class BytesRefHash {
} }
/** /**
* Populates and returns a {@link BytesRef} with the bytes for the given ord. * Populates and returns a {@link BytesRef} with the bytes for the given
* bytesID.
* <p> * <p>
* Note: the given ord must be a positive integer less that the current size ( * Note: the given bytesID must be a positive integer less than the current
* {@link #size()}) * size ({@link #size()})
* </p>
*
* @param ord the ord
* @param ref the {@link BytesRef} to populate
* *
* @return the given BytesRef instance populated with the bytes for the given ord * @param bytesID
* the id
* @param ref
* the {@link BytesRef} to populate
*
* @return the given BytesRef instance populated with the bytes for the given
* bytesID
*/ */
public BytesRef get(int ord, BytesRef ref) { public BytesRef get(int bytesID, BytesRef ref) {
assert bytesStart != null : "bytesStart is null - not initialized"; assert bytesStart != null : "bytesStart is null - not initialized";
assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length; assert bytesID < bytesStart.length: "bytesID exceeds byteStart len: " + bytesStart.length;
pool.setBytesRef(ref, bytesStart[ord]); pool.setBytesRef(ref, bytesStart[bytesID]);
return ref; return ref;
} }
/** /**
* Returns the ords array in arbitrary order. Valid ords start at offset of 0 * Returns the ids array in arbitrary order. Valid ids start at offset of 0
* and end at a limit of {@link #size()} - 1 * and end at a limit of {@link #size()} - 1
* <p> * <p>
* Note: This is a destructive operation. {@link #clear()} must be called in * Note: This is a destructive operation. {@link #clear()} must be called in
@ -131,13 +134,13 @@ public final class BytesRefHash {
* </p> * </p>
*/ */
int[] compact() { int[] compact() {
assert bytesStart != null : "Bytesstart is null - not initialized"; assert bytesStart != null : "bytesStart is null - not initialized";
int upto = 0; int upto = 0;
for (int i = 0; i < hashSize; i++) { for (int i = 0; i < hashSize; i++) {
if (ords[i] != -1) { if (ids[i] != -1) {
if (upto < i) { if (upto < i) {
ords[upto] = ords[i]; ids[upto] = ids[i];
ords[i] = -1; ids[i] = -1;
} }
upto++; upto++;
} }
@ -145,7 +148,7 @@ public final class BytesRefHash {
assert upto == count; assert upto == count;
lastCount = count; lastCount = count;
return ords; return ids;
} }
/** /**
@ -170,25 +173,25 @@ public final class BytesRefHash {
@Override @Override
protected int compare(int i, int j) { protected int compare(int i, int j) {
final int ord1 = compact[i], ord2 = compact[j]; final int id1 = compact[i], id2 = compact[j];
assert bytesStart.length > ord1 && bytesStart.length > ord2; assert bytesStart.length > id1 && bytesStart.length > id2;
pool.setBytesRef(scratch1, bytesStart[ord1]); pool.setBytesRef(scratch1, bytesStart[id1]);
pool.setBytesRef(scratch2, bytesStart[ord2]); pool.setBytesRef(scratch2, bytesStart[id2]);
return comp.compare(scratch1, scratch2); return comp.compare(scratch1, scratch2);
} }
@Override @Override
protected void setPivot(int i) { protected void setPivot(int i) {
final int ord = compact[i]; final int id = compact[i];
assert bytesStart.length > ord; assert bytesStart.length > id;
pool.setBytesRef(pivot, bytesStart[ord]); pool.setBytesRef(pivot, bytesStart[id]);
} }
@Override @Override
protected int comparePivot(int j) { protected int comparePivot(int j) {
final int ord = compact[j]; final int id = compact[j];
assert bytesStart.length > ord; assert bytesStart.length > id;
pool.setBytesRef(scratch2, bytesStart[ord]); pool.setBytesRef(scratch2, bytesStart[id]);
return comp.compare(pivot, scratch2); return comp.compare(pivot, scratch2);
} }
@ -198,8 +201,8 @@ public final class BytesRefHash {
return compact; return compact;
} }
private boolean equals(int ord, BytesRef b) { private boolean equals(int id, BytesRef b) {
pool.setBytesRef(scratch1, bytesStart[ord]); pool.setBytesRef(scratch1, bytesStart[id]);
return scratch1.bytesEquals(b); return scratch1.bytesEquals(b);
} }
@ -213,8 +216,8 @@ public final class BytesRefHash {
if (newSize != hashSize) { if (newSize != hashSize) {
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -(hashSize - newSize)); bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -(hashSize - newSize));
hashSize = newSize; hashSize = newSize;
ords = new int[hashSize]; ids = new int[hashSize];
Arrays.fill(ords, -1); Arrays.fill(ids, -1);
hashHalfSize = newSize / 2; hashHalfSize = newSize / 2;
hashMask = newSize - 1; hashMask = newSize - 1;
return true; return true;
@ -237,7 +240,7 @@ public final class BytesRefHash {
// shrink clears the hash entries // shrink clears the hash entries
return; return;
} }
Arrays.fill(ords, -1); Arrays.fill(ids, -1);
} }
public void clear() { public void clear() {
@ -249,7 +252,7 @@ public final class BytesRefHash {
*/ */
public void close() { public void close() {
clear(true); clear(true);
ords = null; ids = null;
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -hashSize); bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -hashSize);
} }
@ -258,8 +261,8 @@ public final class BytesRefHash {
* *
* @param bytes * @param bytes
* the bytes to hash * the bytes to hash
* @return the ord the given bytes are hashed if there was no mapping for the * @return the id the given bytes are hashed if there was no mapping for the
* given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees * given bytes, otherwise <code>(-(id)-1)</code>. This guarantees
* that the return value will always be &gt;= 0 if the given bytes * that the return value will always be &gt;= 0 if the given bytes
* haven't been hashed before. * haven't been hashed before.
* *
@ -289,8 +292,8 @@ public final class BytesRefHash {
* } * }
* </pre> * </pre>
* *
* @return the ord the given bytes are hashed if there was no mapping for the * @return the id the given bytes are hashed if there was no mapping for the
* given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees * given bytes, otherwise <code>(-(id)-1)</code>. This guarantees
* that the return value will always be &gt;= 0 if the given bytes * that the return value will always be &gt;= 0 if the given bytes
* haven't been hashed before. * haven't been hashed before.
* *
@ -302,19 +305,9 @@ public final class BytesRefHash {
assert bytesStart != null : "Bytesstart is null - not initialized"; assert bytesStart != null : "Bytesstart is null - not initialized";
final int length = bytes.length; final int length = bytes.length;
// final position // final position
int hashPos = code & hashMask; final int hashPos = findHash(bytes, code);
int e = ords[hashPos]; int e = ids[hashPos];
if (e != -1 && !equals(e, bytes)) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = ((code >> 8) + code) | 1;
do {
code += inc;
hashPos = code & hashMask;
e = ords[hashPos];
} while (e != -1 && !equals(e, bytes));
}
if (e == -1) { if (e == -1) {
// new entry // new entry
final int len2 = 2 + bytes.length; final int len2 = 2 + bytes.length;
@ -355,8 +348,8 @@ public final class BytesRefHash {
System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2, System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
length); length);
} }
assert ords[hashPos] == -1; assert ids[hashPos] == -1;
ords[hashPos] = e; ids[hashPos] = e;
if (count == hashHalfSize) { if (count == hashHalfSize) {
rehash(2 * hashSize, true); rehash(2 * hashSize, true);
@ -365,13 +358,56 @@ public final class BytesRefHash {
} }
return -(e + 1); return -(e + 1);
} }
/**
* Returns the id of the given {@link BytesRef}.
*
* @see #find(BytesRef, int)
*/
public int find(BytesRef bytes) {
return find(bytes, bytes.hashCode());
}
/**
* Returns the id of the given {@link BytesRef} with a pre-calculated hash code.
*
* @param bytes
* the bytes to look for
* @param code
* the bytes hash code
*
* @return the id of the given bytes, or {@code -1} if there is no mapping for the
* given bytes.
*/
public int find(BytesRef bytes, int code) {
return ids[findHash(bytes, code)];
}
private final int findHash(BytesRef bytes, int code) {
assert bytesStart != null : "bytesStart is null - not initialized";
// final position
int hashPos = code & hashMask;
int e = ids[hashPos];
if (e != -1 && !equals(e, bytes)) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = ((code >> 8) + code) | 1;
do {
code += inc;
hashPos = code & hashMask;
e = ids[hashPos];
} while (e != -1 && !equals(e, bytes));
}
return hashPos;
}
public int addByPoolOffset(int offset) { public int addByPoolOffset(int offset) {
assert bytesStart != null : "Bytesstart is null - not initialized"; assert bytesStart != null : "Bytesstart is null - not initialized";
// final position // final position
int code = offset; int code = offset;
int hashPos = offset & hashMask; int hashPos = offset & hashMask;
int e = ords[hashPos]; int e = ids[hashPos];
if (e != -1 && bytesStart[e] != offset) { if (e != -1 && bytesStart[e] != offset) {
// Conflict: keep searching different locations in // Conflict: keep searching different locations in
// the hash table. // the hash table.
@ -379,7 +415,7 @@ public final class BytesRefHash {
do { do {
code += inc; code += inc;
hashPos = code & hashMask; hashPos = code & hashMask;
e = ords[hashPos]; e = ids[hashPos];
} while (e != -1 && bytesStart[e] != offset); } while (e != -1 && bytesStart[e] != offset);
} }
if (e == -1) { if (e == -1) {
@ -391,8 +427,8 @@ public final class BytesRefHash {
} }
e = count++; e = count++;
bytesStart[e] = offset; bytesStart[e] = offset;
assert ords[hashPos] == -1; assert ids[hashPos] == -1;
ords[hashPos] = e; ids[hashPos] = e;
if (count == hashHalfSize) { if (count == hashHalfSize) {
rehash(2 * hashSize, false); rehash(2 * hashSize, false);
@ -412,7 +448,7 @@ public final class BytesRefHash {
final int[] newHash = new int[newSize]; final int[] newHash = new int[newSize];
Arrays.fill(newHash, -1); Arrays.fill(newHash, -1);
for (int i = 0; i < hashSize; i++) { for (int i = 0; i < hashSize; i++) {
final int e0 = ords[i]; final int e0 = ids[i];
if (e0 != -1) { if (e0 != -1) {
int code; int code;
if (hashOnData) { if (hashOnData) {
@ -453,8 +489,8 @@ public final class BytesRefHash {
} }
hashMask = newMask; hashMask = newMask;
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length)); bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ids.length));
ords = newHash; ids = newHash;
hashSize = newSize; hashSize = newSize;
hashHalfSize = newSize / 2; hashHalfSize = newSize / 2;
} }
@ -469,25 +505,25 @@ public final class BytesRefHash {
bytesStart = bytesStartArray.init(); bytesStart = bytesStartArray.init();
} }
if (ords == null) { if (ids == null) {
ords = new int[hashSize]; ids = new int[hashSize];
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize); bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize);
} }
} }
/** /**
* Returns the bytesStart offset into the internally used * Returns the bytesStart offset into the internally used
* {@link ByteBlockPool} for the given ord * {@link ByteBlockPool} for the given bytesID
* *
* @param ord * @param bytesID
* the ord to look up * the id to look up
* @return the bytesStart offset into the internally used * @return the bytesStart offset into the internally used
* {@link ByteBlockPool} for the given ord * {@link ByteBlockPool} for the given id
*/ */
public int byteStart(int ord) { public int byteStart(int bytesID) {
assert bytesStart != null : "Bytesstart is null - not initialized"; assert bytesStart != null : "bytesStart is null - not initialized";
assert ord >= 0 && ord < count : ord; assert bytesID >= 0 && bytesID < count : bytesID;
return bytesStart[ord]; return bytesStart[bytesID];
} }
/** /**

View File

@ -30,16 +30,11 @@ import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
/**
*
*/
public class TestBytesRefHash extends LuceneTestCase { public class TestBytesRefHash extends LuceneTestCase {
BytesRefHash hash; BytesRefHash hash;
ByteBlockPool pool; ByteBlockPool pool;
/**
*/
@Override @Override
@Before @Before
public void setUp() throws Exception { public void setUp() throws Exception {
@ -248,6 +243,43 @@ public class TestBytesRefHash extends LuceneTestCase {
hash.reinit(); hash.reinit();
} }
} }
@Test
public void testFind() throws Exception {
BytesRef ref = new BytesRef();
BytesRef scratch = new BytesRef();
int num = atLeast(2);
for (int j = 0; j < num; j++) {
Set<String> strings = new HashSet<String>();
int uniqueCount = 0;
for (int i = 0; i < 797; i++) {
String str;
do {
str = _TestUtil.randomRealisticUnicodeString(random(), 1000);
} while (str.length() == 0);
ref.copyChars(str);
int count = hash.size();
int key = hash.find(ref); //hash.add(ref);
if (key >= 0) { // string found in hash
assertFalse(strings.add(str));
assertTrue(key < count);
assertEquals(str, hash.get(key, scratch).utf8ToString());
assertEquals(count, hash.size());
} else {
key = hash.add(ref);
assertTrue(strings.add(str));
assertEquals(uniqueCount, key);
assertEquals(hash.size(), count + 1);
uniqueCount++;
}
}
assertAllIn(strings, hash);
hash.clear();
assertEquals(0, hash.size());
hash.reinit();
}
}
@Test(expected = MaxBytesLengthExceededException.class) @Test(expected = MaxBytesLengthExceededException.class)
public void testLargeValue() { public void testLargeValue() {