LUCENE-4838: Add BytesRefHash.find()

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457400 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2013-03-17 08:58:51 +00:00
parent efaef82383
commit 0f3af7cec9
2 changed files with 147 additions and 79 deletions

View File

@ -30,8 +30,8 @@ import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
/**
* {@link BytesRefHash} is a special purpose hash-map like data-structure
* optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
* byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
* efficiently in continuous storage. The mapping to the ordinal is
* byte arrays to ids (Map<BytesRef,int>) storing the hashed bytes
* efficiently in continuous storage. The mapping to the id is
* encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
* for each added {@link BytesRef}.
*
@ -58,7 +58,7 @@ public final class BytesRefHash {
private int hashMask;
private int count;
private int lastCount = -1;
private int[] ords;
private int[] ids;
private final BytesStartArray bytesStartArray;
private Counter bytesUsed;
@ -86,8 +86,8 @@ public final class BytesRefHash {
hashHalfSize = hashSize >> 1;
hashMask = hashSize - 1;
this.pool = pool;
ords = new int[hashSize];
Arrays.fill(ords, -1);
ids = new int[hashSize];
Arrays.fill(ids, -1);
this.bytesStartArray = bytesStartArray;
bytesStart = bytesStartArray.init();
bytesUsed = bytesStartArray.bytesUsed() == null? Counter.newCounter() : bytesStartArray.bytesUsed();
@ -104,26 +104,29 @@ public final class BytesRefHash {
}
/**
* Populates and returns a {@link BytesRef} with the bytes for the given ord.
* Populates and returns a {@link BytesRef} with the bytes for the given
* bytesID.
* <p>
* Note: the given ord must be a positive integer less that the current size (
* {@link #size()})
* </p>
* Note: the given bytesID must be a positive integer less than the current
* size ({@link #size()})
*
* @param ord the ord
* @param ref the {@link BytesRef} to populate
* @param bytesID
* the id
* @param ref
* the {@link BytesRef} to populate
*
* @return the given BytesRef instance populated with the bytes for the given ord
* @return the given BytesRef instance populated with the bytes for the given
* bytesID
*/
public BytesRef get(int ord, BytesRef ref) {
public BytesRef get(int bytesID, BytesRef ref) {
assert bytesStart != null : "bytesStart is null - not initialized";
assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length;
pool.setBytesRef(ref, bytesStart[ord]);
assert bytesID < bytesStart.length: "bytesID exceeds byteStart len: " + bytesStart.length;
pool.setBytesRef(ref, bytesStart[bytesID]);
return ref;
}
/**
* Returns the ords array in arbitrary order. Valid ords start at offset of 0
* Returns the ids array in arbitrary order. Valid ids start at offset of 0
* and end at a limit of {@link #size()} - 1
* <p>
* Note: This is a destructive operation. {@link #clear()} must be called in
@ -131,13 +134,13 @@ public final class BytesRefHash {
* </p>
*/
int[] compact() {
assert bytesStart != null : "Bytesstart is null - not initialized";
assert bytesStart != null : "bytesStart is null - not initialized";
int upto = 0;
for (int i = 0; i < hashSize; i++) {
if (ords[i] != -1) {
if (ids[i] != -1) {
if (upto < i) {
ords[upto] = ords[i];
ords[i] = -1;
ids[upto] = ids[i];
ids[i] = -1;
}
upto++;
}
@ -145,7 +148,7 @@ public final class BytesRefHash {
assert upto == count;
lastCount = count;
return ords;
return ids;
}
/**
@ -170,25 +173,25 @@ public final class BytesRefHash {
@Override
protected int compare(int i, int j) {
final int ord1 = compact[i], ord2 = compact[j];
assert bytesStart.length > ord1 && bytesStart.length > ord2;
pool.setBytesRef(scratch1, bytesStart[ord1]);
pool.setBytesRef(scratch2, bytesStart[ord2]);
final int id1 = compact[i], id2 = compact[j];
assert bytesStart.length > id1 && bytesStart.length > id2;
pool.setBytesRef(scratch1, bytesStart[id1]);
pool.setBytesRef(scratch2, bytesStart[id2]);
return comp.compare(scratch1, scratch2);
}
@Override
protected void setPivot(int i) {
final int ord = compact[i];
assert bytesStart.length > ord;
pool.setBytesRef(pivot, bytesStart[ord]);
final int id = compact[i];
assert bytesStart.length > id;
pool.setBytesRef(pivot, bytesStart[id]);
}
@Override
protected int comparePivot(int j) {
final int ord = compact[j];
assert bytesStart.length > ord;
pool.setBytesRef(scratch2, bytesStart[ord]);
final int id = compact[j];
assert bytesStart.length > id;
pool.setBytesRef(scratch2, bytesStart[id]);
return comp.compare(pivot, scratch2);
}
@ -198,8 +201,8 @@ public final class BytesRefHash {
return compact;
}
private boolean equals(int ord, BytesRef b) {
pool.setBytesRef(scratch1, bytesStart[ord]);
private boolean equals(int id, BytesRef b) {
pool.setBytesRef(scratch1, bytesStart[id]);
return scratch1.bytesEquals(b);
}
@ -213,8 +216,8 @@ public final class BytesRefHash {
if (newSize != hashSize) {
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -(hashSize - newSize));
hashSize = newSize;
ords = new int[hashSize];
Arrays.fill(ords, -1);
ids = new int[hashSize];
Arrays.fill(ids, -1);
hashHalfSize = newSize / 2;
hashMask = newSize - 1;
return true;
@ -237,7 +240,7 @@ public final class BytesRefHash {
// shrink clears the hash entries
return;
}
Arrays.fill(ords, -1);
Arrays.fill(ids, -1);
}
public void clear() {
@ -249,7 +252,7 @@ public final class BytesRefHash {
*/
public void close() {
clear(true);
ords = null;
ids = null;
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -hashSize);
}
@ -258,8 +261,8 @@ public final class BytesRefHash {
*
* @param bytes
* the bytes to hash
* @return the ord the given bytes are hashed if there was no mapping for the
* given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
* @return the id the given bytes are hashed if there was no mapping for the
* given bytes, otherwise <code>(-(id)-1)</code>. This guarantees
* that the return value will always be &gt;= 0 if the given bytes
* haven't been hashed before.
*
@ -289,8 +292,8 @@ public final class BytesRefHash {
* }
* </pre>
*
* @return the ord the given bytes are hashed if there was no mapping for the
* given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
* @return the id the given bytes are hashed if there was no mapping for the
* given bytes, otherwise <code>(-(id)-1)</code>. This guarantees
* that the return value will always be &gt;= 0 if the given bytes
* haven't been hashed before.
*
@ -302,18 +305,8 @@ public final class BytesRefHash {
assert bytesStart != null : "Bytesstart is null - not initialized";
final int length = bytes.length;
// final position
int hashPos = code & hashMask;
int e = ords[hashPos];
if (e != -1 && !equals(e, bytes)) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = ((code >> 8) + code) | 1;
do {
code += inc;
hashPos = code & hashMask;
e = ords[hashPos];
} while (e != -1 && !equals(e, bytes));
}
final int hashPos = findHash(bytes, code);
int e = ids[hashPos];
if (e == -1) {
// new entry
@ -355,8 +348,8 @@ public final class BytesRefHash {
System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
length);
}
assert ords[hashPos] == -1;
ords[hashPos] = e;
assert ids[hashPos] == -1;
ids[hashPos] = e;
if (count == hashHalfSize) {
rehash(2 * hashSize, true);
@ -366,12 +359,55 @@ public final class BytesRefHash {
return -(e + 1);
}
/**
* Returns the id of the given {@link BytesRef}.
*
* @see #find(BytesRef, int)
*/
public int find(BytesRef bytes) {
return find(bytes, bytes.hashCode());
}
/**
* Returns the id of the given {@link BytesRef} with a pre-calculated hash code.
*
* @param bytes
* the bytes to look for
* @param code
* the bytes hash code
*
* @return the id of the given bytes, or {@code -1} if there is no mapping for the
* given bytes.
*/
public int find(BytesRef bytes, int code) {
return ids[findHash(bytes, code)];
}
private final int findHash(BytesRef bytes, int code) {
assert bytesStart != null : "bytesStart is null - not initialized";
// final position
int hashPos = code & hashMask;
int e = ids[hashPos];
if (e != -1 && !equals(e, bytes)) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = ((code >> 8) + code) | 1;
do {
code += inc;
hashPos = code & hashMask;
e = ids[hashPos];
} while (e != -1 && !equals(e, bytes));
}
return hashPos;
}
public int addByPoolOffset(int offset) {
assert bytesStart != null : "Bytesstart is null - not initialized";
// final position
int code = offset;
int hashPos = offset & hashMask;
int e = ords[hashPos];
int e = ids[hashPos];
if (e != -1 && bytesStart[e] != offset) {
// Conflict: keep searching different locations in
// the hash table.
@ -379,7 +415,7 @@ public final class BytesRefHash {
do {
code += inc;
hashPos = code & hashMask;
e = ords[hashPos];
e = ids[hashPos];
} while (e != -1 && bytesStart[e] != offset);
}
if (e == -1) {
@ -391,8 +427,8 @@ public final class BytesRefHash {
}
e = count++;
bytesStart[e] = offset;
assert ords[hashPos] == -1;
ords[hashPos] = e;
assert ids[hashPos] == -1;
ids[hashPos] = e;
if (count == hashHalfSize) {
rehash(2 * hashSize, false);
@ -412,7 +448,7 @@ public final class BytesRefHash {
final int[] newHash = new int[newSize];
Arrays.fill(newHash, -1);
for (int i = 0; i < hashSize; i++) {
final int e0 = ords[i];
final int e0 = ids[i];
if (e0 != -1) {
int code;
if (hashOnData) {
@ -453,8 +489,8 @@ public final class BytesRefHash {
}
hashMask = newMask;
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length));
ords = newHash;
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ids.length));
ids = newHash;
hashSize = newSize;
hashHalfSize = newSize / 2;
}
@ -469,25 +505,25 @@ public final class BytesRefHash {
bytesStart = bytesStartArray.init();
}
if (ords == null) {
ords = new int[hashSize];
if (ids == null) {
ids = new int[hashSize];
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize);
}
}
/**
* Returns the bytesStart offset into the internally used
* {@link ByteBlockPool} for the given ord
* {@link ByteBlockPool} for the given bytesID
*
* @param ord
* the ord to look up
* @param bytesID
* the id to look up
* @return the bytesStart offset into the internally used
* {@link ByteBlockPool} for the given ord
* {@link ByteBlockPool} for the given id
*/
public int byteStart(int ord) {
assert bytesStart != null : "Bytesstart is null - not initialized";
assert ord >= 0 && ord < count : ord;
return bytesStart[ord];
public int byteStart(int bytesID) {
assert bytesStart != null : "bytesStart is null - not initialized";
assert bytesID >= 0 && bytesID < count : bytesID;
return bytesStart[bytesID];
}
/**

View File

@ -30,16 +30,11 @@ import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
import org.junit.Before;
import org.junit.Test;
/**
*
*/
public class TestBytesRefHash extends LuceneTestCase {
BytesRefHash hash;
ByteBlockPool pool;
/**
*/
@Override
@Before
public void setUp() throws Exception {
@ -249,6 +244,43 @@ public class TestBytesRefHash extends LuceneTestCase {
}
}
@Test
public void testFind() throws Exception {
BytesRef ref = new BytesRef();
BytesRef scratch = new BytesRef();
int num = atLeast(2);
for (int j = 0; j < num; j++) {
Set<String> strings = new HashSet<String>();
int uniqueCount = 0;
for (int i = 0; i < 797; i++) {
String str;
do {
str = _TestUtil.randomRealisticUnicodeString(random(), 1000);
} while (str.length() == 0);
ref.copyChars(str);
int count = hash.size();
int key = hash.find(ref); //hash.add(ref);
if (key >= 0) { // string found in hash
assertFalse(strings.add(str));
assertTrue(key < count);
assertEquals(str, hash.get(key, scratch).utf8ToString());
assertEquals(count, hash.size());
} else {
key = hash.add(ref);
assertTrue(strings.add(str));
assertEquals(uniqueCount, key);
assertEquals(hash.size(), count + 1);
uniqueCount++;
}
}
assertAllIn(strings, hash);
hash.clear();
assertEquals(0, hash.size());
hash.reinit();
}
}
@Test(expected = MaxBytesLengthExceededException.class)
public void testLargeValue() {
int[] sizes = new int[] { random().nextInt(5),