mirror of https://github.com/apache/lucene.git
LUCENE-4838: Add BytesRefHash.find()
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457400 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
efaef82383
commit
0f3af7cec9
|
@ -30,8 +30,8 @@ import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
|
||||||
/**
|
/**
|
||||||
* {@link BytesRefHash} is a special purpose hash-map like data-structure
|
* {@link BytesRefHash} is a special purpose hash-map like data-structure
|
||||||
* optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
|
* optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
|
||||||
* byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
|
* byte arrays to ids (Map<BytesRef,int>) storing the hashed bytes
|
||||||
* efficiently in continuous storage. The mapping to the ordinal is
|
* efficiently in continuous storage. The mapping to the id is
|
||||||
* encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
|
* encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
|
||||||
* for each added {@link BytesRef}.
|
* for each added {@link BytesRef}.
|
||||||
*
|
*
|
||||||
|
@ -58,7 +58,7 @@ public final class BytesRefHash {
|
||||||
private int hashMask;
|
private int hashMask;
|
||||||
private int count;
|
private int count;
|
||||||
private int lastCount = -1;
|
private int lastCount = -1;
|
||||||
private int[] ords;
|
private int[] ids;
|
||||||
private final BytesStartArray bytesStartArray;
|
private final BytesStartArray bytesStartArray;
|
||||||
private Counter bytesUsed;
|
private Counter bytesUsed;
|
||||||
|
|
||||||
|
@ -86,8 +86,8 @@ public final class BytesRefHash {
|
||||||
hashHalfSize = hashSize >> 1;
|
hashHalfSize = hashSize >> 1;
|
||||||
hashMask = hashSize - 1;
|
hashMask = hashSize - 1;
|
||||||
this.pool = pool;
|
this.pool = pool;
|
||||||
ords = new int[hashSize];
|
ids = new int[hashSize];
|
||||||
Arrays.fill(ords, -1);
|
Arrays.fill(ids, -1);
|
||||||
this.bytesStartArray = bytesStartArray;
|
this.bytesStartArray = bytesStartArray;
|
||||||
bytesStart = bytesStartArray.init();
|
bytesStart = bytesStartArray.init();
|
||||||
bytesUsed = bytesStartArray.bytesUsed() == null? Counter.newCounter() : bytesStartArray.bytesUsed();
|
bytesUsed = bytesStartArray.bytesUsed() == null? Counter.newCounter() : bytesStartArray.bytesUsed();
|
||||||
|
@ -104,26 +104,29 @@ public final class BytesRefHash {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Populates and returns a {@link BytesRef} with the bytes for the given ord.
|
* Populates and returns a {@link BytesRef} with the bytes for the given
|
||||||
|
* bytesID.
|
||||||
* <p>
|
* <p>
|
||||||
* Note: the given ord must be a positive integer less that the current size (
|
* Note: the given bytesID must be a positive integer less than the current
|
||||||
* {@link #size()})
|
* size ({@link #size()})
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* @param ord the ord
|
|
||||||
* @param ref the {@link BytesRef} to populate
|
|
||||||
*
|
*
|
||||||
* @return the given BytesRef instance populated with the bytes for the given ord
|
* @param bytesID
|
||||||
|
* the id
|
||||||
|
* @param ref
|
||||||
|
* the {@link BytesRef} to populate
|
||||||
|
*
|
||||||
|
* @return the given BytesRef instance populated with the bytes for the given
|
||||||
|
* bytesID
|
||||||
*/
|
*/
|
||||||
public BytesRef get(int ord, BytesRef ref) {
|
public BytesRef get(int bytesID, BytesRef ref) {
|
||||||
assert bytesStart != null : "bytesStart is null - not initialized";
|
assert bytesStart != null : "bytesStart is null - not initialized";
|
||||||
assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length;
|
assert bytesID < bytesStart.length: "bytesID exceeds byteStart len: " + bytesStart.length;
|
||||||
pool.setBytesRef(ref, bytesStart[ord]);
|
pool.setBytesRef(ref, bytesStart[bytesID]);
|
||||||
return ref;
|
return ref;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the ords array in arbitrary order. Valid ords start at offset of 0
|
* Returns the ids array in arbitrary order. Valid ids start at offset of 0
|
||||||
* and end at a limit of {@link #size()} - 1
|
* and end at a limit of {@link #size()} - 1
|
||||||
* <p>
|
* <p>
|
||||||
* Note: This is a destructive operation. {@link #clear()} must be called in
|
* Note: This is a destructive operation. {@link #clear()} must be called in
|
||||||
|
@ -131,13 +134,13 @@ public final class BytesRefHash {
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
int[] compact() {
|
int[] compact() {
|
||||||
assert bytesStart != null : "Bytesstart is null - not initialized";
|
assert bytesStart != null : "bytesStart is null - not initialized";
|
||||||
int upto = 0;
|
int upto = 0;
|
||||||
for (int i = 0; i < hashSize; i++) {
|
for (int i = 0; i < hashSize; i++) {
|
||||||
if (ords[i] != -1) {
|
if (ids[i] != -1) {
|
||||||
if (upto < i) {
|
if (upto < i) {
|
||||||
ords[upto] = ords[i];
|
ids[upto] = ids[i];
|
||||||
ords[i] = -1;
|
ids[i] = -1;
|
||||||
}
|
}
|
||||||
upto++;
|
upto++;
|
||||||
}
|
}
|
||||||
|
@ -145,7 +148,7 @@ public final class BytesRefHash {
|
||||||
|
|
||||||
assert upto == count;
|
assert upto == count;
|
||||||
lastCount = count;
|
lastCount = count;
|
||||||
return ords;
|
return ids;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -170,25 +173,25 @@ public final class BytesRefHash {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int compare(int i, int j) {
|
protected int compare(int i, int j) {
|
||||||
final int ord1 = compact[i], ord2 = compact[j];
|
final int id1 = compact[i], id2 = compact[j];
|
||||||
assert bytesStart.length > ord1 && bytesStart.length > ord2;
|
assert bytesStart.length > id1 && bytesStart.length > id2;
|
||||||
pool.setBytesRef(scratch1, bytesStart[ord1]);
|
pool.setBytesRef(scratch1, bytesStart[id1]);
|
||||||
pool.setBytesRef(scratch2, bytesStart[ord2]);
|
pool.setBytesRef(scratch2, bytesStart[id2]);
|
||||||
return comp.compare(scratch1, scratch2);
|
return comp.compare(scratch1, scratch2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void setPivot(int i) {
|
protected void setPivot(int i) {
|
||||||
final int ord = compact[i];
|
final int id = compact[i];
|
||||||
assert bytesStart.length > ord;
|
assert bytesStart.length > id;
|
||||||
pool.setBytesRef(pivot, bytesStart[ord]);
|
pool.setBytesRef(pivot, bytesStart[id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int comparePivot(int j) {
|
protected int comparePivot(int j) {
|
||||||
final int ord = compact[j];
|
final int id = compact[j];
|
||||||
assert bytesStart.length > ord;
|
assert bytesStart.length > id;
|
||||||
pool.setBytesRef(scratch2, bytesStart[ord]);
|
pool.setBytesRef(scratch2, bytesStart[id]);
|
||||||
return comp.compare(pivot, scratch2);
|
return comp.compare(pivot, scratch2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -198,8 +201,8 @@ public final class BytesRefHash {
|
||||||
return compact;
|
return compact;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean equals(int ord, BytesRef b) {
|
private boolean equals(int id, BytesRef b) {
|
||||||
pool.setBytesRef(scratch1, bytesStart[ord]);
|
pool.setBytesRef(scratch1, bytesStart[id]);
|
||||||
return scratch1.bytesEquals(b);
|
return scratch1.bytesEquals(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -213,8 +216,8 @@ public final class BytesRefHash {
|
||||||
if (newSize != hashSize) {
|
if (newSize != hashSize) {
|
||||||
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -(hashSize - newSize));
|
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -(hashSize - newSize));
|
||||||
hashSize = newSize;
|
hashSize = newSize;
|
||||||
ords = new int[hashSize];
|
ids = new int[hashSize];
|
||||||
Arrays.fill(ords, -1);
|
Arrays.fill(ids, -1);
|
||||||
hashHalfSize = newSize / 2;
|
hashHalfSize = newSize / 2;
|
||||||
hashMask = newSize - 1;
|
hashMask = newSize - 1;
|
||||||
return true;
|
return true;
|
||||||
|
@ -237,7 +240,7 @@ public final class BytesRefHash {
|
||||||
// shrink clears the hash entries
|
// shrink clears the hash entries
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
Arrays.fill(ords, -1);
|
Arrays.fill(ids, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void clear() {
|
public void clear() {
|
||||||
|
@ -249,7 +252,7 @@ public final class BytesRefHash {
|
||||||
*/
|
*/
|
||||||
public void close() {
|
public void close() {
|
||||||
clear(true);
|
clear(true);
|
||||||
ords = null;
|
ids = null;
|
||||||
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -hashSize);
|
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -hashSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -258,8 +261,8 @@ public final class BytesRefHash {
|
||||||
*
|
*
|
||||||
* @param bytes
|
* @param bytes
|
||||||
* the bytes to hash
|
* the bytes to hash
|
||||||
* @return the ord the given bytes are hashed if there was no mapping for the
|
* @return the id the given bytes are hashed if there was no mapping for the
|
||||||
* given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
|
* given bytes, otherwise <code>(-(id)-1)</code>. This guarantees
|
||||||
* that the return value will always be >= 0 if the given bytes
|
* that the return value will always be >= 0 if the given bytes
|
||||||
* haven't been hashed before.
|
* haven't been hashed before.
|
||||||
*
|
*
|
||||||
|
@ -289,8 +292,8 @@ public final class BytesRefHash {
|
||||||
* }
|
* }
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
*
|
||||||
* @return the ord the given bytes are hashed if there was no mapping for the
|
* @return the id the given bytes are hashed if there was no mapping for the
|
||||||
* given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
|
* given bytes, otherwise <code>(-(id)-1)</code>. This guarantees
|
||||||
* that the return value will always be >= 0 if the given bytes
|
* that the return value will always be >= 0 if the given bytes
|
||||||
* haven't been hashed before.
|
* haven't been hashed before.
|
||||||
*
|
*
|
||||||
|
@ -302,19 +305,9 @@ public final class BytesRefHash {
|
||||||
assert bytesStart != null : "Bytesstart is null - not initialized";
|
assert bytesStart != null : "Bytesstart is null - not initialized";
|
||||||
final int length = bytes.length;
|
final int length = bytes.length;
|
||||||
// final position
|
// final position
|
||||||
int hashPos = code & hashMask;
|
final int hashPos = findHash(bytes, code);
|
||||||
int e = ords[hashPos];
|
int e = ids[hashPos];
|
||||||
if (e != -1 && !equals(e, bytes)) {
|
|
||||||
// Conflict: keep searching different locations in
|
|
||||||
// the hash table.
|
|
||||||
final int inc = ((code >> 8) + code) | 1;
|
|
||||||
do {
|
|
||||||
code += inc;
|
|
||||||
hashPos = code & hashMask;
|
|
||||||
e = ords[hashPos];
|
|
||||||
} while (e != -1 && !equals(e, bytes));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (e == -1) {
|
if (e == -1) {
|
||||||
// new entry
|
// new entry
|
||||||
final int len2 = 2 + bytes.length;
|
final int len2 = 2 + bytes.length;
|
||||||
|
@ -355,8 +348,8 @@ public final class BytesRefHash {
|
||||||
System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
|
System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
|
||||||
length);
|
length);
|
||||||
}
|
}
|
||||||
assert ords[hashPos] == -1;
|
assert ids[hashPos] == -1;
|
||||||
ords[hashPos] = e;
|
ids[hashPos] = e;
|
||||||
|
|
||||||
if (count == hashHalfSize) {
|
if (count == hashHalfSize) {
|
||||||
rehash(2 * hashSize, true);
|
rehash(2 * hashSize, true);
|
||||||
|
@ -365,13 +358,56 @@ public final class BytesRefHash {
|
||||||
}
|
}
|
||||||
return -(e + 1);
|
return -(e + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the id of the given {@link BytesRef}.
|
||||||
|
*
|
||||||
|
* @see #find(BytesRef, int)
|
||||||
|
*/
|
||||||
|
public int find(BytesRef bytes) {
|
||||||
|
return find(bytes, bytes.hashCode());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the id of the given {@link BytesRef} with a pre-calculated hash code.
|
||||||
|
*
|
||||||
|
* @param bytes
|
||||||
|
* the bytes to look for
|
||||||
|
* @param code
|
||||||
|
* the bytes hash code
|
||||||
|
*
|
||||||
|
* @return the id of the given bytes, or {@code -1} if there is no mapping for the
|
||||||
|
* given bytes.
|
||||||
|
*/
|
||||||
|
public int find(BytesRef bytes, int code) {
|
||||||
|
return ids[findHash(bytes, code)];
|
||||||
|
}
|
||||||
|
|
||||||
|
private final int findHash(BytesRef bytes, int code) {
|
||||||
|
assert bytesStart != null : "bytesStart is null - not initialized";
|
||||||
|
// final position
|
||||||
|
int hashPos = code & hashMask;
|
||||||
|
int e = ids[hashPos];
|
||||||
|
if (e != -1 && !equals(e, bytes)) {
|
||||||
|
// Conflict: keep searching different locations in
|
||||||
|
// the hash table.
|
||||||
|
final int inc = ((code >> 8) + code) | 1;
|
||||||
|
do {
|
||||||
|
code += inc;
|
||||||
|
hashPos = code & hashMask;
|
||||||
|
e = ids[hashPos];
|
||||||
|
} while (e != -1 && !equals(e, bytes));
|
||||||
|
}
|
||||||
|
|
||||||
|
return hashPos;
|
||||||
|
}
|
||||||
|
|
||||||
public int addByPoolOffset(int offset) {
|
public int addByPoolOffset(int offset) {
|
||||||
assert bytesStart != null : "Bytesstart is null - not initialized";
|
assert bytesStart != null : "Bytesstart is null - not initialized";
|
||||||
// final position
|
// final position
|
||||||
int code = offset;
|
int code = offset;
|
||||||
int hashPos = offset & hashMask;
|
int hashPos = offset & hashMask;
|
||||||
int e = ords[hashPos];
|
int e = ids[hashPos];
|
||||||
if (e != -1 && bytesStart[e] != offset) {
|
if (e != -1 && bytesStart[e] != offset) {
|
||||||
// Conflict: keep searching different locations in
|
// Conflict: keep searching different locations in
|
||||||
// the hash table.
|
// the hash table.
|
||||||
|
@ -379,7 +415,7 @@ public final class BytesRefHash {
|
||||||
do {
|
do {
|
||||||
code += inc;
|
code += inc;
|
||||||
hashPos = code & hashMask;
|
hashPos = code & hashMask;
|
||||||
e = ords[hashPos];
|
e = ids[hashPos];
|
||||||
} while (e != -1 && bytesStart[e] != offset);
|
} while (e != -1 && bytesStart[e] != offset);
|
||||||
}
|
}
|
||||||
if (e == -1) {
|
if (e == -1) {
|
||||||
|
@ -391,8 +427,8 @@ public final class BytesRefHash {
|
||||||
}
|
}
|
||||||
e = count++;
|
e = count++;
|
||||||
bytesStart[e] = offset;
|
bytesStart[e] = offset;
|
||||||
assert ords[hashPos] == -1;
|
assert ids[hashPos] == -1;
|
||||||
ords[hashPos] = e;
|
ids[hashPos] = e;
|
||||||
|
|
||||||
if (count == hashHalfSize) {
|
if (count == hashHalfSize) {
|
||||||
rehash(2 * hashSize, false);
|
rehash(2 * hashSize, false);
|
||||||
|
@ -412,7 +448,7 @@ public final class BytesRefHash {
|
||||||
final int[] newHash = new int[newSize];
|
final int[] newHash = new int[newSize];
|
||||||
Arrays.fill(newHash, -1);
|
Arrays.fill(newHash, -1);
|
||||||
for (int i = 0; i < hashSize; i++) {
|
for (int i = 0; i < hashSize; i++) {
|
||||||
final int e0 = ords[i];
|
final int e0 = ids[i];
|
||||||
if (e0 != -1) {
|
if (e0 != -1) {
|
||||||
int code;
|
int code;
|
||||||
if (hashOnData) {
|
if (hashOnData) {
|
||||||
|
@ -453,8 +489,8 @@ public final class BytesRefHash {
|
||||||
}
|
}
|
||||||
|
|
||||||
hashMask = newMask;
|
hashMask = newMask;
|
||||||
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length));
|
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ids.length));
|
||||||
ords = newHash;
|
ids = newHash;
|
||||||
hashSize = newSize;
|
hashSize = newSize;
|
||||||
hashHalfSize = newSize / 2;
|
hashHalfSize = newSize / 2;
|
||||||
}
|
}
|
||||||
|
@ -469,25 +505,25 @@ public final class BytesRefHash {
|
||||||
bytesStart = bytesStartArray.init();
|
bytesStart = bytesStartArray.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ords == null) {
|
if (ids == null) {
|
||||||
ords = new int[hashSize];
|
ids = new int[hashSize];
|
||||||
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize);
|
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the bytesStart offset into the internally used
|
* Returns the bytesStart offset into the internally used
|
||||||
* {@link ByteBlockPool} for the given ord
|
* {@link ByteBlockPool} for the given bytesID
|
||||||
*
|
*
|
||||||
* @param ord
|
* @param bytesID
|
||||||
* the ord to look up
|
* the id to look up
|
||||||
* @return the bytesStart offset into the internally used
|
* @return the bytesStart offset into the internally used
|
||||||
* {@link ByteBlockPool} for the given ord
|
* {@link ByteBlockPool} for the given id
|
||||||
*/
|
*/
|
||||||
public int byteStart(int ord) {
|
public int byteStart(int bytesID) {
|
||||||
assert bytesStart != null : "Bytesstart is null - not initialized";
|
assert bytesStart != null : "bytesStart is null - not initialized";
|
||||||
assert ord >= 0 && ord < count : ord;
|
assert bytesID >= 0 && bytesID < count : bytesID;
|
||||||
return bytesStart[ord];
|
return bytesStart[bytesID];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -30,16 +30,11 @@ import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class TestBytesRefHash extends LuceneTestCase {
|
public class TestBytesRefHash extends LuceneTestCase {
|
||||||
|
|
||||||
BytesRefHash hash;
|
BytesRefHash hash;
|
||||||
ByteBlockPool pool;
|
ByteBlockPool pool;
|
||||||
|
|
||||||
/**
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
@Before
|
@Before
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
|
@ -248,6 +243,43 @@ public class TestBytesRefHash extends LuceneTestCase {
|
||||||
hash.reinit();
|
hash.reinit();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFind() throws Exception {
|
||||||
|
BytesRef ref = new BytesRef();
|
||||||
|
BytesRef scratch = new BytesRef();
|
||||||
|
int num = atLeast(2);
|
||||||
|
for (int j = 0; j < num; j++) {
|
||||||
|
Set<String> strings = new HashSet<String>();
|
||||||
|
int uniqueCount = 0;
|
||||||
|
for (int i = 0; i < 797; i++) {
|
||||||
|
String str;
|
||||||
|
do {
|
||||||
|
str = _TestUtil.randomRealisticUnicodeString(random(), 1000);
|
||||||
|
} while (str.length() == 0);
|
||||||
|
ref.copyChars(str);
|
||||||
|
int count = hash.size();
|
||||||
|
int key = hash.find(ref); //hash.add(ref);
|
||||||
|
if (key >= 0) { // string found in hash
|
||||||
|
assertFalse(strings.add(str));
|
||||||
|
assertTrue(key < count);
|
||||||
|
assertEquals(str, hash.get(key, scratch).utf8ToString());
|
||||||
|
assertEquals(count, hash.size());
|
||||||
|
} else {
|
||||||
|
key = hash.add(ref);
|
||||||
|
assertTrue(strings.add(str));
|
||||||
|
assertEquals(uniqueCount, key);
|
||||||
|
assertEquals(hash.size(), count + 1);
|
||||||
|
uniqueCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assertAllIn(strings, hash);
|
||||||
|
hash.clear();
|
||||||
|
assertEquals(0, hash.size());
|
||||||
|
hash.reinit();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test(expected = MaxBytesLengthExceededException.class)
|
@Test(expected = MaxBytesLengthExceededException.class)
|
||||||
public void testLargeValue() {
|
public void testLargeValue() {
|
||||||
|
|
Loading…
Reference in New Issue