mirror of https://github.com/apache/lucene.git
LUCENE-4838: Add BytesRefHash.find()
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457400 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
efaef82383
commit
0f3af7cec9
|
@ -30,8 +30,8 @@ import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
|
|||
/**
|
||||
* {@link BytesRefHash} is a special purpose hash-map like data-structure
|
||||
* optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
|
||||
* byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
|
||||
* efficiently in continuous storage. The mapping to the ordinal is
|
||||
* byte arrays to ids (Map<BytesRef,int>) storing the hashed bytes
|
||||
* efficiently in continuous storage. The mapping to the id is
|
||||
* encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
|
||||
* for each added {@link BytesRef}.
|
||||
*
|
||||
|
@ -58,7 +58,7 @@ public final class BytesRefHash {
|
|||
private int hashMask;
|
||||
private int count;
|
||||
private int lastCount = -1;
|
||||
private int[] ords;
|
||||
private int[] ids;
|
||||
private final BytesStartArray bytesStartArray;
|
||||
private Counter bytesUsed;
|
||||
|
||||
|
@ -86,8 +86,8 @@ public final class BytesRefHash {
|
|||
hashHalfSize = hashSize >> 1;
|
||||
hashMask = hashSize - 1;
|
||||
this.pool = pool;
|
||||
ords = new int[hashSize];
|
||||
Arrays.fill(ords, -1);
|
||||
ids = new int[hashSize];
|
||||
Arrays.fill(ids, -1);
|
||||
this.bytesStartArray = bytesStartArray;
|
||||
bytesStart = bytesStartArray.init();
|
||||
bytesUsed = bytesStartArray.bytesUsed() == null? Counter.newCounter() : bytesStartArray.bytesUsed();
|
||||
|
@ -104,26 +104,29 @@ public final class BytesRefHash {
|
|||
}
|
||||
|
||||
/**
|
||||
* Populates and returns a {@link BytesRef} with the bytes for the given ord.
|
||||
* Populates and returns a {@link BytesRef} with the bytes for the given
|
||||
* bytesID.
|
||||
* <p>
|
||||
* Note: the given ord must be a positive integer less that the current size (
|
||||
* {@link #size()})
|
||||
* </p>
|
||||
* Note: the given bytesID must be a positive integer less than the current
|
||||
* size ({@link #size()})
|
||||
*
|
||||
* @param ord the ord
|
||||
* @param ref the {@link BytesRef} to populate
|
||||
* @param bytesID
|
||||
* the id
|
||||
* @param ref
|
||||
* the {@link BytesRef} to populate
|
||||
*
|
||||
* @return the given BytesRef instance populated with the bytes for the given ord
|
||||
* @return the given BytesRef instance populated with the bytes for the given
|
||||
* bytesID
|
||||
*/
|
||||
public BytesRef get(int ord, BytesRef ref) {
|
||||
public BytesRef get(int bytesID, BytesRef ref) {
|
||||
assert bytesStart != null : "bytesStart is null - not initialized";
|
||||
assert ord < bytesStart.length: "ord exceeds byteStart len: " + bytesStart.length;
|
||||
pool.setBytesRef(ref, bytesStart[ord]);
|
||||
assert bytesID < bytesStart.length: "bytesID exceeds byteStart len: " + bytesStart.length;
|
||||
pool.setBytesRef(ref, bytesStart[bytesID]);
|
||||
return ref;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the ords array in arbitrary order. Valid ords start at offset of 0
|
||||
* Returns the ids array in arbitrary order. Valid ids start at offset of 0
|
||||
* and end at a limit of {@link #size()} - 1
|
||||
* <p>
|
||||
* Note: This is a destructive operation. {@link #clear()} must be called in
|
||||
|
@ -131,13 +134,13 @@ public final class BytesRefHash {
|
|||
* </p>
|
||||
*/
|
||||
int[] compact() {
|
||||
assert bytesStart != null : "Bytesstart is null - not initialized";
|
||||
assert bytesStart != null : "bytesStart is null - not initialized";
|
||||
int upto = 0;
|
||||
for (int i = 0; i < hashSize; i++) {
|
||||
if (ords[i] != -1) {
|
||||
if (ids[i] != -1) {
|
||||
if (upto < i) {
|
||||
ords[upto] = ords[i];
|
||||
ords[i] = -1;
|
||||
ids[upto] = ids[i];
|
||||
ids[i] = -1;
|
||||
}
|
||||
upto++;
|
||||
}
|
||||
|
@ -145,7 +148,7 @@ public final class BytesRefHash {
|
|||
|
||||
assert upto == count;
|
||||
lastCount = count;
|
||||
return ords;
|
||||
return ids;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -170,25 +173,25 @@ public final class BytesRefHash {
|
|||
|
||||
@Override
|
||||
protected int compare(int i, int j) {
|
||||
final int ord1 = compact[i], ord2 = compact[j];
|
||||
assert bytesStart.length > ord1 && bytesStart.length > ord2;
|
||||
pool.setBytesRef(scratch1, bytesStart[ord1]);
|
||||
pool.setBytesRef(scratch2, bytesStart[ord2]);
|
||||
final int id1 = compact[i], id2 = compact[j];
|
||||
assert bytesStart.length > id1 && bytesStart.length > id2;
|
||||
pool.setBytesRef(scratch1, bytesStart[id1]);
|
||||
pool.setBytesRef(scratch2, bytesStart[id2]);
|
||||
return comp.compare(scratch1, scratch2);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setPivot(int i) {
|
||||
final int ord = compact[i];
|
||||
assert bytesStart.length > ord;
|
||||
pool.setBytesRef(pivot, bytesStart[ord]);
|
||||
final int id = compact[i];
|
||||
assert bytesStart.length > id;
|
||||
pool.setBytesRef(pivot, bytesStart[id]);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int comparePivot(int j) {
|
||||
final int ord = compact[j];
|
||||
assert bytesStart.length > ord;
|
||||
pool.setBytesRef(scratch2, bytesStart[ord]);
|
||||
final int id = compact[j];
|
||||
assert bytesStart.length > id;
|
||||
pool.setBytesRef(scratch2, bytesStart[id]);
|
||||
return comp.compare(pivot, scratch2);
|
||||
}
|
||||
|
||||
|
@ -198,8 +201,8 @@ public final class BytesRefHash {
|
|||
return compact;
|
||||
}
|
||||
|
||||
private boolean equals(int ord, BytesRef b) {
|
||||
pool.setBytesRef(scratch1, bytesStart[ord]);
|
||||
private boolean equals(int id, BytesRef b) {
|
||||
pool.setBytesRef(scratch1, bytesStart[id]);
|
||||
return scratch1.bytesEquals(b);
|
||||
}
|
||||
|
||||
|
@ -213,8 +216,8 @@ public final class BytesRefHash {
|
|||
if (newSize != hashSize) {
|
||||
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -(hashSize - newSize));
|
||||
hashSize = newSize;
|
||||
ords = new int[hashSize];
|
||||
Arrays.fill(ords, -1);
|
||||
ids = new int[hashSize];
|
||||
Arrays.fill(ids, -1);
|
||||
hashHalfSize = newSize / 2;
|
||||
hashMask = newSize - 1;
|
||||
return true;
|
||||
|
@ -237,7 +240,7 @@ public final class BytesRefHash {
|
|||
// shrink clears the hash entries
|
||||
return;
|
||||
}
|
||||
Arrays.fill(ords, -1);
|
||||
Arrays.fill(ids, -1);
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
|
@ -249,7 +252,7 @@ public final class BytesRefHash {
|
|||
*/
|
||||
public void close() {
|
||||
clear(true);
|
||||
ords = null;
|
||||
ids = null;
|
||||
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -hashSize);
|
||||
}
|
||||
|
||||
|
@ -258,8 +261,8 @@ public final class BytesRefHash {
|
|||
*
|
||||
* @param bytes
|
||||
* the bytes to hash
|
||||
* @return the ord the given bytes are hashed if there was no mapping for the
|
||||
* given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
|
||||
* @return the id the given bytes are hashed if there was no mapping for the
|
||||
* given bytes, otherwise <code>(-(id)-1)</code>. This guarantees
|
||||
* that the return value will always be >= 0 if the given bytes
|
||||
* haven't been hashed before.
|
||||
*
|
||||
|
@ -289,8 +292,8 @@ public final class BytesRefHash {
|
|||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @return the ord the given bytes are hashed if there was no mapping for the
|
||||
* given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
|
||||
* @return the id the given bytes are hashed if there was no mapping for the
|
||||
* given bytes, otherwise <code>(-(id)-1)</code>. This guarantees
|
||||
* that the return value will always be >= 0 if the given bytes
|
||||
* haven't been hashed before.
|
||||
*
|
||||
|
@ -302,18 +305,8 @@ public final class BytesRefHash {
|
|||
assert bytesStart != null : "Bytesstart is null - not initialized";
|
||||
final int length = bytes.length;
|
||||
// final position
|
||||
int hashPos = code & hashMask;
|
||||
int e = ords[hashPos];
|
||||
if (e != -1 && !equals(e, bytes)) {
|
||||
// Conflict: keep searching different locations in
|
||||
// the hash table.
|
||||
final int inc = ((code >> 8) + code) | 1;
|
||||
do {
|
||||
code += inc;
|
||||
hashPos = code & hashMask;
|
||||
e = ords[hashPos];
|
||||
} while (e != -1 && !equals(e, bytes));
|
||||
}
|
||||
final int hashPos = findHash(bytes, code);
|
||||
int e = ids[hashPos];
|
||||
|
||||
if (e == -1) {
|
||||
// new entry
|
||||
|
@ -355,8 +348,8 @@ public final class BytesRefHash {
|
|||
System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
|
||||
length);
|
||||
}
|
||||
assert ords[hashPos] == -1;
|
||||
ords[hashPos] = e;
|
||||
assert ids[hashPos] == -1;
|
||||
ids[hashPos] = e;
|
||||
|
||||
if (count == hashHalfSize) {
|
||||
rehash(2 * hashSize, true);
|
||||
|
@ -366,12 +359,55 @@ public final class BytesRefHash {
|
|||
return -(e + 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the id of the given {@link BytesRef}.
|
||||
*
|
||||
* @see #find(BytesRef, int)
|
||||
*/
|
||||
public int find(BytesRef bytes) {
|
||||
return find(bytes, bytes.hashCode());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the id of the given {@link BytesRef} with a pre-calculated hash code.
|
||||
*
|
||||
* @param bytes
|
||||
* the bytes to look for
|
||||
* @param code
|
||||
* the bytes hash code
|
||||
*
|
||||
* @return the id of the given bytes, or {@code -1} if there is no mapping for the
|
||||
* given bytes.
|
||||
*/
|
||||
public int find(BytesRef bytes, int code) {
|
||||
return ids[findHash(bytes, code)];
|
||||
}
|
||||
|
||||
private final int findHash(BytesRef bytes, int code) {
|
||||
assert bytesStart != null : "bytesStart is null - not initialized";
|
||||
// final position
|
||||
int hashPos = code & hashMask;
|
||||
int e = ids[hashPos];
|
||||
if (e != -1 && !equals(e, bytes)) {
|
||||
// Conflict: keep searching different locations in
|
||||
// the hash table.
|
||||
final int inc = ((code >> 8) + code) | 1;
|
||||
do {
|
||||
code += inc;
|
||||
hashPos = code & hashMask;
|
||||
e = ids[hashPos];
|
||||
} while (e != -1 && !equals(e, bytes));
|
||||
}
|
||||
|
||||
return hashPos;
|
||||
}
|
||||
|
||||
public int addByPoolOffset(int offset) {
|
||||
assert bytesStart != null : "Bytesstart is null - not initialized";
|
||||
// final position
|
||||
int code = offset;
|
||||
int hashPos = offset & hashMask;
|
||||
int e = ords[hashPos];
|
||||
int e = ids[hashPos];
|
||||
if (e != -1 && bytesStart[e] != offset) {
|
||||
// Conflict: keep searching different locations in
|
||||
// the hash table.
|
||||
|
@ -379,7 +415,7 @@ public final class BytesRefHash {
|
|||
do {
|
||||
code += inc;
|
||||
hashPos = code & hashMask;
|
||||
e = ords[hashPos];
|
||||
e = ids[hashPos];
|
||||
} while (e != -1 && bytesStart[e] != offset);
|
||||
}
|
||||
if (e == -1) {
|
||||
|
@ -391,8 +427,8 @@ public final class BytesRefHash {
|
|||
}
|
||||
e = count++;
|
||||
bytesStart[e] = offset;
|
||||
assert ords[hashPos] == -1;
|
||||
ords[hashPos] = e;
|
||||
assert ids[hashPos] == -1;
|
||||
ids[hashPos] = e;
|
||||
|
||||
if (count == hashHalfSize) {
|
||||
rehash(2 * hashSize, false);
|
||||
|
@ -412,7 +448,7 @@ public final class BytesRefHash {
|
|||
final int[] newHash = new int[newSize];
|
||||
Arrays.fill(newHash, -1);
|
||||
for (int i = 0; i < hashSize; i++) {
|
||||
final int e0 = ords[i];
|
||||
final int e0 = ids[i];
|
||||
if (e0 != -1) {
|
||||
int code;
|
||||
if (hashOnData) {
|
||||
|
@ -453,8 +489,8 @@ public final class BytesRefHash {
|
|||
}
|
||||
|
||||
hashMask = newMask;
|
||||
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length));
|
||||
ords = newHash;
|
||||
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ids.length));
|
||||
ids = newHash;
|
||||
hashSize = newSize;
|
||||
hashHalfSize = newSize / 2;
|
||||
}
|
||||
|
@ -469,25 +505,25 @@ public final class BytesRefHash {
|
|||
bytesStart = bytesStartArray.init();
|
||||
}
|
||||
|
||||
if (ords == null) {
|
||||
ords = new int[hashSize];
|
||||
if (ids == null) {
|
||||
ids = new int[hashSize];
|
||||
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the bytesStart offset into the internally used
|
||||
* {@link ByteBlockPool} for the given ord
|
||||
* {@link ByteBlockPool} for the given bytesID
|
||||
*
|
||||
* @param ord
|
||||
* the ord to look up
|
||||
* @param bytesID
|
||||
* the id to look up
|
||||
* @return the bytesStart offset into the internally used
|
||||
* {@link ByteBlockPool} for the given ord
|
||||
* {@link ByteBlockPool} for the given id
|
||||
*/
|
||||
public int byteStart(int ord) {
|
||||
assert bytesStart != null : "Bytesstart is null - not initialized";
|
||||
assert ord >= 0 && ord < count : ord;
|
||||
return bytesStart[ord];
|
||||
public int byteStart(int bytesID) {
|
||||
assert bytesStart != null : "bytesStart is null - not initialized";
|
||||
assert bytesID >= 0 && bytesID < count : bytesID;
|
||||
return bytesStart[bytesID];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -30,16 +30,11 @@ import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
|
|||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class TestBytesRefHash extends LuceneTestCase {
|
||||
|
||||
BytesRefHash hash;
|
||||
ByteBlockPool pool;
|
||||
|
||||
/**
|
||||
*/
|
||||
@Override
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
|
@ -249,6 +244,43 @@ public class TestBytesRefHash extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFind() throws Exception {
|
||||
BytesRef ref = new BytesRef();
|
||||
BytesRef scratch = new BytesRef();
|
||||
int num = atLeast(2);
|
||||
for (int j = 0; j < num; j++) {
|
||||
Set<String> strings = new HashSet<String>();
|
||||
int uniqueCount = 0;
|
||||
for (int i = 0; i < 797; i++) {
|
||||
String str;
|
||||
do {
|
||||
str = _TestUtil.randomRealisticUnicodeString(random(), 1000);
|
||||
} while (str.length() == 0);
|
||||
ref.copyChars(str);
|
||||
int count = hash.size();
|
||||
int key = hash.find(ref); //hash.add(ref);
|
||||
if (key >= 0) { // string found in hash
|
||||
assertFalse(strings.add(str));
|
||||
assertTrue(key < count);
|
||||
assertEquals(str, hash.get(key, scratch).utf8ToString());
|
||||
assertEquals(count, hash.size());
|
||||
} else {
|
||||
key = hash.add(ref);
|
||||
assertTrue(strings.add(str));
|
||||
assertEquals(uniqueCount, key);
|
||||
assertEquals(hash.size(), count + 1);
|
||||
uniqueCount++;
|
||||
}
|
||||
}
|
||||
|
||||
assertAllIn(strings, hash);
|
||||
hash.clear();
|
||||
assertEquals(0, hash.size());
|
||||
hash.reinit();
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expected = MaxBytesLengthExceededException.class)
|
||||
public void testLargeValue() {
|
||||
int[] sizes = new int[] { random().nextInt(5),
|
||||
|
|
Loading…
Reference in New Issue