mirror of https://github.com/apache/lucene.git
LUCENE-2438: BytesRef improvements, bytes!=null, kill many cases of oversizing, implement comparable as index order, fix some javadocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940511 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1d14f8f18f
commit
1c79ee4c70
|
@ -126,10 +126,6 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
|
|||
|
||||
// *** TermToBytesRefAttribute interface ***
|
||||
public final int toBytesRef(BytesRef target) {
|
||||
// TODO: Maybe require that bytes is already initialized? TermsHashPerField ensures this.
|
||||
if (target.bytes == null) {
|
||||
target.bytes = new byte[termLength * 4];
|
||||
}
|
||||
return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, target);
|
||||
}
|
||||
|
||||
|
|
|
@ -32,6 +32,8 @@ import org.apache.lucene.util.BytesRef;
|
|||
public interface TermToBytesRefAttribute extends Attribute {
|
||||
/** Copies the token's term text into the given {@link BytesRef}.
|
||||
* @param termBytes destination to write the bytes to (UTF-8 for text terms).
|
||||
* The length of the BytesRef's buffer may be not large enough, so you need to grow.
|
||||
* The parameters' {@code bytes} is guaranteed to be not {@code null}.
|
||||
* @return the hashcode as defined by {@link BytesRef#hashCode}:
|
||||
* <pre>
|
||||
* int hash = 0;
|
||||
|
|
|
@ -24,22 +24,33 @@ import java.io.UnsupportedEncodingException;
|
|||
* existing byte[].
|
||||
*
|
||||
* @lucene.experimental */
|
||||
public final class BytesRef {
|
||||
public final class BytesRef implements Comparable<BytesRef> {
|
||||
public static final byte[] EMPTY_BYTES = new byte[0];
|
||||
|
||||
/** The contents of the BytesRef. Should never be {@code null}. */
|
||||
public byte[] bytes;
|
||||
|
||||
/** Offset of first valid byte. */
|
||||
public int offset;
|
||||
|
||||
/** Length of used bytes. */
|
||||
public int length;
|
||||
|
||||
public BytesRef() {
|
||||
bytes = EMPTY_BYTES;
|
||||
}
|
||||
|
||||
/** bytes[] should not be null */
|
||||
public BytesRef(byte[] bytes, int offset, int length) {
|
||||
assert bytes != null;
|
||||
this.bytes = bytes;
|
||||
this.offset = offset;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
/** bytes[] should not be null */
|
||||
public BytesRef(byte[] bytes) {
|
||||
assert bytes != null;
|
||||
this.bytes = bytes;
|
||||
this.offset = 0;
|
||||
this.length = bytes.length;
|
||||
|
@ -55,10 +66,12 @@ public final class BytesRef {
|
|||
* unicode text, with no unpaired surrogates or U+FFFF.
|
||||
*/
|
||||
public BytesRef(CharSequence text) {
|
||||
this();
|
||||
copy(text);
|
||||
}
|
||||
|
||||
public BytesRef(BytesRef other) {
|
||||
this();
|
||||
copy(other);
|
||||
}
|
||||
|
||||
|
@ -69,13 +82,6 @@ public final class BytesRef {
|
|||
* unpaired surrogates or invalid UTF16 code units.
|
||||
*/
|
||||
public void copy(CharSequence text) {
|
||||
// TODO: new byte[10] is waste of resources; it should
|
||||
// simply allocate text.length()*4 like UnicodeUtil.
|
||||
// Ideally, I would remove this here and add a
|
||||
// null-check in UnicodeUtil. (Uwe)
|
||||
if (bytes == null) {
|
||||
bytes = new byte[10];
|
||||
}
|
||||
UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this);
|
||||
}
|
||||
|
||||
|
@ -178,10 +184,8 @@ public final class BytesRef {
|
|||
}
|
||||
|
||||
public void copy(BytesRef other) {
|
||||
if (bytes == null) {
|
||||
if (bytes.length < other.length) {
|
||||
bytes = new byte[other.length];
|
||||
} else {
|
||||
bytes = ArrayUtil.grow(bytes, other.length);
|
||||
}
|
||||
System.arraycopy(other.bytes, other.offset, bytes, 0, other.length);
|
||||
length = other.length;
|
||||
|
@ -198,6 +202,68 @@ public final class BytesRef {
|
|||
return utf8SortedAsUTF16SortOrder;
|
||||
}
|
||||
|
||||
/** Unsigned byte order comparison */
|
||||
/*
|
||||
public int compareTo(BytesRef other) {
|
||||
if (this == other) return 0;
|
||||
|
||||
final byte[] aBytes = this.bytes;
|
||||
int aUpto = this.offset;
|
||||
final byte[] bBytes = other.bytes;
|
||||
int bUpto = other.offset;
|
||||
|
||||
final int aStop = aUpto + Math.min(this.length, other.length);
|
||||
|
||||
while(aUpto < aStop) {
|
||||
int aByte = aBytes[aUpto++] & 0xff;
|
||||
int bByte = bBytes[bUpto++] & 0xff;
|
||||
int diff = aByte - bByte;
|
||||
if (diff != 0) return diff;
|
||||
}
|
||||
|
||||
// One is a prefix of the other, or, they are equal:
|
||||
return this.length - other.length;
|
||||
}
|
||||
*/
|
||||
|
||||
/** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change
|
||||
* in the future to unsigned byte comparison. */
|
||||
public int compareTo(BytesRef other) {
|
||||
if (this == other) return 0;
|
||||
|
||||
final byte[] aBytes = this.bytes;
|
||||
int aUpto = this.offset;
|
||||
final byte[] bBytes = other.bytes;
|
||||
int bUpto = other.offset;
|
||||
|
||||
final int aStop = aUpto + Math.min(this.length, other.length);
|
||||
|
||||
while(aUpto < aStop) {
|
||||
int aByte = aBytes[aUpto++] & 0xff;
|
||||
int bByte = bBytes[bUpto++] & 0xff;
|
||||
if (aByte != bByte) {
|
||||
|
||||
// See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
|
||||
|
||||
// We know the terms are not equal, but, we may
|
||||
// have to carefully fixup the bytes at the
|
||||
// difference to match UTF16's sort order:
|
||||
if (aByte >= 0xee && bByte >= 0xee) {
|
||||
if ((aByte & 0xfe) == 0xee) {
|
||||
aByte += 0x10;
|
||||
}
|
||||
if ((bByte&0xfe) == 0xee) {
|
||||
bByte += 0x10;
|
||||
}
|
||||
}
|
||||
return aByte - bByte;
|
||||
}
|
||||
}
|
||||
|
||||
// One is a prefix of the other, or, they are equal:
|
||||
return this.length - other.length;
|
||||
}
|
||||
|
||||
private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> {
|
||||
// Only singleton
|
||||
private UTF8SortedAsUTF16Comparator() {};
|
||||
|
|
|
@ -107,13 +107,11 @@ public final class NumericUtils {
|
|||
public static int longToPrefixCoded(final long val, final int shift, final BytesRef bytes) {
|
||||
if (shift>63 || shift<0)
|
||||
throw new IllegalArgumentException("Illegal shift value, must be 0..63");
|
||||
if (bytes.bytes == null) {
|
||||
bytes.bytes = new byte[NumericUtils.BUF_SIZE_LONG];
|
||||
} else if (bytes.bytes.length < NumericUtils.BUF_SIZE_LONG) {
|
||||
bytes.grow(NumericUtils.BUF_SIZE_LONG);
|
||||
}
|
||||
int hash, nChars = (63-shift)/7 + 1;
|
||||
bytes.length = nChars+1;
|
||||
if (bytes.bytes.length < bytes.length) {
|
||||
bytes.grow(NumericUtils.BUF_SIZE_LONG);
|
||||
}
|
||||
bytes.bytes[0] = (byte) (hash = (SHIFT_START_LONG + shift));
|
||||
long sortableBits = val ^ 0x8000000000000000L;
|
||||
sortableBits >>>= shift;
|
||||
|
@ -167,13 +165,11 @@ public final class NumericUtils {
|
|||
public static int intToPrefixCoded(final int val, final int shift, final BytesRef bytes) {
|
||||
if (shift>31 || shift<0)
|
||||
throw new IllegalArgumentException("Illegal shift value, must be 0..31");
|
||||
if (bytes.bytes == null) {
|
||||
bytes.bytes = new byte[NumericUtils.BUF_SIZE_INT];
|
||||
} else if (bytes.bytes.length < NumericUtils.BUF_SIZE_INT) {
|
||||
bytes.grow(NumericUtils.BUF_SIZE_INT);
|
||||
}
|
||||
int hash, nChars = (31-shift)/7 + 1;
|
||||
bytes.length = nChars+1;
|
||||
if (bytes.bytes.length < bytes.length) {
|
||||
bytes.grow(NumericUtils.BUF_SIZE_INT);
|
||||
}
|
||||
bytes.bytes[0] = (byte) (hash = (SHIFT_START_INT + shift));
|
||||
int sortableBits = val ^ 0x80000000;
|
||||
sortableBits >>>= shift;
|
||||
|
|
|
@ -154,8 +154,8 @@ final public class UnicodeUtil {
|
|||
}
|
||||
|
||||
/** Encode characters from a char[] source, starting at
|
||||
* offset for length chars. Returns the number of bytes
|
||||
* written to bytesOut. */
|
||||
* offset for length chars.
|
||||
*/
|
||||
public static void UTF16toUTF8(final char[] source, final int offset, final int length, BytesRef result) {
|
||||
|
||||
int upto = 0;
|
||||
|
@ -165,7 +165,7 @@ final public class UnicodeUtil {
|
|||
// Pre-allocate for worst case 4-for-1
|
||||
final int maxLen = length * 4;
|
||||
if (out.length < maxLen)
|
||||
out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)];
|
||||
out = result.bytes = new byte[maxLen];
|
||||
result.offset = 0;
|
||||
|
||||
while(i < end) {
|
||||
|
@ -209,8 +209,8 @@ final public class UnicodeUtil {
|
|||
}
|
||||
|
||||
/** Encode characters from this String, starting at offset
|
||||
* for length characters. Returns the number of bytes
|
||||
* written to bytesOut. */
|
||||
* for length characters.
|
||||
*/
|
||||
public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, BytesRef result) {
|
||||
final int end = offset + length;
|
||||
|
||||
|
@ -219,7 +219,7 @@ final public class UnicodeUtil {
|
|||
// Pre-allocate for worst case 4-for-1
|
||||
final int maxLen = length * 4;
|
||||
if (out.length < maxLen)
|
||||
out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)];
|
||||
out = result.bytes = new byte[maxLen];
|
||||
|
||||
int upto = 0;
|
||||
for(int i=offset;i<end;i++) {
|
||||
|
|
Loading…
Reference in New Issue