LUCENE-2438: BytesRef improvements, bytes!=null, kill many cases of oversizing, implement comparable as index order, fix some javadocs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940511 13f79535-47bb-0310-9956-ffa450edef68
2010-05-03 16:56:47 +00:00 · 2010-05-03 16:56:47 +00:00 · 1c79ee4c70
parent 1d14f8f18f
commit 1c79ee4c70
5 changed files with 91 additions and 31 deletions
--- a/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
+++ b/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
@ -126,10 +126,6 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr
  
  // *** TermToBytesRefAttribute interface ***
  public final int toBytesRef(BytesRef target) {
-    // TODO: Maybe require that bytes is already initialized? TermsHashPerField ensures this.
-    if (target.bytes == null) {
-      target.bytes = new byte[termLength * 4];
-    }
    return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, target);
  }
  
--- a/lucene/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java
+++ b/lucene/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java
@ -32,6 +32,8 @@ import org.apache.lucene.util.BytesRef;
 public interface TermToBytesRefAttribute extends Attribute {
  /** Copies the token's term text into the given {@link BytesRef}.
   * @param termBytes destination to write the bytes to (UTF-8 for text terms).
+   * The length of the BytesRef's buffer may be not large enough, so you need to grow.
+   * The parameters' {@code bytes} is guaranteed to be not {@code null}.
   * @return the hashcode as defined by {@link BytesRef#hashCode}:
   * <pre>
   *  int hash = 0;
--- a/lucene/src/java/org/apache/lucene/util/BytesRef.java
+++ b/lucene/src/java/org/apache/lucene/util/BytesRef.java
@ -24,22 +24,33 @@ import java.io.UnsupportedEncodingException;
 *  existing byte[].
 *
 *  @lucene.experimental */
-public final class BytesRef {
+public final class BytesRef implements Comparable<BytesRef> {
+  public static final byte[] EMPTY_BYTES = new byte[0]; 

+  /** The contents of the BytesRef. Should never be {@code null}. */
  public byte[] bytes;
+
+  /** Offset of first valid byte. */
  public int offset;
+
+  /** Length of used bytes. */
  public int length;

  public BytesRef() {
+    bytes = EMPTY_BYTES;
  }

+  /** bytes[] should not be null */
  public BytesRef(byte[] bytes, int offset, int length) {
+    assert bytes != null;
    this.bytes = bytes;
    this.offset = offset;
    this.length = length;
  }

+  /** bytes[] should not be null */
  public BytesRef(byte[] bytes) {
+    assert bytes != null;
    this.bytes = bytes;
    this.offset = 0;
    this.length = bytes.length;
@ -55,10 +66,12 @@ public final class BytesRef {
   * unicode text, with no unpaired surrogates or U+FFFF.
   */
  public BytesRef(CharSequence text) {
+    this();
    copy(text);
  }

  public BytesRef(BytesRef other) {
+    this();
    copy(other);
  }

@ -69,13 +82,6 @@ public final class BytesRef {
   * unpaired surrogates or invalid UTF16 code units.
   */
  public void copy(CharSequence text) {
-    // TODO: new byte[10] is waste of resources; it should
-    // simply allocate text.length()*4 like UnicodeUtil.
-    // Ideally, I would remove this here and add a
-    // null-check in UnicodeUtil. (Uwe)
-    if (bytes == null) {
-      bytes = new byte[10];
-    }
    UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this);
  }

@ -178,10 +184,8 @@ public final class BytesRef {
  }

  public void copy(BytesRef other) {
-    if (bytes == null) {
+    if (bytes.length < other.length) {
      bytes = new byte[other.length];
-    } else {
-      bytes = ArrayUtil.grow(bytes, other.length);
    }
    System.arraycopy(other.bytes, other.offset, bytes, 0, other.length);
    length = other.length;
@ -198,6 +202,68 @@ public final class BytesRef {
    return utf8SortedAsUTF16SortOrder;
  }

+  /** Unsigned byte order comparison */
+  /*
+  public int compareTo(BytesRef other) {
+    if (this == other) return 0;
+
+    final byte[] aBytes = this.bytes;
+    int aUpto = this.offset;
+    final byte[] bBytes = other.bytes;
+    int bUpto = other.offset;
+
+    final int aStop = aUpto + Math.min(this.length, other.length);
+
+    while(aUpto < aStop) {
+      int aByte = aBytes[aUpto++] & 0xff;
+      int bByte = bBytes[bUpto++] & 0xff;
+      int diff = aByte - bByte;
+      if (diff != 0) return diff;
+    }
+
+    // One is a prefix of the other, or, they are equal:
+    return this.length - other.length;
+  }
+  */
+
+  /** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change
+   * in the future to unsigned byte comparison. */
+  public int compareTo(BytesRef other) {
+    if (this == other) return 0;
+
+    final byte[] aBytes = this.bytes;
+    int aUpto = this.offset;
+    final byte[] bBytes = other.bytes;
+    int bUpto = other.offset;
+
+    final int aStop = aUpto + Math.min(this.length, other.length);
+
+    while(aUpto < aStop) {
+      int aByte = aBytes[aUpto++] & 0xff;
+      int bByte = bBytes[bUpto++] & 0xff;
+      if (aByte != bByte) {
+
+        // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
+
+        // We know the terms are not equal, but, we may
+        // have to carefully fixup the bytes at the
+        // difference to match UTF16's sort order:
+        if (aByte >= 0xee && bByte >= 0xee) {
+          if ((aByte & 0xfe) == 0xee) {
+            aByte += 0x10;
+          }
+          if ((bByte&0xfe) == 0xee) {
+            bByte += 0x10;
+          }
+        }
+        return aByte - bByte;
+      }
+    }
+
+    // One is a prefix of the other, or, they are equal:
+    return this.length - other.length;
+  }
+
  private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> {
    // Only singleton
    private UTF8SortedAsUTF16Comparator() {};
--- a/lucene/src/java/org/apache/lucene/util/NumericUtils.java
+++ b/lucene/src/java/org/apache/lucene/util/NumericUtils.java
@ -107,13 +107,11 @@ public final class NumericUtils {
  public static int longToPrefixCoded(final long val, final int shift, final BytesRef bytes) {
    if (shift>63 || shift<0)
      throw new IllegalArgumentException("Illegal shift value, must be 0..63");
-    if (bytes.bytes == null) {
-      bytes.bytes = new byte[NumericUtils.BUF_SIZE_LONG];
-    } else if (bytes.bytes.length < NumericUtils.BUF_SIZE_LONG) {
-      bytes.grow(NumericUtils.BUF_SIZE_LONG);
-    }
    int hash, nChars = (63-shift)/7 + 1;
    bytes.length = nChars+1;
+    if (bytes.bytes.length < bytes.length) {
+      bytes.grow(NumericUtils.BUF_SIZE_LONG);
+    }
    bytes.bytes[0] = (byte) (hash = (SHIFT_START_LONG + shift));
    long sortableBits = val ^ 0x8000000000000000L;
    sortableBits >>>= shift;
@ -167,13 +165,11 @@ public final class NumericUtils {
  public static int intToPrefixCoded(final int val, final int shift, final BytesRef bytes) {
    if (shift>31 || shift<0)
      throw new IllegalArgumentException("Illegal shift value, must be 0..31");
-    if (bytes.bytes == null) {
-      bytes.bytes = new byte[NumericUtils.BUF_SIZE_INT];
-    } else if (bytes.bytes.length < NumericUtils.BUF_SIZE_INT) {
-      bytes.grow(NumericUtils.BUF_SIZE_INT);
-    }
    int hash, nChars = (31-shift)/7 + 1;
    bytes.length = nChars+1;
+    if (bytes.bytes.length < bytes.length) {
+      bytes.grow(NumericUtils.BUF_SIZE_INT);
+    }
    bytes.bytes[0] = (byte) (hash = (SHIFT_START_INT + shift));
    int sortableBits = val ^ 0x80000000;
    sortableBits >>>= shift;
--- a/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java
+++ b/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java
@ -154,8 +154,8 @@ final public class UnicodeUtil {
  }

  /** Encode characters from a char[] source, starting at
-   *  offset for length chars.  Returns the number of bytes
-   *  written to bytesOut. */
+   *  offset for length chars.
+   */
  public static void UTF16toUTF8(final char[] source, final int offset, final int length, BytesRef result) {

    int upto = 0;
@ -165,7 +165,7 @@ final public class UnicodeUtil {
    // Pre-allocate for worst case 4-for-1
    final int maxLen = length * 4;
    if (out.length < maxLen)
-      out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)];
+      out = result.bytes = new byte[maxLen];
    result.offset = 0;

    while(i < end) {
@ -209,8 +209,8 @@ final public class UnicodeUtil {
  }

  /** Encode characters from this String, starting at offset
-   *  for length characters.  Returns the number of bytes
-   *  written to bytesOut. */
+   *  for length characters.
+   */
  public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, BytesRef result) {
    final int end = offset + length;

@ -219,7 +219,7 @@ final public class UnicodeUtil {
    // Pre-allocate for worst case 4-for-1
    final int maxLen = length * 4;
    if (out.length < maxLen)
-      out = result.bytes = new byte[ArrayUtil.oversize(maxLen, 1)];
+      out = result.bytes = new byte[maxLen];

    int upto = 0;
    for(int i=offset;i<end;i++) {