LUCENE-3590: fix copyBytes to respect offset, remove dup'ed compareTo code, add javadocs and TODOs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1206789 13f79535-47bb-0310-9956-ffa450edef68
2011-11-27 18:03:22 +00:00 · 2011-11-27 18:03:22 +00:00 · 1ee685d837
parent 60d9125050
commit 1ee685d837
3 changed files with 92 additions and 32 deletions
--- a/lucene/src/java/org/apache/lucene/util/BytesRef.java
+++ b/lucene/src/java/org/apache/lucene/util/BytesRef.java
@ -25,6 +25,7 @@ import java.util.Comparator;
 *
 *  @lucene.experimental */
 public final class BytesRef implements Comparable<BytesRef>,Cloneable {
+  /** An empty byte array for convenience */
  public static final byte[] EMPTY_BYTES = new byte[0]; 

  /** The contents of the BytesRef. Should never be {@code null}. */
@ -36,8 +37,9 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
  /** Length of used bytes. */
  public int length;

+  /** Create a BytesRef with {@link #EMPTY_BYTES} */
  public BytesRef() {
-    bytes = EMPTY_BYTES;
+    this(EMPTY_BYTES);
  }

  /** This instance will directly reference bytes w/o making a copy.
@ -53,20 +55,23 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
  /** This instance will directly reference bytes w/o making a copy.
   * bytes should not be null */
  public BytesRef(byte[] bytes) {
-    assert bytes != null;
-    this.bytes = bytes;
-    this.offset = 0;
-    this.length = bytes.length;
+    this(bytes, 0, bytes.length);
  }

+  /** 
+   * Create a BytesRef pointing to a new array of size <code>capacity</code>.
+   * Offset and length will both be zero.
+   */
  public BytesRef(int capacity) {
    this.bytes = new byte[capacity];
  }

  /**
-   * @param text Initialize the byte[] from the UTF8 bytes
-   * for the provided String.  This must be well-formed
-   * unicode text, with no unpaired surrogates or U+FFFF.
+   * Initialize the byte[] from the UTF8 bytes
+   * for the provided String.  
+   * 
+   * @param text This must be well-formed
+   * unicode text, with no unpaired surrogates.
   */
  public BytesRef(CharSequence text) {
    this();
@ -79,11 +84,20 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
   * @param text Must be well-formed unicode text, with no
   * unpaired surrogates or invalid UTF16 code units.
   */
+  // TODO broken if offset != 0
  public void copyChars(CharSequence text) {
    UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this);
  }
  
+  /**
+   * Expert: compares the bytes against another BytesRef,
+   * returning true if the bytes are equal.
+   * 
+   * @param other Another BytesRef, should not be null.
+   * @lucene.internal
+   */
  public boolean bytesEquals(BytesRef other) {
+    assert other != null;
    if (length == other.length) {
      int otherUpto = other.offset;
      final byte[] otherBytes = other.bytes;
@ -186,20 +200,24 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
  /**
   * Copies the bytes from the given {@link BytesRef}
   * <p>
-   * NOTE: this method resets the offset to 0 and resizes the reference array
-   * if needed.
+   * NOTE: if this would exceed the array size, this method creates a 
+   * new reference array.
   */
  public void copyBytes(BytesRef other) {
    if (bytes.length < other.length) {
      bytes = new byte[other.length];
+      offset = 0;
    }
-    System.arraycopy(other.bytes, other.offset, bytes, 0, other.length);
+    System.arraycopy(other.bytes, other.offset, bytes, offset, other.length);
    length = other.length;
-    offset = 0;
  }

-
-
+  /**
+   * Appends the bytes from the given {@link BytesRef}
+   * <p>
+   * NOTE: if this would exceed the array size, this method creates a 
+   * new reference array.
+   */
  public void append(BytesRef other) {
    int newLen = length + other.length;
    if (bytes.length < newLen) {
@ -212,30 +230,15 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
    length = newLen;
  }

+  // TODO: stupid if existing offset is non-zero.
+  /** @lucene.internal */
  public void grow(int newLength) {
    bytes = ArrayUtil.grow(bytes, newLength);
  }

  /** Unsigned byte order comparison */
  public int compareTo(BytesRef other) {
-    if (this == other) return 0;
-
-    final byte[] aBytes = this.bytes;
-    int aUpto = this.offset;
-    final byte[] bBytes = other.bytes;
-    int bUpto = other.offset;
-
-    final int aStop = aUpto + Math.min(this.length, other.length);
-
-    while(aUpto < aStop) {
-      int aByte = aBytes[aUpto++] & 0xff;
-      int bByte = bBytes[bUpto++] & 0xff;
-      int diff = aByte - bByte;
-      if (diff != 0) return diff;
-    }
-
-    // One is a prefix of the other, or, they are equal:
-    return this.length - other.length;
+    return utf8SortedAsUnicodeSortOrder.compare(this, other);
  }
  
  private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();
--- a/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java
+++ b/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java
@ -123,6 +123,7 @@ public final class UnicodeUtil {

  /** Encode characters from a char[] source, starting at
   *  offset for length chars.  Returns a hash of the resulting bytes.  After encoding, result.offset will always be 0. */
+  // TODO: broken if incoming result.offset != 0
  public static int UTF16toUTF8WithHash(final char[] source, final int offset, final int length, BytesRef result) {
    int hash = 0;
    int upto = 0;
@ -179,6 +180,7 @@ public final class UnicodeUtil {
  /** Encode characters from a char[] source, starting at
   *  offset for length chars. After encoding, result.offset will always be 0.
   */
+  // TODO: broken if incoming result.offset != 0
  public static void UTF16toUTF8(final char[] source, final int offset, final int length, BytesRef result) {

    int upto = 0;
@ -234,6 +236,7 @@ public final class UnicodeUtil {
  /** Encode characters from this String, starting at offset
   *  for length characters. After encoding, result.offset will always be 0.
   */
+  // TODO: broken if incoming result.offset != 0
  public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, BytesRef result) {
    final int end = offset + length;

@ -427,8 +430,10 @@ public final class UnicodeUtil {
    return codePointCount;
  }

+  // TODO: broken if incoming result.offset != 0
  public static void UTF8toUTF32(final BytesRef utf8, final IntsRef utf32) {
    // pre-alloc for worst case
+    // TODO: ints cannot be null, should be an assert
    if (utf32.ints == null || utf32.ints.length < utf8.length) {
      utf32.ints = new int[utf8.length];
    }
@ -567,6 +572,7 @@ public final class UnicodeUtil {
   * can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed).
   * Explicit checks for valid UTF-8 are not performed. 
   */
+  // TODO: broken if chars.offset != 0
  public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) {
    int out_offset = chars.offset = 0;
    final char[] out = chars.chars =  ArrayUtil.grow(chars.chars, length);
--- a/lucene/src/test/org/apache/lucene/util/TestBytesRef.java
+++ b/lucene/src/test/org/apache/lucene/util/TestBytesRef.java
@ -0,0 +1,51 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestBytesRef extends LuceneTestCase {
+  public void testEmpty() {
+    BytesRef b = new BytesRef();
+    assertEquals(BytesRef.EMPTY_BYTES, b.bytes);
+    assertEquals(0, b.offset);
+    assertEquals(0, b.length);
+  }
+  
+  public void testFromBytes() {
+    byte bytes[] = new byte[] { (byte)'a', (byte)'b', (byte)'c', (byte)'d' };
+    BytesRef b = new BytesRef(bytes);
+    assertEquals(bytes, b.bytes);
+    assertEquals(0, b.offset);
+    assertEquals(4, b.length);
+    
+    BytesRef b2 = new BytesRef(bytes, 1, 3);
+    assertEquals("bcd", b2.utf8ToString());
+    
+    assertFalse(b.equals(b2));
+  }
+  
+  public void testFromChars() {
+    for (int i = 0; i < 100; i++) {
+      String s = _TestUtil.randomUnicodeString(random);
+      String s2 = new BytesRef(s).utf8ToString();
+      assertEquals(s, s2);
+    }
+    
+    // only for 4.x
+    assertEquals("\uFFFF", new BytesRef("\uFFFF").utf8ToString());
+  }
+}