LUCENE-3590: fix copyBytes to respect offset, remove dup'ed compareTo code, add javadocs and TODOs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1206789 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-11-27 18:03:22 +00:00
parent 60d9125050
commit 1ee685d837
3 changed files with 92 additions and 32 deletions

View File

@ -25,6 +25,7 @@ import java.util.Comparator;
*
* @lucene.experimental */
public final class BytesRef implements Comparable<BytesRef>,Cloneable {
/** An empty byte array for convenience */
public static final byte[] EMPTY_BYTES = new byte[0];
/** The contents of the BytesRef. Should never be {@code null}. */
@ -36,8 +37,9 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
/** Length of used bytes. */
public int length;
/** Create a BytesRef with {@link #EMPTY_BYTES} */
public BytesRef() {
bytes = EMPTY_BYTES;
this(EMPTY_BYTES);
}
/** This instance will directly reference bytes w/o making a copy.
@ -53,20 +55,23 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
/** This instance will directly reference bytes w/o making a copy.
* bytes should not be null */
public BytesRef(byte[] bytes) {
assert bytes != null;
this.bytes = bytes;
this.offset = 0;
this.length = bytes.length;
this(bytes, 0, bytes.length);
}
/**
* Create a BytesRef pointing to a new array of size <code>capacity</code>.
* Offset and length will both be zero.
*/
public BytesRef(int capacity) {
this.bytes = new byte[capacity];
}
/**
* @param text Initialize the byte[] from the UTF8 bytes
* for the provided String. This must be well-formed
* unicode text, with no unpaired surrogates or U+FFFF.
* Initialize the byte[] from the UTF8 bytes
* for the provided String.
*
* @param text This must be well-formed
* unicode text, with no unpaired surrogates.
*/
public BytesRef(CharSequence text) {
this();
@ -79,11 +84,20 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
* @param text Must be well-formed unicode text, with no
* unpaired surrogates or invalid UTF16 code units.
*/
// TODO broken if offset != 0
public void copyChars(CharSequence text) {
UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this);
}
/**
* Expert: compares the bytes against another BytesRef,
* returning true if the bytes are equal.
*
* @param other Another BytesRef, should not be null.
* @lucene.internal
*/
public boolean bytesEquals(BytesRef other) {
assert other != null;
if (length == other.length) {
int otherUpto = other.offset;
final byte[] otherBytes = other.bytes;
@ -186,20 +200,24 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
/**
* Copies the bytes from the given {@link BytesRef}
* <p>
* NOTE: this method resets the offset to 0 and resizes the reference array
* if needed.
* NOTE: if this would exceed the array size, this method creates a
* new reference array.
*/
public void copyBytes(BytesRef other) {
if (bytes.length < other.length) {
bytes = new byte[other.length];
offset = 0;
}
System.arraycopy(other.bytes, other.offset, bytes, 0, other.length);
System.arraycopy(other.bytes, other.offset, bytes, offset, other.length);
length = other.length;
offset = 0;
}
/**
* Appends the bytes from the given {@link BytesRef}
* <p>
* NOTE: if this would exceed the array size, this method creates a
* new reference array.
*/
public void append(BytesRef other) {
int newLen = length + other.length;
if (bytes.length < newLen) {
@ -212,30 +230,15 @@ public final class BytesRef implements Comparable<BytesRef>,Cloneable {
length = newLen;
}
// TODO: stupid if existing offset is non-zero.
/** @lucene.internal */
public void grow(int newLength) {
bytes = ArrayUtil.grow(bytes, newLength);
}
/** Unsigned byte order comparison */
public int compareTo(BytesRef other) {
if (this == other) return 0;
final byte[] aBytes = this.bytes;
int aUpto = this.offset;
final byte[] bBytes = other.bytes;
int bUpto = other.offset;
final int aStop = aUpto + Math.min(this.length, other.length);
while(aUpto < aStop) {
int aByte = aBytes[aUpto++] & 0xff;
int bByte = bBytes[bUpto++] & 0xff;
int diff = aByte - bByte;
if (diff != 0) return diff;
}
// One is a prefix of the other, or, they are equal:
return this.length - other.length;
return utf8SortedAsUnicodeSortOrder.compare(this, other);
}
private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();

View File

@ -123,6 +123,7 @@ public final class UnicodeUtil {
/** Encode characters from a char[] source, starting at
* offset for length chars. Returns a hash of the resulting bytes. After encoding, result.offset will always be 0. */
// TODO: broken if incoming result.offset != 0
public static int UTF16toUTF8WithHash(final char[] source, final int offset, final int length, BytesRef result) {
int hash = 0;
int upto = 0;
@ -179,6 +180,7 @@ public final class UnicodeUtil {
/** Encode characters from a char[] source, starting at
* offset for length chars. After encoding, result.offset will always be 0.
*/
// TODO: broken if incoming result.offset != 0
public static void UTF16toUTF8(final char[] source, final int offset, final int length, BytesRef result) {
int upto = 0;
@ -234,6 +236,7 @@ public final class UnicodeUtil {
/** Encode characters from this String, starting at offset
* for length characters. After encoding, result.offset will always be 0.
*/
// TODO: broken if incoming result.offset != 0
public static void UTF16toUTF8(final CharSequence s, final int offset, final int length, BytesRef result) {
final int end = offset + length;
@ -427,8 +430,10 @@ public final class UnicodeUtil {
return codePointCount;
}
// TODO: broken if incoming result.offset != 0
public static void UTF8toUTF32(final BytesRef utf8, final IntsRef utf32) {
// pre-alloc for worst case
// TODO: ints cannot be null, should be an assert
if (utf32.ints == null || utf32.ints.length < utf8.length) {
utf32.ints = new int[utf8.length];
}
@ -567,6 +572,7 @@ public final class UnicodeUtil {
* can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed).
* Explicit checks for valid UTF-8 are not performed.
*/
// TODO: broken if chars.offset != 0
public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) {
int out_offset = chars.offset = 0;
final char[] out = chars.chars = ArrayUtil.grow(chars.chars, length);

View File

@ -0,0 +1,51 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestBytesRef extends LuceneTestCase {
public void testEmpty() {
BytesRef b = new BytesRef();
assertEquals(BytesRef.EMPTY_BYTES, b.bytes);
assertEquals(0, b.offset);
assertEquals(0, b.length);
}
public void testFromBytes() {
byte bytes[] = new byte[] { (byte)'a', (byte)'b', (byte)'c', (byte)'d' };
BytesRef b = new BytesRef(bytes);
assertEquals(bytes, b.bytes);
assertEquals(0, b.offset);
assertEquals(4, b.length);
BytesRef b2 = new BytesRef(bytes, 1, 3);
assertEquals("bcd", b2.utf8ToString());
assertFalse(b.equals(b2));
}
public void testFromChars() {
for (int i = 0; i < 100; i++) {
String s = _TestUtil.randomUnicodeString(random);
String s2 = new BytesRef(s).utf8ToString();
assertEquals(s, s2);
}
// only for 4.x
assertEquals("\uFFFF", new BytesRef("\uFFFF").utf8ToString());
}
}