From cdac1f711339a6614fffb085f204fc9ef5ba5b39 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 3 Jan 2010 09:22:40 +0000 Subject: [PATCH] LUCENE-2084: remove Byte/CharBuffer wrapping for collation key generation git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@895341 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 + contrib/CHANGES.txt | 5 + .../collation/ICUCollationKeyFilter.java | 12 +- .../lucene/collation/CollationKeyFilter.java | 11 +- .../util/IndexableBinaryStringTools.java | 381 +++++++++++------- .../util/TestIndexableBinaryStringTools.java | 236 +++++++++-- 6 files changed, 459 insertions(+), 191 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index a93a8c4ba0e..471b00e7428 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -153,6 +153,11 @@ Optimizations * LUCENE-2169: Improved CharArraySet.copy(), if source set is also a CharArraySet. (Simon Willnauer via Uwe Schindler) +* LUCENE-2084: Change IndexableBinaryStringTools to work on byte[] and char[] + directly, instead of Byte/CharBuffers, and modify CollationKeyFilter to + take advantage of this for faster performance. + (Steven Rowe, Uwe Schindler, Robert Muir) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 628e27a9308..735e8751928 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -73,6 +73,11 @@ Optimizations * LUCENE-2157: DelimitedPayloadTokenFilter no longer copies the buffer over itsself. Instead it sets only the length. This patch also optimizes the logic of the filter and uses NIO for IdentityEncoder. (Uwe Schindler) + + * LUCENE-2084: Change IndexableBinaryStringTools to work on byte[] and char[] + directly, instead of Byte/CharBuffers, and modify ICUCollationKeyFilter to + take advantage of this for faster performance. + (Steven Rowe, Uwe Schindler, Robert Muir) Test Cases diff --git a/contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java b/contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java index a008814e149..6309b2e4163 100644 --- a/contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java +++ b/contrib/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java @@ -23,13 +23,10 @@ import com.ibm.icu.text.RawCollationKey; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.IndexableBinaryStringTools; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; /** @@ -92,15 +89,14 @@ public final class ICUCollationKeyFilter extends TokenFilter { char[] termBuffer = termAtt.termBuffer(); String termText = new String(termBuffer, 0, termAtt.termLength()); collator.getRawCollationKey(termText, reusableKey); - ByteBuffer collationKeyBuf = ByteBuffer.wrap(reusableKey.bytes, 0, reusableKey.size); - int encodedLength - = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); + int encodedLength = IndexableBinaryStringTools.getEncodedLength( + reusableKey.bytes, 0, reusableKey.size); if (encodedLength > termBuffer.length) { termAtt.resizeTermBuffer(encodedLength); } termAtt.setTermLength(encodedLength); - CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); - IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size, + termAtt.termBuffer(), 0, encodedLength); return true; } else { return false; diff --git a/src/java/org/apache/lucene/collation/CollationKeyFilter.java b/src/java/org/apache/lucene/collation/CollationKeyFilter.java index 6f0ea0578d2..71085c3ae1a 100644 --- a/src/java/org/apache/lucene/collation/CollationKeyFilter.java +++ b/src/java/org/apache/lucene/collation/CollationKeyFilter.java @@ -24,8 +24,6 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.IndexableBinaryStringTools; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; import java.text.Collator; @@ -94,15 +92,14 @@ public final class CollationKeyFilter extends TokenFilter { char[] termBuffer = termAtt.termBuffer(); String termText = new String(termBuffer, 0, termAtt.termLength()); byte[] collationKey = collator.getCollationKey(termText).toByteArray(); - ByteBuffer collationKeyBuf = ByteBuffer.wrap(collationKey); - int encodedLength - = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); + int encodedLength = IndexableBinaryStringTools.getEncodedLength( + collationKey, 0, collationKey.length); if (encodedLength > termBuffer.length) { termAtt.resizeTermBuffer(encodedLength); } termAtt.setTermLength(encodedLength); - CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); - IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + IndexableBinaryStringTools.encode(collationKey, 0, collationKey.length, + termAtt.termBuffer(), 0, encodedLength); return true; } else { return false; diff --git a/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java b/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java index 05dd903451f..a4761087eaf 100644 --- a/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java +++ b/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java @@ -23,29 +23,33 @@ import java.nio.ByteBuffer; /** * Provides support for converting byte sequences to Strings and back again. * The resulting Strings preserve the original byte sequences' sort order. - * + *

* The Strings are constructed using a Base 8000h encoding of the original * binary data - each char of an encoded String represents a 15-bit chunk * from the byte sequence. Base 8000h was chosen because it allows for all * lower 15 bits of char to be used without restriction; the surrogate range * [U+D8000-U+DFFF] does not represent valid chars, and would require * complicated handling to avoid them and allow use of char's high bit. - * + *

* Although unset bits are used as padding in the final char, the original * byte sequence could contain trailing bytes with no set bits (null bytes): * padding is indistinguishable from valid information. To overcome this * problem, a char is appended, indicating the number of encoded bytes in the * final content char. - * - * This class's operations are defined over CharBuffers and ByteBuffers, to - * allow for wrapped arrays to be reused, reducing memory allocation costs for - * repeated operations. Note that this class calls array() and arrayOffset() + *

+ * Some methods in this class are defined over CharBuffers and ByteBuffers, but + * these are deprecated in favor of methods that operate directly on byte[] and + * char[] arrays. Note that this class calls array() and arrayOffset() * on the CharBuffers and ByteBuffers it uses, so only wrapped arrays may be - * used. This class interprets the arrayOffset() and limit() values returned by - * its input buffers as beginning and end+1 positions on the wrapped array, + * used. This class interprets the arrayOffset() and limit() values returned + * by its input buffers as beginning and end+1 positions on the wrapped array, * respectively; similarly, on the output buffer, arrayOffset() is the first * position written to, and limit() is set to one past the final output array * position. + *

+ * WARNING: This means that the deprecated Buffer-based methods + * only work correctly with buffers that have an offset of 0. For example, they + * will not correctly interpret buffers returned by {@link ByteBuffer#slice}. */ public class IndexableBinaryStringTools { @@ -68,204 +72,276 @@ public class IndexableBinaryStringTools { /** * Returns the number of chars required to encode the given byte sequence. * - * @param original The byte sequence to be encoded. Must be backed by an array. + * @param original The byte sequence to be encoded. Must be backed by an + * array. * @return The number of chars required to encode the given byte sequence - * @throws IllegalArgumentException If the given ByteBuffer is not backed by an array + * @throws IllegalArgumentException If the given ByteBuffer is not backed by + * an array + * @deprecated Use {@link #getEncodedLength(byte[], int, int)} instead. This + * method will be removed in Lucene 4.0 */ - public static int getEncodedLength(ByteBuffer original) + @Deprecated + public static int getEncodedLength(ByteBuffer original) throws IllegalArgumentException { if (original.hasArray()) { - // Use long for intermediaries to protect against overflow - long length = (long)(original.limit() - original.arrayOffset()); - return (int)((length * 8L + 14L) / 15L) + 1; + return getEncodedLength(original.array(), original.arrayOffset(), + original.limit() - original.arrayOffset()); } else { throw new IllegalArgumentException("original argument must have a backing array"); } } + + /** + * Returns the number of chars required to encode the given bytes. + * + * @param inputArray byte sequence to be encoded + * @param inputOffset initial offset into inputArray + * @param inputLength number of bytes in inputArray + * @return The number of chars required to encode the number of bytes. + */ + public static int getEncodedLength(byte[] inputArray, int inputOffset, + int inputLength) { + // Use long for intermediaries to protect against overflow + return (int)(((long)inputLength * 8L + 14L) / 15L) + 1; + } + /** * Returns the number of bytes required to decode the given char sequence. * - * @param encoded The char sequence to be encoded. Must be backed by an array. + * @param encoded The char sequence to be decoded. Must be backed by an array. * @return The number of bytes required to decode the given char sequence - * @throws IllegalArgumentException If the given CharBuffer is not backed by an array + * @throws IllegalArgumentException If the given CharBuffer is not backed by + * an array + * @deprecated Use {@link #getDecodedLength(char[], int, int)} instead. This + * method will be removed in Lucene 4.0 */ + @Deprecated public static int getDecodedLength(CharBuffer encoded) throws IllegalArgumentException { if (encoded.hasArray()) { - int numChars = encoded.limit() - encoded.arrayOffset() - 1; - if (numChars <= 0) { - return 0; - } else { - int numFullBytesInFinalChar = encoded.charAt(encoded.limit() - 1); - int numEncodedChars = numChars - 1; - return (numEncodedChars * 15 + 7) / 8 + numFullBytesInFinalChar; - } + return getDecodedLength(encoded.array(), encoded.arrayOffset(), + encoded.limit() - encoded.arrayOffset()); } else { throw new IllegalArgumentException("encoded argument must have a backing array"); } } + + /** + * Returns the number of bytes required to decode the given char sequence. + * + * @param encoded char sequence to be decoded + * @param offset initial offset + * @param length number of characters + * @return The number of bytes required to decode the given char sequence + */ + public static int getDecodedLength(char[] encoded, int offset, int length) { + final int numChars = length - 1; + if (numChars <= 0) { + return 0; + } else { + // Use long for intermediaries to protect against overflow + final long numFullBytesInFinalChar = encoded[offset + length - 1]; + final long numEncodedChars = numChars - 1; + return (int)((numEncodedChars * 15L + 7L) / 8L + numFullBytesInFinalChar); + } + } /** - * Encodes the input byte sequence into the output char sequence. Before + * Encodes the input byte sequence into the output char sequence. Before * calling this method, ensure that the output CharBuffer has sufficient * capacity by calling {@link #getEncodedLength(java.nio.ByteBuffer)}. * * @param input The byte sequence to encode - * @param output Where the char sequence encoding result will go. The limit - * is set to one past the position of the final char. + * @param output Where the char sequence encoding result will go. The limit is + * set to one past the position of the final char. * @throws IllegalArgumentException If either the input or the output buffer - * is not backed by an array + * is not backed by an array + * @deprecated Use {@link #encode(byte[], int, int, char[], int, int)} + * instead. This method will be removed in Lucene 4.0 */ + @Deprecated public static void encode(ByteBuffer input, CharBuffer output) { if (input.hasArray() && output.hasArray()) { - byte[] inputArray = input.array(); - int inputOffset = input.arrayOffset(); - int inputLength = input.limit() - inputOffset; - char[] outputArray = output.array(); - int outputOffset = output.arrayOffset(); - int outputLength = getEncodedLength(input); - output.limit(outputOffset + outputLength); // Set output final pos + 1 + final int inputOffset = input.arrayOffset(); + final int inputLength = input.limit() - inputOffset; + final int outputOffset = output.arrayOffset(); + final int outputLength = getEncodedLength(input.array(), inputOffset, + inputLength); + output.limit(outputLength + outputOffset); output.position(0); - if (inputLength > 0) { - int inputByteNum = inputOffset; - int caseNum = 0; - int outputCharNum = outputOffset; - CodingCase codingCase; - for ( ; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength ; - ++outputCharNum ) { - codingCase = CODING_CASES[caseNum]; - if (2 == codingCase.numBytes) { - outputArray[outputCharNum] - = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) - + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) - & codingCase.finalMask) - & (short)0x7FFF); - } else { // numBytes is 3 - outputArray[outputCharNum] - = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) - + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift) - + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) - & codingCase.finalMask) - & (short)0x7FFF); - } - inputByteNum += codingCase.advanceBytes; - if (++caseNum == CODING_CASES.length) { - caseNum = 0; - } - } - // Produce final char (if any) and trailing count chars. + encode(input.array(), inputOffset, inputLength, output.array(), + outputOffset, outputLength); + } else { + throw new IllegalArgumentException("Arguments must have backing arrays"); + } + } + + /** + * Encodes the input byte sequence into the output char sequence. Before + * calling this method, ensure that the output array has sufficient + * capacity by calling {@link #getEncodedLength(byte[], int, int)}. + * + * @param inputArray byte sequence to be encoded + * @param inputOffset initial offset into inputArray + * @param inputLength number of bytes in inputArray + * @param outputArray char sequence to store encoded result + * @param outputOffset initial offset into outputArray + * @param outputLength length of output, must be getEncodedLength + */ + public static void encode(byte[] inputArray, int inputOffset, + int inputLength, char[] outputArray, int outputOffset, int outputLength) { + assert (outputLength == getEncodedLength(inputArray, inputOffset, + inputLength)); + if (inputLength > 0) { + int inputByteNum = inputOffset; + int caseNum = 0; + int outputCharNum = outputOffset; + CodingCase codingCase; + for (; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength; ++outputCharNum) { codingCase = CODING_CASES[caseNum]; - - if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3 - outputArray[outputCharNum++] - = (char)((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) - + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) - & (short)0x7FFF); - // Add trailing char containing the number of full bytes in final char - outputArray[outputCharNum++] = (char)1; - } else if (inputByteNum < inputLength) { - outputArray[outputCharNum++] - = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) - & (short)0x7FFF); - // Add trailing char containing the number of full bytes in final char - outputArray[outputCharNum++] = caseNum == 0 ? (char)1 : (char)0; - } else { // No left over bits - last char is completely filled. - // Add trailing char containing the number of full bytes in final char - outputArray[outputCharNum++] = (char)1; + if (2 == codingCase.numBytes) { + outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF); + } else { // numBytes is 3 + outputArray[outputCharNum] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift) + + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short) 0x7FFF); + } + inputByteNum += codingCase.advanceBytes; + if (++caseNum == CODING_CASES.length) { + caseNum = 0; } } + // Produce final char (if any) and trailing count chars. + codingCase = CODING_CASES[caseNum]; + + if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3 + outputArray[outputCharNum++] = (char) ((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF); + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = (char) 1; + } else if (inputByteNum < inputLength) { + outputArray[outputCharNum++] = (char) (((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF); + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = caseNum == 0 ? (char) 1 : (char) 0; + } else { // No left over bits - last char is completely filled. + // Add trailing char containing the number of full bytes in final char + outputArray[outputCharNum++] = (char) 1; + } + } + } + + /** + * Decodes the input char sequence into the output byte sequence. Before + * calling this method, ensure that the output ByteBuffer has sufficient + * capacity by calling {@link #getDecodedLength(java.nio.CharBuffer)}. + * + * @param input The char sequence to decode + * @param output Where the byte sequence decoding result will go. The limit is + * set to one past the position of the final char. + * @throws IllegalArgumentException If either the input or the output buffer + * is not backed by an array + * @deprecated Use {@link #decode(char[], int, int, byte[], int, int)} + * instead. This method will be removed in Lucene 4.0 + */ + @Deprecated + public static void decode(CharBuffer input, ByteBuffer output) { + if (input.hasArray() && output.hasArray()) { + final int inputOffset = input.arrayOffset(); + final int inputLength = input.limit() - inputOffset; + final int outputOffset = output.arrayOffset(); + final int outputLength = getDecodedLength(input.array(), inputOffset, + inputLength); + output.limit(outputLength + outputOffset); + output.position(0); + decode(input.array(), inputOffset, inputLength, output.array(), + outputOffset, outputLength); } else { throw new IllegalArgumentException("Arguments must have backing arrays"); } } /** - * Decodes the input char sequence into the output byte sequence. Before - * calling this method, ensure that the output ByteBuffer has sufficient - * capacity by calling {@link #getDecodedLength(java.nio.CharBuffer)}. + * Decodes the input char sequence into the output byte sequence. Before + * calling this method, ensure that the output array has sufficient capacity + * by calling {@link #getDecodedLength(char[], int, int)}. * - * @param input The char sequence to decode - * @param output Where the byte sequence decoding result will go. The limit - * is set to one past the position of the final char. - * @throws IllegalArgumentException If either the input or the output buffer - * is not backed by an array + * @param inputArray char sequence to be decoded + * @param inputOffset initial offset into inputArray + * @param inputLength number of chars in inputArray + * @param outputArray byte sequence to store encoded result + * @param outputOffset initial offset into outputArray + * @param outputLength length of output, must be + * getDecodedLength(inputArray, inputOffset, inputLength) */ - public static void decode(CharBuffer input, ByteBuffer output) { - if (input.hasArray() && output.hasArray()) { - int numInputChars = input.limit() - input.arrayOffset() - 1; - int numOutputBytes = getDecodedLength(input); - output.limit(numOutputBytes + output.arrayOffset()); // Set output final pos + 1 - output.position(0); - byte[] outputArray = output.array(); - char[] inputArray = input.array(); - if (numOutputBytes > 0) { - int caseNum = 0; - int outputByteNum = output.arrayOffset(); - int inputCharNum = input.arrayOffset(); - short inputChar; - CodingCase codingCase; - for ( ; inputCharNum < numInputChars - 1 ; ++inputCharNum) { - codingCase = CODING_CASES[caseNum]; - inputChar = (short)inputArray[inputCharNum]; - if (2 == codingCase.numBytes) { - if (0 == caseNum) { - outputArray[outputByteNum] = (byte)(inputChar >>> codingCase.initialShift); - } else { - outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); - } - outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) - << codingCase.finalShift); - } else { // numBytes is 3 - outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); - outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask) - >>> codingCase.middleShift); - outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) - << codingCase.finalShift); - } - outputByteNum += codingCase.advanceBytes; - if (++caseNum == CODING_CASES.length) { - caseNum = 0; - } - } - // Handle final char - inputChar = (short)inputArray[inputCharNum]; + public static void decode(char[] inputArray, int inputOffset, + int inputLength, byte[] outputArray, int outputOffset, int outputLength) { + assert (outputLength == getDecodedLength(inputArray, inputOffset, + inputLength)); + final int numInputChars = inputLength - 1; + final int numOutputBytes = outputLength; + + if (numOutputBytes > 0) { + int caseNum = 0; + int outputByteNum = outputOffset; + int inputCharNum = inputOffset; + short inputChar; + CodingCase codingCase; + for (; inputCharNum < numInputChars - 1; ++inputCharNum) { codingCase = CODING_CASES[caseNum]; - if (0 == caseNum) { - outputArray[outputByteNum] = 0; + inputChar = (short) inputArray[inputCharNum]; + if (2 == codingCase.numBytes) { + if (0 == caseNum) { + outputArray[outputByteNum] = (byte) (inputChar >>> codingCase.initialShift); + } else { + outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift); + } + outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift); + } else { // numBytes is 3 + outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift); + outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift); + outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift); } - outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); - int bytesLeft = numOutputBytes - outputByteNum; - if (bytesLeft > 1) { - if (2 == codingCase.numBytes) { - outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) - >>> codingCase.finalShift); - } else { // numBytes is 3 - outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask) - >>> codingCase.middleShift); - if (bytesLeft > 2) { - outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) - << codingCase.finalShift); - } + outputByteNum += codingCase.advanceBytes; + if (++caseNum == CODING_CASES.length) { + caseNum = 0; + } + } + // Handle final char + inputChar = (short) inputArray[inputCharNum]; + codingCase = CODING_CASES[caseNum]; + if (0 == caseNum) { + outputArray[outputByteNum] = 0; + } + outputArray[outputByteNum] += (byte) (inputChar >>> codingCase.initialShift); + final int bytesLeft = numOutputBytes - outputByteNum; + if (bytesLeft > 1) { + if (2 == codingCase.numBytes) { + outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) >>> codingCase.finalShift); + } else { // numBytes is 3 + outputArray[outputByteNum + 1] = (byte) ((inputChar & codingCase.middleMask) >>> codingCase.middleShift); + if (bytesLeft > 2) { + outputArray[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift); } } } - } else { - throw new IllegalArgumentException("Arguments must have backing arrays"); } } /** * Decodes the given char sequence, which must have been encoded by - * {@link #encode(java.nio.ByteBuffer)} or + * {@link #encode(java.nio.ByteBuffer)} or * {@link #encode(java.nio.ByteBuffer, java.nio.CharBuffer)}. * * @param input The char sequence to decode - * @return A byte sequence containing the decoding result. The limit - * is set to one past the position of the final char. + * @return A byte sequence containing the decoding result. The limit is set to + * one past the position of the final char. * @throws IllegalArgumentException If the input buffer is not backed by an - * array + * array + * @deprecated Use {@link #decode(char[], int, int, byte[], int, int)} + * instead. This method will be removed in Lucene 4.0 */ + @Deprecated public static ByteBuffer decode(CharBuffer input) { byte[] outputArray = new byte[getDecodedLength(input)]; ByteBuffer output = ByteBuffer.wrap(outputArray); @@ -277,11 +353,14 @@ public class IndexableBinaryStringTools { * Encodes the input byte sequence. * * @param input The byte sequence to encode - * @return A char sequence containing the encoding result. The limit is set - * to one past the position of the final char. + * @return A char sequence containing the encoding result. The limit is set to + * one past the position of the final char. * @throws IllegalArgumentException If the input buffer is not backed by an - * array + * array + * @deprecated Use {@link #encode(byte[], int, int, char[], int, int)} + * instead. This method will be removed in Lucene 4.0 */ + @Deprecated public static CharBuffer encode(ByteBuffer input) { char[] outputArray = new char[getEncodedLength(input)]; CharBuffer output = CharBuffer.wrap(outputArray); diff --git a/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java b/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java index 36fdf9b2dd0..e1eaed2e3ff 100644 --- a/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java +++ b/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java @@ -25,7 +25,9 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase { private static final int NUM_RANDOM_TESTS = 2000; private static final int MAX_RANDOM_BINARY_LENGTH = 300; - public void testSingleBinaryRoundTrip() { + /** @deprecated remove this test for Lucene 4.0 */ + @Deprecated + public void testSingleBinaryRoundTripNIO() { byte[] binary = new byte[] { (byte)0x23, (byte)0x98, (byte)0x13, (byte)0xE4, (byte)0x76, (byte)0x41, (byte)0xB2, (byte)0xC9, (byte)0x7F, (byte)0x0A, (byte)0xA6, (byte)0xD8 }; @@ -35,15 +37,44 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase { ByteBuffer decoded = IndexableBinaryStringTools.decode(encoded); assertEquals("Round trip decode/decode returned different results:" + System.getProperty("line.separator") - + "original: " + binaryDump(binaryBuf) + + "original: " + binaryDumpNIO(binaryBuf) + System.getProperty("line.separator") - + " encoded: " + charArrayDump(encoded) + + " encoded: " + charArrayDumpNIO(encoded) + System.getProperty("line.separator") - + " decoded: " + binaryDump(decoded), + + " decoded: " + binaryDumpNIO(decoded), binaryBuf, decoded); } - public void testEncodedSortability() { + public void testSingleBinaryRoundTrip() { + byte[] binary = new byte[] { (byte) 0x23, (byte) 0x98, (byte) 0x13, + (byte) 0xE4, (byte) 0x76, (byte) 0x41, (byte) 0xB2, (byte) 0xC9, + (byte) 0x7F, (byte) 0x0A, (byte) 0xA6, (byte) 0xD8 }; + + int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0, + binary.length); + char encoded[] = new char[encodedLen]; + IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0, + encoded.length); + + int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0, + encoded.length); + byte decoded[] = new byte[decodedLen]; + IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0, + decoded.length); + + assertEquals("Round trip decode/decode returned different results:" + + System.getProperty("line.separator") + "original: " + + binaryDump(binary, binary.length) + + System.getProperty("line.separator") + " encoded: " + + charArrayDump(encoded, encoded.length) + + System.getProperty("line.separator") + " decoded: " + + binaryDump(decoded, decoded.length), + binaryDump(binary, binary.length), binaryDump(decoded, decoded.length)); + } + + /** @deprecated remove this test for Lucene 4.0 */ + @Deprecated + public void testEncodedSortabilityNIO() { Random random = newRandom(); byte[] originalArray1 = new byte[MAX_RANDOM_BINARY_LENGTH]; ByteBuffer originalBuf1 = ByteBuffer.wrap(originalArray1); @@ -88,19 +119,85 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase { assertEquals("Test #" + (testNum + 1) + ": Original bytes and encoded chars compare differently:" + System.getProperty("line.separator") - + " binary 1: " + binaryDump(originalBuf1) + + " binary 1: " + binaryDumpNIO(originalBuf1) + System.getProperty("line.separator") - + " binary 2: " + binaryDump(originalBuf2) + + " binary 2: " + binaryDumpNIO(originalBuf2) + System.getProperty("line.separator") - + "encoded 1: " + charArrayDump(encodedBuf1) + + "encoded 1: " + charArrayDumpNIO(encodedBuf1) + System.getProperty("line.separator") - + "encoded 2: " + charArrayDump(encodedBuf2) + + "encoded 2: " + charArrayDumpNIO(encodedBuf2) + System.getProperty("line.separator"), originalComparison, encodedComparison); } } - - public void testEmptyInput() { + + public void testEncodedSortability() { + Random random = newRandom(); + byte[] originalArray1 = new byte[MAX_RANDOM_BINARY_LENGTH]; + char[] originalString1 = new char[MAX_RANDOM_BINARY_LENGTH]; + char[] encoded1 = new char[MAX_RANDOM_BINARY_LENGTH * 10]; + byte[] original2 = new byte[MAX_RANDOM_BINARY_LENGTH]; + char[] originalString2 = new char[MAX_RANDOM_BINARY_LENGTH]; + char[] encoded2 = new char[MAX_RANDOM_BINARY_LENGTH * 10]; + + for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) { + int numBytes1 = random.nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1 + + for (int byteNum = 0; byteNum < numBytes1; ++byteNum) { + int randomInt = random.nextInt(0x100); + originalArray1[byteNum] = (byte) randomInt; + originalString1[byteNum] = (char) randomInt; + } + + int numBytes2 = random.nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1 + + for (int byteNum = 0; byteNum < numBytes2; ++byteNum) { + int randomInt = random.nextInt(0x100); + original2[byteNum] = (byte) randomInt; + originalString2[byteNum] = (char) randomInt; + } + int originalComparison = new String(originalString1, 0, numBytes1) + .compareTo(new String(originalString2, 0, numBytes2)); + originalComparison = originalComparison < 0 ? -1 + : originalComparison > 0 ? 1 : 0; + + int encodedLen1 = IndexableBinaryStringTools.getEncodedLength( + originalArray1, 0, numBytes1); + if (encodedLen1 > encoded1.length) + encoded1 = new char[ArrayUtil.getNextSize(encodedLen1)]; + IndexableBinaryStringTools.encode(originalArray1, 0, numBytes1, encoded1, + 0, encodedLen1); + + int encodedLen2 = IndexableBinaryStringTools.getEncodedLength(original2, + 0, numBytes2); + if (encodedLen2 > encoded2.length) + encoded2 = new char[ArrayUtil.getNextSize(encodedLen2)]; + IndexableBinaryStringTools.encode(original2, 0, numBytes2, encoded2, 0, + encodedLen2); + + int encodedComparison = new String(encoded1, 0, encodedLen1) + .compareTo(new String(encoded2, 0, encodedLen2)); + encodedComparison = encodedComparison < 0 ? -1 + : encodedComparison > 0 ? 1 : 0; + + assertEquals("Test #" + (testNum + 1) + + ": Original bytes and encoded chars compare differently:" + + System.getProperty("line.separator") + " binary 1: " + + binaryDump(originalArray1, numBytes1) + + System.getProperty("line.separator") + " binary 2: " + + binaryDump(original2, numBytes2) + + System.getProperty("line.separator") + "encoded 1: " + + charArrayDump(encoded1, encodedLen1) + + System.getProperty("line.separator") + "encoded 2: " + + charArrayDump(encoded2, encodedLen2) + + System.getProperty("line.separator"), originalComparison, + encodedComparison); + } + } + + /** @deprecated remove this test for Lucene 4.0 */ + @Deprecated + public void testEmptyInputNIO() { byte[] binary = new byte[0]; CharBuffer encoded = IndexableBinaryStringTools.encode(ByteBuffer.wrap(binary)); ByteBuffer decoded = IndexableBinaryStringTools.decode(encoded); @@ -108,7 +205,27 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase { assertEquals("decoded empty input was not empty", decoded.limit(), 0); } - public void testAllNullInput() { + public void testEmptyInput() { + byte[] binary = new byte[0]; + + int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0, + binary.length); + char[] encoded = new char[encodedLen]; + IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0, + encoded.length); + + int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0, + encoded.length); + byte[] decoded = new byte[decodedLen]; + IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0, + decoded.length); + + assertEquals("decoded empty input was not empty", decoded.length, 0); + } + + /** @deprecated remove this test for Lucene 4.0 */ + @Deprecated + public void testAllNullInputNIO() { byte[] binary = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; ByteBuffer binaryBuf = ByteBuffer.wrap(binary); CharBuffer encoded = IndexableBinaryStringTools.encode(binaryBuf); @@ -117,13 +234,38 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase { assertNotNull("decode() returned null", decodedBuf); assertEquals("Round trip decode/decode returned different results:" + System.getProperty("line.separator") - + " original: " + binaryDump(binaryBuf) + + " original: " + binaryDumpNIO(binaryBuf) + System.getProperty("line.separator") - + "decodedBuf: " + binaryDump(decodedBuf), + + "decodedBuf: " + binaryDumpNIO(decodedBuf), binaryBuf, decodedBuf); } - public void testRandomBinaryRoundTrip() { + public void testAllNullInput() { + byte[] binary = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0, + binary.length); + char encoded[] = new char[encodedLen]; + IndexableBinaryStringTools.encode(binary, 0, binary.length, encoded, 0, + encoded.length); + + int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0, + encoded.length); + byte[] decoded = new byte[decodedLen]; + IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0, + decoded.length); + + assertEquals("Round trip decode/decode returned different results:" + + System.getProperty("line.separator") + " original: " + + binaryDump(binary, binary.length) + + System.getProperty("line.separator") + "decodedBuf: " + + binaryDump(decoded, decoded.length), + binaryDump(binary, binary.length), binaryDump(decoded, decoded.length)); + } + + /** @deprecated remove this test for Lucene 4.0 */ + @Deprecated + public void testRandomBinaryRoundTripNIO() { Random random = newRandom(); byte[] binary = new byte[MAX_RANDOM_BINARY_LENGTH]; ByteBuffer binaryBuf = ByteBuffer.wrap(binary); @@ -142,19 +284,59 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase { assertEquals("Test #" + (testNum + 1) + ": Round trip decode/decode returned different results:" + System.getProperty("line.separator") - + " original: " + binaryDump(binaryBuf) + + " original: " + binaryDumpNIO(binaryBuf) + System.getProperty("line.separator") - + "encodedBuf: " + charArrayDump(encodedBuf) + + "encodedBuf: " + charArrayDumpNIO(encodedBuf) + System.getProperty("line.separator") - + "decodedBuf: " + binaryDump(decodedBuf), + + "decodedBuf: " + binaryDumpNIO(decodedBuf), binaryBuf, decodedBuf); } } + + public void testRandomBinaryRoundTrip() { + Random random = newRandom(); + byte[] binary = new byte[MAX_RANDOM_BINARY_LENGTH]; + char[] encoded = new char[MAX_RANDOM_BINARY_LENGTH * 10]; + byte[] decoded = new byte[MAX_RANDOM_BINARY_LENGTH]; + for (int testNum = 0; testNum < NUM_RANDOM_TESTS; ++testNum) { + int numBytes = random.nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1 + + for (int byteNum = 0; byteNum < numBytes; ++byteNum) { + binary[byteNum] = (byte) random.nextInt(0x100); + } + + int encodedLen = IndexableBinaryStringTools.getEncodedLength(binary, 0, + numBytes); + if (encoded.length < encodedLen) + encoded = new char[ArrayUtil.getNextSize(encodedLen)]; + IndexableBinaryStringTools.encode(binary, 0, numBytes, encoded, 0, + encodedLen); + + int decodedLen = IndexableBinaryStringTools.getDecodedLength(encoded, 0, + encodedLen); + IndexableBinaryStringTools.decode(encoded, 0, encodedLen, decoded, 0, + decodedLen); + + assertEquals("Test #" + (testNum + 1) + + ": Round trip decode/decode returned different results:" + + System.getProperty("line.separator") + " original: " + + binaryDump(binary, numBytes) + System.getProperty("line.separator") + + "encodedBuf: " + charArrayDump(encoded, encodedLen) + + System.getProperty("line.separator") + "decodedBuf: " + + binaryDump(decoded, decodedLen), binaryDump(binary, numBytes), + binaryDump(decoded, decodedLen)); + } + } - public String binaryDump(ByteBuffer binaryBuf) { + /** @deprecated remove this method for Lucene 4.0 */ + @Deprecated + public String binaryDumpNIO(ByteBuffer binaryBuf) { + return binaryDump(binaryBuf.array(), + binaryBuf.limit() - binaryBuf.arrayOffset()); + } + + public String binaryDump(byte[] binary, int numBytes) { StringBuilder buf = new StringBuilder(); - int numBytes = binaryBuf.limit() - binaryBuf.arrayOffset(); - byte[] binary = binaryBuf.array(); for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) { String hex = Integer.toHexString((int)binary[byteNum] & 0xFF); if (hex.length() == 1) { @@ -167,11 +349,15 @@ public class TestIndexableBinaryStringTools extends LuceneTestCase { } return buf.toString(); } - - public String charArrayDump(CharBuffer charBuf) { + /** @deprecated remove this method for Lucene 4.0 */ + @Deprecated + public String charArrayDumpNIO(CharBuffer charBuf) { + return charArrayDump(charBuf.array(), + charBuf.limit() - charBuf.arrayOffset()); + } + + public String charArrayDump(char[] charArray, int numBytes) { StringBuilder buf = new StringBuilder(); - int numBytes = charBuf.limit() - charBuf.arrayOffset(); - char[] charArray = charBuf.array(); for (int charNum = 0 ; charNum < numBytes ; ++charNum) { String hex = Integer.toHexString((int)charArray[charNum]); for (int digit = 0 ; digit < 4 - hex.length() ; ++digit) {