LUCENE-1434: add IndexableBinaryStringTools

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@755742 13f79535-47bb-0310-9956-ffa450edef68
2009-03-18 21:35:51 +00:00 · 2009-03-18 21:35:51 +00:00 · 5d05762169
parent 96863198a5
commit 5d05762169
3 changed files with 507 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -178,6 +178,11 @@ New features
    resources with the original reader.  (Torin Danil via Mike
    McCandless)

+17. LUCENE-1434: Added org.apache.lucene.util.IndexableBinaryStringTools,
+    to encode byte[] as String values that are valid terms, and
+    maintain sort order of the original byte[] when the bytes are
+    interpreted as unsigned.  (Steven Rowe via Mike McCandless)
+
 Optimizations

 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
--- a/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java
+++ b/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java
@ -0,0 +1,315 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.nio.CharBuffer;
+import java.nio.ByteBuffer;
+
+/**
+ * Provides support for converting byte sequences to Strings and back again.
+ * The resulting Strings preserve the original byte sequences' sort order.
+ * 
+ * The Strings are constructed using a Base 8000h encoding of the original
+ * binary data - each char of an encoded String represents a 15-bit chunk
+ * from the byte sequence.  Base 8000h was chosen because it allows for all
+ * lower 15 bits of char to be used without restriction; the surrogate range 
+ * [U+D8000-U+DFFF] does not represent valid chars, and would require
+ * complicated handling to avoid them and allow use of char's high bit.
+ * 
+ * Although unset bits are used as padding in the final char, the original
+ * byte sequence could contain trailing bytes with no set bits (null bytes):
+ * padding is indistinguishable from valid information.  To overcome this
+ * problem, a char is appended, indicating the number of encoded bytes in the
+ * final content char.
+ * 
+ * This class's operations are defined over CharBuffers and ByteBuffers, to
+ * allow for wrapped arrays to be reused, reducing memory allocation costs for
+ * repeated operations.  Note that this class calls array() and arrayOffset()
+ * on the CharBuffers and ByteBuffers it uses, so only wrapped arrays may be
+ * used.  This class interprets the arrayOffset() and limit() values returned by
+ * its input buffers as beginning and end+1 positions on the wrapped array,
+ * resprectively; similarly, on the output buffer, arrayOffset() is the first
+ * position written to, and limit() is set to one past the final output array
+ * position.
+ */
+public class IndexableBinaryStringTools {
+
+  private static final CodingCase[] CODING_CASES = {
+    // CodingCase(int initialShift, int finalShift)
+    new CodingCase( 7, 1   ),
+    // CodingCase(int initialShift, int middleShift, int finalShift)
+    new CodingCase(14, 6, 2),
+    new CodingCase(13, 5, 3),
+    new CodingCase(12, 4, 4),
+    new CodingCase(11, 3, 5),
+    new CodingCase(10, 2, 6),
+    new CodingCase( 9, 1, 7),
+    new CodingCase( 8, 0   )
+  };
+
+  // Export only static methods
+  private IndexableBinaryStringTools() {}
+
+  /**
+   * Returns the number of chars required to encode the given byte sequence.
+   * 
+   * @param original The byte sequence to be encoded.  Must be backed by an array.
+   * @return The number of chars required to encode the given byte sequence
+   * @throws IllegalArgumentException If the given ByteBuffer is not backed by an array
+   */
+  public static int getEncodedLength(ByteBuffer original) 
+    throws IllegalArgumentException {
+    if (original.hasArray()) {
+      // Use long for intermediaries to protect against overflow
+      long length = (long)(original.limit() - original.arrayOffset());
+      return (int)((length * 8L + 14L) / 15L) + 1;
+    } else {
+      throw new IllegalArgumentException("original argument must have a backing array");
+    }
+  }
+
+  /**
+   * Returns the number of bytes required to decode the given char sequence.
+   * 
+   * @param encoded The char sequence to be encoded.  Must be backed by an array.
+   * @return The number of bytes required to decode the given char sequence
+   * @throws IllegalArgumentException If the given CharBuffer is not backed by an array
+   */
+  public static int getDecodedLength(CharBuffer encoded) 
+    throws IllegalArgumentException {
+    if (encoded.hasArray()) {
+      int numChars = encoded.limit() - encoded.arrayOffset() - 1;
+      if (numChars <= 0) {
+        return 0;
+      } else {
+        int numFullBytesInFinalChar = encoded.charAt(encoded.limit() - 1);
+        int numEncodedChars = numChars - 1;
+        return (numEncodedChars * 15 + 7) / 8 + numFullBytesInFinalChar;
+      }
+    } else {
+      throw new IllegalArgumentException("encoded argument must have a backing array");
+    }
+  }
+
+  /**
+   * Encodes the input byte sequence into the output char sequence.  Before
+   * calling this method, ensure that the output CharBuffer has sufficient
+   * capacity by calling {@link #getEncodedLength(java.nio.ByteBuffer)}.
+   * 
+   * @param input The byte sequence to encode
+   * @param output Where the char sequence encoding result will go.  The limit
+   *  is set to one past the position of the final char.
+   * @throws IllegalArgumentException If either the input or the output buffer
+   *  is not backed by an array
+   */
+  public static void encode(ByteBuffer input, CharBuffer output) {
+    if (input.hasArray() && output.hasArray()) {
+      byte[] inputArray = input.array();
+      int inputOffset = input.arrayOffset();
+      int inputLength = input.limit() - inputOffset; 
+      char[] outputArray = output.array();
+      int outputOffset = output.arrayOffset();
+      int outputLength = getEncodedLength(input);
+      output.limit(outputOffset + outputLength); // Set output final pos + 1
+      output.position(0);
+      if (inputLength > 0) {
+        int inputByteNum = inputOffset;
+        int caseNum = 0;
+        int outputCharNum = outputOffset;
+        CodingCase codingCase;
+        for ( ; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength ;
+              ++outputCharNum                                                 ) {
+          codingCase = CODING_CASES[caseNum];
+          if (2 == codingCase.numBytes) {
+            outputArray[outputCharNum]
+              = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+                       + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift)
+                          & codingCase.finalMask)
+                       & (short)0x7FFF);
+          } else { // numBytes is 3
+            outputArray[outputCharNum] 
+              = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+                       + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)
+                       + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) 
+                          & codingCase.finalMask)
+                       & (short)0x7FFF);          
+          }
+          inputByteNum += codingCase.advanceBytes;          
+          if (++caseNum == CODING_CASES.length) {
+            caseNum = 0;
+          }
+        }
+        // Produce final char (if any) and trailing count chars.
+        codingCase = CODING_CASES[caseNum];
+        
+        if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3
+          outputArray[outputCharNum++] 
+            = (char)((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+                      + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift))
+                     & (short)0x7FFF);
+          // Add trailing char containing the number of full bytes in final char
+          outputArray[outputCharNum++] = (char)1;
+        } else if (inputByteNum < inputLength) {
+          outputArray[outputCharNum++] 
+            = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift)
+                     & (short)0x7FFF);
+          // Add trailing char containing the number of full bytes in final char
+          outputArray[outputCharNum++] = caseNum == 0 ? (char)1 : (char)0;
+        } else { // No left over bits - last char is completely filled.
+          // Add trailing char containing the number of full bytes in final char
+          outputArray[outputCharNum++] = (char)1;
+        }
+      }
+    } else {
+      throw new IllegalArgumentException("Arguments must have backing arrays");
+    }
+  }
+
+  /**
+   * Decodes the input char sequence into the output byte sequence.  Before
+   * calling this method, ensure that the output ByteBuffer has sufficient
+   * capacity by calling {@link #getDecodedLength(java.nio.CharBuffer)}.
+   * 
+   * @param input The char sequence to decode
+   * @param output Where the byte sequence decoding result will go.  The limit
+   *  is set to one past the position of the final char.
+   * @throws IllegalArgumentException If either the input or the output buffer
+   *  is not backed by an array
+   */
+  public static void decode(CharBuffer input, ByteBuffer output) {
+    if (input.hasArray() && output.hasArray()) {
+      int numInputChars = input.limit() - input.arrayOffset() - 1;
+      int numOutputBytes = getDecodedLength(input);
+      output.limit(numOutputBytes + output.arrayOffset()); // Set output final pos + 1
+      output.position(0);
+      byte[] outputArray = output.array();
+      char[] inputArray = input.array();
+      if (numOutputBytes > 0) {
+        int caseNum = 0;
+        int outputByteNum = output.arrayOffset();
+        int inputCharNum = input.arrayOffset();
+        short inputChar;
+        CodingCase codingCase;
+        for ( ; inputCharNum < numInputChars - 1 ; ++inputCharNum) {
+          codingCase = CODING_CASES[caseNum];
+          inputChar = (short)inputArray[inputCharNum];
+          if (2 == codingCase.numBytes) {
+            if (0 == caseNum) {
+              outputArray[outputByteNum] = (byte)(inputChar >>> codingCase.initialShift);
+            } else {
+              outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift);
+            }
+            outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) 
+                                                    << codingCase.finalShift);
+          } else { // numBytes is 3
+            outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift);
+            outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask)
+                                                    >>> codingCase.middleShift);
+            outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) 
+                                                    << codingCase.finalShift);
+          }
+          outputByteNum += codingCase.advanceBytes;
+          if (++caseNum == CODING_CASES.length) {
+            caseNum = 0;
+          }
+        }
+        // Handle final char
+        inputChar = (short)inputArray[inputCharNum];
+        codingCase = CODING_CASES[caseNum];
+        if (0 == caseNum) {
+          outputArray[outputByteNum] = 0;
+        }
+        outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift);
+        int bytesLeft = numOutputBytes - outputByteNum;
+        if (bytesLeft > 1) {
+          if (2 == codingCase.numBytes) {
+            outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) 
+                                                    >>> codingCase.finalShift);
+          } else { // numBytes is 3
+            outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask)
+                                                    >>> codingCase.middleShift);
+            if (bytesLeft > 2) {
+              outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) 
+                                                      << codingCase.finalShift);
+            }
+          }
+        }
+      }
+    } else {
+      throw new IllegalArgumentException("Arguments must have backing arrays");
+    }
+  }
+
+  /**
+   * Decodes the given char sequence, which must have been encoded by
+   * {@link #encode(java.nio.ByteBuffer)} or 
+   * {@link #encode(java.nio.ByteBuffer, java.nio.CharBuffer)}.
+   * 
+   * @param input The char sequence to decode
+   * @return A byte sequence containing the decoding result.  The limit
+   *  is set to one past the position of the final char.
+   * @throws IllegalArgumentException If the input buffer is not backed by an
+   *  array
+   */
+  public static ByteBuffer decode(CharBuffer input) {
+    byte[] outputArray = new byte[getDecodedLength(input)];
+    ByteBuffer output = ByteBuffer.wrap(outputArray);
+    decode(input, output);
+    return output;
+  }
+
+  /**
+   * Encodes the input byte sequence.
+   * 
+   * @param input The byte sequence to encode
+   * @return A char sequence containing the encoding result.  The limit is set
+   *  to one past the position of the final char.
+   * @throws IllegalArgumentException If the input buffer is not backed by an
+   *  array
+   */
+  public static CharBuffer encode(ByteBuffer input) {
+    char[] outputArray = new char[getEncodedLength(input)];
+    CharBuffer output = CharBuffer.wrap(outputArray);
+    encode(input, output);
+    return output;
+  }
+  
+  static class CodingCase {
+    int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2;
+    short middleMask, finalMask;
+
+    CodingCase(int initialShift, int middleShift, int finalShift) {
+      this.numBytes = 3;
+      this.initialShift = initialShift;
+      this.middleShift = middleShift;
+      this.finalShift = finalShift;
+      this.finalMask = (short)((short)0xFF >>> finalShift);
+      this.middleMask = (short)((short)0xFF << middleShift);
+    }
+
+    CodingCase(int initialShift, int finalShift) {
+      this.numBytes = 2;
+      this.initialShift = initialShift;
+      this.finalShift = finalShift;
+      this.finalMask = (short)((short)0xFF >>> finalShift);
+      if (finalShift != 0) {
+        advanceBytes = 1; 
+      }
+    }
+  }
+}
--- a/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java
+++ b/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java
@ -0,0 +1,187 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Random;
+import java.nio.CharBuffer;
+import java.nio.ByteBuffer;
+
+public class TestIndexableBinaryStringTools extends LuceneTestCase {
+  private static final int NUM_RANDOM_TESTS = 20000;
+  private static final int MAX_RANDOM_BINARY_LENGTH = 300;
+  
+  public void testSingleBinaryRoundTrip() {
+    byte[] binary = new byte[] 
+      { (byte)0x23, (byte)0x98, (byte)0x13, (byte)0xE4, (byte)0x76, (byte)0x41,
+        (byte)0xB2, (byte)0xC9, (byte)0x7F, (byte)0x0A, (byte)0xA6, (byte)0xD8 };
+
+    ByteBuffer binaryBuf = ByteBuffer.wrap(binary);
+    CharBuffer encoded = IndexableBinaryStringTools.encode(binaryBuf);
+    ByteBuffer decoded = IndexableBinaryStringTools.decode(encoded);
+    assertEquals("Round trip decode/decode returned different results:"
+                 + System.getProperty("line.separator")
+                 + "original: " + binaryDump(binaryBuf)
+                 + System.getProperty("line.separator")
+                 + " encoded: " + charArrayDump(encoded)
+                 + System.getProperty("line.separator")
+                 + " decoded: " + binaryDump(decoded),
+                 binaryBuf, decoded);
+  }
+  
+  public void testEncodedSortability() {
+    Random random = newRandom();
+    byte[] originalArray1 = new byte[MAX_RANDOM_BINARY_LENGTH];
+    ByteBuffer originalBuf1 = ByteBuffer.wrap(originalArray1);
+    char[] originalString1 = new char[MAX_RANDOM_BINARY_LENGTH];
+    CharBuffer originalStringBuf1 = CharBuffer.wrap(originalString1);
+    char[] encoded1 = new char[IndexableBinaryStringTools.getEncodedLength(originalBuf1)];
+    CharBuffer encodedBuf1 = CharBuffer.wrap(encoded1);
+    byte[] original2 = new byte[MAX_RANDOM_BINARY_LENGTH];
+    ByteBuffer originalBuf2 = ByteBuffer.wrap(original2);
+    char[] originalString2 = new char[MAX_RANDOM_BINARY_LENGTH];
+    CharBuffer originalStringBuf2 = CharBuffer.wrap(originalString2);
+    char[] encoded2 = new char[IndexableBinaryStringTools.getEncodedLength(originalBuf2)];
+    CharBuffer encodedBuf2 = CharBuffer.wrap(encoded2);
+    for (int testNum = 0 ; testNum < NUM_RANDOM_TESTS ; ++testNum) {
+      int numBytes1 = random.nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
+      originalBuf1.limit(numBytes1);
+      originalStringBuf1.limit(numBytes1);
+      
+      for (int byteNum = 0 ; byteNum < numBytes1 ; ++byteNum) {
+        int randomInt = random.nextInt(0x100);
+        originalArray1[byteNum] = (byte) randomInt;
+        originalString1[byteNum] = (char)randomInt;
+      }
+      
+      int numBytes2 = random.nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1; // Min == 1
+      originalBuf2.limit(numBytes2);
+      originalStringBuf2.limit(numBytes2);
+      for (int byteNum = 0 ; byteNum < numBytes2 ; ++byteNum) {
+        int randomInt = random.nextInt(0x100);
+        original2[byteNum] = (byte)randomInt;
+        originalString2[byteNum] = (char)randomInt;
+      }
+      int originalComparison = originalStringBuf1.compareTo(originalStringBuf2);
+      originalComparison = originalComparison < 0 ? -1 : originalComparison > 0 ? 1 : 0;
+      
+      IndexableBinaryStringTools.encode(originalBuf1, encodedBuf1);
+      IndexableBinaryStringTools.encode(originalBuf2, encodedBuf2);
+      
+      int encodedComparison = encodedBuf1.compareTo(encodedBuf2);
+      encodedComparison = encodedComparison < 0 ? -1 : encodedComparison > 0 ? 1 : 0;
+      
+      assertEquals("Test #" + (testNum + 1) 
+                   + ": Original bytes and encoded chars compare differently:"
+                   + System.getProperty("line.separator")
+                   + " binary 1: " + binaryDump(originalBuf1)
+                   + System.getProperty("line.separator")
+                   + " binary 2: " + binaryDump(originalBuf2)
+                   + System.getProperty("line.separator")
+                   + "encoded 1: " + charArrayDump(encodedBuf1)
+                   + System.getProperty("line.separator")
+                   + "encoded 2: " + charArrayDump(encodedBuf2)
+                   + System.getProperty("line.separator"),
+                   originalComparison, encodedComparison);
+    }
+  }
+  
+  public void testEmptyInput() {
+    byte[] binary = new byte[0];
+    CharBuffer encoded = IndexableBinaryStringTools.encode(ByteBuffer.wrap(binary));
+    ByteBuffer decoded = IndexableBinaryStringTools.decode(encoded);
+    assertNotNull("decode() returned null", decoded);
+    assertEquals("decoded empty input was not empty", decoded.limit(), 0);
+  }
+  
+  public void testAllNullInput() {
+    byte[] binary = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    ByteBuffer binaryBuf = ByteBuffer.wrap(binary);
+    CharBuffer encoded = IndexableBinaryStringTools.encode(binaryBuf);
+    assertNotNull("encode() returned null", encoded);
+    ByteBuffer decodedBuf = IndexableBinaryStringTools.decode(encoded);
+    assertNotNull("decode() returned null", decodedBuf);
+    assertEquals("Round trip decode/decode returned different results:"
+                 + System.getProperty("line.separator")
+                 + "  original: " + binaryDump(binaryBuf)
+                 + System.getProperty("line.separator")
+                 + "decodedBuf: " + binaryDump(decodedBuf),
+                 binaryBuf, decodedBuf);
+  }
+  
+  public void testRandomBinaryRoundTrip() {
+    Random random = newRandom();
+    byte[] binary = new byte[MAX_RANDOM_BINARY_LENGTH];
+    ByteBuffer binaryBuf = ByteBuffer.wrap(binary);
+    char[] encoded = new char[IndexableBinaryStringTools.getEncodedLength(binaryBuf)];
+    CharBuffer encodedBuf = CharBuffer.wrap(encoded);
+    byte[] decoded = new byte[MAX_RANDOM_BINARY_LENGTH];
+    ByteBuffer decodedBuf = ByteBuffer.wrap(decoded);
+    for (int testNum = 0 ; testNum < NUM_RANDOM_TESTS ; ++testNum) {
+      int numBytes = random.nextInt(MAX_RANDOM_BINARY_LENGTH - 1) + 1 ; // Min == 1
+      binaryBuf.limit(numBytes);
+      for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) {
+        binary[byteNum] = (byte)random.nextInt(0x100);
+      }
+      IndexableBinaryStringTools.encode(binaryBuf, encodedBuf);
+      IndexableBinaryStringTools.decode(encodedBuf, decodedBuf);
+      assertEquals("Test #" + (testNum + 1) 
+                   + ": Round trip decode/decode returned different results:"
+                   + System.getProperty("line.separator")
+                   + "  original: " + binaryDump(binaryBuf)
+                   + System.getProperty("line.separator")
+                   + "encodedBuf: " + charArrayDump(encodedBuf)
+                   + System.getProperty("line.separator")
+                   + "decodedBuf: " + binaryDump(decodedBuf),
+                   binaryBuf, decodedBuf);
+    }
+  }
+  
+  public String binaryDump(ByteBuffer binaryBuf) {
+    StringBuffer buf = new StringBuffer();
+    int numBytes = binaryBuf.limit() - binaryBuf.arrayOffset();
+    byte[] binary = binaryBuf.array();
+    for (int byteNum = 0 ; byteNum < numBytes ; ++byteNum) {
+      String hex = Integer.toHexString((int)binary[byteNum] & 0xFF);
+      if (hex.length() == 1) {
+        buf.append('0');
+      }
+      buf.append(hex.toUpperCase());
+      if (byteNum < numBytes - 1) {
+        buf.append(' ');
+      }
+    }
+    return buf.toString();
+  }
+
+  public String charArrayDump(CharBuffer charBuf) {
+    StringBuffer buf = new StringBuffer();
+    int numBytes = charBuf.limit() - charBuf.arrayOffset();
+    char[] charArray = charBuf.array();
+    for (int charNum = 0 ; charNum < numBytes ; ++charNum) {
+      String hex = Integer.toHexString((int)charArray[charNum]);
+      for (int digit = 0 ; digit < 4 - hex.length() ; ++digit) {
+        buf.append('0');
+      }
+      buf.append(hex.toUpperCase());
+      if (charNum < numBytes - 1) {
+        buf.append(' ');
+      }
+    }
+    return buf.toString();
+  }
+}