SOLR-1900: use bytes instead of strings for bigTerm to find end prefixes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@997108 13f79535-47bb-0310-9956-ffa450edef68
2010-09-14 21:44:49 +00:00 · 2010-09-14 21:44:49 +00:00 · 2d9eb62343
parent 0e9e44f10d
commit 2d9eb62343
6 changed files with 36 additions and 19 deletions
--- a/lucene/src/java/org/apache/lucene/util/BytesRef.java
+++ b/lucene/src/java/org/apache/lucene/util/BytesRef.java
@ -232,6 +232,18 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
    offset = 0;
  }

+  public void append(BytesRef other) {
+    int newLen = length + other.length;
+    if (bytes.length < newLen) {
+      byte[] newBytes = new byte[newLen];
+      System.arraycopy(bytes, offset, newBytes, 0, length);
+      offset = 0;
+      bytes = newBytes;
+    }
+    System.arraycopy(other.bytes, other.offset, bytes, length+offset, other.length);
+    length = newLen;
+  }
+
  public void grow(int newLength) {
    bytes = ArrayUtil.grow(bytes, newLength);
  }
--- a/solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
+++ b/solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
@ -19,6 +19,7 @@ import org.apache.solr.search.DocSet;
 import org.apache.solr.search.SolrIndexReader;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.util.BoundedTreeSet;
+import org.apache.solr.util.ByteUtils;

 import java.io.IOException;
 import java.util.*;
@ -231,12 +232,12 @@ class PerSegmentSingleValuedFaceting {
      // SolrCore.log.info("reader= " + reader + "  FC=" + System.identityHashCode(si));

      if (prefix!=null) {
-        startTermIndex = si.binarySearchLookup(new BytesRef(prefix), tempBR);
+        BytesRef prefixRef = new BytesRef(prefix);
+        startTermIndex = si.binarySearchLookup(prefixRef, tempBR);
        if (startTermIndex<0) startTermIndex=-startTermIndex-1;
-        // find the end term.  \uffff isn't a legal unicode char, but only compareTo
-        // is used, so it should be fine, and is guaranteed to be bigger than legal chars.
-        // TODO: switch to binarySearch version that takes start/end in Java6
-        endTermIndex = si.binarySearchLookup(new BytesRef(prefix+"\uffff\uffff\uffff\uffff"), tempBR);
+        prefixRef.append(ByteUtils.bigTerm);
+        // TODO: we could constrain the lower endpoint if we had a binarySearch method that allowed passing start/end
+        endTermIndex = si.binarySearchLookup(prefixRef, tempBR);
        assert endTermIndex < 0;
        endTermIndex = -endTermIndex-1;
      } else {
@ -408,4 +409,4 @@ class IndexSortedFacetCollector extends FacetCollector {
  public NamedList getFacetCounts() {
    return res;
  }
-}
+}
--- a/solr/src/java/org/apache/solr/request/SimpleFacets.java
+++ b/solr/src/java/org/apache/solr/request/SimpleFacets.java
@ -432,9 +432,8 @@ public class SimpleFacets {
    if (prefix!=null) {
      startTermIndex = si.binarySearchLookup(prefixRef, br);
      if (startTermIndex<0) startTermIndex=-startTermIndex-1;
-      // find the end term.  \uffff isn't a legal unicode char, but only compareTo
-      // is used, so it should be fine, and is guaranteed to be bigger than legal chars.
-      endTermIndex = si.binarySearchLookup(new BytesRef(prefix+"\uffff\uffff\uffff\uffff"), br);
+      prefixRef.append(ByteUtils.bigTerm);
+      endTermIndex = si.binarySearchLookup(prefixRef, br);
      assert endTermIndex < 0;
      endTermIndex = -endTermIndex-1;
    } else {
--- a/solr/src/java/org/apache/solr/request/UnInvertedField.java
+++ b/solr/src/java/org/apache/solr/request/UnInvertedField.java
@ -37,6 +37,7 @@ import org.apache.solr.core.SolrCore;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.TrieField;
 import org.apache.solr.search.*;
+import org.apache.solr.util.ByteUtils;
 import org.apache.solr.util.LongPriorityQueue;
 import org.apache.solr.util.PrimUtils;
 import org.apache.solr.util.BoundedTreeSet;
@ -483,9 +484,11 @@ public class UnInvertedField {

      NumberedTermsEnum te = ti.getEnumerator(searcher.getReader());
      if (prefix != null && prefix.length() > 0) {
-        te.skipTo(new BytesRef(prefix));
+        BytesRef prefixBr = new BytesRef(prefix);
+        te.skipTo(prefixBr);
        startTerm = te.getTermNumber();
-        te.skipTo(new BytesRef(prefix + "\uffff\uffff\uffff\uffff"));
+        prefixBr.append(ByteUtils.bigTerm);
+        te.skipTo(prefixBr);
        endTerm = te.getTermNumber();
      }

--- a/solr/src/java/org/apache/solr/search/MissingStringLastComparatorSource.java
+++ b/solr/src/java/org/apache/solr/search/MissingStringLastComparatorSource.java
@ -24,21 +24,18 @@ import org.apache.lucene.util.packed.Direct16;
 import org.apache.lucene.util.packed.Direct32;
 import org.apache.lucene.util.packed.Direct8;
 import org.apache.lucene.util.packed.PackedInts;
+import org.apache.solr.util.ByteUtils;

 import java.io.IOException;


 public class MissingStringLastComparatorSource extends FieldComparatorSource {
-  /** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
-   *  one would normally encounter, and definitely bigger than any UTF-8 terms */
-  public static final BytesRef bigTerm = new BytesRef(
-      new byte[] {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
-  );
+

  private final BytesRef missingValueProxy;

  public MissingStringLastComparatorSource() {
-    this(bigTerm);
+    this(ByteUtils.bigTerm);
  }

  /** Creates a {@link FieldComparatorSource} that sorts null last in a normal ascending sort.
@ -428,4 +425,4 @@ public class MissingStringLastComparatorSource extends FieldComparatorSource {
    public Comparable<?> value(int slot) {
      return values==null ? NULL_VAL : values[slot];
    }
-  }
+  }
--- a/solr/src/java/org/apache/solr/util/ByteUtils.java
+++ b/solr/src/java/org/apache/solr/util/ByteUtils.java
@ -22,7 +22,12 @@ import org.apache.noggit.CharArr;


 public class ByteUtils {
-
+ /** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
+   *  one would normally encounter, and definitely bigger than any UTF-8 terms */
+  public static final BytesRef bigTerm = new BytesRef(
+      new byte[] {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
+  );
+  
  /** Converts utf8 to utf16 and returns the number of 16 bit Java chars written.
   * Full characters are read, even if this reads past the length passed (and can result in
   * an ArrayOutOfBoundsException if invalid UTF8 is passed).  Explicit checks for valid UTF8 are not performed.