SOLR-1900: use bytes instead of strings for bigTerm to find end prefixes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@997108 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2010-09-14 21:44:49 +00:00
parent 0e9e44f10d
commit 2d9eb62343
6 changed files with 36 additions and 19 deletions

View File

@ -232,6 +232,18 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
offset = 0;
}
public void append(BytesRef other) {
int newLen = length + other.length;
if (bytes.length < newLen) {
byte[] newBytes = new byte[newLen];
System.arraycopy(bytes, offset, newBytes, 0, length);
offset = 0;
bytes = newBytes;
}
System.arraycopy(other.bytes, other.offset, bytes, length+offset, other.length);
length = newLen;
}
public void grow(int newLength) {
bytes = ArrayUtil.grow(bytes, newLength);
}

View File

@ -19,6 +19,7 @@ import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexReader;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.BoundedTreeSet;
import org.apache.solr.util.ByteUtils;
import java.io.IOException;
import java.util.*;
@ -231,12 +232,12 @@ class PerSegmentSingleValuedFaceting {
// SolrCore.log.info("reader= " + reader + " FC=" + System.identityHashCode(si));
if (prefix!=null) {
startTermIndex = si.binarySearchLookup(new BytesRef(prefix), tempBR);
BytesRef prefixRef = new BytesRef(prefix);
startTermIndex = si.binarySearchLookup(prefixRef, tempBR);
if (startTermIndex<0) startTermIndex=-startTermIndex-1;
// find the end term. \uffff isn't a legal unicode char, but only compareTo
// is used, so it should be fine, and is guaranteed to be bigger than legal chars.
// TODO: switch to binarySearch version that takes start/end in Java6
endTermIndex = si.binarySearchLookup(new BytesRef(prefix+"\uffff\uffff\uffff\uffff"), tempBR);
prefixRef.append(ByteUtils.bigTerm);
// TODO: we could constrain the lower endpoint if we had a binarySearch method that allowed passing start/end
endTermIndex = si.binarySearchLookup(prefixRef, tempBR);
assert endTermIndex < 0;
endTermIndex = -endTermIndex-1;
} else {
@ -408,4 +409,4 @@ class IndexSortedFacetCollector extends FacetCollector {
public NamedList getFacetCounts() {
return res;
}
}
}

View File

@ -432,9 +432,8 @@ public class SimpleFacets {
if (prefix!=null) {
startTermIndex = si.binarySearchLookup(prefixRef, br);
if (startTermIndex<0) startTermIndex=-startTermIndex-1;
// find the end term. \uffff isn't a legal unicode char, but only compareTo
// is used, so it should be fine, and is guaranteed to be bigger than legal chars.
endTermIndex = si.binarySearchLookup(new BytesRef(prefix+"\uffff\uffff\uffff\uffff"), br);
prefixRef.append(ByteUtils.bigTerm);
endTermIndex = si.binarySearchLookup(prefixRef, br);
assert endTermIndex < 0;
endTermIndex = -endTermIndex-1;
} else {

View File

@ -37,6 +37,7 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.*;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.LongPriorityQueue;
import org.apache.solr.util.PrimUtils;
import org.apache.solr.util.BoundedTreeSet;
@ -483,9 +484,11 @@ public class UnInvertedField {
NumberedTermsEnum te = ti.getEnumerator(searcher.getReader());
if (prefix != null && prefix.length() > 0) {
te.skipTo(new BytesRef(prefix));
BytesRef prefixBr = new BytesRef(prefix);
te.skipTo(prefixBr);
startTerm = te.getTermNumber();
te.skipTo(new BytesRef(prefix + "\uffff\uffff\uffff\uffff"));
prefixBr.append(ByteUtils.bigTerm);
te.skipTo(prefixBr);
endTerm = te.getTermNumber();
}

View File

@ -24,21 +24,18 @@ import org.apache.lucene.util.packed.Direct16;
import org.apache.lucene.util.packed.Direct32;
import org.apache.lucene.util.packed.Direct8;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.solr.util.ByteUtils;
import java.io.IOException;
public class MissingStringLastComparatorSource extends FieldComparatorSource {
/** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
* one would normally encounter, and definitely bigger than any UTF-8 terms */
public static final BytesRef bigTerm = new BytesRef(
new byte[] {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
);
private final BytesRef missingValueProxy;
public MissingStringLastComparatorSource() {
this(bigTerm);
this(ByteUtils.bigTerm);
}
/** Creates a {@link FieldComparatorSource} that sorts null last in a normal ascending sort.
@ -428,4 +425,4 @@ public class MissingStringLastComparatorSource extends FieldComparatorSource {
public Comparable<?> value(int slot) {
return values==null ? NULL_VAL : values[slot];
}
}
}

View File

@ -22,7 +22,12 @@ import org.apache.noggit.CharArr;
public class ByteUtils {
/** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
* one would normally encounter, and definitely bigger than any UTF-8 terms */
public static final BytesRef bigTerm = new BytesRef(
new byte[] {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
);
/** Converts utf8 to utf16 and returns the number of 16 bit Java chars written.
* Full characters are read, even if this reads past the length passed (and can result in
* an ArrayOutOfBoundsException if invalid UTF8 is passed). Explicit checks for valid UTF8 are not performed.