LUCENE-3518: enable sorting by sorted source doc values

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1201440 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-11-13 15:26:36 +00:00
parent 2cdf3fa14d
commit dbd48a72e4
9 changed files with 850 additions and 88 deletions

View File

@ -145,33 +145,33 @@ public class FieldType implements IndexableFieldType {
if (result.length() > 0)
result.append(",");
result.append("indexed");
}
if (tokenized()) {
if (result.length() > 0)
result.append(",");
result.append("tokenized");
}
if (storeTermVectors()) {
if (result.length() > 0)
result.append(",");
result.append("termVector");
}
if (storeTermVectorOffsets()) {
if (result.length() > 0)
result.append(",");
result.append("termVectorOffsets");
}
if (storeTermVectorPositions()) {
if (result.length() > 0)
result.append(",");
result.append("termVectorPosition");
}
if (omitNorms()) {
result.append(",omitNorms");
}
if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
result.append(",indexOptions=");
result.append(indexOptions);
if (tokenized()) {
if (result.length() > 0)
result.append(",");
result.append("tokenized");
}
if (storeTermVectors()) {
if (result.length() > 0)
result.append(",");
result.append("termVector");
}
if (storeTermVectorOffsets()) {
if (result.length() > 0)
result.append(",");
result.append("termVectorOffsets");
}
if (storeTermVectorPositions()) {
if (result.length() > 0)
result.append(",");
result.append("termVectorPosition");
}
if (omitNorms()) {
result.append(",omitNorms");
}
if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
result.append(",indexOptions=");
result.append(indexOptions);
}
}
return result.toString();

View File

@ -32,17 +32,17 @@ import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool.Allocator;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.ByteBlockPool.Allocator;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray;
import org.apache.lucene.util.packed.PackedInts;
/**
@ -586,7 +586,11 @@ public final class Bytes {
this.idxIn = idxIn;
ordToOffsetIndex = hasOffsets ? PackedInts.getReader(idxIn) : null;
docToOrdIndex = PackedInts.getReader(idxIn);
}
@Override
public PackedInts.Reader getDocToOrd() {
return docToOrdIndex;
}
@Override

View File

@ -194,6 +194,11 @@ class FixedSortedBytesImpl {
return (int) docToOrdIndex.get(docID);
}
@Override
public PackedInts.Reader getDocToOrd() {
return docToOrdIndex;
}
@Override
public BytesRef getByOrd(int ord, BytesRef bytesRef) {
try {

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.codecs.DocValuesFormat;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.packed.PackedInts;
/**
* {@link IndexDocValues} provides a dense per-document typed storage for fast
@ -223,7 +224,7 @@ public abstract class IndexDocValues implements Closeable {
return null;
}
}
/**
* A sorted variant of {@link Source} for <tt>byte[]</tt> values per document.
* <p>
@ -257,6 +258,18 @@ public abstract class IndexDocValues implements Closeable {
/** Returns value for specified ord. */
public abstract BytesRef getByOrd(int ord, BytesRef bytesRef);
/**
* Returns the PackedInts.Reader impl that maps document to ord.
*/
public abstract PackedInts.Reader getDocToOrd();
/**
* Returns the comparator used to order the BytesRefs.
*/
public Comparator<BytesRef> getComparator() {
return comparator;
}
/**
* Performs a lookup by value.
*
@ -304,4 +317,98 @@ public abstract class IndexDocValues implements Closeable {
*/
public abstract int getValueCount();
}
/** Returns a Source that always returns default (missing)
* values for all documents. */
public static Source getDefaultSource(final ValueType type) {
return new Source(type) {
@Override
public long getInt(int docID) {
return 0;
}
@Override
public double getFloat(int docID) {
return 0.0;
}
@Override
public BytesRef getBytes(int docID, BytesRef ref) {
ref.length = 0;
return ref;
}
};
}
/** Returns a SortedSource that always returns default (missing)
* values for all documents. */
public static SortedSource getDefaultSortedSource(final ValueType type, final int size) {
final PackedInts.Reader docToOrd = new PackedInts.Reader() {
@Override
public long get(int index) {
return 0;
}
@Override
public int getBitsPerValue() {
return 0;
}
@Override
public int size() {
return size;
}
@Override
public boolean hasArray() {
return false;
}
@Override
public Object getArray() {
return null;
}
};
return new SortedSource(type, BytesRef.getUTF8SortedAsUnicodeComparator()) {
@Override
public BytesRef getBytes(int docID, BytesRef ref) {
ref.length = 0;
return ref;
}
@Override
public int ord(int docID) {
return 0;
}
@Override
public BytesRef getByOrd(int ord, BytesRef bytesRef) {
assert ord == 0;
bytesRef.length = 0;
return bytesRef;
}
@Override
public PackedInts.Reader getDocToOrd() {
return docToOrd;
}
@Override
public int getByValue(BytesRef value, BytesRef spare) {
if (value.length == 0) {
return 0;
} else {
return -1;
}
}
@Override
public int getValueCount() {
return 1;
}
};
}
}

View File

@ -299,6 +299,11 @@ final class SortedBytesMergeUtils {
return bytesRef;
}
@Override
public PackedInts.Reader getDocToOrd() {
return null;
}
@Override
public int getValueCount() {
return 1;

View File

@ -214,6 +214,11 @@ final class VarSortedBytesImpl {
return (int) docToOrdIndex.get(docID);
}
@Override
public PackedInts.Reader getDocToOrd() {
return docToOrdIndex;
}
@Override
public BytesRef getByOrd(int ord, BytesRef bytesRef) {
try {

View File

@ -18,10 +18,14 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.values.IndexDocValues.SortedSource;
import org.apache.lucene.index.values.IndexDocValues.Source;
import org.apache.lucene.index.values.IndexDocValues;
import org.apache.lucene.index.values.ValueType;
import org.apache.lucene.search.FieldCache.ByteParser;
import org.apache.lucene.search.FieldCache.DocTerms;
import org.apache.lucene.search.FieldCache.DocTermsIndex;
@ -399,6 +403,8 @@ public abstract class FieldComparator<T> {
final IndexDocValues docValues = context.reader.docValues(field);
if (docValues != null) {
currentReaderValues = docValues.getSource();
} else {
currentReaderValues = IndexDocValues.getDefaultSource(ValueType.FLOAT_64);
}
return this;
}
@ -690,6 +696,8 @@ public abstract class FieldComparator<T> {
IndexDocValues docValues = context.reader.docValues(field);
if (docValues != null) {
currentReaderValues = docValues.getSource();
} else {
currentReaderValues = IndexDocValues.getDefaultSource(ValueType.FIXED_INTS_64);
}
return this;
}
@ -911,30 +919,53 @@ public abstract class FieldComparator<T> {
* than {@link TermValComparator}. For very small
* result sets it may be slower. */
public static final class TermOrdValComparator extends FieldComparator<BytesRef> {
/** @lucene.internal */
/* Ords for each slot.
@lucene.internal */
final int[] ords;
/** @lucene.internal */
/* Values for each slot.
@lucene.internal */
final BytesRef[] values;
/** @lucene.internal */
/* Which reader last copied a value into the slot. When
we compare two slots, we just compare-by-ord if the
readerGen is the same; else we must compare the
values (slower).
@lucene.internal */
final int[] readerGen;
/** @lucene.internal */
/* Gen of current reader we are on.
@lucene.internal */
int currentReaderGen = -1;
private DocTermsIndex termsIndex;
/* Current reader's doc ord/values.
@lucene.internal */
DocTermsIndex termsIndex;
private final String field;
/** @lucene.internal */
/* Bottom slot, or -1 if queue isn't full yet
@lucene.internal */
int bottomSlot = -1;
/** @lucene.internal */
/* Bottom ord (same as ords[bottomSlot] once bottomSlot
is set). Cached for faster compares.
@lucene.internal */
int bottomOrd;
/** @lucene.internal */
/* True if current bottom slot matches the current
reader.
@lucene.internal */
boolean bottomSameReader;
/** @lucene.internal */
/* Bottom value (same as values[bottomSlot] once
bottomSlot is set). Cached for faster compares.
@lucene.internal */
BytesRef bottomValue;
/** @lucene.internal */
final BytesRef tempBR = new BytesRef();
public TermOrdValComparator(int numHits, String field, int sortPos, boolean reversed) {
public TermOrdValComparator(int numHits, String field) {
ords = new int[numHits];
values = new BytesRef[numHits];
readerGen = new int[numHits];
@ -1325,6 +1356,396 @@ public abstract class FieldComparator<T> {
}
}
/** Sorts by field's natural Term sort order, using
* ordinals; this is just like {@link
* TermOrdValComparator} except it uses DocValues to
* retrieve the sort ords saved during indexing. */
public static final class TermOrdValDocValuesComparator extends FieldComparator<BytesRef> {
/* Ords for each slot.
@lucene.internal */
final int[] ords;
/* Values for each slot.
@lucene.internal */
final BytesRef[] values;
/* Which reader last copied a value into the slot. When
we compare two slots, we just compare-by-ord if the
readerGen is the same; else we must compare the
values (slower).
@lucene.internal */
final int[] readerGen;
/* Gen of current reader we are on.
@lucene.internal */
int currentReaderGen = -1;
/* Current reader's doc ord/values.
@lucene.internal */
SortedSource termsIndex;
/* Comparator for comparing by value.
@lucene.internal */
Comparator<BytesRef> comp;
private final String field;
/* Bottom slot, or -1 if queue isn't full yet
@lucene.internal */
int bottomSlot = -1;
/* Bottom ord (same as ords[bottomSlot] once bottomSlot
is set). Cached for faster compares.
@lucene.internal */
int bottomOrd;
/* True if current bottom slot matches the current
reader.
@lucene.internal */
boolean bottomSameReader;
/* Bottom value (same as values[bottomSlot] once
bottomSlot is set). Cached for faster compares.
@lucene.internal */
BytesRef bottomValue;
/** @lucene.internal */
final BytesRef tempBR = new BytesRef();
public TermOrdValDocValuesComparator(int numHits, String field) {
ords = new int[numHits];
values = new BytesRef[numHits];
readerGen = new int[numHits];
this.field = field;
}
@Override
public int compare(int slot1, int slot2) {
if (readerGen[slot1] == readerGen[slot2]) {
return ords[slot1] - ords[slot2];
}
final BytesRef val1 = values[slot1];
final BytesRef val2 = values[slot2];
if (val1 == null) {
if (val2 == null) {
return 0;
}
return -1;
} else if (val2 == null) {
return 1;
}
return comp.compare(val1, val2);
}
@Override
public int compareBottom(int doc) {
throw new UnsupportedOperationException();
}
@Override
public void copy(int slot, int doc) {
throw new UnsupportedOperationException();
}
// TODO: would be nice to share these specialized impls
// w/ TermOrdValComparator
/** Base class for specialized (per bit width of the
* ords) per-segment comparator. NOTE: this is messy;
* we do this only because hotspot can't reliably inline
* the underlying array access when looking up doc->ord
* @lucene.internal
*/
abstract class PerSegmentComparator extends FieldComparator<BytesRef> {
@Override
public FieldComparator setNextReader(AtomicReaderContext context) throws IOException {
return TermOrdValDocValuesComparator.this.setNextReader(context);
}
@Override
public int compare(int slot1, int slot2) {
return TermOrdValDocValuesComparator.this.compare(slot1, slot2);
}
@Override
public void setBottom(final int bottom) {
TermOrdValDocValuesComparator.this.setBottom(bottom);
}
@Override
public BytesRef value(int slot) {
return TermOrdValDocValuesComparator.this.value(slot);
}
@Override
public int compareValues(BytesRef val1, BytesRef val2) {
assert val1 != null;
assert val2 != null;
return comp.compare(val1, val2);
}
}
// Used per-segment when bit width of doc->ord is 8:
private final class ByteOrdComparator extends PerSegmentComparator {
private final byte[] readerOrds;
private final SortedSource termsIndex;
private final int docBase;
public ByteOrdComparator(byte[] readerOrds, SortedSource termsIndex, int docBase) {
this.readerOrds = readerOrds;
this.termsIndex = termsIndex;
this.docBase = docBase;
}
@Override
public int compareBottom(int doc) {
assert bottomSlot != -1;
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - (readerOrds[doc]&0xFF);
} else {
// ord is only approx comparable: if they are not
// equal, we can use that; if they are equal, we
// must fallback to compare by value
final int order = readerOrds[doc]&0xFF;
final int cmp = bottomOrd - order;
if (cmp != 0) {
return cmp;
}
termsIndex.getByOrd(order, tempBR);
return comp.compare(bottomValue, tempBR);
}
}
@Override
public void copy(int slot, int doc) {
final int ord = readerOrds[doc]&0xFF;
ords[slot] = ord;
if (values[slot] == null) {
values[slot] = new BytesRef();
}
termsIndex.getByOrd(ord, values[slot]);
readerGen[slot] = currentReaderGen;
}
}
// Used per-segment when bit width of doc->ord is 16:
private final class ShortOrdComparator extends PerSegmentComparator {
private final short[] readerOrds;
private final SortedSource termsIndex;
private final int docBase;
public ShortOrdComparator(short[] readerOrds, SortedSource termsIndex, int docBase) {
this.readerOrds = readerOrds;
this.termsIndex = termsIndex;
this.docBase = docBase;
}
@Override
public int compareBottom(int doc) {
assert bottomSlot != -1;
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - (readerOrds[doc]&0xFFFF);
} else {
// ord is only approx comparable: if they are not
// equal, we can use that; if they are equal, we
// must fallback to compare by value
final int order = readerOrds[doc]&0xFFFF;
final int cmp = bottomOrd - order;
if (cmp != 0) {
return cmp;
}
termsIndex.getByOrd(order, tempBR);
return comp.compare(bottomValue, tempBR);
}
}
@Override
public void copy(int slot, int doc) {
final int ord = readerOrds[doc]&0xFFFF;
ords[slot] = ord;
if (values[slot] == null) {
values[slot] = new BytesRef();
}
termsIndex.getByOrd(ord, values[slot]);
readerGen[slot] = currentReaderGen;
}
}
// Used per-segment when bit width of doc->ord is 32:
private final class IntOrdComparator extends PerSegmentComparator {
private final int[] readerOrds;
private final SortedSource termsIndex;
private final int docBase;
public IntOrdComparator(int[] readerOrds, SortedSource termsIndex, int docBase) {
this.readerOrds = readerOrds;
this.termsIndex = termsIndex;
this.docBase = docBase;
}
@Override
public int compareBottom(int doc) {
assert bottomSlot != -1;
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - readerOrds[doc];
} else {
// ord is only approx comparable: if they are not
// equal, we can use that; if they are equal, we
// must fallback to compare by value
final int order = readerOrds[doc];
final int cmp = bottomOrd - order;
if (cmp != 0) {
return cmp;
}
termsIndex.getByOrd(order, tempBR);
return comp.compare(bottomValue, tempBR);
}
}
@Override
public void copy(int slot, int doc) {
final int ord = readerOrds[doc];
ords[slot] = ord;
if (values[slot] == null) {
values[slot] = new BytesRef();
}
termsIndex.getByOrd(ord, values[slot]);
readerGen[slot] = currentReaderGen;
}
}
// Used per-segment when bit width is not a native array
// size (8, 16, 32):
private final class AnyOrdComparator extends PerSegmentComparator {
private final PackedInts.Reader readerOrds;
private final int docBase;
public AnyOrdComparator(PackedInts.Reader readerOrds, int docBase) {
this.readerOrds = readerOrds;
this.docBase = docBase;
}
@Override
public int compareBottom(int doc) {
assert bottomSlot != -1;
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - (int) readerOrds.get(doc);
} else {
// ord is only approx comparable: if they are not
// equal, we can use that; if they are equal, we
// must fallback to compare by value
final int order = (int) readerOrds.get(doc);
final int cmp = bottomOrd - order;
if (cmp != 0) {
return cmp;
}
termsIndex.getByOrd(order, tempBR);
return comp.compare(bottomValue, tempBR);
}
}
@Override
public void copy(int slot, int doc) {
final int ord = (int) readerOrds.get(doc);
ords[slot] = ord;
if (values[slot] == null) {
values[slot] = new BytesRef();
}
termsIndex.getByOrd(ord, values[slot]);
readerGen[slot] = currentReaderGen;
}
}
@Override
public FieldComparator setNextReader(AtomicReaderContext context) throws IOException {
final int docBase = context.docBase;
final IndexDocValues dv = context.reader.docValues(field);
if (dv == null) {
termsIndex = IndexDocValues.getDefaultSortedSource(ValueType.BYTES_VAR_SORTED, context.reader.maxDoc());
} else {
termsIndex = dv.getSource().asSortedSource();
if (termsIndex == null) {
termsIndex = IndexDocValues.getDefaultSortedSource(ValueType.BYTES_VAR_SORTED, context.reader.maxDoc());
}
}
comp = termsIndex.getComparator();
FieldComparator perSegComp = null;
final PackedInts.Reader docToOrd = termsIndex.getDocToOrd();
if (docToOrd.hasArray()) {
final Object arr = docToOrd.getArray();
assert arr != null;
if (arr instanceof byte[]) {
// 8 bit packed
perSegComp = new ByteOrdComparator((byte[]) arr, termsIndex, docBase);
} else if (arr instanceof short[]) {
// 16 bit packed
perSegComp = new ShortOrdComparator((short[]) arr, termsIndex, docBase);
} else if (arr instanceof int[]) {
// 32 bit packed
perSegComp = new IntOrdComparator((int[]) arr, termsIndex, docBase);
}
}
if (perSegComp == null) {
perSegComp = new AnyOrdComparator(docToOrd, docBase);
}
currentReaderGen++;
if (bottomSlot != -1) {
perSegComp.setBottom(bottomSlot);
}
return perSegComp;
}
@Override
public void setBottom(final int bottom) {
bottomSlot = bottom;
bottomValue = values[bottomSlot];
if (currentReaderGen == readerGen[bottomSlot]) {
bottomOrd = ords[bottomSlot];
bottomSameReader = true;
} else {
if (bottomValue == null) {
// 0 ord is null for all segments
assert ords[bottomSlot] == 0;
bottomOrd = 0;
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
} else {
final int index = termsIndex.getByValue(bottomValue, tempBR);
if (index < 0) {
bottomOrd = -index - 2;
bottomSameReader = false;
} else {
bottomOrd = index;
// exact value match
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
ords[bottomSlot] = bottomOrd;
}
}
}
}
@Override
public BytesRef value(int slot) {
return values[slot];
}
}
/** Sorts by field's natural Term sort order. All
* comparisons are done using BytesRef.compareTo, which is
* slow for medium to large result sets but possibly
@ -1410,6 +1831,74 @@ public abstract class FieldComparator<T> {
}
}
/** Sorts by field's natural Term sort order. All
* comparisons are done using BytesRef.compareTo, which is
* slow for medium to large result sets but possibly
* very fast for very small results sets. The BytesRef
* values are obtained using {@link IndexReader#docValues}. */
public static final class TermValDocValuesComparator extends FieldComparator<BytesRef> {
private BytesRef[] values;
private Source docTerms;
private final String field;
private BytesRef bottom;
private final BytesRef tempBR = new BytesRef();
TermValDocValuesComparator(int numHits, String field) {
values = new BytesRef[numHits];
this.field = field;
}
@Override
public int compare(int slot1, int slot2) {
assert values[slot1] != null;
assert values[slot2] != null;
return values[slot1].compareTo(values[slot2]);
}
@Override
public int compareBottom(int doc) {
assert bottom != null;
return bottom.compareTo(docTerms.getBytes(doc, tempBR));
}
@Override
public void copy(int slot, int doc) {
if (values[slot] == null) {
values[slot] = new BytesRef();
}
docTerms.getBytes(doc, values[slot]);
}
@Override
public FieldComparator setNextReader(AtomicReaderContext context) throws IOException {
final IndexDocValues dv = context.reader.docValues(field);
if (dv != null) {
docTerms = dv.getSource();
} else {
docTerms = IndexDocValues.getDefaultSource(ValueType.BYTES_VAR_DEREF);
}
return this;
}
@Override
public void setBottom(final int bottom) {
this.bottom = values[bottom];
}
@Override
public BytesRef value(int slot) {
return values[slot];
}
@Override
public int compareValues(BytesRef val1, BytesRef val2) {
assert val1 != null;
assert val2 != null;
return val1.compareTo(val2);
}
}
final protected static int binarySearch(BytesRef br, DocTermsIndex a, BytesRef key) {
return binarySearch(br, a, key, 1, a.numOrd()-1);
}

View File

@ -254,6 +254,7 @@ public class SortField {
@Override
public String toString() {
StringBuilder buffer = new StringBuilder();
String dv = useIndexValues ? " [dv]" : "";
switch (type) {
case SCORE:
buffer.append("<score>");
@ -264,11 +265,11 @@ public class SortField {
break;
case STRING:
buffer.append("<string: \"").append(field).append("\">");
buffer.append("<string" + dv + ": \"").append(field).append("\">");
break;
case STRING_VAL:
buffer.append("<string_val: \"").append(field).append("\">");
buffer.append("<string_val" + dv + ": \"").append(field).append("\">");
break;
case BYTE:
@ -280,7 +281,7 @@ public class SortField {
break;
case INT:
buffer.append("<int: \"").append(field).append("\">");
buffer.append("<int" + dv + ": \"").append(field).append("\">");
break;
case LONG:
@ -288,11 +289,11 @@ public class SortField {
break;
case FLOAT:
buffer.append("<float: \"").append(field).append("\">");
buffer.append("<float" + dv + ": \"").append(field).append("\">");
break;
case DOUBLE:
buffer.append("<double: \"").append(field).append("\">");
buffer.append("<double" + dv + ": \"").append(field).append("\">");
break;
case CUSTOM:
@ -415,10 +416,18 @@ public class SortField {
return comparatorSource.newComparator(field, numHits, sortPos, reverse);
case STRING:
return new FieldComparator.TermOrdValComparator(numHits, field, sortPos, reverse);
if (useIndexValues) {
return new FieldComparator.TermOrdValDocValuesComparator(numHits, field);
} else {
return new FieldComparator.TermOrdValComparator(numHits, field);
}
case STRING_VAL:
return new FieldComparator.TermValComparator(numHits, field);
if (useIndexValues) {
return new FieldComparator.TermValDocValuesComparator(numHits, field);
} else {
return new FieldComparator.TermValComparator(numHits, field);
}
case REWRITEABLE:
throw new IllegalStateException("SortField needs to be rewritten through Sort.rewrite(..) and SortField.rewrite(..)");

View File

@ -81,6 +81,7 @@ public class TestSort extends LuceneTestCase {
public static void beforeClass() throws Exception {
NUM_STRINGS = atLeast(6000);
}
// document data:
// the tracer field is used to determine which document was hit
// the contents field is used to search and sort by relevance
@ -111,7 +112,7 @@ public class TestSort extends LuceneTestCase {
{ "c", "m", "5", "5.0", "5", null, null, "5", "5", "5", "5", null},
{ "d", "m", null, null, null, null, null, null, null, null, null, null}
};
// create an index of all the documents, or just the x, or just the y documents
private IndexSearcher getIndex (boolean even, boolean odd)
throws IOException {
@ -119,6 +120,21 @@ public class TestSort extends LuceneTestCase {
dirs.add(indexStore);
RandomIndexWriter writer = new RandomIndexWriter(random, indexStore, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
final ValueType stringDVType;
if (dvStringSorted) {
// Index sorted
stringDVType = random.nextBoolean() ? ValueType.BYTES_VAR_SORTED : ValueType.BYTES_FIXED_SORTED;
} else {
// Index non-sorted
if (random.nextBoolean()) {
// Fixed
stringDVType = random.nextBoolean() ? ValueType.BYTES_FIXED_STRAIGHT : ValueType.BYTES_FIXED_DEREF;
} else {
// Var
stringDVType = random.nextBoolean() ? ValueType.BYTES_VAR_STRAIGHT : ValueType.BYTES_VAR_DEREF;
}
}
FieldType ft1 = new FieldType();
ft1.setStored(true);
FieldType ft2 = new FieldType();
@ -142,7 +158,13 @@ public class TestSort extends LuceneTestCase {
}
doc.add(f);
}
if (data[i][4] != null) doc.add (new StringField ("string", data[i][4]));
if (data[i][4] != null) {
Field f = new StringField ("string", data[i][4]);
if (supportsDocValues) {
f = IndexDocValuesField.build(f, stringDVType);
}
doc.add(f);
}
if (data[i][5] != null) doc.add (new StringField ("custom", data[i][5]));
if (data[i][6] != null) doc.add (new StringField ("i18n", data[i][6]));
if (data[i][7] != null) doc.add (new StringField ("long", data[i][7]));
@ -185,21 +207,52 @@ public class TestSort extends LuceneTestCase {
setMaxBufferedDocs(4).
setMergePolicy(newLogMergePolicy(97))
);
FieldType customType = new FieldType();
customType.setStored(true);
FieldType onlyStored = new FieldType();
onlyStored.setStored(true);
final int fixedLen = getRandomNumber(2, 8);
final int fixedLen2 = getRandomNumber(1, 4);
for (int i=0; i<NUM_STRINGS; i++) {
Document doc = new Document();
String num = getRandomCharString(getRandomNumber(2, 8), 48, 52);
doc.add (new Field ("tracer", num, customType));
//doc.add (new Field ("contents", Integer.toString(i), Field.Store.NO, Field.Index.ANALYZED));
doc.add (new StringField ("string", num));
String num2 = getRandomCharString(getRandomNumber(1, 4), 48, 50);
doc.add (new StringField ("string2", num2));
doc.add (new Field ("tracer2", num2, customType));
for(IndexableField f : doc.getFields()) {
((Field) f).setBoost(2.0f);
}
writer.addDocument (doc);
Document doc = new Document();
String num = getRandomCharString(getRandomNumber(2, 8), 48, 52);
doc.add (new Field ("tracer", num, onlyStored));
//doc.add (new Field ("contents", Integer.toString(i), Field.Store.NO, Field.Index.ANALYZED));
Field f = new StringField("string", num);
if (supportsDocValues) {
f = IndexDocValuesField.build(f, ValueType.BYTES_VAR_SORTED);
}
doc.add (f);
String num2 = getRandomCharString(getRandomNumber(1, 4), 48, 50);
f = new StringField ("string2", num2);
if (supportsDocValues) {
f = IndexDocValuesField.build(f, ValueType.BYTES_VAR_SORTED);
}
doc.add (f);
doc.add (new Field ("tracer2", num2, onlyStored));
for(IndexableField f2 : doc.getFields()) {
((Field) f2).setBoost(2.0f);
}
String numFixed = getRandomCharString(fixedLen, 48, 52);
doc.add (new Field ("fixed_tracer", numFixed, onlyStored));
//doc.add (new Field ("contents", Integer.toString(i), Field.Store.NO, Field.Index.ANALYZED));
f = new StringField("string_fixed", numFixed);
if (supportsDocValues) {
f = IndexDocValuesField.build(f, ValueType.BYTES_FIXED_SORTED);
}
doc.add (f);
String num2Fixed = getRandomCharString(fixedLen2, 48, 52);
f = new StringField ("string2_fixed", num2Fixed);
if (supportsDocValues) {
f = IndexDocValuesField.build(f, ValueType.BYTES_FIXED_SORTED);
}
doc.add (f);
doc.add (new Field ("tracer2_fixed", num2Fixed, onlyStored));
for(IndexableField f2 : doc.getFields()) {
((Field) f2).setBoost(2.0f);
}
writer.addDocument (doc);
}
//writer.forceMerge(1);
//System.out.println(writer.getSegmentCount());
@ -249,10 +302,15 @@ public class TestSort extends LuceneTestCase {
return getIndex (false, false);
}
// Set to true if the DV "string" field is indexed as a
// sorted source:
private boolean dvStringSorted;
@Override
public void setUp() throws Exception {
super.setUp();
dvStringSorted = random.nextBoolean();
full = getFullIndex();
searchX = getXIndex();
searchY = getYIndex();
@ -339,6 +397,20 @@ public class TestSort extends LuceneTestCase {
sort.setSort (useDocValues(new SortField ("double", SortField.Type.DOUBLE)), SortField.FIELD_DOC );
assertMatches (full, queryX, sort, "AGICE");
assertMatches (full, queryY, sort, "DJHBF");
sort.setSort (useDocValues(new SortField ("string", getDVStringSortType())), SortField.FIELD_DOC );
assertMatches (full, queryX, sort, "AIGEC");
assertMatches (full, queryY, sort, "DJHFB");
}
}
private SortField.Type getDVStringSortType() {
if (dvStringSorted) {
// If you index as sorted source you can still sort by
// value instead:
return random.nextBoolean() ? SortField.Type.STRING : SortField.Type.STRING_VAL;
} else {
return SortField.Type.STRING_VAL;
}
}
@ -405,42 +477,72 @@ public class TestSort extends LuceneTestCase {
/**
* Test String sorting: small queue to many matches, multi field sort, reverse sort
*/
public void testStringSort() throws IOException {
ScoreDoc[] result = null;
IndexSearcher searcher = getFullStrings();
public void testStringSort() throws Exception {
// Normal string field, var length
sort.setSort(
new SortField("string", SortField.Type.STRING),
new SortField("string2", SortField.Type.STRING, true),
SortField.FIELD_DOC);
verifyStringSort(sort);
result = searcher.search(new MatchAllDocsQuery(), null, 500, sort).scoreDocs;
// Normal string field, fixed length
sort.setSort(
new SortField("string_fixed", SortField.Type.STRING),
new SortField("string2_fixed", SortField.Type.STRING, true),
SortField.FIELD_DOC);
verifyStringSort(sort);
// Doc values field, var length
assumeFalse("cannot work with preflex codec",
"Lucene3x".equals(Codec.getDefault().getName()));
sort.setSort(
useDocValues(new SortField("string", getDVStringSortType())),
useDocValues(new SortField("string2", getDVStringSortType(), true)),
SortField.FIELD_DOC);
verifyStringSort(sort);
// Doc values field, fixed length
sort.setSort(
useDocValues(new SortField("string_fixed", getDVStringSortType())),
useDocValues(new SortField("string2_fixed", getDVStringSortType(), true)),
SortField.FIELD_DOC);
verifyStringSort(sort);
}
private void verifyStringSort(Sort sort) throws Exception {
final IndexSearcher searcher = getFullStrings();
final ScoreDoc[] result = searcher.search(new MatchAllDocsQuery(), null, _TestUtil.nextInt(random, 500, searcher.getIndexReader().maxDoc()), sort).scoreDocs;
StringBuilder buff = new StringBuilder();
int n = result.length;
String last = null;
String lastSub = null;
int lastDocId = 0;
boolean fail = false;
final String fieldSuffix = sort.getSort()[0].getField().endsWith("_fixed") ? "_fixed" : "";
for (int x = 0; x < n; ++x) {
Document doc2 = searcher.doc(result[x].doc);
IndexableField[] v = doc2.getFields("tracer");
IndexableField[] v2 = doc2.getFields("tracer2");
IndexableField[] v = doc2.getFields("tracer" + fieldSuffix);
IndexableField[] v2 = doc2.getFields("tracer2" + fieldSuffix);
for (int j = 0; j < v.length; ++j) {
buff.append(v[j] + "(" + v2[j] + ")(" + result[x].doc+")\n");
if (last != null) {
int cmp = v[j].stringValue().compareTo(last);
if (!(cmp >= 0)) { // ensure first field is in order
fail = true;
System.out.println("fail:" + v[j] + " < " + last);
buff.append(" WRONG tracer\n");
}
if (cmp == 0) { // ensure second field is in reverse order
cmp = v2[j].stringValue().compareTo(lastSub);
if (cmp > 0) {
fail = true;
System.out.println("rev field fail:" + v2[j] + " > " + lastSub);
buff.append(" WRONG tracer2\n");
} else if(cmp == 0) { // ensure docid is in order
if (result[x].doc < lastDocId) {
fail = true;
System.out.println("doc fail:" + result[x].doc + " > " + lastDocId);
buff.append(" WRONG docID\n");
}
}
}
@ -448,11 +550,10 @@ public class TestSort extends LuceneTestCase {
last = v[j].stringValue();
lastSub = v2[j].stringValue();
lastDocId = result[x].doc;
buff.append(v[j] + "(" + v2[j] + ")(" + result[x].doc+") ");
}
}
if(fail) {
System.out.println("topn field1(field2)(docID):" + buff);
if (fail) {
System.out.println("topn field1(field2)(docID):\n" + buff);
}
assertFalse("Found sort results out of order", fail);
searcher.close();
@ -549,6 +650,16 @@ public class TestSort extends LuceneTestCase {
sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT)), new SortField ("string", SortField.Type.STRING) );
assertMatches (empty, queryX, sort, "");
sort.setSort (useDocValues(new SortField ("string", getDVStringSortType(), true)), SortField.FIELD_DOC );
assertMatches (empty, queryX, sort, "");
sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT)),
useDocValues(new SortField ("string", getDVStringSortType())) );
assertMatches (empty, queryX, sort, "");
sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT)), useDocValues(new SortField ("string", getDVStringSortType())) );
assertMatches (empty, queryX, sort, "");
}
static class MyFieldComparator extends FieldComparator<Integer> {
@ -642,11 +753,18 @@ public class TestSort extends LuceneTestCase {
sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT, true)) );
assertMatches (full, queryX, sort, "AECIG");
assertMatches (full, queryY, sort, "BFJHD");
sort.setSort (useDocValues(new SortField ("string", getDVStringSortType(), true)) );
assertMatches (full, queryX, sort, "CEGIA");
assertMatches (full, queryY, sort, "BFHJD");
}
}
// test sorting when the sort field is empty (undefined) for some of the documents
public void testEmptyFieldSort() throws Exception {
// NOTE: do not test DocValues fields here, since you
// can't sort when some documents don't have the field
sort.setSort (new SortField ("string", SortField.Type.STRING) );
assertMatches (full, queryF, sort, "ZJI");
@ -662,14 +780,6 @@ public class TestSort extends LuceneTestCase {
sort.setSort (new SortField ("float", SortField.Type.FLOAT) );
assertMatches (full, queryF, sort, "ZJI");
if (supportsDocValues) {
sort.setSort (useDocValues(new SortField ("int", SortField.Type.INT)) );
assertMatches (full, queryF, sort, "IZJ");
sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT)) );
assertMatches (full, queryF, sort, "ZJI");
}
// using a nonexisting field as first sort key shouldn't make a difference:
sort.setSort (new SortField ("nosuchfield", SortField.Type.STRING),
new SortField ("float", SortField.Type.FLOAT) );
@ -679,7 +789,6 @@ public class TestSort extends LuceneTestCase {
assertMatches (full, queryF, sort, "IJZ");
// When a field is null for both documents, the next SortField should be used.
// Works for
sort.setSort (new SortField ("int", SortField.Type.INT),
new SortField ("string", SortField.Type.STRING),
new SortField ("float", SortField.Type.FLOAT) );
@ -688,7 +797,7 @@ public class TestSort extends LuceneTestCase {
// Reverse the last criterium to make sure the test didn't pass by chance
sort.setSort (new SortField ("int", SortField.Type.INT),
new SortField ("string", SortField.Type.STRING),
new SortField ("float", SortField.Type.FLOAT, true) );
new SortField ("float", SortField.Type.FLOAT, true) );
assertMatches (full, queryG, sort, "ZYXW");
// Do the same for a ParallelMultiSearcher
@ -696,13 +805,13 @@ public class TestSort extends LuceneTestCase {
IndexSearcher parallelSearcher=new IndexSearcher (full.getIndexReader(), exec);
sort.setSort (new SortField ("int", SortField.Type.INT),
new SortField ("string", SortField.Type.STRING),
new SortField ("float", SortField.Type.FLOAT) );
new SortField ("string", SortField.Type.STRING),
new SortField ("float", SortField.Type.FLOAT) );
assertMatches (parallelSearcher, queryG, sort, "ZWXY");
sort.setSort (new SortField ("int", SortField.Type.INT),
new SortField ("string", SortField.Type.STRING),
new SortField ("float", SortField.Type.FLOAT, true) );
new SortField ("string", SortField.Type.STRING),
new SortField ("float", SortField.Type.FLOAT, true) );
assertMatches (parallelSearcher, queryG, sort, "ZYXW");
parallelSearcher.close();
exec.shutdown();
@ -719,6 +828,20 @@ public class TestSort extends LuceneTestCase {
sort.setSort (new SortField ("float", SortField.Type.FLOAT), new SortField ("string", SortField.Type.STRING) );
assertMatches (full, queryX, sort, "GICEA");
if (supportsDocValues) {
sort.setSort (useDocValues(new SortField ("int", SortField.Type.INT)),
useDocValues(new SortField ("float", SortField.Type.FLOAT)));
assertMatches (full, queryX, sort, "IGEAC");
sort.setSort (useDocValues(new SortField ("int", SortField.Type.INT, true)),
useDocValues(new SortField (null, SortField.Type.DOC, true)));
assertMatches (full, queryX, sort, "CEAGI");
sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT)),
useDocValues(new SortField ("string", getDVStringSortType())));
assertMatches (full, queryX, sort, "GICEA");
}
}
// test a variety of sorts using a parallel multisearcher
@ -1064,6 +1187,21 @@ public class TestSort extends LuceneTestCase {
sort.setSort(useDocValues(new SortField ("int", SortField.Type.INT, true)));
assertMatches(multi, queryF, sort, "JZI");
sort.setSort(useDocValues(new SortField("string", getDVStringSortType())));
assertMatches(multi, queryA, sort, "DJAIHGFEBC");
sort.setSort(useDocValues(new SortField("string", getDVStringSortType(), true)));
assertMatches(multi, queryA, sort, "CBEFGHIAJD");
sort.setSort(useDocValues(new SortField("float", SortField.Type.FLOAT)),useDocValues(new SortField("string", getDVStringSortType())));
assertMatches(multi, queryA, sort, "GDHJICEFAB");
sort.setSort(useDocValues(new SortField ("string", getDVStringSortType())));
assertMatches(multi, queryF, sort, "ZJI");
sort.setSort(useDocValues(new SortField ("string", getDVStringSortType(), true)));
assertMatches(multi, queryF, sort, "IJZ");
}
// up to this point, all of the searches should have "sane"