From dbd48a72e4570014896ac43bcb9da7ff153d7099 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Sun, 13 Nov 2011 15:26:36 +0000 Subject: [PATCH] LUCENE-3518: enable sorting by sorted source doc values git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1201440 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/document/FieldType.java | 54 +- .../org/apache/lucene/index/values/Bytes.java | 10 +- .../index/values/FixedSortedBytesImpl.java | 5 + .../lucene/index/values/IndexDocValues.java | 109 +++- .../index/values/SortedBytesMergeUtils.java | 5 + .../index/values/VarSortedBytesImpl.java | 5 + .../apache/lucene/search/FieldComparator.java | 511 +++++++++++++++++- .../org/apache/lucene/search/SortField.java | 23 +- .../org/apache/lucene/search/TestSort.java | 216 ++++++-- 9 files changed, 850 insertions(+), 88 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/document/FieldType.java b/lucene/src/java/org/apache/lucene/document/FieldType.java index d47d2f9d3a9..a5bc9c1d60c 100644 --- a/lucene/src/java/org/apache/lucene/document/FieldType.java +++ b/lucene/src/java/org/apache/lucene/document/FieldType.java @@ -145,33 +145,33 @@ public class FieldType implements IndexableFieldType { if (result.length() > 0) result.append(","); result.append("indexed"); - } - if (tokenized()) { - if (result.length() > 0) - result.append(","); - result.append("tokenized"); - } - if (storeTermVectors()) { - if (result.length() > 0) - result.append(","); - result.append("termVector"); - } - if (storeTermVectorOffsets()) { - if (result.length() > 0) - result.append(","); - result.append("termVectorOffsets"); - } - if (storeTermVectorPositions()) { - if (result.length() > 0) - result.append(","); - result.append("termVectorPosition"); - } - if (omitNorms()) { - result.append(",omitNorms"); - } - if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - result.append(",indexOptions="); - result.append(indexOptions); + if (tokenized()) { + if (result.length() > 0) + result.append(","); + result.append("tokenized"); + } + if (storeTermVectors()) { + if (result.length() > 0) + result.append(","); + result.append("termVector"); + } + if (storeTermVectorOffsets()) { + if (result.length() > 0) + result.append(","); + result.append("termVectorOffsets"); + } + if (storeTermVectorPositions()) { + if (result.length() > 0) + result.append(","); + result.append("termVectorPosition"); + } + if (omitNorms()) { + result.append(",omitNorms"); + } + if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + result.append(",indexOptions="); + result.append(indexOptions); + } } return result.toString(); diff --git a/lucene/src/java/org/apache/lucene/index/values/Bytes.java b/lucene/src/java/org/apache/lucene/index/values/Bytes.java index aac08af314c..c0b07291e2d 100644 --- a/lucene/src/java/org/apache/lucene/index/values/Bytes.java +++ b/lucene/src/java/org/apache/lucene/index/values/Bytes.java @@ -32,17 +32,17 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.ByteBlockPool.Allocator; +import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.Counter; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.ByteBlockPool.Allocator; -import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; -import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray; import org.apache.lucene.util.packed.PackedInts; /** @@ -586,7 +586,11 @@ public final class Bytes { this.idxIn = idxIn; ordToOffsetIndex = hasOffsets ? PackedInts.getReader(idxIn) : null; docToOrdIndex = PackedInts.getReader(idxIn); + } + @Override + public PackedInts.Reader getDocToOrd() { + return docToOrdIndex; } @Override diff --git a/lucene/src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java b/lucene/src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java index 0d711369efd..08fdf94fd9f 100644 --- a/lucene/src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java +++ b/lucene/src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java @@ -194,6 +194,11 @@ class FixedSortedBytesImpl { return (int) docToOrdIndex.get(docID); } + @Override + public PackedInts.Reader getDocToOrd() { + return docToOrdIndex; + } + @Override public BytesRef getByOrd(int ord, BytesRef bytesRef) { try { diff --git a/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java b/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java index 6d00aaa7b12..81896472482 100644 --- a/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java +++ b/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.codecs.DocValuesFormat; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.packed.PackedInts; /** * {@link IndexDocValues} provides a dense per-document typed storage for fast @@ -223,7 +224,7 @@ public abstract class IndexDocValues implements Closeable { return null; } } - + /** * A sorted variant of {@link Source} for byte[] values per document. *

@@ -257,6 +258,18 @@ public abstract class IndexDocValues implements Closeable { /** Returns value for specified ord. */ public abstract BytesRef getByOrd(int ord, BytesRef bytesRef); + /** + * Returns the PackedInts.Reader impl that maps document to ord. + */ + public abstract PackedInts.Reader getDocToOrd(); + + /** + * Returns the comparator used to order the BytesRefs. + */ + public Comparator getComparator() { + return comparator; + } + /** * Performs a lookup by value. * @@ -304,4 +317,98 @@ public abstract class IndexDocValues implements Closeable { */ public abstract int getValueCount(); } + + /** Returns a Source that always returns default (missing) + * values for all documents. */ + public static Source getDefaultSource(final ValueType type) { + return new Source(type) { + @Override + public long getInt(int docID) { + return 0; + } + + @Override + public double getFloat(int docID) { + return 0.0; + } + + @Override + public BytesRef getBytes(int docID, BytesRef ref) { + ref.length = 0; + return ref; + } + }; + } + + /** Returns a SortedSource that always returns default (missing) + * values for all documents. */ + public static SortedSource getDefaultSortedSource(final ValueType type, final int size) { + + final PackedInts.Reader docToOrd = new PackedInts.Reader() { + @Override + public long get(int index) { + return 0; + } + + @Override + public int getBitsPerValue() { + return 0; + } + + @Override + public int size() { + return size; + } + + @Override + public boolean hasArray() { + return false; + } + + @Override + public Object getArray() { + return null; + } + }; + + return new SortedSource(type, BytesRef.getUTF8SortedAsUnicodeComparator()) { + + @Override + public BytesRef getBytes(int docID, BytesRef ref) { + ref.length = 0; + return ref; + } + + @Override + public int ord(int docID) { + return 0; + } + + @Override + public BytesRef getByOrd(int ord, BytesRef bytesRef) { + assert ord == 0; + bytesRef.length = 0; + return bytesRef; + } + + @Override + public PackedInts.Reader getDocToOrd() { + return docToOrd; + } + + @Override + public int getByValue(BytesRef value, BytesRef spare) { + if (value.length == 0) { + return 0; + } else { + return -1; + } + } + + @Override + public int getValueCount() { + return 1; + } + }; + } } diff --git a/lucene/src/java/org/apache/lucene/index/values/SortedBytesMergeUtils.java b/lucene/src/java/org/apache/lucene/index/values/SortedBytesMergeUtils.java index 6cfc24e583f..fe966677ea8 100644 --- a/lucene/src/java/org/apache/lucene/index/values/SortedBytesMergeUtils.java +++ b/lucene/src/java/org/apache/lucene/index/values/SortedBytesMergeUtils.java @@ -299,6 +299,11 @@ final class SortedBytesMergeUtils { return bytesRef; } + @Override + public PackedInts.Reader getDocToOrd() { + return null; + } + @Override public int getValueCount() { return 1; diff --git a/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java b/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java index 496b3d82f39..ac58a08b074 100644 --- a/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java +++ b/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java @@ -214,6 +214,11 @@ final class VarSortedBytesImpl { return (int) docToOrdIndex.get(docID); } + @Override + public PackedInts.Reader getDocToOrd() { + return docToOrdIndex; + } + @Override public BytesRef getByOrd(int ord, BytesRef bytesRef) { try { diff --git a/lucene/src/java/org/apache/lucene/search/FieldComparator.java b/lucene/src/java/org/apache/lucene/search/FieldComparator.java index 9f8f09f16c9..9dd0ed4a96e 100644 --- a/lucene/src/java/org/apache/lucene/search/FieldComparator.java +++ b/lucene/src/java/org/apache/lucene/search/FieldComparator.java @@ -18,10 +18,14 @@ package org.apache.lucene.search; */ import java.io.IOException; +import java.util.Comparator; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.values.IndexDocValues.SortedSource; import org.apache.lucene.index.values.IndexDocValues.Source; import org.apache.lucene.index.values.IndexDocValues; +import org.apache.lucene.index.values.ValueType; import org.apache.lucene.search.FieldCache.ByteParser; import org.apache.lucene.search.FieldCache.DocTerms; import org.apache.lucene.search.FieldCache.DocTermsIndex; @@ -399,6 +403,8 @@ public abstract class FieldComparator { final IndexDocValues docValues = context.reader.docValues(field); if (docValues != null) { currentReaderValues = docValues.getSource(); + } else { + currentReaderValues = IndexDocValues.getDefaultSource(ValueType.FLOAT_64); } return this; } @@ -690,6 +696,8 @@ public abstract class FieldComparator { IndexDocValues docValues = context.reader.docValues(field); if (docValues != null) { currentReaderValues = docValues.getSource(); + } else { + currentReaderValues = IndexDocValues.getDefaultSource(ValueType.FIXED_INTS_64); } return this; } @@ -911,30 +919,53 @@ public abstract class FieldComparator { * than {@link TermValComparator}. For very small * result sets it may be slower. */ public static final class TermOrdValComparator extends FieldComparator { - /** @lucene.internal */ + /* Ords for each slot. + @lucene.internal */ final int[] ords; - /** @lucene.internal */ + + /* Values for each slot. + @lucene.internal */ final BytesRef[] values; - /** @lucene.internal */ + + /* Which reader last copied a value into the slot. When + we compare two slots, we just compare-by-ord if the + readerGen is the same; else we must compare the + values (slower). + @lucene.internal */ final int[] readerGen; - /** @lucene.internal */ + /* Gen of current reader we are on. + @lucene.internal */ int currentReaderGen = -1; - private DocTermsIndex termsIndex; + + /* Current reader's doc ord/values. + @lucene.internal */ + DocTermsIndex termsIndex; + private final String field; - /** @lucene.internal */ + /* Bottom slot, or -1 if queue isn't full yet + @lucene.internal */ int bottomSlot = -1; - /** @lucene.internal */ + + /* Bottom ord (same as ords[bottomSlot] once bottomSlot + is set). Cached for faster compares. + @lucene.internal */ int bottomOrd; - /** @lucene.internal */ + + /* True if current bottom slot matches the current + reader. + @lucene.internal */ boolean bottomSameReader; - /** @lucene.internal */ + + /* Bottom value (same as values[bottomSlot] once + bottomSlot is set). Cached for faster compares. + @lucene.internal */ BytesRef bottomValue; - /** @lucene.internal */ + final BytesRef tempBR = new BytesRef(); - public TermOrdValComparator(int numHits, String field, int sortPos, boolean reversed) { + public TermOrdValComparator(int numHits, String field) { ords = new int[numHits]; values = new BytesRef[numHits]; readerGen = new int[numHits]; @@ -1325,6 +1356,396 @@ public abstract class FieldComparator { } } + /** Sorts by field's natural Term sort order, using + * ordinals; this is just like {@link + * TermOrdValComparator} except it uses DocValues to + * retrieve the sort ords saved during indexing. */ + public static final class TermOrdValDocValuesComparator extends FieldComparator { + /* Ords for each slot. + @lucene.internal */ + final int[] ords; + + /* Values for each slot. + @lucene.internal */ + final BytesRef[] values; + + /* Which reader last copied a value into the slot. When + we compare two slots, we just compare-by-ord if the + readerGen is the same; else we must compare the + values (slower). + @lucene.internal */ + final int[] readerGen; + + /* Gen of current reader we are on. + @lucene.internal */ + int currentReaderGen = -1; + + /* Current reader's doc ord/values. + @lucene.internal */ + SortedSource termsIndex; + + /* Comparator for comparing by value. + @lucene.internal */ + Comparator comp; + + private final String field; + + /* Bottom slot, or -1 if queue isn't full yet + @lucene.internal */ + int bottomSlot = -1; + + /* Bottom ord (same as ords[bottomSlot] once bottomSlot + is set). Cached for faster compares. + @lucene.internal */ + int bottomOrd; + + /* True if current bottom slot matches the current + reader. + @lucene.internal */ + boolean bottomSameReader; + + /* Bottom value (same as values[bottomSlot] once + bottomSlot is set). Cached for faster compares. + @lucene.internal */ + BytesRef bottomValue; + + /** @lucene.internal */ + final BytesRef tempBR = new BytesRef(); + + public TermOrdValDocValuesComparator(int numHits, String field) { + ords = new int[numHits]; + values = new BytesRef[numHits]; + readerGen = new int[numHits]; + this.field = field; + } + + @Override + public int compare(int slot1, int slot2) { + if (readerGen[slot1] == readerGen[slot2]) { + return ords[slot1] - ords[slot2]; + } + + final BytesRef val1 = values[slot1]; + final BytesRef val2 = values[slot2]; + if (val1 == null) { + if (val2 == null) { + return 0; + } + return -1; + } else if (val2 == null) { + return 1; + } + return comp.compare(val1, val2); + } + + @Override + public int compareBottom(int doc) { + throw new UnsupportedOperationException(); + } + + @Override + public void copy(int slot, int doc) { + throw new UnsupportedOperationException(); + } + + // TODO: would be nice to share these specialized impls + // w/ TermOrdValComparator + + /** Base class for specialized (per bit width of the + * ords) per-segment comparator. NOTE: this is messy; + * we do this only because hotspot can't reliably inline + * the underlying array access when looking up doc->ord + * @lucene.internal + */ + abstract class PerSegmentComparator extends FieldComparator { + + @Override + public FieldComparator setNextReader(AtomicReaderContext context) throws IOException { + return TermOrdValDocValuesComparator.this.setNextReader(context); + } + + @Override + public int compare(int slot1, int slot2) { + return TermOrdValDocValuesComparator.this.compare(slot1, slot2); + } + + @Override + public void setBottom(final int bottom) { + TermOrdValDocValuesComparator.this.setBottom(bottom); + } + + @Override + public BytesRef value(int slot) { + return TermOrdValDocValuesComparator.this.value(slot); + } + + @Override + public int compareValues(BytesRef val1, BytesRef val2) { + assert val1 != null; + assert val2 != null; + return comp.compare(val1, val2); + } + } + + // Used per-segment when bit width of doc->ord is 8: + private final class ByteOrdComparator extends PerSegmentComparator { + private final byte[] readerOrds; + private final SortedSource termsIndex; + private final int docBase; + + public ByteOrdComparator(byte[] readerOrds, SortedSource termsIndex, int docBase) { + this.readerOrds = readerOrds; + this.termsIndex = termsIndex; + this.docBase = docBase; + } + + @Override + public int compareBottom(int doc) { + assert bottomSlot != -1; + if (bottomSameReader) { + // ord is precisely comparable, even in the equal case + return bottomOrd - (readerOrds[doc]&0xFF); + } else { + // ord is only approx comparable: if they are not + // equal, we can use that; if they are equal, we + // must fallback to compare by value + final int order = readerOrds[doc]&0xFF; + final int cmp = bottomOrd - order; + if (cmp != 0) { + return cmp; + } + + termsIndex.getByOrd(order, tempBR); + return comp.compare(bottomValue, tempBR); + } + } + + @Override + public void copy(int slot, int doc) { + final int ord = readerOrds[doc]&0xFF; + ords[slot] = ord; + if (values[slot] == null) { + values[slot] = new BytesRef(); + } + termsIndex.getByOrd(ord, values[slot]); + readerGen[slot] = currentReaderGen; + } + } + + // Used per-segment when bit width of doc->ord is 16: + private final class ShortOrdComparator extends PerSegmentComparator { + private final short[] readerOrds; + private final SortedSource termsIndex; + private final int docBase; + + public ShortOrdComparator(short[] readerOrds, SortedSource termsIndex, int docBase) { + this.readerOrds = readerOrds; + this.termsIndex = termsIndex; + this.docBase = docBase; + } + + @Override + public int compareBottom(int doc) { + assert bottomSlot != -1; + if (bottomSameReader) { + // ord is precisely comparable, even in the equal case + return bottomOrd - (readerOrds[doc]&0xFFFF); + } else { + // ord is only approx comparable: if they are not + // equal, we can use that; if they are equal, we + // must fallback to compare by value + final int order = readerOrds[doc]&0xFFFF; + final int cmp = bottomOrd - order; + if (cmp != 0) { + return cmp; + } + + termsIndex.getByOrd(order, tempBR); + return comp.compare(bottomValue, tempBR); + } + } + + @Override + public void copy(int slot, int doc) { + final int ord = readerOrds[doc]&0xFFFF; + ords[slot] = ord; + if (values[slot] == null) { + values[slot] = new BytesRef(); + } + termsIndex.getByOrd(ord, values[slot]); + readerGen[slot] = currentReaderGen; + } + } + + // Used per-segment when bit width of doc->ord is 32: + private final class IntOrdComparator extends PerSegmentComparator { + private final int[] readerOrds; + private final SortedSource termsIndex; + private final int docBase; + + public IntOrdComparator(int[] readerOrds, SortedSource termsIndex, int docBase) { + this.readerOrds = readerOrds; + this.termsIndex = termsIndex; + this.docBase = docBase; + } + + @Override + public int compareBottom(int doc) { + assert bottomSlot != -1; + if (bottomSameReader) { + // ord is precisely comparable, even in the equal case + return bottomOrd - readerOrds[doc]; + } else { + // ord is only approx comparable: if they are not + // equal, we can use that; if they are equal, we + // must fallback to compare by value + final int order = readerOrds[doc]; + final int cmp = bottomOrd - order; + if (cmp != 0) { + return cmp; + } + termsIndex.getByOrd(order, tempBR); + return comp.compare(bottomValue, tempBR); + } + } + + @Override + public void copy(int slot, int doc) { + final int ord = readerOrds[doc]; + ords[slot] = ord; + if (values[slot] == null) { + values[slot] = new BytesRef(); + } + termsIndex.getByOrd(ord, values[slot]); + readerGen[slot] = currentReaderGen; + } + } + + // Used per-segment when bit width is not a native array + // size (8, 16, 32): + private final class AnyOrdComparator extends PerSegmentComparator { + private final PackedInts.Reader readerOrds; + private final int docBase; + + public AnyOrdComparator(PackedInts.Reader readerOrds, int docBase) { + this.readerOrds = readerOrds; + this.docBase = docBase; + } + + @Override + public int compareBottom(int doc) { + assert bottomSlot != -1; + if (bottomSameReader) { + // ord is precisely comparable, even in the equal case + return bottomOrd - (int) readerOrds.get(doc); + } else { + // ord is only approx comparable: if they are not + // equal, we can use that; if they are equal, we + // must fallback to compare by value + final int order = (int) readerOrds.get(doc); + final int cmp = bottomOrd - order; + if (cmp != 0) { + return cmp; + } + termsIndex.getByOrd(order, tempBR); + return comp.compare(bottomValue, tempBR); + } + } + + @Override + public void copy(int slot, int doc) { + final int ord = (int) readerOrds.get(doc); + ords[slot] = ord; + if (values[slot] == null) { + values[slot] = new BytesRef(); + } + termsIndex.getByOrd(ord, values[slot]); + readerGen[slot] = currentReaderGen; + } + } + + @Override + public FieldComparator setNextReader(AtomicReaderContext context) throws IOException { + final int docBase = context.docBase; + + final IndexDocValues dv = context.reader.docValues(field); + if (dv == null) { + termsIndex = IndexDocValues.getDefaultSortedSource(ValueType.BYTES_VAR_SORTED, context.reader.maxDoc()); + } else { + termsIndex = dv.getSource().asSortedSource(); + if (termsIndex == null) { + termsIndex = IndexDocValues.getDefaultSortedSource(ValueType.BYTES_VAR_SORTED, context.reader.maxDoc()); + } + } + + comp = termsIndex.getComparator(); + + FieldComparator perSegComp = null; + final PackedInts.Reader docToOrd = termsIndex.getDocToOrd(); + if (docToOrd.hasArray()) { + final Object arr = docToOrd.getArray(); + assert arr != null; + if (arr instanceof byte[]) { + // 8 bit packed + perSegComp = new ByteOrdComparator((byte[]) arr, termsIndex, docBase); + } else if (arr instanceof short[]) { + // 16 bit packed + perSegComp = new ShortOrdComparator((short[]) arr, termsIndex, docBase); + } else if (arr instanceof int[]) { + // 32 bit packed + perSegComp = new IntOrdComparator((int[]) arr, termsIndex, docBase); + } + } + + if (perSegComp == null) { + perSegComp = new AnyOrdComparator(docToOrd, docBase); + } + + currentReaderGen++; + if (bottomSlot != -1) { + perSegComp.setBottom(bottomSlot); + } + + return perSegComp; + } + + @Override + public void setBottom(final int bottom) { + bottomSlot = bottom; + + bottomValue = values[bottomSlot]; + if (currentReaderGen == readerGen[bottomSlot]) { + bottomOrd = ords[bottomSlot]; + bottomSameReader = true; + } else { + if (bottomValue == null) { + // 0 ord is null for all segments + assert ords[bottomSlot] == 0; + bottomOrd = 0; + bottomSameReader = true; + readerGen[bottomSlot] = currentReaderGen; + } else { + final int index = termsIndex.getByValue(bottomValue, tempBR); + if (index < 0) { + bottomOrd = -index - 2; + bottomSameReader = false; + } else { + bottomOrd = index; + // exact value match + bottomSameReader = true; + readerGen[bottomSlot] = currentReaderGen; + ords[bottomSlot] = bottomOrd; + } + } + } + } + + @Override + public BytesRef value(int slot) { + return values[slot]; + } + } + /** Sorts by field's natural Term sort order. All * comparisons are done using BytesRef.compareTo, which is * slow for medium to large result sets but possibly @@ -1410,6 +1831,74 @@ public abstract class FieldComparator { } } + /** Sorts by field's natural Term sort order. All + * comparisons are done using BytesRef.compareTo, which is + * slow for medium to large result sets but possibly + * very fast for very small results sets. The BytesRef + * values are obtained using {@link IndexReader#docValues}. */ + public static final class TermValDocValuesComparator extends FieldComparator { + + private BytesRef[] values; + private Source docTerms; + private final String field; + private BytesRef bottom; + private final BytesRef tempBR = new BytesRef(); + + TermValDocValuesComparator(int numHits, String field) { + values = new BytesRef[numHits]; + this.field = field; + } + + @Override + public int compare(int slot1, int slot2) { + assert values[slot1] != null; + assert values[slot2] != null; + return values[slot1].compareTo(values[slot2]); + } + + @Override + public int compareBottom(int doc) { + assert bottom != null; + return bottom.compareTo(docTerms.getBytes(doc, tempBR)); + } + + @Override + public void copy(int slot, int doc) { + if (values[slot] == null) { + values[slot] = new BytesRef(); + } + docTerms.getBytes(doc, values[slot]); + } + + @Override + public FieldComparator setNextReader(AtomicReaderContext context) throws IOException { + final IndexDocValues dv = context.reader.docValues(field); + if (dv != null) { + docTerms = dv.getSource(); + } else { + docTerms = IndexDocValues.getDefaultSource(ValueType.BYTES_VAR_DEREF); + } + return this; + } + + @Override + public void setBottom(final int bottom) { + this.bottom = values[bottom]; + } + + @Override + public BytesRef value(int slot) { + return values[slot]; + } + + @Override + public int compareValues(BytesRef val1, BytesRef val2) { + assert val1 != null; + assert val2 != null; + return val1.compareTo(val2); + } + } + final protected static int binarySearch(BytesRef br, DocTermsIndex a, BytesRef key) { return binarySearch(br, a, key, 1, a.numOrd()-1); } diff --git a/lucene/src/java/org/apache/lucene/search/SortField.java b/lucene/src/java/org/apache/lucene/search/SortField.java index 42cf6aaa974..bbd47147a4c 100644 --- a/lucene/src/java/org/apache/lucene/search/SortField.java +++ b/lucene/src/java/org/apache/lucene/search/SortField.java @@ -254,6 +254,7 @@ public class SortField { @Override public String toString() { StringBuilder buffer = new StringBuilder(); + String dv = useIndexValues ? " [dv]" : ""; switch (type) { case SCORE: buffer.append(""); @@ -264,11 +265,11 @@ public class SortField { break; case STRING: - buffer.append(""); + buffer.append(""); break; case STRING_VAL: - buffer.append(""); + buffer.append(""); break; case BYTE: @@ -280,7 +281,7 @@ public class SortField { break; case INT: - buffer.append(""); + buffer.append(""); break; case LONG: @@ -288,11 +289,11 @@ public class SortField { break; case FLOAT: - buffer.append(""); + buffer.append(""); break; case DOUBLE: - buffer.append(""); + buffer.append(""); break; case CUSTOM: @@ -415,10 +416,18 @@ public class SortField { return comparatorSource.newComparator(field, numHits, sortPos, reverse); case STRING: - return new FieldComparator.TermOrdValComparator(numHits, field, sortPos, reverse); + if (useIndexValues) { + return new FieldComparator.TermOrdValDocValuesComparator(numHits, field); + } else { + return new FieldComparator.TermOrdValComparator(numHits, field); + } case STRING_VAL: - return new FieldComparator.TermValComparator(numHits, field); + if (useIndexValues) { + return new FieldComparator.TermValDocValuesComparator(numHits, field); + } else { + return new FieldComparator.TermValComparator(numHits, field); + } case REWRITEABLE: throw new IllegalStateException("SortField needs to be rewritten through Sort.rewrite(..) and SortField.rewrite(..)"); diff --git a/lucene/src/test/org/apache/lucene/search/TestSort.java b/lucene/src/test/org/apache/lucene/search/TestSort.java index b3112516329..7ddb2d9c402 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSort.java +++ b/lucene/src/test/org/apache/lucene/search/TestSort.java @@ -81,6 +81,7 @@ public class TestSort extends LuceneTestCase { public static void beforeClass() throws Exception { NUM_STRINGS = atLeast(6000); } + // document data: // the tracer field is used to determine which document was hit // the contents field is used to search and sort by relevance @@ -111,7 +112,7 @@ public class TestSort extends LuceneTestCase { { "c", "m", "5", "5.0", "5", null, null, "5", "5", "5", "5", null}, { "d", "m", null, null, null, null, null, null, null, null, null, null} }; - + // create an index of all the documents, or just the x, or just the y documents private IndexSearcher getIndex (boolean even, boolean odd) throws IOException { @@ -119,6 +120,21 @@ public class TestSort extends LuceneTestCase { dirs.add(indexStore); RandomIndexWriter writer = new RandomIndexWriter(random, indexStore, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + final ValueType stringDVType; + if (dvStringSorted) { + // Index sorted + stringDVType = random.nextBoolean() ? ValueType.BYTES_VAR_SORTED : ValueType.BYTES_FIXED_SORTED; + } else { + // Index non-sorted + if (random.nextBoolean()) { + // Fixed + stringDVType = random.nextBoolean() ? ValueType.BYTES_FIXED_STRAIGHT : ValueType.BYTES_FIXED_DEREF; + } else { + // Var + stringDVType = random.nextBoolean() ? ValueType.BYTES_VAR_STRAIGHT : ValueType.BYTES_VAR_DEREF; + } + } + FieldType ft1 = new FieldType(); ft1.setStored(true); FieldType ft2 = new FieldType(); @@ -142,7 +158,13 @@ public class TestSort extends LuceneTestCase { } doc.add(f); } - if (data[i][4] != null) doc.add (new StringField ("string", data[i][4])); + if (data[i][4] != null) { + Field f = new StringField ("string", data[i][4]); + if (supportsDocValues) { + f = IndexDocValuesField.build(f, stringDVType); + } + doc.add(f); + } if (data[i][5] != null) doc.add (new StringField ("custom", data[i][5])); if (data[i][6] != null) doc.add (new StringField ("i18n", data[i][6])); if (data[i][7] != null) doc.add (new StringField ("long", data[i][7])); @@ -185,21 +207,52 @@ public class TestSort extends LuceneTestCase { setMaxBufferedDocs(4). setMergePolicy(newLogMergePolicy(97)) ); - FieldType customType = new FieldType(); - customType.setStored(true); + FieldType onlyStored = new FieldType(); + onlyStored.setStored(true); + final int fixedLen = getRandomNumber(2, 8); + final int fixedLen2 = getRandomNumber(1, 4); for (int i=0; i= 0)) { // ensure first field is in order fail = true; System.out.println("fail:" + v[j] + " < " + last); + buff.append(" WRONG tracer\n"); } if (cmp == 0) { // ensure second field is in reverse order cmp = v2[j].stringValue().compareTo(lastSub); if (cmp > 0) { fail = true; System.out.println("rev field fail:" + v2[j] + " > " + lastSub); + buff.append(" WRONG tracer2\n"); } else if(cmp == 0) { // ensure docid is in order if (result[x].doc < lastDocId) { fail = true; System.out.println("doc fail:" + result[x].doc + " > " + lastDocId); + buff.append(" WRONG docID\n"); } } } @@ -448,11 +550,10 @@ public class TestSort extends LuceneTestCase { last = v[j].stringValue(); lastSub = v2[j].stringValue(); lastDocId = result[x].doc; - buff.append(v[j] + "(" + v2[j] + ")(" + result[x].doc+") "); } } - if(fail) { - System.out.println("topn field1(field2)(docID):" + buff); + if (fail) { + System.out.println("topn field1(field2)(docID):\n" + buff); } assertFalse("Found sort results out of order", fail); searcher.close(); @@ -549,6 +650,16 @@ public class TestSort extends LuceneTestCase { sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT)), new SortField ("string", SortField.Type.STRING) ); assertMatches (empty, queryX, sort, ""); + + sort.setSort (useDocValues(new SortField ("string", getDVStringSortType(), true)), SortField.FIELD_DOC ); + assertMatches (empty, queryX, sort, ""); + + sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT)), + useDocValues(new SortField ("string", getDVStringSortType())) ); + assertMatches (empty, queryX, sort, ""); + + sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT)), useDocValues(new SortField ("string", getDVStringSortType())) ); + assertMatches (empty, queryX, sort, ""); } static class MyFieldComparator extends FieldComparator { @@ -642,11 +753,18 @@ public class TestSort extends LuceneTestCase { sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT, true)) ); assertMatches (full, queryX, sort, "AECIG"); assertMatches (full, queryY, sort, "BFJHD"); + + sort.setSort (useDocValues(new SortField ("string", getDVStringSortType(), true)) ); + assertMatches (full, queryX, sort, "CEGIA"); + assertMatches (full, queryY, sort, "BFHJD"); } } // test sorting when the sort field is empty (undefined) for some of the documents public void testEmptyFieldSort() throws Exception { + + // NOTE: do not test DocValues fields here, since you + // can't sort when some documents don't have the field sort.setSort (new SortField ("string", SortField.Type.STRING) ); assertMatches (full, queryF, sort, "ZJI"); @@ -662,14 +780,6 @@ public class TestSort extends LuceneTestCase { sort.setSort (new SortField ("float", SortField.Type.FLOAT) ); assertMatches (full, queryF, sort, "ZJI"); - if (supportsDocValues) { - sort.setSort (useDocValues(new SortField ("int", SortField.Type.INT)) ); - assertMatches (full, queryF, sort, "IZJ"); - - sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT)) ); - assertMatches (full, queryF, sort, "ZJI"); - } - // using a nonexisting field as first sort key shouldn't make a difference: sort.setSort (new SortField ("nosuchfield", SortField.Type.STRING), new SortField ("float", SortField.Type.FLOAT) ); @@ -679,7 +789,6 @@ public class TestSort extends LuceneTestCase { assertMatches (full, queryF, sort, "IJZ"); // When a field is null for both documents, the next SortField should be used. - // Works for sort.setSort (new SortField ("int", SortField.Type.INT), new SortField ("string", SortField.Type.STRING), new SortField ("float", SortField.Type.FLOAT) ); @@ -688,7 +797,7 @@ public class TestSort extends LuceneTestCase { // Reverse the last criterium to make sure the test didn't pass by chance sort.setSort (new SortField ("int", SortField.Type.INT), new SortField ("string", SortField.Type.STRING), - new SortField ("float", SortField.Type.FLOAT, true) ); + new SortField ("float", SortField.Type.FLOAT, true) ); assertMatches (full, queryG, sort, "ZYXW"); // Do the same for a ParallelMultiSearcher @@ -696,13 +805,13 @@ public class TestSort extends LuceneTestCase { IndexSearcher parallelSearcher=new IndexSearcher (full.getIndexReader(), exec); sort.setSort (new SortField ("int", SortField.Type.INT), - new SortField ("string", SortField.Type.STRING), - new SortField ("float", SortField.Type.FLOAT) ); + new SortField ("string", SortField.Type.STRING), + new SortField ("float", SortField.Type.FLOAT) ); assertMatches (parallelSearcher, queryG, sort, "ZWXY"); sort.setSort (new SortField ("int", SortField.Type.INT), - new SortField ("string", SortField.Type.STRING), - new SortField ("float", SortField.Type.FLOAT, true) ); + new SortField ("string", SortField.Type.STRING), + new SortField ("float", SortField.Type.FLOAT, true) ); assertMatches (parallelSearcher, queryG, sort, "ZYXW"); parallelSearcher.close(); exec.shutdown(); @@ -719,6 +828,20 @@ public class TestSort extends LuceneTestCase { sort.setSort (new SortField ("float", SortField.Type.FLOAT), new SortField ("string", SortField.Type.STRING) ); assertMatches (full, queryX, sort, "GICEA"); + + if (supportsDocValues) { + sort.setSort (useDocValues(new SortField ("int", SortField.Type.INT)), + useDocValues(new SortField ("float", SortField.Type.FLOAT))); + assertMatches (full, queryX, sort, "IGEAC"); + + sort.setSort (useDocValues(new SortField ("int", SortField.Type.INT, true)), + useDocValues(new SortField (null, SortField.Type.DOC, true))); + assertMatches (full, queryX, sort, "CEAGI"); + + sort.setSort (useDocValues(new SortField ("float", SortField.Type.FLOAT)), + useDocValues(new SortField ("string", getDVStringSortType()))); + assertMatches (full, queryX, sort, "GICEA"); + } } // test a variety of sorts using a parallel multisearcher @@ -1064,6 +1187,21 @@ public class TestSort extends LuceneTestCase { sort.setSort(useDocValues(new SortField ("int", SortField.Type.INT, true))); assertMatches(multi, queryF, sort, "JZI"); + + sort.setSort(useDocValues(new SortField("string", getDVStringSortType()))); + assertMatches(multi, queryA, sort, "DJAIHGFEBC"); + + sort.setSort(useDocValues(new SortField("string", getDVStringSortType(), true))); + assertMatches(multi, queryA, sort, "CBEFGHIAJD"); + + sort.setSort(useDocValues(new SortField("float", SortField.Type.FLOAT)),useDocValues(new SortField("string", getDVStringSortType()))); + assertMatches(multi, queryA, sort, "GDHJICEFAB"); + + sort.setSort(useDocValues(new SortField ("string", getDVStringSortType()))); + assertMatches(multi, queryF, sort, "ZJI"); + + sort.setSort(useDocValues(new SortField ("string", getDVStringSortType(), true))); + assertMatches(multi, queryF, sort, "IJZ"); } // up to this point, all of the searches should have "sane"