LUCENE-5835: TermValComparator can sort missing values last.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1612245 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2014-07-21 11:04:25 +00:00
parent ec788948a6
commit 4bd3a57899
7 changed files with 160 additions and 137 deletions

View File

@ -123,6 +123,8 @@ New Features
format, to include term ordinals in the index so the optional
TermsEnum.ord() and TermsEnum.seekExact(long ord) APIs work. (Mike
McCandless)
* LUCENE-5835: TermValComparator can sort missing values last. (Adrien Grand)
API Changes

View File

@ -687,6 +687,7 @@ public abstract class FieldComparator<T> {
/* Values for each slot.
@lucene.internal */
final BytesRef[] values;
private final BytesRef[] tempBRs;
/* Which reader last copied a value into the slot. When
we compare two slots, we just compare-by-ord if the
@ -729,8 +730,6 @@ public abstract class FieldComparator<T> {
boolean topSameReader;
int topOrd;
final BytesRef tempBR = new BytesRef();
/** -1 if missing values are sorted first, 1 if they are
* sorted last */
final int missingSortCmp;
@ -749,6 +748,7 @@ public abstract class FieldComparator<T> {
public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) {
ords = new int[numHits];
values = new BytesRef[numHits];
tempBRs = new BytesRef[numHits];
readerGen = new int[numHits];
this.field = field;
if (sortMissingLast) {
@ -807,9 +807,10 @@ public abstract class FieldComparator<T> {
values[slot] = null;
} else {
assert ord >= 0;
if (values[slot] == null) {
values[slot] = new BytesRef();
if (tempBRs[slot] == null) {
tempBRs[slot] = new BytesRef();
}
values[slot] = tempBRs[slot];
values[slot].copyBytes(termsIndex.lookupOrd(ord));
}
ords[slot] = ord;
@ -934,15 +935,7 @@ public abstract class FieldComparator<T> {
* comparisons are done using BytesRef.compareTo, which is
* slow for medium to large result sets but possibly
* very fast for very small results sets. */
// TODO: should we remove this? who really uses it?
public static final class TermValComparator extends FieldComparator<BytesRef> {
// sentinels, just used internally in this comparator
private static final byte[] MISSING_BYTES = new byte[0];
// TODO: this is seriously not good, we should nuke this comparator, or
// instead we should represent missing as null, or use missingValue from the user...
// but it was always this way...
private final BytesRef MISSING_BYTESREF = new BytesRef(MISSING_BYTES);
private final BytesRef[] values;
private final BytesRef[] tempBRs;
@ -951,14 +944,16 @@ public abstract class FieldComparator<T> {
private final String field;
private BytesRef bottom;
private BytesRef topValue;
private final int missingSortCmp;
// TODO: add missing first/last support here?
/** Sole constructor. */
TermValComparator(int numHits, String field) {
public TermValComparator(int numHits, String field, boolean sortMissingLast) {
values = new BytesRef[numHits];
tempBRs = new BytesRef[numHits];
this.field = field;
missingSortCmp = sortMissingLast ? 1 : -1;
}
@Override
@ -977,8 +972,8 @@ public abstract class FieldComparator<T> {
@Override
public void copy(int slot, int doc) {
final BytesRef comparableBytes = getComparableBytes(doc, docTerms.get(doc));
if (comparableBytes == MISSING_BYTESREF) {
values[slot] = MISSING_BYTESREF;
if (comparableBytes == null) {
values[slot] = null;
} else {
if (tempBRs[slot] == null) {
tempBRs[slot] = new BytesRef();
@ -988,10 +983,32 @@ public abstract class FieldComparator<T> {
}
}
/** Retrieves the BinaryDocValues for the field in this segment */
protected BinaryDocValues getBinaryDocValues(AtomicReaderContext context, String field) throws IOException {
return DocValues.getBinary(context.reader(), field);
}
/** Retrieves the set of documents that have a value in this segment */
protected Bits getDocsWithField(AtomicReaderContext context, String field) throws IOException {
return DocValues.getDocsWithField(context.reader(), field);
}
/** Check whether the given value represents <tt>null</tt>. This can be
* useful if the {@link BinaryDocValues} returned by {@link #getBinaryDocValues}
* use a special value as a sentinel. The default implementation checks
* {@link #getDocsWithField}.
* <p>NOTE: The null value can only be an EMPTY {@link BytesRef}. */
protected boolean isNull(int doc, BytesRef term) {
return docsWithField != null && docsWithField.get(doc) == false;
}
@Override
public FieldComparator<BytesRef> setNextReader(AtomicReaderContext context) throws IOException {
docTerms = DocValues.getBinary(context.reader(), field);
docsWithField = DocValues.getDocsWithField(context.reader(), field);
docTerms = getBinaryDocValues(context, field);
docsWithField = getDocsWithField(context, field);
if (docsWithField instanceof Bits.MatchAllBits) {
docsWithField = null;
}
return this;
}
@ -1002,12 +1019,8 @@ public abstract class FieldComparator<T> {
@Override
public void setTopValue(BytesRef value) {
if (value == null) {
throw new IllegalArgumentException("value cannot be null");
}
if (value.bytes == MISSING_BYTES) {
value = MISSING_BYTESREF;
}
// null is fine: it means the last doc of the prior
// search was missing this value
topValue = value;
}
@ -1019,13 +1032,13 @@ public abstract class FieldComparator<T> {
@Override
public int compareValues(BytesRef val1, BytesRef val2) {
// missing always sorts first:
if (val1 == MISSING_BYTESREF) {
if (val2 == MISSING_BYTESREF) {
if (val1 == null) {
if (val2 == null) {
return 0;
}
return -1;
} else if (val2 == MISSING_BYTESREF) {
return 1;
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}
@ -1038,11 +1051,11 @@ public abstract class FieldComparator<T> {
/**
* Given a document and a term, return the term itself if it exists or
* {@link #MISSING_BYTESREF} otherwise.
* <tt>null</tt> otherwise.
*/
private BytesRef getComparableBytes(int doc, BytesRef term) {
if (term.length == 0 && docsWithField.get(doc) == false) {
return MISSING_BYTESREF;
if (term.length == 0 && isNull(doc, term)) {
return null;
}
return term;
}

View File

@ -142,7 +142,7 @@ public class SortField {
};
public void setMissingValue(Object missingValue) {
if (type == Type.STRING) {
if (type == Type.STRING || type == Type.STRING_VAL) {
if (missingValue != STRING_FIRST && missingValue != STRING_LAST) {
throw new IllegalArgumentException("For STRING type, missing value must be either STRING_FIRST or STRING_LAST");
}
@ -351,8 +351,7 @@ public class SortField {
return new FieldComparator.TermOrdValComparator(numHits, field, missingValue == STRING_LAST);
case STRING_VAL:
// TODO: should we remove this? who really uses it?
return new FieldComparator.TermValComparator(numHits, field);
return new FieldComparator.TermValComparator(numHits, field, missingValue == STRING_LAST);
case REWRITEABLE:
throw new IllegalStateException("SortField needs to be rewritten through Sort.rewrite(..) and SortField.rewrite(..)");

View File

@ -97,6 +97,20 @@ public class TestSearchAfter extends LuceneTestCase {
}
}
// Also test missing first / last for the "string_val" sorts:
for(String field : new String[] {"sortedbytesdocvaluesval", "straightbytesdocvalues"}) {
for(int rev=0;rev<2;rev++) {
boolean reversed = rev == 0;
SortField sf = new SortField(field, SortField.Type.STRING_VAL, reversed);
sf.setMissingValue(SortField.STRING_FIRST);
allSortFields.add(sf);
sf = new SortField(field, SortField.Type.STRING_VAL, reversed);
sf.setMissingValue(SortField.STRING_LAST);
allSortFields.add(sf);
}
}
int limit = allSortFields.size();
for(int i=0;i<limit;i++) {
SortField sf = allSortFields.get(i);

View File

@ -48,6 +48,14 @@ import org.apache.lucene.util.TestUtil;
public class TestSortRandom extends LuceneTestCase {
public void testRandomStringSort() throws Exception {
testRandomStringSort(SortField.Type.STRING);
}
public void testRandomStringValSort() throws Exception {
testRandomStringSort(SortField.Type.STRING_VAL);
}
private void testRandomStringSort(SortField.Type type) throws Exception {
Random random = new Random(random().nextLong());
final int NUM_DOCS = atLeast(100);
@ -125,7 +133,7 @@ public class TestSortRandom extends LuceneTestCase {
final SortField sf;
final boolean sortMissingLast;
final boolean missingIsNull;
sf = new SortField("stringdv", SortField.Type.STRING, reverse);
sf = new SortField("stringdv", type, reverse);
// Can only use sort missing if the DVFormat
// supports docsWithField:
sortMissingLast = defaultCodecSupportsDocsWithField() && random().nextBoolean();

View File

@ -55,8 +55,16 @@ import org.apache.lucene.util.LuceneTestCase;
*/
public class TestFieldCacheSort extends LuceneTestCase {
/** Tests sorting on type string */
public void testString() throws IOException {
testString(SortField.Type.STRING);
}
public void testStringVal() throws Exception {
testString(SortField.Type.STRING_VAL);
}
/** Tests sorting on type string */
private void testString(SortField.Type sortType) throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
@ -65,12 +73,13 @@ public class TestFieldCacheSort extends LuceneTestCase {
doc = new Document();
doc.add(newStringField("value", "bar", Field.Store.YES));
writer.addDocument(doc);
Type type = sortType == SortField.Type.STRING ? Type.SORTED : Type.BINARY;
IndexReader ir = UninvertingReader.wrap(writer.getReader(),
Collections.singletonMap("value", Type.SORTED));
Collections.singletonMap("value", type));
writer.shutdown();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(new SortField("value", SortField.Type.STRING));
Sort sort = new Sort(new SortField("value", sortType));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(2, td.totalHits);
@ -82,8 +91,16 @@ public class TestFieldCacheSort extends LuceneTestCase {
dir.close();
}
/** Tests sorting on type string with a missing value */
public void testStringMissing() throws IOException {
testStringMissing(SortField.Type.STRING);
}
public void testStringValMissing() throws IOException {
testStringMissing(SortField.Type.STRING_VAL);
}
/** Tests sorting on type string with a missing value */
private void testStringMissing(SortField.Type sortType) throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
@ -94,12 +111,13 @@ public class TestFieldCacheSort extends LuceneTestCase {
doc = new Document();
doc.add(newStringField("value", "bar", Field.Store.YES));
writer.addDocument(doc);
Type type = sortType == SortField.Type.STRING ? Type.SORTED : Type.BINARY;
IndexReader ir = UninvertingReader.wrap(writer.getReader(),
Collections.singletonMap("value", Type.SORTED));
Collections.singletonMap("value", type));
writer.shutdown();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(new SortField("value", SortField.Type.STRING));
Sort sort = new Sort(new SortField("value", sortType));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(3, td.totalHits);
@ -112,8 +130,16 @@ public class TestFieldCacheSort extends LuceneTestCase {
dir.close();
}
/** Tests reverse sorting on type string */
public void testStringReverse() throws IOException {
testStringReverse(SortField.Type.STRING);
}
public void testStringValReverse() throws IOException {
testStringReverse(SortField.Type.STRING_VAL);
}
/** Tests reverse sorting on type string */
private void testStringReverse(SortField.Type sortType) throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
@ -122,12 +148,13 @@ public class TestFieldCacheSort extends LuceneTestCase {
doc = new Document();
doc.add(newStringField("value", "foo", Field.Store.YES));
writer.addDocument(doc);
Type type = sortType == SortField.Type.STRING ? Type.SORTED : Type.BINARY;
IndexReader ir = UninvertingReader.wrap(writer.getReader(),
Collections.singletonMap("value", Type.SORTED));
Collections.singletonMap("value", type));
writer.shutdown();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(new SortField("value", SortField.Type.STRING, true));
Sort sort = new Sort(new SortField("value", sortType, true));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(2, td.totalHits);
@ -139,66 +166,17 @@ public class TestFieldCacheSort extends LuceneTestCase {
dir.close();
}
/** Tests sorting on type string_val */
public void testStringVal() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newStringField("value", "foo", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(newStringField("value", "bar", Field.Store.YES));
writer.addDocument(doc);
IndexReader ir = UninvertingReader.wrap(writer.getReader(),
Collections.singletonMap("value", Type.BINARY));
writer.shutdown();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(new SortField("value", SortField.Type.STRING_VAL));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(2, td.totalHits);
// 'bar' comes before 'foo'
assertEquals("bar", searcher.doc(td.scoreDocs[0].doc).get("value"));
assertEquals("foo", searcher.doc(td.scoreDocs[1].doc).get("value"));
ir.close();
dir.close();
public void testStringMissingSortedFirst() throws IOException {
testStringMissingSortedFirst(SortField.Type.STRING);
}
/** Tests sorting on type string_val with a missing value */
public void testStringValMissing() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
writer.addDocument(doc);
doc = new Document();
doc.add(newStringField("value", "foo", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(newStringField("value", "bar", Field.Store.YES));
writer.addDocument(doc);
IndexReader ir = UninvertingReader.wrap(writer.getReader(),
Collections.singletonMap("value", Type.BINARY));
writer.shutdown();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(new SortField("value", SortField.Type.STRING_VAL));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(3, td.totalHits);
// null comes first
assertNull(searcher.doc(td.scoreDocs[0].doc).get("value"));
assertEquals("bar", searcher.doc(td.scoreDocs[1].doc).get("value"));
assertEquals("foo", searcher.doc(td.scoreDocs[2].doc).get("value"));
ir.close();
dir.close();
public void testStringValMissingSortedFirst() throws IOException {
testStringMissingSortedFirst(SortField.Type.STRING_VAL);
}
/** Tests sorting on type string with a missing
* value sorted first */
public void testStringMissingSortedFirst() throws IOException {
private void testStringMissingSortedFirst(SortField.Type sortType) throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
@ -209,12 +187,13 @@ public class TestFieldCacheSort extends LuceneTestCase {
doc = new Document();
doc.add(newStringField("value", "bar", Field.Store.YES));
writer.addDocument(doc);
Type type = sortType == SortField.Type.STRING ? Type.SORTED : Type.BINARY;
IndexReader ir = UninvertingReader.wrap(writer.getReader(),
Collections.singletonMap("value", Type.SORTED));
Collections.singletonMap("value", type));
writer.shutdown();
IndexSearcher searcher = newSearcher(ir);
SortField sf = new SortField("value", SortField.Type.STRING);
SortField sf = new SortField("value", sortType);
Sort sort = new Sort(sf);
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
@ -228,9 +207,17 @@ public class TestFieldCacheSort extends LuceneTestCase {
dir.close();
}
public void testStringMissingSortedFirstReverse() throws IOException {
testStringMissingSortedFirstReverse(SortField.Type.STRING);
}
public void testStringValMissingSortedFirstReverse() throws IOException {
testStringMissingSortedFirstReverse(SortField.Type.STRING_VAL);
}
/** Tests reverse sorting on type string with a missing
* value sorted first */
public void testStringMissingSortedFirstReverse() throws IOException {
private void testStringMissingSortedFirstReverse(SortField.Type sortType) throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
@ -241,12 +228,13 @@ public class TestFieldCacheSort extends LuceneTestCase {
doc = new Document();
doc.add(newStringField("value", "bar", Field.Store.YES));
writer.addDocument(doc);
Type type = sortType == SortField.Type.STRING ? Type.SORTED : Type.BINARY;
IndexReader ir = UninvertingReader.wrap(writer.getReader(),
Collections.singletonMap("value", Type.SORTED));
Collections.singletonMap("value", type));
writer.shutdown();
IndexSearcher searcher = newSearcher(ir);
SortField sf = new SortField("value", SortField.Type.STRING, true);
SortField sf = new SortField("value", sortType, true);
Sort sort = new Sort(sf);
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
@ -260,9 +248,17 @@ public class TestFieldCacheSort extends LuceneTestCase {
dir.close();
}
public void testStringMissingSortedLast() throws IOException {
testStringMissingSortedLast(SortField.Type.STRING);
}
public void testStringValMissingSortedLast() throws IOException {
testStringMissingSortedLast(SortField.Type.STRING_VAL);
}
/** Tests sorting on type string with a missing
* value sorted last */
public void testStringValMissingSortedLast() throws IOException {
private void testStringMissingSortedLast(SortField.Type sortType) throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
@ -273,12 +269,13 @@ public class TestFieldCacheSort extends LuceneTestCase {
doc = new Document();
doc.add(newStringField("value", "bar", Field.Store.YES));
writer.addDocument(doc);
Type type = sortType == SortField.Type.STRING ? Type.SORTED : Type.BINARY;
IndexReader ir = UninvertingReader.wrap(writer.getReader(),
Collections.singletonMap("value", Type.SORTED));
Collections.singletonMap("value", type));
writer.shutdown();
IndexSearcher searcher = newSearcher(ir);
SortField sf = new SortField("value", SortField.Type.STRING);
SortField sf = new SortField("value", sortType);
sf.setMissingValue(SortField.STRING_LAST);
Sort sort = new Sort(sf);
@ -293,9 +290,17 @@ public class TestFieldCacheSort extends LuceneTestCase {
dir.close();
}
public void testStringMissingSortedLastReverse() throws IOException {
testStringMissingSortedLastReverse(SortField.Type.STRING);
}
public void testStringValMissingSortedLastReverse() throws IOException {
testStringMissingSortedLastReverse(SortField.Type.STRING_VAL);
}
/** Tests reverse sorting on type string with a missing
* value sorted last */
public void testStringValMissingSortedLastReverse() throws IOException {
private void testStringMissingSortedLastReverse(SortField.Type sortType) throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
@ -306,12 +311,13 @@ public class TestFieldCacheSort extends LuceneTestCase {
doc = new Document();
doc.add(newStringField("value", "bar", Field.Store.YES));
writer.addDocument(doc);
Type type = sortType == SortField.Type.STRING ? Type.SORTED : Type.BINARY;
IndexReader ir = UninvertingReader.wrap(writer.getReader(),
Collections.singletonMap("value", Type.SORTED));
Collections.singletonMap("value", type));
writer.shutdown();
IndexSearcher searcher = newSearcher(ir);
SortField sf = new SortField("value", SortField.Type.STRING, true);
SortField sf = new SortField("value", sortType, true);
sf.setMissingValue(SortField.STRING_LAST);
Sort sort = new Sort(sf);
@ -326,33 +332,6 @@ public class TestFieldCacheSort extends LuceneTestCase {
dir.close();
}
/** Tests reverse sorting on type string_val */
public void testStringValReverse() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newStringField("value", "bar", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(newStringField("value", "foo", Field.Store.YES));
writer.addDocument(doc);
IndexReader ir = UninvertingReader.wrap(writer.getReader(),
Collections.singletonMap("value", Type.BINARY));
writer.shutdown();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(new SortField("value", SortField.Type.STRING_VAL, true));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(2, td.totalHits);
// 'foo' comes after 'bar' in reverse order
assertEquals("foo", searcher.doc(td.scoreDocs[0].doc).get("value"));
assertEquals("bar", searcher.doc(td.scoreDocs[1].doc).get("value"));
ir.close();
dir.close();
}
/** Tests sorting on internal docid order */
public void testFieldDoc() throws Exception {
Directory dir = newDirectory();

View File

@ -60,6 +60,14 @@ import org.apache.lucene.util.TestUtil;
public class TestFieldCacheSortRandom extends LuceneTestCase {
public void testRandomStringSort() throws Exception {
testRandomStringSort(SortField.Type.STRING);
}
public void testRandomStringValSort() throws Exception {
testRandomStringSort(SortField.Type.STRING_VAL);
}
private void testRandomStringSort(SortField.Type type) throws Exception {
Random random = new Random(random().nextLong());
final int NUM_DOCS = atLeast(100);
@ -138,7 +146,7 @@ public class TestFieldCacheSortRandom extends LuceneTestCase {
final SortField sf;
final boolean sortMissingLast;
final boolean missingIsNull;
sf = new SortField("stringdv", SortField.Type.STRING, reverse);
sf = new SortField("stringdv", type, reverse);
sortMissingLast = random().nextBoolean();
missingIsNull = true;