Clean up handling of missing values when merging shard results on the coordinating node.

Today shards are responsible for producing one sort value per document, which
is later used on the coordinating node to resolve the global top documents.
However, this is problematic on string fields with
`missing: _first, order: desc` or `missing: _last, order: asc` given that there
is no such thing as a string that compares greater than any other string. Today
we use a string containing a single code point which is the maximum allowed code
point but this is a hack: instead we should inform the coordinating node that
the document had no value and let it figure out how it should be sorted
depending on whether missing values should be sorted first or last.

Close #9155
This commit is contained in:
Adrien Grand 2015-07-08 14:32:45 +02:00
parent b970fbacee
commit fbab48e451
8 changed files with 69 additions and 65 deletions

View File

@ -413,7 +413,13 @@ public class Lucene {
if (in.readBoolean()) {
field = in.readString();
}
fields[i] = new SortField(field, readSortType(in), in.readBoolean());
SortField.Type sortType = readSortType(in);
Object missingValue = readMissingValue(in);
boolean reverse = in.readBoolean();
fields[i] = new SortField(field, sortType, reverse);
if (missingValue != null) {
fields[i].setMissingValue(missingValue);
}
}
FieldDoc[] fieldDocs = new FieldDoc[in.readVInt()];
@ -485,9 +491,12 @@ public class Lucene {
out.writeString(sortField.getField());
}
if (sortField.getComparatorSource() != null) {
writeSortType(out, ((IndexFieldData.XFieldComparatorSource) sortField.getComparatorSource()).reducedType());
IndexFieldData.XFieldComparatorSource comparatorSource = (IndexFieldData.XFieldComparatorSource) sortField.getComparatorSource();
writeSortType(out, comparatorSource.reducedType());
writeMissingValue(out, comparatorSource.missingValue(sortField.getReverse()));
} else {
writeSortType(out, sortField.getType());
writeMissingValue(out, sortField.missingValue);
}
out.writeBoolean(sortField.getReverse());
}
@ -508,6 +517,31 @@ public class Lucene {
}
}
private static void writeMissingValue(StreamOutput out, Object missingValue) throws IOException {
if (missingValue == SortField.STRING_FIRST) {
out.writeByte((byte) 1);
} else if (missingValue == SortField.STRING_LAST) {
out.writeByte((byte) 2);
} else {
out.writeByte((byte) 0);
out.writeGenericValue(missingValue);
}
}
private static Object readMissingValue(StreamInput in) throws IOException {
final byte id = in.readByte();
switch (id) {
case 0:
return in.readGenericValue();
case 1:
return SortField.STRING_FIRST;
case 2:
return SortField.STRING_LAST;
default:
throw new IOException("Unknown missing value id: " + id);
}
}
public static void writeFieldDoc(StreamOutput out, FieldDoc fieldDoc) throws IOException {
out.writeVInt(fieldDoc.fields.length);
for (Object field : fieldDoc.fields) {

View File

@ -25,13 +25,11 @@ import org.apache.lucene.search.*;
import org.apache.lucene.search.join.BitDocIdSetFilter;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexComponent;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.settings.IndexSettings;
@ -112,16 +110,6 @@ public interface IndexFieldData<FD extends AtomicFieldData> extends IndexCompone
// on another node (we don't have the custom source them...)
public abstract class XFieldComparatorSource extends FieldComparatorSource {
/** UTF-8 term containing a single code point: {@link Character#MAX_CODE_POINT} which will compare greater than all other index terms
* since {@link Character#MAX_CODE_POINT} is a noncharacter and thus shouldn't appear in an index term. */
public static final BytesRef MAX_TERM;
static {
BytesRefBuilder builder = new BytesRefBuilder();
final char[] chars = Character.toChars(Character.MAX_CODE_POINT);
builder.copyChars(chars, 0, chars.length);
MAX_TERM = builder.toBytesRef();
}
/**
* Simple wrapper class around a filter that matches parent documents
* and a filter that matches child documents. For every root document R,
@ -179,7 +167,7 @@ public interface IndexFieldData<FD extends AtomicFieldData> extends IndexCompone
return min ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
case STRING:
case STRING_VAL:
return min ? null : MAX_TERM;
return null;
default:
throw new UnsupportedOperationException("Unsupported reduced type: " + reducedType());
}
@ -225,6 +213,18 @@ public interface IndexFieldData<FD extends AtomicFieldData> extends IndexCompone
}
public abstract SortField.Type reducedType();
/**
* Return a missing value that is understandable by {@link SortField#setMissingValue(Object)}.
* Most implementations return null because they already replace the value at the fielddata level.
* However this can't work in case of strings since there is no such thing as a string which
* compares greater than any other string, so in that case we need to return
* {@link SortField#STRING_FIRST} or {@link SortField#STRING_LAST} so that the coordinating node
* knows how to deal with null values.
*/
public Object missingValue(boolean reversed) {
return null;
}
}
interface Builder {

View File

@ -59,6 +59,19 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
return SortField.Type.STRING;
}
@Override
public Object missingValue(boolean reversed) {
if (sortMissingFirst(missingValue) || sortMissingLast(missingValue)) {
if (sortMissingLast(missingValue) ^ reversed) {
return SortField.STRING_LAST;
} else {
return SortField.STRING_FIRST;
}
}
// otherwise we fill missing values ourselves
return null;
}
protected SortedBinaryDocValues getValues(LeafReaderContext context) throws IOException {
return indexFieldData.load(context).getBytesValues();
}
@ -97,29 +110,6 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
BytesRefFieldComparatorSource.this.setScorer(scorer);
}
@Override
public BytesRef value(int slot) {
// TODO: When serializing the response to the coordinating node, we lose the information about
// whether the comparator sorts missing docs first or last. We should fix it and let
// TopDocs.merge deal with it (it knows how to)
BytesRef value = super.value(slot);
if (value == null) {
assert sortMissingFirst(missingValue) || sortMissingLast(missingValue);
value = missingBytes;
}
return value;
}
@Override
public void setTopValue(BytesRef topValue) {
// symetric of value(int): if we need to feed the comparator with <tt>null</tt>
// if we overrode the value with MAX_TERM in value(int)
if (topValue == missingBytes && (sortMissingFirst(missingValue) || sortMissingLast(missingValue))) {
topValue = null;
}
super.setTopValue(topValue);
}
};
}
@ -156,15 +146,6 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
BytesRefFieldComparatorSource.this.setScorer(scorer);
}
@Override
public BytesRef value(int slot) {
BytesRef value = super.value(slot);
if (value == null) {
value = missingBytes;
}
return value;
}
};
}

View File

@ -64,7 +64,6 @@ import static org.elasticsearch.search.internal.InternalSearchHitField.readSearc
public class InternalSearchHit implements SearchHit {
private static final Object[] EMPTY_SORT_VALUES = new Object[0];
private static final Text MAX_TERM_AS_TEXT = new StringAndBytesText(BytesRefFieldComparatorSource.MAX_TERM.utf8ToString());
private transient int docId;
@ -506,13 +505,7 @@ public class InternalSearchHit implements SearchHit {
if (sortValues != null && sortValues.length > 0) {
builder.startArray(Fields.SORT);
for (Object sortValue : sortValues) {
if (sortValue != null && sortValue.equals(MAX_TERM_AS_TEXT)) {
// We don't display MAX_TERM in JSON responses in case some clients have UTF-8 parsers that wouldn't accept a
// non-character in the response, even though this is valid UTF-8
builder.nullValue();
} else {
builder.value(sortValue);
}
builder.value(sortValue);
}
builder.endArray();
}

View File

@ -247,9 +247,9 @@ public abstract class AbstractFieldDataImplTests extends AbstractFieldDataTests
assertThat(topDocs.scoreDocs[5].doc, equalTo(6));
assertThat(((BytesRef) ((FieldDoc) topDocs.scoreDocs[5]).fields[0]).utf8ToString(), equalTo("08"));
assertThat(topDocs.scoreDocs[6].doc, equalTo(1));
assertThat((BytesRef) ((FieldDoc) topDocs.scoreDocs[6]).fields[0], equalTo(XFieldComparatorSource.MAX_TERM));
assertThat((BytesRef) ((FieldDoc) topDocs.scoreDocs[6]).fields[0], equalTo(null));
assertThat(topDocs.scoreDocs[7].doc, equalTo(5));
assertThat((BytesRef) ((FieldDoc) topDocs.scoreDocs[7]).fields[0], equalTo(XFieldComparatorSource.MAX_TERM));
assertThat((BytesRef) ((FieldDoc) topDocs.scoreDocs[7]).fields[0], equalTo(null));
topDocs = searcher.search(new MatchAllDocsQuery(), 10,
new Sort(new SortField("value", indexFieldData.comparatorSource(null, MultiValueMode.MAX, null), true)));

View File

@ -443,13 +443,11 @@ public abstract class AbstractStringFieldDataTests extends AbstractFieldDataImpl
if (cmpValue == null) {
if ("_first".equals(missingValue)) {
cmpValue = new BytesRef();
} else if ("_last".equals(missingValue)) {
cmpValue = XFieldComparatorSource.MAX_TERM;
} else {
} else if ("_last".equals(missingValue) == false) {
cmpValue = (BytesRef) missingValue;
}
}
if (previous != null) {
if (previous != null && cmpValue != null) {
assertTrue(previous.utf8ToString() + " / " + cmpValue.utf8ToString(), previous.compareTo(cmpValue) <= 0);
}
previous = cmpValue;

View File

@ -185,7 +185,7 @@ public class ParentChildFieldDataTests extends AbstractFieldDataTests {
assertThat(topDocs.scoreDocs[6].doc, equalTo(6));
assertThat(((BytesRef) ((FieldDoc) topDocs.scoreDocs[6]).fields[0]).utf8ToString(), equalTo("2"));
assertThat(topDocs.scoreDocs[7].doc, equalTo(7));
assertThat(((BytesRef) ((FieldDoc) topDocs.scoreDocs[7]).fields[0]), equalTo(XFieldComparatorSource.MAX_TERM));
assertThat(((BytesRef) ((FieldDoc) topDocs.scoreDocs[7]).fields[0]), equalTo(null));
topDocs = searcher.search(new MatchAllDocsQuery(), 10, new Sort(new SortField(ParentFieldMapper.NAME, comparator, true)));
assertThat(topDocs.totalHits, equalTo(8));

View File

@ -37,7 +37,6 @@ import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.unit.DistanceUnit;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.mapper.Uid;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
@ -2017,10 +2016,9 @@ public class SimpleSortTests extends ElasticsearchIntegrationTest {
.addSort(fieldSort("str_field").order(SortOrder.ASC).unmappedType("string"))
.addSort(fieldSort("str_field2").order(SortOrder.DESC).unmappedType("string")).get();
final StringAndBytesText maxTerm = new StringAndBytesText(IndexFieldData.XFieldComparatorSource.MAX_TERM.utf8ToString());
assertSortValues(resp,
new Object[] {new StringAndBytesText("bcd"), null},
new Object[] {maxTerm, null});
new Object[] {null, null});
resp = client().prepareSearch("test1", "test2")
.addSort(fieldSort("long_field").order(SortOrder.ASC).unmappedType("long"))