Clean up handling of missing values when merging shard results on the coordinating node.

Today shards are responsible for producing one sort value per document, which is later used on the coordinating node to resolve the global top documents. However, this is problematic on string fields with `missing: _first, order: desc` or `missing: _last, order: asc` given that there is no such thing as a string that compares greater than any other string. Today we use a string containing a single code point which is the maximum allowed code point but this is a hack: instead we should inform the coordinating node that the document had no value and let it figure out how it should be sorted depending on whether missing values should be sorted first or last. Close #9155
2015-07-08 14:32:45 +02:00 · 2015-07-08 14:32:45 +02:00 · fbab48e451
parent b970fbacee
commit fbab48e451
8 changed files with 69 additions and 65 deletions
--- a/core/src/main/java/org/elasticsearch/common/lucene/Lucene.java
+++ b/core/src/main/java/org/elasticsearch/common/lucene/Lucene.java
@ -413,7 +413,13 @@ public class Lucene {
                if (in.readBoolean()) {
                    field = in.readString();
                }
-                fields[i] = new SortField(field, readSortType(in), in.readBoolean());
+                SortField.Type sortType = readSortType(in);
+                Object missingValue = readMissingValue(in);
+                boolean reverse = in.readBoolean();
+                fields[i] = new SortField(field, sortType, reverse);
+                if (missingValue != null) {
+                    fields[i].setMissingValue(missingValue);
+                }
            }

            FieldDoc[] fieldDocs = new FieldDoc[in.readVInt()];
@ -485,9 +491,12 @@ public class Lucene {
                    out.writeString(sortField.getField());
                }
                if (sortField.getComparatorSource() != null) {
-                    writeSortType(out, ((IndexFieldData.XFieldComparatorSource) sortField.getComparatorSource()).reducedType());
+                    IndexFieldData.XFieldComparatorSource comparatorSource = (IndexFieldData.XFieldComparatorSource) sortField.getComparatorSource();
+                    writeSortType(out, comparatorSource.reducedType());
+                    writeMissingValue(out, comparatorSource.missingValue(sortField.getReverse()));
                } else {
                    writeSortType(out, sortField.getType());
+                    writeMissingValue(out, sortField.missingValue);
                }
                out.writeBoolean(sortField.getReverse());
            }
@ -508,6 +517,31 @@ public class Lucene {
        }
    }

+    private static void writeMissingValue(StreamOutput out, Object missingValue) throws IOException {
+        if (missingValue == SortField.STRING_FIRST) {
+            out.writeByte((byte) 1);
+        } else if (missingValue == SortField.STRING_LAST) {
+            out.writeByte((byte) 2);
+        } else {
+            out.writeByte((byte) 0);
+            out.writeGenericValue(missingValue);
+        }
+    }
+
+    private static Object readMissingValue(StreamInput in) throws IOException {
+        final byte id = in.readByte();
+        switch (id) {
+        case 0:
+            return in.readGenericValue();
+        case 1:
+            return SortField.STRING_FIRST;
+        case 2:
+            return SortField.STRING_LAST;
+        default:
+            throw new IOException("Unknown missing value id: " + id);
+        }
+    }
+
    public static void writeFieldDoc(StreamOutput out, FieldDoc fieldDoc) throws IOException {
        out.writeVInt(fieldDoc.fields.length);
        for (Object field : fieldDoc.fields) {
--- a/core/src/main/java/org/elasticsearch/index/fielddata/IndexFieldData.java
+++ b/core/src/main/java/org/elasticsearch/index/fielddata/IndexFieldData.java
@ -25,13 +25,11 @@ import org.apache.lucene.search.*;
 import org.apache.lucene.search.join.BitDocIdSetFilter;
 import org.apache.lucene.util.BitDocIdSet;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexComponent;
 import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
-import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.index.settings.IndexSettings;
@ -112,16 +110,6 @@ public interface IndexFieldData<FD extends AtomicFieldData> extends IndexCompone
    // on another node (we don't have the custom source them...)
    public abstract class XFieldComparatorSource extends FieldComparatorSource {

-        /** UTF-8 term containing a single code point: {@link Character#MAX_CODE_POINT} which will compare greater than all other index terms
-         *  since {@link Character#MAX_CODE_POINT} is a noncharacter and thus shouldn't appear in an index term. */
-        public static final BytesRef MAX_TERM;
-        static {
-            BytesRefBuilder builder = new BytesRefBuilder();
-            final char[] chars = Character.toChars(Character.MAX_CODE_POINT);
-            builder.copyChars(chars, 0, chars.length);
-            MAX_TERM = builder.toBytesRef();
-        }
-
        /**
         * Simple wrapper class around a filter that matches parent documents
         * and a filter that matches child documents. For every root document R,
@ -179,7 +167,7 @@ public interface IndexFieldData<FD extends AtomicFieldData> extends IndexCompone
                    return min ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
                case STRING:
                case STRING_VAL:
-                    return min ? null : MAX_TERM;
+                    return null;
                default:
                    throw new UnsupportedOperationException("Unsupported reduced type: " + reducedType());
                }
@ -225,6 +213,18 @@ public interface IndexFieldData<FD extends AtomicFieldData> extends IndexCompone
        }

        public abstract SortField.Type reducedType();
+
+        /**
+         * Return a missing value that is understandable by {@link SortField#setMissingValue(Object)}.
+         * Most implementations return null because they already replace the value at the fielddata level.
+         * However this can't work in case of strings since there is no such thing as a string which
+         * compares greater than any other string, so in that case we need to return
+         * {@link SortField#STRING_FIRST} or {@link SortField#STRING_LAST} so that the coordinating node
+         * knows how to deal with null values.
+         */
+        public Object missingValue(boolean reversed) {
+            return null;
+        }
    }

    interface Builder {
--- a/core/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/BytesRefFieldComparatorSource.java
+++ b/core/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/BytesRefFieldComparatorSource.java
@ -59,6 +59,19 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
        return SortField.Type.STRING;
    }

+    @Override
+    public Object missingValue(boolean reversed) {
+        if (sortMissingFirst(missingValue) || sortMissingLast(missingValue)) {
+            if (sortMissingLast(missingValue) ^ reversed) {
+                return SortField.STRING_LAST;
+            } else {
+                return SortField.STRING_FIRST;
+            }
+        }
+        // otherwise we fill missing values ourselves
+        return null;
+    }
+
    protected SortedBinaryDocValues getValues(LeafReaderContext context) throws IOException {
        return indexFieldData.load(context).getBytesValues();
    }
@ -97,29 +110,6 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
                    BytesRefFieldComparatorSource.this.setScorer(scorer);
                }

-                @Override
-                public BytesRef value(int slot) {
-                    // TODO: When serializing the response to the coordinating node, we lose the information about
-                    // whether the comparator sorts missing docs first or last. We should fix it and let
-                    // TopDocs.merge deal with it (it knows how to)
-                    BytesRef value = super.value(slot);
-                    if (value == null) {
-                        assert sortMissingFirst(missingValue) || sortMissingLast(missingValue);
-                        value = missingBytes;
-                    }
-                    return value;
-                }
-                
-                @Override
-                public void setTopValue(BytesRef topValue) {
-                    // symetric of value(int): if we need to feed the comparator with <tt>null</tt>
-                    // if we overrode the value with MAX_TERM in value(int)
-                    if (topValue == missingBytes && (sortMissingFirst(missingValue) || sortMissingLast(missingValue))) {
-                        topValue = null;
-                    }
-                    super.setTopValue(topValue);
-                }
-
            };
        }

@ -156,15 +146,6 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
                BytesRefFieldComparatorSource.this.setScorer(scorer);
            }

-            @Override
-            public BytesRef value(int slot) {
-                BytesRef value = super.value(slot);
-                if (value == null) {
-                    value = missingBytes;
-                }
-                return value;
-            }
-
        };
    }
    
--- a/core/src/main/java/org/elasticsearch/search/internal/InternalSearchHit.java
+++ b/core/src/main/java/org/elasticsearch/search/internal/InternalSearchHit.java
@ -64,7 +64,6 @@ import static org.elasticsearch.search.internal.InternalSearchHitField.readSearc
 public class InternalSearchHit implements SearchHit {

    private static final Object[] EMPTY_SORT_VALUES = new Object[0];
-    private static final Text MAX_TERM_AS_TEXT = new StringAndBytesText(BytesRefFieldComparatorSource.MAX_TERM.utf8ToString());

    private transient int docId;

@ -506,13 +505,7 @@ public class InternalSearchHit implements SearchHit {
        if (sortValues != null && sortValues.length > 0) {
            builder.startArray(Fields.SORT);
            for (Object sortValue : sortValues) {
-                if (sortValue != null && sortValue.equals(MAX_TERM_AS_TEXT)) {
-                    // We don't display MAX_TERM in JSON responses in case some clients have UTF-8 parsers that wouldn't accept a
-                    // non-character in the response, even though this is valid UTF-8
-                    builder.nullValue();
-                } else {
-                    builder.value(sortValue);
-                }
+                builder.value(sortValue);
            }
            builder.endArray();
        }
--- a/core/src/test/java/org/elasticsearch/index/fielddata/AbstractFieldDataImplTests.java
+++ b/core/src/test/java/org/elasticsearch/index/fielddata/AbstractFieldDataImplTests.java
@ -247,9 +247,9 @@ public abstract class AbstractFieldDataImplTests extends AbstractFieldDataTests
        assertThat(topDocs.scoreDocs[5].doc, equalTo(6));
        assertThat(((BytesRef) ((FieldDoc) topDocs.scoreDocs[5]).fields[0]).utf8ToString(), equalTo("08"));
        assertThat(topDocs.scoreDocs[6].doc, equalTo(1));
-        assertThat((BytesRef) ((FieldDoc) topDocs.scoreDocs[6]).fields[0], equalTo(XFieldComparatorSource.MAX_TERM));
+        assertThat((BytesRef) ((FieldDoc) topDocs.scoreDocs[6]).fields[0], equalTo(null));
        assertThat(topDocs.scoreDocs[7].doc, equalTo(5));
-        assertThat((BytesRef) ((FieldDoc) topDocs.scoreDocs[7]).fields[0], equalTo(XFieldComparatorSource.MAX_TERM));
+        assertThat((BytesRef) ((FieldDoc) topDocs.scoreDocs[7]).fields[0], equalTo(null));

        topDocs = searcher.search(new MatchAllDocsQuery(), 10,
                new Sort(new SortField("value", indexFieldData.comparatorSource(null, MultiValueMode.MAX, null), true)));
--- a/core/src/test/java/org/elasticsearch/index/fielddata/AbstractStringFieldDataTests.java
+++ b/core/src/test/java/org/elasticsearch/index/fielddata/AbstractStringFieldDataTests.java
@ -443,13 +443,11 @@ public abstract class AbstractStringFieldDataTests extends AbstractFieldDataImpl
            if (cmpValue == null) {
                if ("_first".equals(missingValue)) {
                    cmpValue = new BytesRef();
-                } else if ("_last".equals(missingValue)) {
-                    cmpValue = XFieldComparatorSource.MAX_TERM;
-                } else {
+                } else if ("_last".equals(missingValue) == false) {
                    cmpValue = (BytesRef) missingValue;
                }
            }
-            if (previous != null) {
+            if (previous != null && cmpValue != null) {
                assertTrue(previous.utf8ToString() + "   /   " + cmpValue.utf8ToString(), previous.compareTo(cmpValue) <= 0);
            }
            previous = cmpValue;
--- a/core/src/test/java/org/elasticsearch/index/fielddata/ParentChildFieldDataTests.java
+++ b/core/src/test/java/org/elasticsearch/index/fielddata/ParentChildFieldDataTests.java
@ -185,7 +185,7 @@ public class ParentChildFieldDataTests extends AbstractFieldDataTests {
        assertThat(topDocs.scoreDocs[6].doc, equalTo(6));
        assertThat(((BytesRef) ((FieldDoc) topDocs.scoreDocs[6]).fields[0]).utf8ToString(), equalTo("2"));
        assertThat(topDocs.scoreDocs[7].doc, equalTo(7));
-        assertThat(((BytesRef) ((FieldDoc) topDocs.scoreDocs[7]).fields[0]), equalTo(XFieldComparatorSource.MAX_TERM));
+        assertThat(((BytesRef) ((FieldDoc) topDocs.scoreDocs[7]).fields[0]), equalTo(null));

        topDocs = searcher.search(new MatchAllDocsQuery(), 10, new Sort(new SortField(ParentFieldMapper.NAME, comparator, true)));
        assertThat(topDocs.totalHits, equalTo(8));
--- a/core/src/test/java/org/elasticsearch/search/sort/SimpleSortTests.java
+++ b/core/src/test/java/org/elasticsearch/search/sort/SimpleSortTests.java
@ -37,7 +37,6 @@ import org.elasticsearch.common.text.Text;
 import org.elasticsearch.common.unit.DistanceUnit;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentFactory;
-import org.elasticsearch.index.fielddata.IndexFieldData;
 import org.elasticsearch.index.mapper.Uid;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
@ -2017,10 +2016,9 @@ public class SimpleSortTests extends ElasticsearchIntegrationTest {
                .addSort(fieldSort("str_field").order(SortOrder.ASC).unmappedType("string"))
                .addSort(fieldSort("str_field2").order(SortOrder.DESC).unmappedType("string")).get();

-        final StringAndBytesText maxTerm = new StringAndBytesText(IndexFieldData.XFieldComparatorSource.MAX_TERM.utf8ToString());
        assertSortValues(resp,
                new Object[] {new StringAndBytesText("bcd"), null},
-                new Object[] {maxTerm, null});
+                new Object[] {null, null});

        resp = client().prepareSearch("test1", "test2")
                .addSort(fieldSort("long_field").order(SortOrder.ASC).unmappedType("long"))