Store _all payloads on 1 byte instead of 4. #16899

This changes the `_all` field to store per-field boosts using a single byte similarly to norms.
2016-03-02 10:41:45 +01:00 · 2016-03-02 10:41:45 +01:00 · 2c3e4840f2
parent 529f7cb42c
commit 2c3e4840f2
26 changed files with 64 additions and 8 deletions
--- a/core/src/main/java/org/elasticsearch/common/lucene/all/AllTermQuery.java
+++ b/core/src/main/java/org/elasticsearch/common/lucene/all/AllTermQuery.java
@ -42,6 +42,7 @@ import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.search.similarities.Similarity.SimScorer;
 import org.apache.lucene.search.similarities.Similarity.SimWeight;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.SmallFloat;
 import org.apache.lucene.util.ToStringUtils;

 import java.io.IOException;
@ -186,9 +187,13 @@ public final class AllTermQuery extends Query {
                    float boost;
                    if (payload == null) {
                        boost = 1;
-                    } else {
-                        assert payload.length == 4;
+                    } else if (payload.length == 1) {
+                        boost = SmallFloat.byte315ToFloat(payload.bytes[payload.offset]);
+                    } else if (payload.length == 4) {
+                        // TODO: for bw compat only, remove this in 6.0
                        boost = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
+                    } else {
+                        throw new IllegalStateException("Payloads are expected to have a length of 1 or 4 but got: " + payload);
                    }
                    payloadBoost += boost;
                }
--- a/core/src/main/java/org/elasticsearch/common/lucene/all/AllTokenStream.java
+++ b/core/src/main/java/org/elasticsearch/common/lucene/all/AllTokenStream.java
@ -25,11 +25,10 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.SmallFloat;

 import java.io.IOException;

-import static org.apache.lucene.analysis.payloads.PayloadHelper.encodeFloat;
-
 /**
 *
 */
@ -39,7 +38,7 @@ public final class AllTokenStream extends TokenFilter {
        return new AllTokenStream(analyzer.tokenStream(allFieldName, allEntries), allEntries);
    }
    
-    private final BytesRef payloadSpare = new BytesRef(new byte[4]);
+    private final BytesRef payloadSpare = new BytesRef(new byte[1]);

    private final AllEntries allEntries;

@ -64,7 +63,7 @@ public final class AllTokenStream extends TokenFilter {
        }
        final float boost = allEntries.boost(offsetAttribute.startOffset());
        if (boost != 1.0f) {
-            encodeFloat(boost, payloadSpare.bytes, payloadSpare.offset);
+            payloadSpare.bytes[0] = SmallFloat.floatToByte315(boost);
            payloadAttribute.setPayload(payloadSpare);
        } else {
            payloadAttribute.setPayload(null);
--- a/core/src/test/java/org/elasticsearch/bwcompat/OldIndexBackwardsCompatibilityIT.java
+++ b/core/src/test/java/org/elasticsearch/bwcompat/OldIndexBackwardsCompatibilityIT.java
@ -20,7 +20,9 @@
 package org.elasticsearch.bwcompat;

 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.search.Explanation;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.SmallFloat;
 import org.apache.lucene.util.TestUtil;
 import org.elasticsearch.Version;
 import org.elasticsearch.action.admin.indices.get.GetIndexResponse;
@ -297,6 +299,7 @@ public class OldIndexBackwardsCompatibilityIT extends ESIntegTestCase {
        importIndex(indexName);
        assertIndexSanity(indexName, version);
        assertBasicSearchWorks(indexName);
+        assertAllSearchWorks(indexName);
        assertBasicAggregationWorks(indexName);
        assertRealtimeGetWorks(indexName);
        assertNewReplicasWork(indexName);
@ -354,6 +357,39 @@ public class OldIndexBackwardsCompatibilityIT extends ESIntegTestCase {
        assertEquals(numDocs, searchRsp.getHits().getTotalHits());
    }

+    boolean findPayloadBoostInExplanation(Explanation expl) {
+        if (expl.getDescription().startsWith("payloadBoost=") && expl.getValue() != 1f) {
+            return true;
+        } else {
+            boolean found = false;
+            for (Explanation sub : expl.getDetails()) {
+                found |= findPayloadBoostInExplanation(sub);
+            }
+            return found;
+        }
+    }
+
+    void assertAllSearchWorks(String indexName) {
+        logger.info("--> testing _all search");
+        SearchResponse searchRsp = client().prepareSearch(indexName).get();
+        ElasticsearchAssertions.assertNoFailures(searchRsp);
+        assertThat(searchRsp.getHits().getTotalHits(), greaterThanOrEqualTo(1L));
+        SearchHit bestHit = searchRsp.getHits().getAt(0);
+
+        // Make sure there are payloads and they are taken into account for the score
+        // the 'string' field has a boost of 4 in the mappings so it should get a payload boost
+        String stringValue = (String) bestHit.sourceAsMap().get("string");
+        assertNotNull(stringValue);
+        Explanation explanation = client().prepareExplain(indexName, bestHit.getType(), bestHit.getId())
+                .setQuery(QueryBuilders.matchQuery("_all", stringValue)).get().getExplanation();
+        assertTrue("Could not find payload boost in explanation\n" + explanation, findPayloadBoostInExplanation(explanation));
+
+        // Make sure the query can run on the whole index
+        searchRsp = client().prepareSearch(indexName).setQuery(QueryBuilders.matchQuery("_all", stringValue)).setExplain(true).get();
+        ElasticsearchAssertions.assertNoFailures(searchRsp);
+        assertThat(searchRsp.getHits().getTotalHits(), greaterThanOrEqualTo(1L));
+    }
+
    void assertBasicAggregationWorks(String indexName) {
        // histogram on a long
        SearchResponse searchRsp = client().prepareSearch(indexName).addAggregation(AggregationBuilders.histogram("histo").field("long_sort").interval(10)).get();
--- a/core/src/test/java/org/elasticsearch/common/lucene/all/SimpleAllTests.java
+++ b/core/src/test/java/org/elasticsearch/common/lucene/all/SimpleAllTests.java
@ -42,6 +42,7 @@ import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.SmallFloat;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.test.ESTestCase;

@ -89,8 +90,8 @@ public class SimpleAllTests extends ESTestCase {
            if (payload == null || payload.length == 0) {
                assertEquals(boost, 1f, 0.001f);
            } else {
-                assertEquals(4, payload.length);
-                final float b = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
+                assertEquals(1, payload.length);
+                final float b = SmallFloat.byte315ToFloat(payload.bytes[payload.offset]);
                assertEquals(boost, b, 0.001f);
            }
        }
--- a/core/src/test/resources/indices/bwc/index-2.0.0-beta1.zip
+++ b/core/src/test/resources/indices/bwc/index-2.0.0-beta1.zip
--- a/core/src/test/resources/indices/bwc/index-2.0.0-beta2.zip
+++ b/core/src/test/resources/indices/bwc/index-2.0.0-beta2.zip
--- a/core/src/test/resources/indices/bwc/index-2.0.0-rc1.zip
+++ b/core/src/test/resources/indices/bwc/index-2.0.0-rc1.zip
--- a/core/src/test/resources/indices/bwc/index-2.0.0.zip
+++ b/core/src/test/resources/indices/bwc/index-2.0.0.zip
--- a/core/src/test/resources/indices/bwc/index-2.0.1.zip
+++ b/core/src/test/resources/indices/bwc/index-2.0.1.zip
--- a/core/src/test/resources/indices/bwc/index-2.0.2.zip
+++ b/core/src/test/resources/indices/bwc/index-2.0.2.zip
--- a/core/src/test/resources/indices/bwc/index-2.1.0.zip
+++ b/core/src/test/resources/indices/bwc/index-2.1.0.zip
--- a/core/src/test/resources/indices/bwc/index-2.1.1.zip
+++ b/core/src/test/resources/indices/bwc/index-2.1.1.zip
--- a/core/src/test/resources/indices/bwc/index-2.1.2.zip
+++ b/core/src/test/resources/indices/bwc/index-2.1.2.zip
--- a/core/src/test/resources/indices/bwc/index-2.2.0.zip
+++ b/core/src/test/resources/indices/bwc/index-2.2.0.zip
--- a/core/src/test/resources/indices/bwc/repo-2.0.0-beta1.zip
+++ b/core/src/test/resources/indices/bwc/repo-2.0.0-beta1.zip
--- a/core/src/test/resources/indices/bwc/repo-2.0.0-beta2.zip
+++ b/core/src/test/resources/indices/bwc/repo-2.0.0-beta2.zip
--- a/core/src/test/resources/indices/bwc/repo-2.0.0-rc1.zip
+++ b/core/src/test/resources/indices/bwc/repo-2.0.0-rc1.zip
--- a/core/src/test/resources/indices/bwc/repo-2.0.0.zip
+++ b/core/src/test/resources/indices/bwc/repo-2.0.0.zip
--- a/core/src/test/resources/indices/bwc/repo-2.0.1.zip
+++ b/core/src/test/resources/indices/bwc/repo-2.0.1.zip
--- a/core/src/test/resources/indices/bwc/repo-2.0.2.zip
+++ b/core/src/test/resources/indices/bwc/repo-2.0.2.zip
--- a/core/src/test/resources/indices/bwc/repo-2.1.0.zip
+++ b/core/src/test/resources/indices/bwc/repo-2.1.0.zip
--- a/core/src/test/resources/indices/bwc/repo-2.1.1.zip
+++ b/core/src/test/resources/indices/bwc/repo-2.1.1.zip
--- a/core/src/test/resources/indices/bwc/repo-2.1.2.zip
+++ b/core/src/test/resources/indices/bwc/repo-2.1.2.zip
--- a/core/src/test/resources/indices/bwc/repo-2.2.0.zip
+++ b/core/src/test/resources/indices/bwc/repo-2.2.0.zip
--- a/dev-tools/create_bwc_index.py
+++ b/dev-tools/create_bwc_index.py
@ -247,6 +247,15 @@ def generate_index(client, version, index_name):
      }
    }

+  mappings['doc'] = {
+    'properties': {
+      'string': {
+        'type': 'string',
+        'boost': 4
+      }
+    }
+  }
+
  settings = {
    'number_of_shards': 1,
    'number_of_replicas': 0,
--- a/docs/reference/migration/migrate_5_0.asciidoc
+++ b/docs/reference/migration/migrate_5_0.asciidoc
@ -94,6 +94,12 @@ Scroll requests sorted by `_doc` have been optimized to more efficiently resume
 from where the previous request stopped, so this will have the same performance
 characteristics as the former `scan` search type.

+==== Boost accuracy for queries on `_all`
+
+Per-field boosts on the `_all` are now compressed on a single byte instead of
+4 bytes previously. While this will make the index more space-efficient, this
+also means that the boosts will be less accurately encoded.
+
 [[breaking_50_rest_api_changes]]
 === REST API changes