Store _all payloads on 1 byte instead of 4. #16899

This changes the `_all` field to store per-field boosts using a single byte
similarly to norms.
This commit is contained in:
Adrien Grand 2016-03-02 10:41:45 +01:00
parent 529f7cb42c
commit 2c3e4840f2
26 changed files with 64 additions and 8 deletions

View File

@ -42,6 +42,7 @@ import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.search.similarities.Similarity.SimWeight;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
@ -186,9 +187,13 @@ public final class AllTermQuery extends Query {
float boost;
if (payload == null) {
boost = 1;
} else {
assert payload.length == 4;
} else if (payload.length == 1) {
boost = SmallFloat.byte315ToFloat(payload.bytes[payload.offset]);
} else if (payload.length == 4) {
// TODO: for bw compat only, remove this in 6.0
boost = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
} else {
throw new IllegalStateException("Payloads are expected to have a length of 1 or 4 but got: " + payload);
}
payloadBoost += boost;
}

View File

@ -25,11 +25,10 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
import java.io.IOException;
import static org.apache.lucene.analysis.payloads.PayloadHelper.encodeFloat;
/**
*
*/
@ -39,7 +38,7 @@ public final class AllTokenStream extends TokenFilter {
return new AllTokenStream(analyzer.tokenStream(allFieldName, allEntries), allEntries);
}
private final BytesRef payloadSpare = new BytesRef(new byte[4]);
private final BytesRef payloadSpare = new BytesRef(new byte[1]);
private final AllEntries allEntries;
@ -64,7 +63,7 @@ public final class AllTokenStream extends TokenFilter {
}
final float boost = allEntries.boost(offsetAttribute.startOffset());
if (boost != 1.0f) {
encodeFloat(boost, payloadSpare.bytes, payloadSpare.offset);
payloadSpare.bytes[0] = SmallFloat.floatToByte315(boost);
payloadAttribute.setPayload(payloadSpare);
} else {
payloadAttribute.setPayload(null);

View File

@ -20,7 +20,9 @@
package org.elasticsearch.bwcompat;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.TestUtil;
import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.get.GetIndexResponse;
@ -297,6 +299,7 @@ public class OldIndexBackwardsCompatibilityIT extends ESIntegTestCase {
importIndex(indexName);
assertIndexSanity(indexName, version);
assertBasicSearchWorks(indexName);
assertAllSearchWorks(indexName);
assertBasicAggregationWorks(indexName);
assertRealtimeGetWorks(indexName);
assertNewReplicasWork(indexName);
@ -354,6 +357,39 @@ public class OldIndexBackwardsCompatibilityIT extends ESIntegTestCase {
assertEquals(numDocs, searchRsp.getHits().getTotalHits());
}
boolean findPayloadBoostInExplanation(Explanation expl) {
if (expl.getDescription().startsWith("payloadBoost=") && expl.getValue() != 1f) {
return true;
} else {
boolean found = false;
for (Explanation sub : expl.getDetails()) {
found |= findPayloadBoostInExplanation(sub);
}
return found;
}
}
void assertAllSearchWorks(String indexName) {
logger.info("--> testing _all search");
SearchResponse searchRsp = client().prepareSearch(indexName).get();
ElasticsearchAssertions.assertNoFailures(searchRsp);
assertThat(searchRsp.getHits().getTotalHits(), greaterThanOrEqualTo(1L));
SearchHit bestHit = searchRsp.getHits().getAt(0);
// Make sure there are payloads and they are taken into account for the score
// the 'string' field has a boost of 4 in the mappings so it should get a payload boost
String stringValue = (String) bestHit.sourceAsMap().get("string");
assertNotNull(stringValue);
Explanation explanation = client().prepareExplain(indexName, bestHit.getType(), bestHit.getId())
.setQuery(QueryBuilders.matchQuery("_all", stringValue)).get().getExplanation();
assertTrue("Could not find payload boost in explanation\n" + explanation, findPayloadBoostInExplanation(explanation));
// Make sure the query can run on the whole index
searchRsp = client().prepareSearch(indexName).setQuery(QueryBuilders.matchQuery("_all", stringValue)).setExplain(true).get();
ElasticsearchAssertions.assertNoFailures(searchRsp);
assertThat(searchRsp.getHits().getTotalHits(), greaterThanOrEqualTo(1L));
}
void assertBasicAggregationWorks(String indexName) {
// histogram on a long
SearchResponse searchRsp = client().prepareSearch(indexName).addAggregation(AggregationBuilders.histogram("histo").field("long_sort").interval(10)).get();

View File

@ -42,6 +42,7 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.test.ESTestCase;
@ -89,8 +90,8 @@ public class SimpleAllTests extends ESTestCase {
if (payload == null || payload.length == 0) {
assertEquals(boost, 1f, 0.001f);
} else {
assertEquals(4, payload.length);
final float b = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
assertEquals(1, payload.length);
final float b = SmallFloat.byte315ToFloat(payload.bytes[payload.offset]);
assertEquals(boost, b, 0.001f);
}
}

View File

@ -247,6 +247,15 @@ def generate_index(client, version, index_name):
}
}
mappings['doc'] = {
'properties': {
'string': {
'type': 'string',
'boost': 4
}
}
}
settings = {
'number_of_shards': 1,
'number_of_replicas': 0,

View File

@ -94,6 +94,12 @@ Scroll requests sorted by `_doc` have been optimized to more efficiently resume
from where the previous request stopped, so this will have the same performance
characteristics as the former `scan` search type.
==== Boost accuracy for queries on `_all`
Per-field boosts on the `_all` are now compressed on a single byte instead of
4 bytes previously. While this will make the index more space-efficient, this
also means that the boosts will be less accurately encoded.
[[breaking_50_rest_api_changes]]
=== REST API changes