Store _all payloads on 1 byte instead of 4. #16899
This changes the `_all` field to store per-field boosts using a single byte similarly to norms.
This commit is contained in:
parent
529f7cb42c
commit
2c3e4840f2
|
@ -42,6 +42,7 @@ import org.apache.lucene.search.similarities.Similarity;
|
|||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimWeight;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -186,9 +187,13 @@ public final class AllTermQuery extends Query {
|
|||
float boost;
|
||||
if (payload == null) {
|
||||
boost = 1;
|
||||
} else {
|
||||
assert payload.length == 4;
|
||||
} else if (payload.length == 1) {
|
||||
boost = SmallFloat.byte315ToFloat(payload.bytes[payload.offset]);
|
||||
} else if (payload.length == 4) {
|
||||
// TODO: for bw compat only, remove this in 6.0
|
||||
boost = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
|
||||
} else {
|
||||
throw new IllegalStateException("Payloads are expected to have a length of 1 or 4 but got: " + payload);
|
||||
}
|
||||
payloadBoost += boost;
|
||||
}
|
||||
|
|
|
@ -25,11 +25,10 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.apache.lucene.analysis.payloads.PayloadHelper.encodeFloat;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
@ -39,7 +38,7 @@ public final class AllTokenStream extends TokenFilter {
|
|||
return new AllTokenStream(analyzer.tokenStream(allFieldName, allEntries), allEntries);
|
||||
}
|
||||
|
||||
private final BytesRef payloadSpare = new BytesRef(new byte[4]);
|
||||
private final BytesRef payloadSpare = new BytesRef(new byte[1]);
|
||||
|
||||
private final AllEntries allEntries;
|
||||
|
||||
|
@ -64,7 +63,7 @@ public final class AllTokenStream extends TokenFilter {
|
|||
}
|
||||
final float boost = allEntries.boost(offsetAttribute.startOffset());
|
||||
if (boost != 1.0f) {
|
||||
encodeFloat(boost, payloadSpare.bytes, payloadSpare.offset);
|
||||
payloadSpare.bytes[0] = SmallFloat.floatToByte315(boost);
|
||||
payloadAttribute.setPayload(payloadSpare);
|
||||
} else {
|
||||
payloadAttribute.setPayload(null);
|
||||
|
|
|
@ -20,7 +20,9 @@
|
|||
package org.elasticsearch.bwcompat;
|
||||
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.action.admin.indices.get.GetIndexResponse;
|
||||
|
@ -297,6 +299,7 @@ public class OldIndexBackwardsCompatibilityIT extends ESIntegTestCase {
|
|||
importIndex(indexName);
|
||||
assertIndexSanity(indexName, version);
|
||||
assertBasicSearchWorks(indexName);
|
||||
assertAllSearchWorks(indexName);
|
||||
assertBasicAggregationWorks(indexName);
|
||||
assertRealtimeGetWorks(indexName);
|
||||
assertNewReplicasWork(indexName);
|
||||
|
@ -354,6 +357,39 @@ public class OldIndexBackwardsCompatibilityIT extends ESIntegTestCase {
|
|||
assertEquals(numDocs, searchRsp.getHits().getTotalHits());
|
||||
}
|
||||
|
||||
boolean findPayloadBoostInExplanation(Explanation expl) {
|
||||
if (expl.getDescription().startsWith("payloadBoost=") && expl.getValue() != 1f) {
|
||||
return true;
|
||||
} else {
|
||||
boolean found = false;
|
||||
for (Explanation sub : expl.getDetails()) {
|
||||
found |= findPayloadBoostInExplanation(sub);
|
||||
}
|
||||
return found;
|
||||
}
|
||||
}
|
||||
|
||||
void assertAllSearchWorks(String indexName) {
|
||||
logger.info("--> testing _all search");
|
||||
SearchResponse searchRsp = client().prepareSearch(indexName).get();
|
||||
ElasticsearchAssertions.assertNoFailures(searchRsp);
|
||||
assertThat(searchRsp.getHits().getTotalHits(), greaterThanOrEqualTo(1L));
|
||||
SearchHit bestHit = searchRsp.getHits().getAt(0);
|
||||
|
||||
// Make sure there are payloads and they are taken into account for the score
|
||||
// the 'string' field has a boost of 4 in the mappings so it should get a payload boost
|
||||
String stringValue = (String) bestHit.sourceAsMap().get("string");
|
||||
assertNotNull(stringValue);
|
||||
Explanation explanation = client().prepareExplain(indexName, bestHit.getType(), bestHit.getId())
|
||||
.setQuery(QueryBuilders.matchQuery("_all", stringValue)).get().getExplanation();
|
||||
assertTrue("Could not find payload boost in explanation\n" + explanation, findPayloadBoostInExplanation(explanation));
|
||||
|
||||
// Make sure the query can run on the whole index
|
||||
searchRsp = client().prepareSearch(indexName).setQuery(QueryBuilders.matchQuery("_all", stringValue)).setExplain(true).get();
|
||||
ElasticsearchAssertions.assertNoFailures(searchRsp);
|
||||
assertThat(searchRsp.getHits().getTotalHits(), greaterThanOrEqualTo(1L));
|
||||
}
|
||||
|
||||
void assertBasicAggregationWorks(String indexName) {
|
||||
// histogram on a long
|
||||
SearchResponse searchRsp = client().prepareSearch(indexName).addAggregation(AggregationBuilders.histogram("histo").field("long_sort").interval(10)).get();
|
||||
|
|
|
@ -42,6 +42,7 @@ import org.apache.lucene.search.TopDocs;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
|
@ -89,8 +90,8 @@ public class SimpleAllTests extends ESTestCase {
|
|||
if (payload == null || payload.length == 0) {
|
||||
assertEquals(boost, 1f, 0.001f);
|
||||
} else {
|
||||
assertEquals(4, payload.length);
|
||||
final float b = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
|
||||
assertEquals(1, payload.length);
|
||||
final float b = SmallFloat.byte315ToFloat(payload.bytes[payload.offset]);
|
||||
assertEquals(boost, b, 0.001f);
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -247,6 +247,15 @@ def generate_index(client, version, index_name):
|
|||
}
|
||||
}
|
||||
|
||||
mappings['doc'] = {
|
||||
'properties': {
|
||||
'string': {
|
||||
'type': 'string',
|
||||
'boost': 4
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
settings = {
|
||||
'number_of_shards': 1,
|
||||
'number_of_replicas': 0,
|
||||
|
|
|
@ -94,6 +94,12 @@ Scroll requests sorted by `_doc` have been optimized to more efficiently resume
|
|||
from where the previous request stopped, so this will have the same performance
|
||||
characteristics as the former `scan` search type.
|
||||
|
||||
==== Boost accuracy for queries on `_all`
|
||||
|
||||
Per-field boosts on the `_all` are now compressed on a single byte instead of
|
||||
4 bytes previously. While this will make the index more space-efficient, this
|
||||
also means that the boosts will be less accurately encoded.
|
||||
|
||||
[[breaking_50_rest_api_changes]]
|
||||
=== REST API changes
|
||||
|
||||
|
|
Loading…
Reference in New Issue