Ensure nested documents have consistent version and seq_ids (#27455)

Today we index dummy values for seq_ids and version on nested documents.
This is on the one hand trappy since users can request these values via
inner hits and on the other hand not necessarily good for compression since
the dummy value will likely not compress well when seqIDs are lowish.

This change ensures that we share the same field values for all documents in a
nested block. This won't have any overhead, in-fact it might be more efficient since
we even reduce the work needed slightly.
This commit is contained in:
Simon Willnauer 2017-11-20 16:50:08 +01:00 committed by GitHub
parent d3e3bc8656
commit 720e96e288
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 69 additions and 7 deletions

View File

@ -245,15 +245,18 @@ public class SeqNoFieldMapper extends MetadataFieldMapper {
@Override
public void postParse(ParseContext context) throws IOException {
// In the case of nested docs, let's fill nested docs with seqNo=1 and
// primaryTerm=0 so that Lucene doesn't write a Bitset for documents
// that don't have the field. This is consistent with the default value
// In the case of nested docs, let's fill nested docs with the original
// so that Lucene doesn't write a Bitset for documents that
// don't have the field. This is consistent with the default value
// for efficiency.
// we share the parent docs fields to ensure good compression
SequenceIDFields seqID = context.seqID();
assert seqID != null;
for (int i = 1; i < context.docs().size(); i++) {
final Document doc = context.docs().get(i);
doc.add(new LongPoint(NAME, 1));
doc.add(new NumericDocValuesField(NAME, 1L));
doc.add(new NumericDocValuesField(PRIMARY_TERM_NAME, 0L));
doc.add(seqID.seqNo);
doc.add(seqID.seqNoDocValue);
doc.add(seqID.primaryTerm);
}
}

View File

@ -126,9 +126,11 @@ public class VersionFieldMapper extends MetadataFieldMapper {
public void postParse(ParseContext context) throws IOException {
// In the case of nested docs, let's fill nested docs with version=1 so that Lucene doesn't write a Bitset for documents
// that don't have the field. This is consistent with the default value for efficiency.
Field version = context.version();
assert version != null;
for (int i = 1; i < context.docs().size(); i++) {
final Document doc = context.docs().get(i);
doc.add(new NumericDocValuesField(NAME, 1L));
doc.add(version);
}
}

View File

@ -41,3 +41,60 @@ setup:
- match: { hits.hits.0.inner_hits.nested_field.hits.hits.0._nested.offset: 0 }
- is_false: hits.hits.0.inner_hits.nested_field.hits.hits.0._nested.child
---
"Nested doc version and seqIDs":
- skip:
# fixed in 7.0
version: " - 6.99.99"
reason: "version and seq IDs where not accurate in previous versions"
- do:
index:
index: test
type: type_1
id: 1
body:
"nested_field" : [ { "foo": "bar" } ]
- do:
indices.refresh: {}
- do:
search:
body: { "query" : { "nested" : { "path" : "nested_field", "query" : { "match_all" : {} }, "inner_hits" : { version: true, "docvalue_fields": ["_seq_no"]} }}, "version": true, "docvalue_fields" : ["_seq_no"] }
- match: { hits.total: 1 }
- match: { hits.hits.0._index: "test" }
- match: { hits.hits.0._type: "type_1" }
- match: { hits.hits.0._id: "1" }
- match: { hits.hits.0._version: 1 }
- match: { hits.hits.0.fields._seq_no: [0] }
- match: { hits.hits.0.inner_hits.nested_field.hits.hits.0.fields._seq_no: [0] }
- do:
index:
index: test
type: type_1
id: 1
body:
"nested_field" : [ { "foo": "baz" } ]
- do:
indices.refresh: {}
- do:
search:
body: { "query" : { "nested" : { "path" : "nested_field", "query" : { "match_all" : {} }, "inner_hits" : { version: true, "docvalue_fields": ["_seq_no"]} }}, "version": true, "docvalue_fields" : ["_seq_no"] }
- match: { hits.total: 1 }
- match: { hits.hits.0._index: "test" }
- match: { hits.hits.0._type: "type_1" }
- match: { hits.hits.0._id: "1" }
- match: { hits.hits.0._version: 2 }
- match: { hits.hits.0.fields._seq_no: [1] }
- match: { hits.hits.0.inner_hits.nested_field.hits.hits.0._version: 2 }
- match: { hits.hits.0.inner_hits.nested_field.hits.hits.0.fields._seq_no: [1] }