LUCENE-9300: Fix field infos update on doc values update (#1394)

Today a doc values update creates a new field infos file that contains the original field infos updated for the new generation as well as the new fields created by the doc values update.

However existing fields are cloned through the global fields (shared in the index writer) instead of the local ones (present in the segment).
In practice this is not an issue since field numbers are shared between segments created by the same index writer.
But this assumption doesn't hold for segments created by different writers and added through IndexWriter#addIndexes(Directory).
In this case, the field number of the same field can differ between segments so any doc values update can corrupt the index
by assigning the wrong field number to an existing field in the next generation.

When this happens, queries and merges can access wrong fields without throwing any error, leading to a silent corruption in the index.

This change ensures that we preserve local field numbers when creating
a new field infos generation.
This commit is contained in:
Jim Ferenczi 2020-04-03 13:58:05 +02:00 committed by GitHub
parent ac2837cfbd
commit b5c5ebe37c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 188 additions and 14 deletions

View File

@ -158,6 +158,9 @@ Bug Fixes
* LUCENE-9133: Fix for potential NPE in TermFilteredPresearcher for empty fields (Marvin Justice via Mike Drob)
* LUCENE-9300: Fix corruption of the new gen field infos when doc values updates are applied on a segment created
externally and added to the index with IndexWriter#addIndexes(Directory). (Jim Ferenczi, Adrien Grand)
Other
---------------------

View File

@ -543,27 +543,37 @@ final class ReadersAndUpdates {
try {
// clone FieldInfos so that we can update their dvGen separately from
// the reader's infos and write them to a new fieldInfos_gen file
FieldInfos.Builder builder = new FieldInfos.Builder(fieldNumbers);
// cannot use builder.add(reader.getFieldInfos()) because it does not
// clone FI.attributes as well FI.dvGen
// the reader's infos and write them to a new fieldInfos_gen file.
int maxFieldNumber = -1;
Map<String, FieldInfo> byName = new HashMap<>();
for (FieldInfo fi : reader.getFieldInfos()) {
FieldInfo clone = builder.add(fi);
// copy the stuff FieldInfos.Builder doesn't copy
for (Entry<String,String> e : fi.attributes().entrySet()) {
clone.putAttribute(e.getKey(), e.getValue());
}
clone.setDocValuesGen(fi.getDocValuesGen());
// cannot use builder.add(fi) because it does not preserve
// the local field number. Field numbers can be different from
// the global ones if the segment was created externally (and added to
// this index with IndexWriter#addIndexes(Directory)).
byName.put(fi.name, cloneFieldInfo(fi, fi.number));
maxFieldNumber = Math.max(fi.number, maxFieldNumber);
}
// create new fields with the right DV type
FieldInfos.Builder builder = new FieldInfos.Builder(fieldNumbers);
for (List<DocValuesFieldUpdates> updates : pendingDVUpdates.values()) {
DocValuesFieldUpdates update = updates.get(0);
FieldInfo fieldInfo = builder.getOrAdd(update.field);
fieldInfo.setDocValuesType(update.type);
}
fieldInfos = builder.finish();
if (byName.containsKey(update.field)) {
// the field already exists in this segment
FieldInfo fi = byName.get(update.field);
fi.setDocValuesType(update.type);
} else {
// the field is not present in this segment so we clone the global field
// (which is guaranteed to exist) and remaps its field number locally.
assert fieldNumbers.contains(update.field, update.type);
FieldInfo fi = cloneFieldInfo(builder.getOrAdd(update.field), ++maxFieldNumber);
fi.setDocValuesType(update.type);
byName.put(fi.name, fi);
}
}
fieldInfos = new FieldInfos(byName.values().toArray(new FieldInfo[0]));
final DocValuesFormat docValuesFormat = codec.docValuesFormat();
handleDVUpdates(fieldInfos, trackingDir, docValuesFormat, reader, newDVFiles, maxDelGen, infoStream);
@ -644,6 +654,12 @@ final class ReadersAndUpdates {
return true;
}
private FieldInfo cloneFieldInfo(FieldInfo fi, int fieldNumber) {
return new FieldInfo(fi.name, fieldNumber, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(),
fi.getIndexOptions(), fi.getDocValuesType(), fi.getDocValuesGen(), new HashMap<>(fi.attributes()),
fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField());
}
private SegmentReader createNewReaderWithLatestLiveDocs(SegmentReader reader) throws IOException {
assert reader != null;
assert Thread.holdsLock(this) : Thread.currentThread().getName();

View File

@ -36,6 +36,7 @@ import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
@ -1483,6 +1484,160 @@ public class TestNumericDocValuesUpdates extends LuceneTestCase {
IOUtils.close(dir1, dir2);
}
public void testAddNewFieldAfterAddIndexes() throws Exception {
Directory dir1 = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
final int numDocs = atLeast(50);
try (IndexWriter writer = new IndexWriter(dir1, conf)) {
// create first index
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(i), Store.NO));
doc.add(new NumericDocValuesField("a1", 0L));
doc.add(new NumericDocValuesField("a2", 1L));
writer.addDocument(doc);
}
}
Directory dir2 = newDirectory();
conf = newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
try (IndexWriter writer = new IndexWriter(dir2, conf)) {
// create second index
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(i), Store.NO));
doc.add(new NumericDocValuesField("i1", 0L));
doc.add(new NumericDocValuesField("i2", 1L));
doc.add(new NumericDocValuesField("i3", 2L));
writer.addDocument(doc);
}
}
Directory mainDir = newDirectory();
conf = newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
try (IndexWriter writer = new IndexWriter(mainDir, conf)) {
writer.addIndexes(dir1, dir2);
List<FieldInfos> originalFieldInfos = new ArrayList<>();
try (DirectoryReader reader = DirectoryReader.open(writer)) {
for (LeafReaderContext leaf : reader.leaves()) {
originalFieldInfos.add(leaf.reader().getFieldInfos());
}
}
assertTrue(originalFieldInfos.size() > 0);
// update all doc values
long value = random().nextInt();
NumericDocValuesField[] update = new NumericDocValuesField[numDocs];
for (int i = 0; i < numDocs; i++) {
Term term = new Term("id", new BytesRef(Integer.toString(i)));
writer.updateDocValues(term, new NumericDocValuesField("ndv", value));
}
try (DirectoryReader reader = DirectoryReader.open(writer)) {
for (int i = 0; i < reader.leaves().size(); i++) {
LeafReader leafReader = reader.leaves().get(i).reader();
FieldInfos origFieldInfos = originalFieldInfos.get(i);
FieldInfos newFieldInfos = leafReader.getFieldInfos();
ensureConsistentFieldInfos(origFieldInfos, newFieldInfos);
assertEquals(DocValuesType.NUMERIC, newFieldInfos.fieldInfo("ndv").getDocValuesType());
NumericDocValues ndv = leafReader.getNumericDocValues("ndv");
for (int docId = 0; docId < leafReader.maxDoc(); docId++) {
assertEquals(docId, ndv.nextDoc());
assertEquals(ndv.longValue(), value);
}
}
}
}
IOUtils.close(dir1, dir2, mainDir);
}
public void testUpdatesAfterAddIndexes() throws Exception {
Directory dir1 = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
final int numDocs = atLeast(50);
try (IndexWriter writer = new IndexWriter(dir1, conf)) {
// create first index
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(i), Store.NO));
doc.add(new NumericDocValuesField("ndv", 4L));
doc.add(new NumericDocValuesField("control", 8L));
doc.add(new LongPoint("i1", 4L));
writer.addDocument(doc);
}
}
Directory dir2 = newDirectory();
conf = newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
try (IndexWriter writer = new IndexWriter(dir2, conf)) {
// create second index
for (int i = numDocs; i < numDocs * 2; i++) {
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(i), Store.NO));
doc.add(new NumericDocValuesField("ndv", 2L));
doc.add(new NumericDocValuesField("control", 4L));
doc.add(new LongPoint("i2", 16L));
doc.add(new LongPoint("i2", 24L));
writer.addDocument(doc);
}
}
Directory mainDir = newDirectory();
conf = newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
try (IndexWriter writer = new IndexWriter(mainDir, conf)) {
writer.addIndexes(dir1, dir2);
List<FieldInfos> originalFieldInfos = new ArrayList<>();
try (DirectoryReader reader = DirectoryReader.open(writer)) {
for (LeafReaderContext leaf : reader.leaves()) {
originalFieldInfos.add(leaf.reader().getFieldInfos());
}
}
assertTrue(originalFieldInfos.size() > 0);
// update some docs to a random value
long value = random().nextInt();
Term term = new Term("id", new BytesRef(Integer.toString(random().nextInt(numDocs) * 2)));
writer.updateDocValues(term, new NumericDocValuesField("ndv", value),
new NumericDocValuesField("control", value*2));
try (DirectoryReader reader = DirectoryReader.open(writer)) {
for (int i = 0; i < reader.leaves().size(); i++) {
LeafReader leafReader = reader.leaves().get(i).reader();
FieldInfos origFieldInfos = originalFieldInfos.get(i);
FieldInfos newFieldInfos = leafReader.getFieldInfos();
ensureConsistentFieldInfos(origFieldInfos, newFieldInfos);
assertNotNull(newFieldInfos.fieldInfo("ndv"));
assertEquals(DocValuesType.NUMERIC, newFieldInfos.fieldInfo("ndv").getDocValuesType());
assertEquals(DocValuesType.NUMERIC, newFieldInfos.fieldInfo("control").getDocValuesType());
NumericDocValues ndv = leafReader.getNumericDocValues("ndv");
NumericDocValues control = leafReader.getNumericDocValues("control");
for (int docId = 0; docId < leafReader.maxDoc(); docId++) {
assertEquals(docId, ndv.nextDoc());
assertEquals(docId, control.nextDoc());
assertEquals(ndv.longValue()*2, control.longValue());
}
}
IndexSearcher searcher = new IndexSearcher(reader);
assertEquals(numDocs, searcher.count(LongPoint.newExactQuery("i1", 4L)));
assertEquals(numDocs, searcher.count(LongPoint.newExactQuery("i2", 16L)));
assertEquals(numDocs, searcher.count(LongPoint.newExactQuery("i2", 24L)));
}
}
IOUtils.close(dir1, dir2, mainDir);
}
private void ensureConsistentFieldInfos(FieldInfos old, FieldInfos after) {
for (FieldInfo fi : old) {
assertNotNull(after.fieldInfo(fi.number));
assertNotNull(after.fieldInfo(fi.name));
assertEquals(fi.number, after.fieldInfo(fi.name).number);
assertTrue(fi.getDocValuesGen() <= after.fieldInfo(fi.name).getDocValuesGen());
}
}
public void testDeleteUnusedUpdatesFiles() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));