LUCENE-8505: IndexWriter#addIndices will now fail if the target index is sorted but the candidate is not.

This commit is contained in:
Jim Ferenczi 2018-09-25 09:14:07 +02:00
parent e437b2f1ed
commit 2bad3c4985
4 changed files with 70 additions and 53 deletions

View File

@ -117,6 +117,9 @@ Changes in Runtime Behavior
total hit counts accurately up to 1,000 in order to enable top-hits
optimizations such as block-max WAND (LUCENE-8135). (Adrien Grand)
* LUCENE-8505: IndexWriter#addIndices will now fail if the target index is sorted but
the candidate is not. (Jim Ferenczi)
New Features
* LUCENE-8340: LongPoint#newDistanceQuery may be used to boost scores based on

View File

@ -53,6 +53,7 @@ import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
@ -934,21 +935,30 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable,
}
/** Confirms that the incoming index sort (if any) matches the existing index sort (if any). */
private void validateIndexSort() throws CorruptIndexException {
private void validateIndexSort() {
Sort indexSort = config.getIndexSort();
if (indexSort != null) {
for(SegmentCommitInfo info : segmentInfos) {
Sort segmentIndexSort = info.info.getIndexSort();
if (segmentIndexSort != null && indexSort.equals(segmentIndexSort) == false) {
if (segmentIndexSort == null || isCongruentSort(indexSort, segmentIndexSort) == false) {
throw new IllegalArgumentException("cannot change previous indexSort=" + segmentIndexSort + " (from segment=" + info + ") to new indexSort=" + indexSort);
} else if (segmentIndexSort == null) {
// Flushed segments are not sorted if they were built with a version prior to 6.5.0
throw new CorruptIndexException("segment not sorted with indexSort=" + segmentIndexSort, info.info.toString());
}
}
}
}
/**
* Returns true if <code>indexSort</code> is a prefix of <code>otherSort</code>.
**/
static boolean isCongruentSort(Sort indexSort, Sort otherSort) {
final SortField[] fields1 = indexSort.getSort();
final SortField[] fields2 = otherSort.getSort();
if (fields1.length > fields2.length) {
return false;
}
return Arrays.asList(fields1).equals(Arrays.asList(fields2).subList(0, fields1.length));
}
// reads latest field infos for the commit
// this is used on IW init and addIndexes(Dir) to create/update the global field map.
// TODO: fix tests abusing this method!
@ -2824,8 +2834,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable,
Sort segmentIndexSort = info.info.getIndexSort();
if (indexSort != null && segmentIndexSort != null && indexSort.equals(segmentIndexSort) == false) {
// TODO: we could make this smarter, e.g. if the incoming indexSort is congruent with our sort ("starts with") then it's OK
if (indexSort != null && (segmentIndexSort == null || isCongruentSort(indexSort, segmentIndexSort) == false)) {
throw new IllegalArgumentException("cannot change index sort from " + segmentIndexSort + " to " + indexSort);
}
@ -2908,8 +2917,8 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable,
}
Sort leafIndexSort = segmentMeta.getSort();
if (config.getIndexSort() != null && leafIndexSort != null
&& config.getIndexSort().equals(leafIndexSort) == false) {
if (config.getIndexSort() != null &&
(leafIndexSort == null || isCongruentSort(config.getIndexSort(), leafIndexSort) == false)) {
throw new IllegalArgumentException("cannot change index sort from " + leafIndexSort + " to " + config.getIndexSort());
}
}

View File

@ -34,6 +34,8 @@ import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
import static org.apache.lucene.index.IndexWriter.isCongruentSort;
/** Holds common state used during segment merging.
*
* @lucene.experimental */
@ -223,50 +225,14 @@ public class MergeState {
return originalReaders;
}
/** If an incoming reader is not sorted, because it was flushed by IW older than {@link Version.LUCENE_7_0_0}
* or because we add unsorted segments from another index {@link IndexWriter#addIndexes(CodecReader...)} ,
* we sort it here:
*/
final Sorter sorter = new Sorter(indexSort);
List<CodecReader> readers = new ArrayList<>(originalReaders.size());
for (CodecReader leaf : originalReaders) {
Sort segmentSort = leaf.getMetaData().getSort();
if (segmentSort == null) {
// This segment was written by flush, so documents are not yet sorted, so we sort them now:
long t0 = System.nanoTime();
Sorter.DocMap sortDocMap = sorter.sort(leaf);
long t1 = System.nanoTime();
double msec = (t1-t0)/1000000.0;
if (sortDocMap != null) {
if (infoStream.isEnabled("SM")) {
infoStream.message("SM", String.format(Locale.ROOT, "segment %s is not sorted; wrapping for sort %s now (%.2f msec to sort)", leaf, indexSort, msec));
}
needsIndexSort = true;
leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(new MergeReaderWrapper(leaf), sortDocMap));
leafDocMaps[readers.size()] = new DocMap() {
@Override
public int get(int docID) {
return sortDocMap.oldToNew(docID);
}
};
} else {
if (infoStream.isEnabled("SM")) {
infoStream.message("SM", String.format(Locale.ROOT, "segment %s is not sorted, but is already accidentally in sort %s order (%.2f msec to sort)", leaf, indexSort, msec));
}
}
} else {
if (segmentSort.equals(indexSort) == false) {
throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
}
if (infoStream.isEnabled("SM")) {
infoStream.message("SM", "segment " + leaf + " already sorted");
}
if (segmentSort == null || isCongruentSort(indexSort, segmentSort) == false) {
throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort +
" but to-be-merged segment has sort=" + (segmentSort == null ? "null" : segmentSort));
}
readers.add(leaf);
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
@ -1820,19 +1821,52 @@ public class TestIndexSorting extends LuceneTestCase {
dir.close();
}
public void testAddIndexes(boolean withDeletes, boolean useReaders) throws Exception {
public void testBadAddIndexes() throws Exception {
Directory dir = newDirectory();
Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG));
IndexWriterConfig iwc1 = newIndexWriterConfig();
if (random().nextBoolean()) {
iwc1.setIndexSort(indexSort);
iwc1.setIndexSort(indexSort);
IndexWriter w = new IndexWriter(dir, iwc1);
w.addDocument(new Document());
List<Sort> indexSorts = Arrays.asList(null, new Sort(new SortField("bar", SortField.Type.LONG)));
for (Sort sort : indexSorts) {
Directory dir2 = newDirectory();
IndexWriterConfig iwc2 = newIndexWriterConfig();
if (sort != null) {
iwc2.setIndexSort(sort);
}
IndexWriter w2 = new IndexWriter(dir2, iwc2);
w2.addDocument(new Document());
final IndexReader reader = w2.getReader();
w2.close();
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> w.addIndexes(dir2));
assertThat(expected.getMessage(), containsString("cannot change index sort"));
CodecReader[] codecReaders = new CodecReader[reader.leaves().size()];
for (int i = 0; i < codecReaders.length; ++i) {
codecReaders[i] = (CodecReader) reader.leaves().get(i).reader();
}
expected = expectThrows(IllegalArgumentException.class, () -> w.addIndexes(codecReaders));
assertThat(expected.getMessage(), containsString("cannot change index sort"));
reader.close();
dir2.close();
}
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
w.close();
dir.close();
}
public void testAddIndexes(boolean withDeletes, boolean useReaders) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc1 = newIndexWriterConfig();
Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG), new SortField("bar", SortField.Type.LONG));
iwc1.setIndexSort(indexSort);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc1);
final int numDocs = atLeast(100);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(i), Store.NO));
doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
doc.add(new NumericDocValuesField("bar", random().nextInt(20)));
w.addDocument(doc);
}
if (withDeletes) {
@ -1848,7 +1882,12 @@ public class TestIndexSorting extends LuceneTestCase {
Directory dir2 = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
iwc.setIndexSort(indexSort);
if (indexSort != null && random().nextBoolean()) {
// test congruent index sort
iwc.setIndexSort(new Sort(new SortField("foo", SortField.Type.LONG)));
} else {
iwc.setIndexSort(indexSort);
}
IndexWriter w2 = new IndexWriter(dir2, iwc);
if (useReaders) {