From 679b4aa71d205ac58621f6b2bad64637f6bd7d67 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 1 Aug 2018 18:28:51 +0200 Subject: [PATCH] LUCENE-8441: IndexWriter now checks doc value type of index sort fields and fails the document if they are not compatible. --- lucene/CHANGES.txt | 3 ++ .../lucene/index/DefaultIndexingChain.java | 49 +++++++++++++++++++ .../index/SortingStoredFieldsConsumer.java | 6 ++- .../index/SortingTermVectorsConsumer.java | 6 ++- .../apache/lucene/index/TestIndexSorting.java | 40 +++++++++++++++ 5 files changed, 100 insertions(+), 4 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index fdc8c98fdb3..b76cc6fefea 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -199,6 +199,9 @@ Bug Fixes: * LUCENE-8429: DaciukMihovAutomatonBuilder is no longer prone to stack overflows by enforcing a maximum term length. (Adrien Grand) +* LUCENE-8441: IndexWriter now checks doc value type for index sort fields + and fails the document if they are not compatible. (Jim Ferenczi, Mike McCandless) + Changes in Runtime Behavior: * LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java index e55251696e9..d0a697468fc 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java @@ -39,6 +39,8 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.search.SortedSetSortField; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.IOContext; import org.apache.lucene.util.ArrayUtil; @@ -524,6 +526,48 @@ final class DefaultIndexingChain extends DocConsumer { fp.pointValuesWriter.addPackedValue(docState.docID, field.binaryValue()); } + private void validateIndexSortDVType(Sort indexSort, String fieldName, DocValuesType dvType) { + for (SortField sortField : indexSort.getSort()) { + if (sortField.getField().equals(fieldName)) { + switch (dvType) { + case NUMERIC: + if (sortField.getType().equals(SortField.Type.INT) == false && + sortField.getType().equals(SortField.Type.LONG) == false && + sortField.getType().equals(SortField.Type.FLOAT) == false && + sortField.getType().equals(SortField.Type.DOUBLE) == false) { + throw new IllegalArgumentException("invalid doc value type:" + dvType + " for sortField:" + sortField); + } + break; + + case BINARY: + throw new IllegalArgumentException("invalid doc value type:" + dvType + " for sortField:" + sortField); + + case SORTED: + if (sortField.getType().equals(SortField.Type.STRING) == false) { + throw new IllegalArgumentException("invalid doc value type:" + dvType + " for sortField:" + sortField); + } + break; + + case SORTED_NUMERIC: + if (sortField instanceof SortedNumericSortField == false) { + throw new IllegalArgumentException("invalid doc value type:" + dvType + " for sortField:" + sortField); + } + break; + + case SORTED_SET: + if (sortField instanceof SortedSetSortField == false) { + throw new IllegalArgumentException("invalid doc value type:" + dvType + " for sortField:" + sortField); + } + break; + + default: + throw new IllegalArgumentException("invalid doc value type:" + dvType + " for sortField:" + sortField); + } + break; + } + } + } + /** Called from processDocument to index one field's doc value */ private void indexDocValue(PerField fp, DocValuesType dvType, IndexableField field) throws IOException { @@ -531,7 +575,12 @@ final class DefaultIndexingChain extends DocConsumer { // This is the first time we are seeing this field indexed with doc values, so we // now record the DV type so that any future attempt to (illegally) change // the DV type of this field, will throw an IllegalArgExc: + if (docWriter.getSegmentInfo().getIndexSort() != null) { + final Sort indexSort = docWriter.getSegmentInfo().getIndexSort(); + validateIndexSortDVType(indexSort, fp.fieldInfo.name, dvType); + } fieldInfos.globalFieldNumbers.setDocValuesType(fp.fieldInfo.number, fp.fieldInfo.name, dvType); + } fp.fieldInfo.setDocValuesType(dvType); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java index e5443b28e3b..97253a5ee35 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java @@ -83,8 +83,10 @@ final class SortingStoredFieldsConsumer extends StoredFieldsConsumer { try { super.abort(); } finally { - IOUtils.deleteFilesIgnoringExceptions(tmpDirectory, - tmpDirectory.getTemporaryFiles().values()); + if (tmpDirectory != null) { + IOUtils.deleteFilesIgnoringExceptions(tmpDirectory, + tmpDirectory.getTemporaryFiles().values()); + } } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java index 054ca50aa4e..955dd8aa15b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java @@ -83,8 +83,10 @@ final class SortingTermVectorsConsumer extends TermVectorsConsumer { try { super.abort(); } finally { - IOUtils.deleteFilesIgnoringExceptions(tmpDirectory, - tmpDirectory.getTemporaryFiles().values()); + if (tmpDirectory != null) { + IOUtils.deleteFilesIgnoringExceptions(tmpDirectory, + tmpDirectory.getTemporaryFiles().values()); + } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 9711a356ce6..674972f9f97 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -84,6 +84,7 @@ import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.TestUtil; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import static org.junit.internal.matchers.StringContains.containsString; public class TestIndexSorting extends LuceneTestCase { static class AssertingNeedsIndexSortCodec extends FilterCodec { @@ -2472,4 +2473,43 @@ public class TestIndexSorting extends LuceneTestCase { IOUtils.close(r, w, dir); } + public void testWrongSortFieldType() throws Exception { + Directory dir = newDirectory(); + List dvs = new ArrayList<>(); + dvs.add(new SortedDocValuesField("field", new BytesRef(""))); + dvs.add(new SortedSetDocValuesField("field", new BytesRef(""))); + dvs.add(new NumericDocValuesField("field", 42)); + dvs.add(new SortedNumericDocValuesField("field", 42)); + + List sortFields = new ArrayList<>(); + sortFields.add(new SortField("field", SortField.Type.STRING)); + sortFields.add(new SortedSetSortField("field", false)); + sortFields.add(new SortField("field", SortField.Type.INT)); + sortFields.add(new SortedNumericSortField("field", SortField.Type.INT)); + + for (int i = 0; i < sortFields.size(); i++) { + for (int j = 0; j < dvs.size(); j++) { + if (i == j) { + continue; + } + Sort indexSort = new Sort(sortFields.get(i)); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(dvs.get(j)); + IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc)); + assertThat(exc.getMessage(), containsString("invalid doc value type")); + doc.clear(); + doc.add(dvs.get(i)); + w.addDocument(doc); + doc.add(dvs.get(j)); + exc = expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc)); + assertThat(exc.getMessage(), containsString("cannot change DocValues type")); + w.rollback(); + IOUtils.close(w); + } + } + IOUtils.close(dir); + } }