From 54fa7dfd271f7fe3bc8da2cfcd165e21e76e1804 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Wed, 4 May 2016 05:37:37 -0400 Subject: [PATCH 01/16] LUCENE-6766: initial patch --- .../simpletext/SimpleTextFieldsWriter.java | 2 + .../SimpleTextSegmentInfoFormat.java | 144 +++- .../SimpleTextStoredFieldsWriter.java | 1 - .../lucene/codecs/DocValuesConsumer.java | 512 +++++------ .../apache/lucene/codecs/PointsWriter.java | 3 +- .../lucene/codecs/StoredFieldsWriter.java | 60 +- .../lucene/codecs/TermVectorsWriter.java | 78 +- .../lucene50/Lucene50SegmentInfoFormat.java | 7 +- .../lucene/codecs/lucene60/Lucene60Codec.java | 2 + .../codecs/lucene60/Lucene60PointsWriter.java | 4 +- .../org/apache/lucene/index/CheckIndex.java | 3 + .../org/apache/lucene/index/DocIDMerger.java | 173 ++++ .../index/DocumentsWriterPerThread.java | 2 +- .../lucene/index/FilterCodecReader.java | 6 + .../apache/lucene/index/FilterLeafReader.java | 7 + .../org/apache/lucene/index/IndexWriter.java | 109 +-- .../lucene/index/IndexWriterConfig.java | 26 +- .../org/apache/lucene/index/LeafReader.java | 4 + .../lucene/index/LiveIndexWriterConfig.java | 13 + .../index/MappingMultiPostingsEnum.java | 121 ++- .../org/apache/lucene/index/MergePolicy.java | 61 +- .../lucene/index/MergeReaderWrapper.java | 11 +- .../org/apache/lucene/index/MergeState.java | 226 ++--- .../org/apache/lucene/index/MultiFields.java | 2 + .../lucene/index/MultiPostingsEnum.java | 7 +- .../org/apache/lucene/index/MultiSorter.java | 221 +++++ .../lucene/index/ParallelLeafReader.java | 18 + .../org/apache/lucene/index/SegmentInfo.java | 25 +- .../apache/lucene/index/SegmentReader.java | 6 + .../lucene/index/SlowCodecReaderWrapper.java | 11 + .../java/org/apache/lucene/index/Sorter.java | 4 +- .../lucene/index/SortingLeafReader.java | 46 +- .../java/org/apache/lucene/search/Sort.java | 3 + .../org/apache/lucene/util/bkd/BKDWriter.java | 16 +- .../org/apache/lucene/index/TestCodecs.java | 4 +- .../index/TestDemoParallelLeafReader.java | 38 +- .../test/org/apache/lucene/index/TestDoc.java | 2 +- .../apache/lucene/index/TestDocIDMerger.java | 179 ++++ .../apache/lucene/index/TestIndexSorting.java | 792 ++++++++++++++++++ .../apache/lucene/index/TestIndexWriter.java | 3 + .../apache/lucene/index/TestSegmentInfos.java | 6 +- .../lucene/index/TestSegmentMerger.java | 24 +- .../org/apache/lucene/util/bkd/TestBKD.java | 23 +- .../apache/lucene/index/IndexSplitter.java | 2 +- .../index/SlowCompositeReaderWrapper.java | 11 + .../lucene/index/SortingMergePolicy.java | 264 ------ .../search/BlockJoinComparatorSource.java | 5 +- .../EarlyTerminatingSortingCollector.java | 46 +- .../apache/lucene/index/IndexSortingTest.java | 89 -- .../apache/lucene/index/SorterTestBase.java | 405 --------- .../lucene/index/SortingLeafReaderTest.java | 73 -- .../lucene/index/TestSortingMergePolicy.java | 201 ----- .../TestDiversifiedTopDocsCollector.java | 4 +- .../TestEarlyTerminatingSortingCollector.java | 74 +- .../index/BaseCompoundFormatTestCase.java | 2 +- .../index/BaseFieldInfoFormatTestCase.java | 2 +- .../index/BaseIndexFileFormatTestCase.java | 2 +- .../index/BaseSegmentInfoFormatTestCase.java | 71 +- .../lucene/index/MockRandomMergePolicy.java | 49 +- .../lucene/index/RandomPostingsTester.java | 2 +- .../org/apache/lucene/search/QueryUtils.java | 5 + 61 files changed, 2478 insertions(+), 1834 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java rename lucene/{misc => core}/src/java/org/apache/lucene/index/MergeReaderWrapper.java (96%) create mode 100644 lucene/core/src/java/org/apache/lucene/index/MultiSorter.java rename lucene/{misc => core}/src/java/org/apache/lucene/index/Sorter.java (99%) rename lucene/{misc => core}/src/java/org/apache/lucene/index/SortingLeafReader.java (96%) create mode 100644 lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java create mode 100644 lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java delete mode 100644 lucene/misc/src/java/org/apache/lucene/index/SortingMergePolicy.java delete mode 100644 lucene/misc/src/test/org/apache/lucene/index/IndexSortingTest.java delete mode 100644 lucene/misc/src/test/org/apache/lucene/index/SorterTestBase.java delete mode 100644 lucene/misc/src/test/org/apache/lucene/index/SortingLeafReaderTest.java delete mode 100644 lucene/misc/src/test/org/apache/lucene/index/TestSortingMergePolicy.java diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java index 3b026bedacd..20235528dca 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java @@ -36,6 +36,7 @@ class SimpleTextFieldsWriter extends FieldsConsumer { private IndexOutput out; private final BytesRefBuilder scratch = new BytesRefBuilder(); private final SegmentWriteState writeState; + final String segment; final static BytesRef END = new BytesRef("END"); final static BytesRef FIELD = new BytesRef("field "); @@ -49,6 +50,7 @@ class SimpleTextFieldsWriter extends FieldsConsumer { public SimpleTextFieldsWriter(SegmentWriteState writeState) throws IOException { final String fileName = SimpleTextPostingsFormat.getPostingsFileName(writeState.segmentInfo.name, writeState.segmentSuffix); + segment = writeState.segmentInfo.name; out = writeState.directory.createOutput(fileName, writeState.context); this.writeState = writeState; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index 0823a888040..594fccf2472 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -31,6 +31,8 @@ import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -59,6 +61,11 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { final static BytesRef SI_NUM_FILES = new BytesRef(" files "); final static BytesRef SI_FILE = new BytesRef(" file "); final static BytesRef SI_ID = new BytesRef(" id "); + final static BytesRef SI_SORT = new BytesRef(" sort "); + final static BytesRef SI_SORT_FIELD = new BytesRef(" field "); + final static BytesRef SI_SORT_TYPE = new BytesRef(" type "); + final static BytesRef SI_SORT_REVERSE = new BytesRef(" reverse "); + final static BytesRef SI_SORT_MISSING = new BytesRef(" missing "); public static final String SI_EXTENSION = "si"; @@ -137,10 +144,93 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { + ", got: " + StringHelper.idToString(id), input); } + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SI_SORT); + final int numSortFields = Integer.parseInt(readString(SI_SORT.length, scratch)); + SortField[] sortField = new SortField[numSortFields]; + for (int i = 0; i < numSortFields; ++i) { + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SI_SORT_FIELD); + final String field = readString(SI_SORT_FIELD.length, scratch); + + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SI_SORT_TYPE); + final String typeAsString = readString(SI_SORT_TYPE.length, scratch); + + final SortField.Type type; + switch (typeAsString) { + case "string": + type = SortField.Type.STRING; + break; + case "long": + type = SortField.Type.LONG; + break; + case "int": + type = SortField.Type.INT; + break; + default: + throw new CorruptIndexException("unable to parse sort type string: " + typeAsString, input); + } + + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SI_SORT_REVERSE); + final boolean reverse = Boolean.parseBoolean(readString(SI_SORT_REVERSE.length, scratch)); + + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SI_SORT_MISSING); + final String missingLastAsString = readString(SI_SORT_MISSING.length, scratch); + final Object missingValue; + switch (type) { + case STRING: + switch (missingLastAsString) { + case "null": + missingValue = null; + break; + case "first": + missingValue = SortField.STRING_FIRST; + break; + case "last": + missingValue = SortField.STRING_LAST; + break; + default: + throw new CorruptIndexException("unable to parse missing string: " + typeAsString, input); + } + break; + case LONG: + switch (missingLastAsString) { + case "null": + missingValue = null; + break; + default: + missingValue = Long.parseLong(missingLastAsString); + break; + } + break; + case INT: + switch (missingLastAsString) { + case "null": + missingValue = null; + break; + default: + missingValue = Integer.parseInt(missingLastAsString); + break; + } + break; + default: + throw new AssertionError(); + } + sortField[i] = new SortField(field, type, reverse); + if (missingValue != null) { + sortField[i].setMissingValue(missingValue); + } + } + Sort indexSort = sortField.length == 0 ? null : new Sort(sortField); + SimpleTextUtil.checkFooter(input); SegmentInfo info = new SegmentInfo(directory, version, segmentName, docCount, - isCompoundFile, null, Collections.unmodifiableMap(diagnostics), id, Collections.unmodifiableMap(attributes)); + isCompoundFile, null, Collections.unmodifiableMap(diagnostics), + id, Collections.unmodifiableMap(attributes), indexSort); info.setFiles(files); return info; } @@ -223,6 +313,58 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { SimpleTextUtil.write(output, new BytesRef(si.getId())); SimpleTextUtil.writeNewline(output); + Sort indexSort = si.getIndexSort(); + SimpleTextUtil.write(output, SI_SORT); + final int numSortFields = indexSort == null ? 0 : indexSort.getSort().length; + SimpleTextUtil.write(output, Integer.toString(numSortFields), scratch); + SimpleTextUtil.writeNewline(output); + for (int i = 0; i < numSortFields; ++i) { + final SortField sortField = indexSort.getSort()[i]; + + SimpleTextUtil.write(output, SI_SORT_FIELD); + SimpleTextUtil.write(output, sortField.getField(), scratch); + SimpleTextUtil.writeNewline(output); + + SimpleTextUtil.write(output, SI_SORT_TYPE); + final String sortType; + switch (sortField.getType()) { + case STRING: + sortType = "string"; + break; + case LONG: + sortType = "long"; + break; + case INT: + sortType = "int"; + break; + default: + throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); + } + SimpleTextUtil.write(output, sortType, scratch); + SimpleTextUtil.writeNewline(output); + + SimpleTextUtil.write(output, SI_SORT_REVERSE); + SimpleTextUtil.write(output, Boolean.toString(sortField.getReverse()), scratch); + SimpleTextUtil.writeNewline(output); + + SimpleTextUtil.write(output, SI_SORT_MISSING); + final Object missingValue = sortField.getMissingValue(); + final String missing; + if (missingValue == null) { + missing = "null"; + } else if (missingValue == SortField.STRING_FIRST) { + missing = "first"; + } else if (missingValue == SortField.STRING_LAST) { + missing = "last"; + } else if (missingValue instanceof Long) { + missing = Long.toString((Long) missingValue); + } else { + throw new IllegalStateException("Unexpected missing sort value: " + missingValue); + } + SimpleTextUtil.write(output, missing, scratch); + SimpleTextUtil.writeNewline(output); + } + SimpleTextUtil.writeChecksum(output, scratch); } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsWriter.java index 00259b89711..b59114a65e2 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsWriter.java @@ -143,7 +143,6 @@ public class SimpleTextStoredFieldsWriter extends StoredFieldsWriter { } else { write(TYPE_STRING); newLine(); - write(VALUE); write(field.stringValue()); newLine(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java index 90abf2ad44b..52bf9b2f82d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.codecs; - import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; @@ -25,12 +24,13 @@ import java.util.List; import java.util.NoSuchElementException; import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocIDMerger; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FilteredTermsEnum; import org.apache.lucene.index.MergeState; -import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.MultiDocValues.OrdinalMap; -import org.apache.lucene.index.DocValues; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentWriteState; // javadocs import org.apache.lucene.index.SortedDocValues; @@ -44,6 +44,8 @@ import org.apache.lucene.util.LongBitSet; import org.apache.lucene.util.LongValues; import org.apache.lucene.util.packed.PackedInts; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + /** * Abstract API that consumes numeric, binary and * sorted docvalues. Concrete implementations of this @@ -240,6 +242,32 @@ public abstract class DocValuesConsumer implements Closeable { } } } + + /** Tracks state of one numeric sub-reader that we are merging */ + private static class NumericDocValuesSub extends DocIDMerger.Sub { + + private final NumericDocValues values; + private final Bits docsWithField; + private int docID = -1; + private final int maxDoc; + + public NumericDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, NumericDocValues values, Bits docsWithField, int maxDoc) { + super(docMap, liveDocs); + this.values = values; + this.docsWithField = docsWithField; + this.maxDoc = maxDoc; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + return docID; + } + } + } /** * Merges the numeric docvalues from toMerge. @@ -248,20 +276,23 @@ public abstract class DocValuesConsumer implements Closeable { * an Iterable that merges and filters deleted documents on the fly. */ public void mergeNumericField(final FieldInfo fieldInfo, final MergeState mergeState, final List toMerge, final List docsWithField) throws IOException { - addNumericField(fieldInfo, new Iterable() { @Override public Iterator iterator() { + + // We must make a new DocIDMerger for each iterator: + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == toMerge.size(); + for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + return new Iterator() { - int readerUpto = -1; - int docIDUpto; long nextValue; boolean nextHasValue; - int currentMaxDoc; - NumericDocValues currentValues; - Bits currentLiveDocs; - Bits currentDocsWithField; boolean nextIsSet; @Override @@ -276,7 +307,7 @@ public abstract class DocValuesConsumer implements Closeable { @Override public Number next() { - if (!hasNext()) { + if (hasNext() == false) { throw new NoSuchElementException(); } assert nextIsSet; @@ -286,35 +317,14 @@ public abstract class DocValuesConsumer implements Closeable { private boolean setNext() { while (true) { - if (readerUpto == toMerge.size()) { + NumericDocValuesSub sub = docIDMerger.next(); + if (sub == null) { return false; } - - if (docIDUpto == currentMaxDoc) { - readerUpto++; - if (readerUpto < toMerge.size()) { - currentValues = toMerge.get(readerUpto); - currentDocsWithField = docsWithField.get(readerUpto); - currentLiveDocs = mergeState.liveDocs[readerUpto]; - currentMaxDoc = mergeState.maxDocs[readerUpto]; - } - docIDUpto = 0; - continue; - } - - if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { - nextIsSet = true; - nextValue = currentValues.get(docIDUpto); - if (nextValue == 0 && currentDocsWithField.get(docIDUpto) == false) { - nextHasValue = false; - } else { - nextHasValue = true; - } - docIDUpto++; - return true; - } - - docIDUpto++; + nextIsSet = true; + nextValue = sub.values.get(sub.docID); + nextHasValue = nextValue != 0 || sub.docsWithField.get(sub.docID); + return true; } } }; @@ -322,6 +332,32 @@ public abstract class DocValuesConsumer implements Closeable { }); } + /** Tracks state of one binary sub-reader that we are merging */ + private static class BinaryDocValuesSub extends DocIDMerger.Sub { + + private final BinaryDocValues values; + private final Bits docsWithField; + private int docID = -1; + private final int maxDoc; + + public BinaryDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, BinaryDocValues values, Bits docsWithField, int maxDoc) { + super(docMap, liveDocs); + this.values = values; + this.docsWithField = docsWithField; + this.maxDoc = maxDoc; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + return docID; + } + } + } + /** * Merges the binary docvalues from toMerge. *

@@ -329,20 +365,23 @@ public abstract class DocValuesConsumer implements Closeable { * an Iterable that merges and filters deleted documents on the fly. */ public void mergeBinaryField(FieldInfo fieldInfo, final MergeState mergeState, final List toMerge, final List docsWithField) throws IOException { - addBinaryField(fieldInfo, new Iterable() { @Override public Iterator iterator() { + + // We must make a new DocIDMerger for each iterator: + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == toMerge.size(); + for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + return new Iterator() { - int readerUpto = -1; - int docIDUpto; BytesRef nextValue; BytesRef nextPointer; // points to null if missing, or nextValue - int currentMaxDoc; - BinaryDocValues currentValues; - Bits currentLiveDocs; - Bits currentDocsWithField; boolean nextIsSet; @Override @@ -357,7 +396,7 @@ public abstract class DocValuesConsumer implements Closeable { @Override public BytesRef next() { - if (!hasNext()) { + if (hasNext() == false) { throw new NoSuchElementException(); } assert nextIsSet; @@ -367,42 +406,49 @@ public abstract class DocValuesConsumer implements Closeable { private boolean setNext() { while (true) { - if (readerUpto == toMerge.size()) { - return false; + BinaryDocValuesSub sub = docIDMerger.next(); + if (sub == null) { + return false; + } + nextIsSet = true; + if (sub.docsWithField.get(sub.docID)) { + nextPointer = nextValue = sub.values.get(sub.docID); + } else { + nextPointer = null; + } + return true; } - - if (docIDUpto == currentMaxDoc) { - readerUpto++; - if (readerUpto < toMerge.size()) { - currentValues = toMerge.get(readerUpto); - currentDocsWithField = docsWithField.get(readerUpto); - currentLiveDocs = mergeState.liveDocs[readerUpto]; - currentMaxDoc = mergeState.maxDocs[readerUpto]; - } - docIDUpto = 0; - continue; - } - - if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { - nextIsSet = true; - if (currentDocsWithField.get(docIDUpto)) { - nextValue = currentValues.get(docIDUpto); - nextPointer = nextValue; - } else { - nextPointer = null; - } - docIDUpto++; - return true; - } - - docIDUpto++; } - } }; } }); } + /** Tracks state of one sorted numeric sub-reader that we are merging */ + private static class SortedNumericDocValuesSub extends DocIDMerger.Sub { + + private final SortedNumericDocValues values; + private int docID = -1; + private final int maxDoc; + + public SortedNumericDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, SortedNumericDocValues values, int maxDoc) { + super(docMap, liveDocs); + this.values = values; + this.maxDoc = maxDoc; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + values.setDocument(docID); + return docID; + } + } + } + /** * Merges the sorted docvalues from toMerge. *

@@ -410,21 +456,24 @@ public abstract class DocValuesConsumer implements Closeable { * iterables that filter deleted documents. */ public void mergeSortedNumericField(FieldInfo fieldInfo, final MergeState mergeState, List toMerge) throws IOException { - final int numReaders = toMerge.size(); - final SortedNumericDocValues dvs[] = toMerge.toArray(new SortedNumericDocValues[numReaders]); - // step 3: add field addSortedNumericField(fieldInfo, // doc -> value count new Iterable() { @Override public Iterator iterator() { + + // We must make a new DocIDMerger for each iterator: + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == toMerge.size(); + for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + return new Iterator() { - int readerUpto = -1; - int docIDUpto; int nextValue; - int currentMaxDoc; - Bits currentLiveDocs; boolean nextIsSet; @Override @@ -439,7 +488,7 @@ public abstract class DocValuesConsumer implements Closeable { @Override public Number next() { - if (!hasNext()) { + if (hasNext() == false) { throw new NoSuchElementException(); } assert nextIsSet; @@ -449,30 +498,13 @@ public abstract class DocValuesConsumer implements Closeable { private boolean setNext() { while (true) { - if (readerUpto == numReaders) { + SortedNumericDocValuesSub sub = docIDMerger.next(); + if (sub == null) { return false; } - - if (docIDUpto == currentMaxDoc) { - readerUpto++; - if (readerUpto < numReaders) { - currentLiveDocs = mergeState.liveDocs[readerUpto]; - currentMaxDoc = mergeState.maxDocs[readerUpto]; - } - docIDUpto = 0; - continue; - } - - if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { - nextIsSet = true; - SortedNumericDocValues dv = dvs[readerUpto]; - dv.setDocument(docIDUpto); - nextValue = dv.count(); - docIDUpto++; - return true; - } - - docIDUpto++; + nextIsSet = true; + nextValue = sub.values.count(); + return true; } } }; @@ -482,15 +514,21 @@ public abstract class DocValuesConsumer implements Closeable { new Iterable() { @Override public Iterator iterator() { + // We must make a new DocIDMerger for each iterator: + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == toMerge.size(); + for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + return new Iterator() { - int readerUpto = -1; - int docIDUpto; long nextValue; - int currentMaxDoc; - Bits currentLiveDocs; boolean nextIsSet; int valueUpto; int valueLength; + SortedNumericDocValuesSub current; @Override public boolean hasNext() { @@ -504,7 +542,7 @@ public abstract class DocValuesConsumer implements Closeable { @Override public Number next() { - if (!hasNext()) { + if (hasNext() == false) { throw new NoSuchElementException(); } assert nextIsSet; @@ -514,38 +552,21 @@ public abstract class DocValuesConsumer implements Closeable { private boolean setNext() { while (true) { - if (readerUpto == numReaders) { - return false; - } if (valueUpto < valueLength) { - nextValue = dvs[readerUpto].valueAt(valueUpto); + nextValue = current.values.valueAt(valueUpto); valueUpto++; nextIsSet = true; return true; } - if (docIDUpto == currentMaxDoc) { - readerUpto++; - if (readerUpto < numReaders) { - currentLiveDocs = mergeState.liveDocs[readerUpto]; - currentMaxDoc = mergeState.maxDocs[readerUpto]; - } - docIDUpto = 0; - continue; + current = docIDMerger.next(); + if (current == null) { + return false; } - - if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { - assert docIDUpto < currentMaxDoc; - SortedNumericDocValues dv = dvs[readerUpto]; - dv.setDocument(docIDUpto); - valueUpto = 0; - valueLength = dv.count(); - docIDUpto++; - continue; - } - - docIDUpto++; + valueUpto = 0; + valueLength = current.values.count(); + continue; } } }; @@ -554,6 +575,32 @@ public abstract class DocValuesConsumer implements Closeable { ); } + /** Tracks state of one sorted sub-reader that we are merging */ + private static class SortedDocValuesSub extends DocIDMerger.Sub { + + private final SortedDocValues values; + private int docID = -1; + private final int maxDoc; + private final LongValues map; + + public SortedDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, SortedDocValues values, int maxDoc, LongValues map) { + super(docMap, liveDocs); + this.values = values; + this.maxDoc = maxDoc; + this.map = map; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + return docID; + } + } + } + /** * Merges the sorted docvalues from toMerge. *

@@ -608,7 +655,7 @@ public abstract class DocValuesConsumer implements Closeable { @Override public BytesRef next() { - if (!hasNext()) { + if (hasNext() == false) { throw new NoSuchElementException(); } int segmentNumber = map.getFirstSegmentNumber(currentOrd); @@ -629,13 +676,17 @@ public abstract class DocValuesConsumer implements Closeable { new Iterable() { @Override public Iterator iterator() { + // We must make a new DocIDMerger for each iterator: + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == toMerge.size(); + for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + return new Iterator() { - int readerUpto = -1; - int docIDUpto; int nextValue; - int currentMaxDoc; - Bits currentLiveDocs; - LongValues currentMap; boolean nextIsSet; @Override @@ -650,7 +701,7 @@ public abstract class DocValuesConsumer implements Closeable { @Override public Number next() { - if (!hasNext()) { + if (hasNext() == false) { throw new NoSuchElementException(); } assert nextIsSet; @@ -661,30 +712,15 @@ public abstract class DocValuesConsumer implements Closeable { private boolean setNext() { while (true) { - if (readerUpto == numReaders) { + SortedDocValuesSub sub = docIDMerger.next(); + if (sub == null) { return false; } - if (docIDUpto == currentMaxDoc) { - readerUpto++; - if (readerUpto < numReaders) { - currentMap = map.getGlobalOrds(readerUpto); - currentLiveDocs = mergeState.liveDocs[readerUpto]; - currentMaxDoc = mergeState.maxDocs[readerUpto]; - } - docIDUpto = 0; - continue; - } - - if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { - nextIsSet = true; - int segOrd = dvs[readerUpto].getOrd(docIDUpto); - nextValue = segOrd == -1 ? -1 : (int) currentMap.get(segOrd); - docIDUpto++; - return true; - } - - docIDUpto++; + nextIsSet = true; + int segOrd = sub.values.getOrd(sub.docID); + nextValue = segOrd == -1 ? -1 : (int) sub.map.get(segOrd); + return true; } } }; @@ -693,6 +729,37 @@ public abstract class DocValuesConsumer implements Closeable { ); } + /** Tracks state of one sorted set sub-reader that we are merging */ + private static class SortedSetDocValuesSub extends DocIDMerger.Sub { + + private final SortedSetDocValues values; + int docID = -1; + private final int maxDoc; + private final LongValues map; + + public SortedSetDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, SortedSetDocValues values, int maxDoc, LongValues map) { + super(docMap, liveDocs); + this.values = values; + this.maxDoc = maxDoc; + this.map = map; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + return docID; + } + } + + @Override + public String toString() { + return "SortedSetDocValuesSub(docID=" + docID + " mappedDocID=" + mappedDocID + " values=" + values + ")"; + } + } + /** * Merges the sortedset docvalues from toMerge. *

@@ -700,14 +767,12 @@ public abstract class DocValuesConsumer implements Closeable { * an Iterable that merges ordinals and values and filters deleted documents . */ public void mergeSortedSetField(FieldInfo fieldInfo, final MergeState mergeState, List toMerge) throws IOException { - final SortedSetDocValues dvs[] = toMerge.toArray(new SortedSetDocValues[toMerge.size()]); - final int numReaders = mergeState.maxDocs.length; // step 1: iterate thru each sub and mark terms still in use - TermsEnum liveTerms[] = new TermsEnum[dvs.length]; + TermsEnum liveTerms[] = new TermsEnum[toMerge.size()]; long[] weights = new long[liveTerms.length]; for (int sub = 0; sub < liveTerms.length; sub++) { - SortedSetDocValues dv = dvs[sub]; + SortedSetDocValues dv = toMerge.get(sub); Bits liveDocs = mergeState.liveDocs[sub]; int maxDoc = mergeState.maxDocs[sub]; if (liveDocs == null) { @@ -748,12 +813,12 @@ public abstract class DocValuesConsumer implements Closeable { @Override public BytesRef next() { - if (!hasNext()) { + if (hasNext() == false) { throw new NoSuchElementException(); } int segmentNumber = map.getFirstSegmentNumber(currentOrd); long segmentOrd = map.getFirstSegmentOrd(currentOrd); - final BytesRef term = dvs[segmentNumber].lookupOrd(segmentOrd); + final BytesRef term = toMerge.get(segmentNumber).lookupOrd(segmentOrd); currentOrd++; return term; } @@ -769,12 +834,18 @@ public abstract class DocValuesConsumer implements Closeable { new Iterable() { @Override public Iterator iterator() { + + // We must make a new DocIDMerger for each iterator: + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == toMerge.size(); + for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + return new Iterator() { - int readerUpto = -1; - int docIDUpto; int nextValue; - int currentMaxDoc; - Bits currentLiveDocs; boolean nextIsSet; @Override @@ -789,7 +860,7 @@ public abstract class DocValuesConsumer implements Closeable { @Override public Number next() { - if (!hasNext()) { + if (hasNext() == false) { throw new NoSuchElementException(); } assert nextIsSet; @@ -800,33 +871,18 @@ public abstract class DocValuesConsumer implements Closeable { private boolean setNext() { while (true) { - if (readerUpto == numReaders) { + SortedSetDocValuesSub sub = docIDMerger.next(); + if (sub == null) { return false; } - - if (docIDUpto == currentMaxDoc) { - readerUpto++; - if (readerUpto < numReaders) { - currentLiveDocs = mergeState.liveDocs[readerUpto]; - currentMaxDoc = mergeState.maxDocs[readerUpto]; - } - docIDUpto = 0; - continue; + sub.values.setDocument(sub.docID); + nextValue = 0; + while (sub.values.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) { + nextValue++; } - - if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { - nextIsSet = true; - SortedSetDocValues dv = dvs[readerUpto]; - dv.setDocument(docIDUpto); - nextValue = 0; - while (dv.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) { - nextValue++; - } - docIDUpto++; - return true; - } - - docIDUpto++; + //System.out.println(" doc " + sub + " -> ord count = " + nextValue); + nextIsSet = true; + return true; } } }; @@ -836,13 +892,18 @@ public abstract class DocValuesConsumer implements Closeable { new Iterable() { @Override public Iterator iterator() { + + // We must make a new DocIDMerger for each iterator: + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == toMerge.size(); + for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + return new Iterator() { - int readerUpto = -1; - int docIDUpto; long nextValue; - int currentMaxDoc; - Bits currentLiveDocs; - LongValues currentMap; boolean nextIsSet; long ords[] = new long[8]; int ordUpto; @@ -860,7 +921,7 @@ public abstract class DocValuesConsumer implements Closeable { @Override public Number next() { - if (!hasNext()) { + if (hasNext() == false) { throw new NoSuchElementException(); } assert nextIsSet; @@ -871,10 +932,6 @@ public abstract class DocValuesConsumer implements Closeable { private boolean setNext() { while (true) { - if (readerUpto == numReaders) { - return false; - } - if (ordUpto < ordLength) { nextValue = ords[ordUpto]; ordUpto++; @@ -882,35 +939,22 @@ public abstract class DocValuesConsumer implements Closeable { return true; } - if (docIDUpto == currentMaxDoc) { - readerUpto++; - if (readerUpto < numReaders) { - currentMap = map.getGlobalOrds(readerUpto); - currentLiveDocs = mergeState.liveDocs[readerUpto]; - currentMaxDoc = mergeState.maxDocs[readerUpto]; - } - docIDUpto = 0; - continue; - } - - if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { - assert docIDUpto < currentMaxDoc; - SortedSetDocValues dv = dvs[readerUpto]; - dv.setDocument(docIDUpto); - ordUpto = ordLength = 0; - long ord; - while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { - if (ordLength == ords.length) { - ords = ArrayUtil.grow(ords, ordLength+1); - } - ords[ordLength] = currentMap.get(ord); - ordLength++; - } - docIDUpto++; - continue; + SortedSetDocValuesSub sub = docIDMerger.next(); + if (sub == null) { + return false; } + sub.values.setDocument(sub.docID); - docIDUpto++; + ordUpto = ordLength = 0; + long ord; + while ((ord = sub.values.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + if (ordLength == ords.length) { + ords = ArrayUtil.grow(ords, ordLength+1); + } + ords[ordLength] = sub.map.get(ord); + ordLength++; + } + continue; } } }; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java index 43b4416fedb..05084db6ca1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java @@ -76,7 +76,6 @@ public abstract class PointsWriter implements Closeable { } MergeState.DocMap docMap = mergeState.docMaps[i]; - int docBase = mergeState.docBase[i]; pointsReader.intersect(fieldInfo.name, new IntersectVisitor() { @Override @@ -90,7 +89,7 @@ public abstract class PointsWriter implements Closeable { int newDocID = docMap.get(docID); if (newDocID != -1) { // Not deleted: - mergedVisitor.visit(docBase + newDocID, packedValue); + mergedVisitor.visit(newDocID, packedValue); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java index b8cff117e5f..b76247d7a89 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java @@ -20,10 +20,13 @@ import java.io.Closeable; import java.io.IOException; import java.io.Reader; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.StoredField; +import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexableField; @@ -33,6 +36,8 @@ import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + /** * Codec API for writing stored fields: *

    @@ -73,6 +78,30 @@ public abstract class StoredFieldsWriter implements Closeable { * check that this is the case to detect the JRE bug described * in LUCENE-1282. */ public abstract void finish(FieldInfos fis, int numDocs) throws IOException; + + private static class StoredFieldsMergeSub extends DocIDMerger.Sub { + private final StoredFieldsReader reader; + private final int maxDoc; + private final MergeVisitor visitor; + int docID = -1; + + public StoredFieldsMergeSub(MergeVisitor visitor, MergeState.DocMap docMap, Bits liveDocs, StoredFieldsReader reader, int maxDoc) { + super(docMap, liveDocs); + this.maxDoc = maxDoc; + this.reader = reader; + this.visitor = visitor; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + return docID; + } + } + } /** Merges in the stored fields from the readers in * mergeState. The default implementation skips @@ -82,23 +111,26 @@ public abstract class StoredFieldsWriter implements Closeable { * Implementations can override this method for more sophisticated * merging (bulk-byte copying, etc). */ public int merge(MergeState mergeState) throws IOException { - int docCount = 0; - for (int i=0;i subs = new ArrayList<>(); + for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + + int docCount = 0; + while (true) { + StoredFieldsMergeSub sub = docIDMerger.next(); + if (sub == null) { + break; } + assert sub.mappedDocID == docCount; + startDocument(); + sub.reader.visitDocument(sub.docID, sub.visitor); + finishDocument(); + docCount++; } finish(mergeState.mergeFieldInfos, docCount); return docCount; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java index 1aff7379d37..6ab115745b1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java @@ -16,16 +16,18 @@ */ package org.apache.lucene.codecs; - import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; import java.util.Iterator; +import java.util.List; -import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; @@ -34,6 +36,8 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + /** * Codec API for writing term vectors: *
      @@ -160,6 +164,28 @@ public abstract class TermVectorsWriter implements Closeable { } } + private static class TermVectorsMergeSub extends DocIDMerger.Sub { + private final TermVectorsReader reader; + private final int maxDoc; + int docID = -1; + + public TermVectorsMergeSub(MergeState.DocMap docMap, Bits liveDocs, TermVectorsReader reader, int maxDoc) { + super(docMap, liveDocs); + this.maxDoc = maxDoc; + this.reader = reader; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + return docID; + } + } + } + /** Merges in the term vectors from the readers in * mergeState. The default implementation skips * over deleted documents, and uses {@link #startDocument(int)}, @@ -170,32 +196,36 @@ public abstract class TermVectorsWriter implements Closeable { * Implementations can override this method for more sophisticated * merging (bulk-byte copying, etc). */ public int merge(MergeState mergeState) throws IOException { + + List subs = new ArrayList<>(); + for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + int docCount = 0; - int numReaders = mergeState.maxDocs.length; - for (int i = 0; i < numReaders; i++) { - int maxDoc = mergeState.maxDocs[i]; - Bits liveDocs = mergeState.liveDocs[i]; - TermVectorsReader termVectorsReader = mergeState.termVectorsReaders[i]; - if (termVectorsReader != null) { - termVectorsReader.checkIntegrity(); + while (true) { + TermVectorsMergeSub sub = docIDMerger.next(); + if (sub == null) { + break; } - for (int docID=0;docID bkdReaders = new ArrayList<>(); List docMaps = new ArrayList<>(); - List docIDBases = new ArrayList<>(); for(int i=0;i { + + private final List subs; + + // Used when indexSort != null: + private final PriorityQueue queue; + private boolean first; + + // Used when indexIsSorted + private T current; + private int nextIndex; + + public static abstract class Sub { + public int mappedDocID; + final MergeState.DocMap docMap; + final Bits liveDocs; + + // nocommit isn't liveDocs redundant? docMap returns -1 for us? + public Sub(MergeState.DocMap docMap, Bits liveDocs) { + this.docMap = docMap; + this.liveDocs = liveDocs; + } + + /** Returns the next document ID from this sub reader, and {@link DocIdSetIterator#NO_MORE_DOCS} when done */ + public abstract int nextDoc(); + } + + public DocIDMerger(List subs, int maxCount, boolean indexIsSorted) { + this.subs = subs; + + if (indexIsSorted) { + queue = new PriorityQueue(maxCount) { + @Override + protected boolean lessThan(Sub a, Sub b) { + assert a.mappedDocID != b.mappedDocID; + return a.mappedDocID < b.mappedDocID; + } + }; + } else { + // We simply concatentate + queue = null; + } + + reset(); + } + + // nocommit it's awkward that we must pass in this boolean, when the subs should "know" this based on what docMap they have? + public DocIDMerger(List subs, boolean indexIsSorted) { + this(subs, subs.size(), indexIsSorted); + } + + /** Reuse API, currently only used by postings during merge */ + public void reset() { + if (queue != null) { + assert queue.size() == 0; + for(T sub : subs) { + while (true) { + int docID = sub.nextDoc(); + if (docID == NO_MORE_DOCS) { + // all docs in this sub were deleted; do not add it to the queue! + break; + } else if (sub.liveDocs != null && sub.liveDocs.get(docID) == false) { + // nocommit is it sub's job to skip deleted docs? + continue; + } else { + sub.mappedDocID = sub.docMap.get(docID); + assert sub.mappedDocID != -1; + queue.add(sub); + break; + } + } + } + first = true; + } else { + if (subs.size() > 0) { + current = subs.get(0); + nextIndex = 1; + } else { + current = null; + nextIndex = 0; + } + } + } + + /** Returns null when done */ + public T next() { + // Loop until we find a non-deleted document + if (queue != null) { + T top = queue.top(); + if (top == null) { + // NOTE: it's annoying that caller is allowed to call us again even after we returned null before + return null; + } + + if (first == false) { + while (true) { + int docID = top.nextDoc(); + if (docID == NO_MORE_DOCS) { + queue.pop(); + top = queue.top(); + break; + } else if (top.liveDocs != null && top.liveDocs.get(docID) == false) { + continue; + } else { + top.mappedDocID = top.docMap.get(docID); + top = queue.updateTop(); + break; + } + } + } + + first = false; + + return top; + + } else { + while (true) { + if (current == null) { + // NOTE: it's annoying that caller is allowed to call us again even after we returned null before + return null; + } + int docID = current.nextDoc(); + if (docID == NO_MORE_DOCS) { + if (nextIndex == subs.size()) { + current = null; + return null; + } + current = subs.get(nextIndex); + nextIndex++; + continue; + } else if (current.liveDocs != null && current.liveDocs.get(docID) == false) { + // Document is deleted + continue; + } + + current.mappedDocID = current.docMap.get(docID); + return current; + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 65d6a144e74..3e8a2270297 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -178,7 +178,7 @@ class DocumentsWriterPerThread { pendingUpdates.clear(); deleteSlice = deleteQueue.newSlice(); - segmentInfo = new SegmentInfo(directoryOrig, Version.LATEST, segmentName, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + segmentInfo = new SegmentInfo(directoryOrig, Version.LATEST, segmentName, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); assert numDocsInRAM == 0; if (INFO_VERBOSE && infoStream.isEnabled("DWPT")) { infoStream.message("DWPT", Thread.currentThread().getName() + " init seg=" + segmentName + " delQueue=" + deleteQueue); diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java index c35dc6719c9..13b6e8d6d4d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java @@ -25,6 +25,7 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.Bits; /** @@ -101,6 +102,11 @@ public abstract class FilterCodecReader extends CodecReader { return in.maxDoc(); } + @Override + public Sort getIndexSort() { + return in.getIndexSort(); + } + @Override public void addCoreClosedListener(CoreClosedListener listener) { in.addCoreClosedListener(listener); diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java index f273dba6fc9..886c12a6836 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java @@ -22,6 +22,7 @@ import java.util.Iterator; import java.util.Objects; import org.apache.lucene.search.QueryCache; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -471,6 +472,12 @@ public abstract class FilterLeafReader extends LeafReader { return in.getDocsWithField(field); } + @Override + public Sort getIndexSort() { + ensureOpen(); + return in.getIndexSort(); + } + @Override public void checkIntegrity() throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 2b45b6b38ab..2254ba4cd05 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.index; +// nocommit must add sorted indices to back compat tests import java.io.Closeable; import java.io.FileNotFoundException; @@ -32,8 +33,8 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Locale; -import java.util.Map; import java.util.Map.Entry; +import java.util.Map; import java.util.Queue; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; @@ -49,6 +50,7 @@ import org.apache.lucene.index.FieldInfos.FieldNumbers; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; @@ -937,6 +939,8 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { // NOTE: this is correct even for an NRT reader because we'll pull FieldInfos even for the un-committed segments: globalFieldNumberMap = getFieldNumberMap(); + validateIndexSort(); + config.getFlushPolicy().init(config); docWriter = new DocumentsWriter(this, config, directoryOrig, directory); eventQueue = docWriter.eventQueue(); @@ -1000,6 +1004,22 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { } } + // nocommit can we improve this from just best effort? + + /** Confirms that the incoming index sort (if any) matches the existing index sort (if any). This is unfortunately just best effort, + * because it could be the old index only has flushed segments. */ + private void validateIndexSort() { + Sort indexSort = config.getIndexSort(); + if (indexSort != null) { + for(SegmentCommitInfo info : segmentInfos) { + Sort segmentIndexSort = info.info.getIndexSort(); + if (segmentIndexSort != null && indexSort.equals(segmentIndexSort) == false) { + throw new IllegalArgumentException("cannot change previous indexSort=" + segmentIndexSort + " (from segment=" + info + ") to new indexSort=" + indexSort); + } + } + } + } + // reads latest field infos for the commit // this is used on IW init and addIndexes(Dir) to create/update the global field map. // TODO: fix tests abusing this method! @@ -2474,6 +2494,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { * @throws IllegalArgumentException if addIndexes would cause * the index to exceed {@link #MAX_DOCS} */ + // nocommit doesn't support index sorting? or sorts must be the same? public void addIndexes(Directory... dirs) throws IOException { ensureOpen(); @@ -2603,6 +2624,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { * @throws IllegalArgumentException * if addIndexes would cause the index to exceed {@link #MAX_DOCS} */ + // nocommit make sure if you add "sorted by X" to "sorted by Y" index, we catch it public void addIndexes(CodecReader... readers) throws IOException { ensureOpen(); @@ -2630,7 +2652,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(directory); SegmentInfo info = new SegmentInfo(directoryOrig, Version.LATEST, mergedName, -1, - false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), config.getIndexSort()); SegmentMerger merger = new SegmentMerger(Arrays.asList(readers), info, infoStream, trackingDir, globalFieldNumberMap, @@ -2715,7 +2737,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { // Same SI as before but we change directory and name SegmentInfo newInfo = new SegmentInfo(directoryOrig, info.info.getVersion(), segName, info.info.maxDoc(), info.info.getUseCompoundFile(), info.info.getCodec(), - info.info.getDiagnostics(), info.info.getId(), info.info.getAttributes()); + info.info.getDiagnostics(), info.info.getId(), info.info.getAttributes(), info.info.getIndexSort()); SegmentCommitInfo newInfoPerCommit = new SegmentCommitInfo(newInfo, info.getDelCount(), info.getDelGen(), info.getFieldInfosGen(), info.getDocValuesGen()); @@ -3243,16 +3265,13 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { private static class MergedDeletesAndUpdates { ReadersAndUpdates mergedDeletesAndUpdates = null; - MergePolicy.DocMap docMap = null; boolean initializedWritableLiveDocs = false; MergedDeletesAndUpdates() {} - final void init(ReaderPool readerPool, MergePolicy.OneMerge merge, MergeState mergeState, boolean initWritableLiveDocs) throws IOException { + final void init(ReaderPool readerPool, MergePolicy.OneMerge merge, boolean initWritableLiveDocs) throws IOException { if (mergedDeletesAndUpdates == null) { mergedDeletesAndUpdates = readerPool.get(merge.info, true); - docMap = merge.getDocMap(mergeState); - assert docMap.isConsistent(merge.info.info.maxDoc()); } if (initWritableLiveDocs && !initializedWritableLiveDocs) { mergedDeletesAndUpdates.initWritableLiveDocs(); @@ -3262,18 +3281,18 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { } - private void maybeApplyMergedDVUpdates(MergePolicy.OneMerge merge, MergeState mergeState, int docUpto, + private void maybeApplyMergedDVUpdates(MergePolicy.OneMerge merge, MergeState mergeState, MergedDeletesAndUpdates holder, String[] mergingFields, DocValuesFieldUpdates[] dvFieldUpdates, - DocValuesFieldUpdates.Iterator[] updatesIters, int curDoc) throws IOException { + DocValuesFieldUpdates.Iterator[] updatesIters, int segment, int curDoc) throws IOException { int newDoc = -1; for (int idx = 0; idx < mergingFields.length; idx++) { DocValuesFieldUpdates.Iterator updatesIter = updatesIters[idx]; if (updatesIter.doc() == curDoc) { // document has an update if (holder.mergedDeletesAndUpdates == null) { - holder.init(readerPool, merge, mergeState, false); + holder.init(readerPool, merge, false); } if (newDoc == -1) { // map once per all field updates, but only if there are any updates - newDoc = holder.docMap.map(docUpto); + newDoc = mergeState.docMaps[segment].get(curDoc); } DocValuesFieldUpdates dvUpdates = dvFieldUpdates[idx]; dvUpdates.add(newDoc, updatesIter.value()); @@ -3306,13 +3325,13 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { // Carefully merge deletes that occurred after we // started merging: - int docUpto = 0; long minGen = Long.MAX_VALUE; // Lazy init (only when we find a delete to carry over): final MergedDeletesAndUpdates holder = new MergedDeletesAndUpdates(); final DocValuesFieldUpdates.Container mergedDVUpdates = new DocValuesFieldUpdates.Container(); - + + assert sourceSegments.size() == mergeState.docMaps.length; for (int i = 0; i < sourceSegments.size(); i++) { SegmentCommitInfo info = sourceSegments.get(i); minGen = Math.min(info.getBufferedDeletesGen(), minGen); @@ -3375,21 +3394,20 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { // since we started the merge, so we // must merge them: for (int j = 0; j < maxDoc; j++) { - if (!prevLiveDocs.get(j)) { - assert !currentLiveDocs.get(j); - } else { - if (!currentLiveDocs.get(j)) { - if (holder.mergedDeletesAndUpdates == null || !holder.initializedWritableLiveDocs) { - holder.init(readerPool, merge, mergeState, true); - } - holder.mergedDeletesAndUpdates.delete(holder.docMap.map(docUpto)); - if (mergingFields != null) { // advance all iters beyond the deleted document - skipDeletedDoc(updatesIters, j); - } - } else if (mergingFields != null) { - maybeApplyMergedDVUpdates(merge, mergeState, docUpto, holder, mergingFields, dvFieldUpdates, updatesIters, j); + if (prevLiveDocs.get(j) == false) { + // if the document was deleted before, it better still be deleted! + assert currentLiveDocs.get(j) == false; + } else if (currentLiveDocs.get(j) == false) { + // the document was deleted while we were merging: + if (holder.mergedDeletesAndUpdates == null || holder.initializedWritableLiveDocs == false) { + holder.init(readerPool, merge, true); } - docUpto++; + holder.mergedDeletesAndUpdates.delete(mergeState.docMaps[i].get(mergeState.leafDocMaps[i].get(j))); + if (mergingFields != null) { // advance all iters beyond the deleted document + skipDeletedDoc(updatesIters, j); + } + } else if (mergingFields != null) { + maybeApplyMergedDVUpdates(merge, mergeState, holder, mergingFields, dvFieldUpdates, updatesIters, i, j); } } } else if (mergingFields != null) { @@ -3397,50 +3415,38 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { for (int j = 0; j < maxDoc; j++) { if (prevLiveDocs.get(j)) { // document isn't deleted, check if any of the fields have an update to it - maybeApplyMergedDVUpdates(merge, mergeState, docUpto, holder, mergingFields, dvFieldUpdates, updatesIters, j); - // advance docUpto for every non-deleted document - docUpto++; + maybeApplyMergedDVUpdates(merge, mergeState, holder, mergingFields, dvFieldUpdates, updatesIters, i, j); } else { // advance all iters beyond the deleted document skipDeletedDoc(updatesIters, j); } } - } else { - docUpto += info.info.maxDoc() - info.getDelCount() - rld.getPendingDeleteCount(); } } else if (currentLiveDocs != null) { assert currentLiveDocs.length() == maxDoc; // This segment had no deletes before but now it // does: for (int j = 0; j < maxDoc; j++) { - if (!currentLiveDocs.get(j)) { + if (currentLiveDocs.get(j) == false) { if (holder.mergedDeletesAndUpdates == null || !holder.initializedWritableLiveDocs) { - holder.init(readerPool, merge, mergeState, true); + holder.init(readerPool, merge, true); } - holder.mergedDeletesAndUpdates.delete(holder.docMap.map(docUpto)); + holder.mergedDeletesAndUpdates.delete(mergeState.docMaps[i].get(mergeState.leafDocMaps[i].get(j))); if (mergingFields != null) { // advance all iters beyond the deleted document skipDeletedDoc(updatesIters, j); } } else if (mergingFields != null) { - maybeApplyMergedDVUpdates(merge, mergeState, docUpto, holder, mergingFields, dvFieldUpdates, updatesIters, j); + maybeApplyMergedDVUpdates(merge, mergeState, holder, mergingFields, dvFieldUpdates, updatesIters, i, j); } - docUpto++; } } else if (mergingFields != null) { // no deletions before or after, but there were updates for (int j = 0; j < maxDoc; j++) { - maybeApplyMergedDVUpdates(merge, mergeState, docUpto, holder, mergingFields, dvFieldUpdates, updatesIters, j); - // advance docUpto for every non-deleted document - docUpto++; + maybeApplyMergedDVUpdates(merge, mergeState, holder, mergingFields, dvFieldUpdates, updatesIters, i, j); } - } else { - // No deletes or updates before or after - docUpto += info.info.maxDoc(); } } - assert docUpto == merge.info.info.maxDoc(); - if (mergedDVUpdates.any()) { // System.out.println("[" + Thread.currentThread().getName() + "] IW.commitMergedDeletes: mergedDeletes.info=" + mergedDeletes.info + ", mergedFieldUpdates=" + mergedFieldUpdates); boolean success = false; @@ -3881,7 +3887,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { // ConcurrentMergePolicy we keep deterministic segment // names. final String mergeSegmentName = newSegmentName(); - SegmentInfo si = new SegmentInfo(directoryOrig, Version.LATEST, mergeSegmentName, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + SegmentInfo si = new SegmentInfo(directoryOrig, Version.LATEST, mergeSegmentName, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), config.getIndexSort()); Map details = new HashMap<>(); details.put("mergeMaxNumSegments", "" + merge.maxNumSegments); details.put("mergeFactor", Integer.toString(merge.segments.size())); @@ -4082,10 +4088,13 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { } // System.out.println("[" + Thread.currentThread().getName() + "] IW.mergeMiddle: merging " + merge.getMergeReaders()); - - // we pass merge.getMergeReaders() instead of merge.readers to allow the - // OneMerge to return a view over the actual segments to merge - final SegmentMerger merger = new SegmentMerger(merge.getMergeReaders(), + + // Let the merge wrap readers + List mergeReaders = new ArrayList<>(); + for (SegmentReader reader : merge.readers) { + mergeReaders.add(merge.wrapForMerge(reader)); + } + final SegmentMerger merger = new SegmentMerger(mergeReaders, merge.info.info, infoStream, dirWrapper, globalFieldNumberMap, context); diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java index a90d625e305..a6b559900df 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -18,16 +18,19 @@ package org.apache.lucene.index; import java.io.PrintStream; +import java.util.EnumSet; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain; import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.PrintStreamInfoStream; -import org.apache.lucene.util.SetOnce; import org.apache.lucene.util.SetOnce.AlreadySetException; +import org.apache.lucene.util.SetOnce; /** * Holds all the configuration that is used to create an {@link IndexWriter}. @@ -439,6 +442,27 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig { return this; } + /** We only allow sorting on these types */ + private static final EnumSet ALLOWED_INDEX_SORT_TYPES = EnumSet.of(SortField.Type.STRING, + SortField.Type.INT, + SortField.Type.FLOAT, + SortField.Type.LONG, + SortField.Type.DOUBLE, + SortField.Type.BYTES); + + /** + * Set the {@link Sort} order to use when merging segments. Note that newly flushed segments will remain unsorted. + */ + public IndexWriterConfig setIndexSort(Sort sort) { + for(SortField sortField : sort.getSort()) { + if (ALLOWED_INDEX_SORT_TYPES.contains(sortField.getType()) == false) { + throw new IllegalArgumentException("invalid SortField type: must be one of " + ALLOWED_INDEX_SORT_TYPES + " but got: " + sortField); + } + } + this.indexSort = sort; + return this; + } + @Override public String toString() { StringBuilder sb = new StringBuilder(super.toString()); diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java index 9622d4e2f85..44e61e2787f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java @@ -20,6 +20,7 @@ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.index.IndexReader.ReaderClosedListener; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.Bits; /** {@code LeafReader} is an abstract class, providing an interface for accessing an @@ -312,4 +313,7 @@ public abstract class LeafReader extends IndexReader { * @lucene.internal */ public abstract void checkIntegrity() throws IOException; + + /** Returns null if this leaf is unsorted, or the {@link Sort} that it was sorted by */ + public abstract Sort getIndexSort(); } diff --git a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java index 1a0002c73f7..cec70c099aa 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java @@ -23,6 +23,7 @@ import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain; import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Sort; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.InfoStream; @@ -94,6 +95,9 @@ public class LiveIndexWriterConfig { /** True if calls to {@link IndexWriter#close()} should first do a commit. */ protected boolean commitOnClose = IndexWriterConfig.DEFAULT_COMMIT_ON_CLOSE; + /** The sort order to use to write merged segments. */ + protected Sort indexSort = null; + // used by IndexWriterConfig LiveIndexWriterConfig(Analyzer analyzer) { this.analyzer = analyzer; @@ -445,6 +449,14 @@ public class LiveIndexWriterConfig { return commitOnClose; } + /** + * Set the index-time {@link Sort} order. Merged segments will be written + * in this order. + */ + public Sort getIndexSort() { + return indexSort; + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); @@ -467,6 +479,7 @@ public class LiveIndexWriterConfig { sb.append("perThreadHardLimitMB=").append(getRAMPerThreadHardLimitMB()).append("\n"); sb.append("useCompoundFile=").append(getUseCompoundFile()).append("\n"); sb.append("commitOnClose=").append(getCommitOnClose()).append("\n"); + sb.append("indexSort=").append(getIndexSort()).append("\n"); return sb.toString(); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java index a06c34f3c80..c4333bc049d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java @@ -18,8 +18,11 @@ package org.apache.lucene.index; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import org.apache.lucene.index.MultiPostingsEnum.EnumWithSlice; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; /** @@ -30,52 +33,63 @@ import org.apache.lucene.util.BytesRef; */ final class MappingMultiPostingsEnum extends PostingsEnum { - private MultiPostingsEnum.EnumWithSlice[] subs; - int numSubs; - int upto; - MergeState.DocMap currentMap; - PostingsEnum current; - int currentBase; - int doc = -1; - private MergeState mergeState; MultiPostingsEnum multiDocsAndPositionsEnum; final String field; + final DocIDMerger docIDMerger; + private MappingPostingsSub current; + private final MappingPostingsSub[] allSubs; + private final List subs = new ArrayList<>(); + + private static class MappingPostingsSub extends DocIDMerger.Sub { + public PostingsEnum postings; + + public MappingPostingsSub(MergeState.DocMap docMap, Bits liveDocs) { + super(docMap, liveDocs); + } + + @Override + public int nextDoc() { + try { + return postings.nextDoc(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + } /** Sole constructor. */ - public MappingMultiPostingsEnum(String field, MergeState mergeState) { + public MappingMultiPostingsEnum(String field, MergeState mergeState) throws IOException { this.field = field; - this.mergeState = mergeState; + allSubs = new MappingPostingsSub[mergeState.fieldsProducers.length]; + for(int i=0;i(subs, allSubs.length, mergeState.segmentInfo.getIndexSort() != null); } - MappingMultiPostingsEnum reset(MultiPostingsEnum postingsEnum) { - this.numSubs = postingsEnum.getNumSubs(); - this.subs = postingsEnum.getSubs(); - upto = -1; - doc = -1; - current = null; + MappingMultiPostingsEnum reset(MultiPostingsEnum postingsEnum) throws IOException { this.multiDocsAndPositionsEnum = postingsEnum; + MultiPostingsEnum.EnumWithSlice[] subsArray = postingsEnum.getSubs(); + int count = postingsEnum.getNumSubs(); + subs.clear(); + for(int i=0;i IndexWriter.MAX_POSITION=" + IndexWriter.MAX_POSITION + "), field=\"" + field + "\" doc=" + doc, - mergeState.fieldsProducers[upto].toString()); + throw new CorruptIndexException("position=" + pos + " is too large (> IndexWriter.MAX_POSITION=" + IndexWriter.MAX_POSITION + "), field=\"" + field + "\" doc=" + current.mappedDocID, + current.postings.toString()); } return pos; } @Override public int startOffset() throws IOException { - return current.startOffset(); + return current.postings.startOffset(); } @Override public int endOffset() throws IOException { - return current.endOffset(); + return current.postings.endOffset(); } @Override public BytesRef getPayload() throws IOException { - return current.getPayload(); + return current.postings.getPayload(); } @Override public long cost() { long cost = 0; - for (EnumWithSlice enumWithSlice : subs) { - cost += enumWithSlice.postingsEnum.cost(); + for (MappingPostingsSub sub : subs) { + cost += sub.postings.cost(); } return cost; } diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java index 1d67c4a0abc..c42b052d288 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java @@ -58,31 +58,6 @@ import org.apache.lucene.util.FixedBitSet; */ public abstract class MergePolicy { - /** A map of doc IDs. */ - public static abstract class DocMap { - /** Sole constructor, typically invoked from sub-classes constructors. */ - protected DocMap() {} - - /** Return the new doc ID according to its old value. */ - public abstract int map(int old); - - /** Useful from an assert. */ - boolean isConsistent(int maxDoc) { - final FixedBitSet targets = new FixedBitSet(maxDoc); - for (int i = 0; i < maxDoc; ++i) { - final int target = map(i); - if (target < 0 || target >= maxDoc) { - assert false : "out of range: " + target + " not in [0-" + maxDoc + "["; - return false; - } else if (targets.get(target)) { - assert false : target + " is already taken (" + i + ")"; - return false; - } - } - return true; - } - } - /** OneMerge provides the information necessary to perform * an individual primitive merge operation, resulting in * a single new segment. The merge spec includes the @@ -140,25 +115,11 @@ public abstract class MergePolicy { public void mergeFinished() throws IOException { } - /** Expert: Get the list of readers to merge. Note that this list does not - * necessarily match the list of segments to merge and should only be used - * to feed SegmentMerger to initialize a merge. When a {@link OneMerge} - * reorders doc IDs, it must override {@link #getDocMap} too so that - * deletes that happened during the merge can be applied to the newly - * merged segment. */ - public List getMergeReaders() throws IOException { - if (readers == null) { - throw new IllegalStateException("IndexWriter has not initialized readers from the segment infos yet"); - } - final List readers = new ArrayList<>(this.readers.size()); - for (SegmentReader reader : this.readers) { - if (reader.numDocs() > 0) { - readers.add(reader); - } - } - return Collections.unmodifiableList(readers); + /** Wrap the reader in order to add/remove information to the merged segment. */ + public CodecReader wrapForMerge(CodecReader reader) throws IOException { + return reader; } - + /** * Expert: Sets the {@link SegmentCommitInfo} of the merged segment. * Allows sub-classes to e.g. set diagnostics properties. @@ -175,20 +136,6 @@ public abstract class MergePolicy { return info; } - /** Expert: If {@link #getMergeReaders()} reorders document IDs, this method - * must be overridden to return a mapping from the natural doc ID - * (the doc ID that would result from a natural merge) to the actual doc - * ID. This mapping is used to apply deletions that happened during the - * merge to the new segment. */ - public DocMap getDocMap(MergeState mergeState) { - return new DocMap() { - @Override - public int map(int docID) { - return docID; - } - }; - } - /** Record that an exception occurred while executing * this merge */ synchronized void setException(Throwable error) { diff --git a/lucene/misc/src/java/org/apache/lucene/index/MergeReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java similarity index 96% rename from lucene/misc/src/java/org/apache/lucene/index/MergeReaderWrapper.java rename to lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java index dba5c913f00..be3513a698e 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/MergeReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java @@ -1,3 +1,5 @@ +package org.apache.lucene.index; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -14,7 +16,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.index; import java.io.IOException; @@ -23,9 +24,10 @@ import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.Bits; -/** this is a hack to make SortingMP fast! */ +/** This is a hack to make index sorting fast, with a {@link LeafReader} that always returns merge instances when you ask for the codec readers. */ class MergeReaderWrapper extends LeafReader { final SegmentReader in; final FieldsProducer fields; @@ -256,4 +258,9 @@ class MergeReaderWrapper extends LeafReader { public String toString() { return "MergeReaderWrapper(" + in + ")"; } + + @Override + public Sort getIndexSort() { + return in.getIndexSort(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 7242785e101..32e048086a7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -18,6 +18,8 @@ package org.apache.lucene.index; import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; import org.apache.lucene.codecs.DocValuesProducer; @@ -26,6 +28,7 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.Bits; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.packed.PackedInts; @@ -36,6 +39,13 @@ import org.apache.lucene.util.packed.PackedLongValues; * @lucene.experimental */ public class MergeState { + /** Maps document IDs from old segments to document IDs in the new segment */ + // nocommit in the unsorted case, this should map correctly, e.g. apply per segment docBase + public final DocMap[] docMaps; + + // nocommit can we somehow not need to expose this? should IW's reader pool always sort on load...? + public final DocMap[] leafDocMaps; + /** {@link SegmentInfo} of the newly merged segment. */ public final SegmentInfo segmentInfo; @@ -60,18 +70,12 @@ public class MergeState { /** Live docs for each reader */ public final Bits[] liveDocs; - /** Maps docIDs around deletions. */ - public final DocMap[] docMaps; - /** Postings to merge */ public final FieldsProducer[] fieldsProducers; /** Point readers to merge */ public final PointsReader[] pointsReaders; - /** New docID base per reader. */ - public final int[] docBase; - /** Max docs per reader */ public final int[] maxDocs; @@ -79,11 +83,13 @@ public class MergeState { public final InfoStream infoStream; /** Sole constructor. */ - MergeState(List readers, SegmentInfo segmentInfo, InfoStream infoStream) throws IOException { + MergeState(List originalReaders, SegmentInfo segmentInfo, InfoStream infoStream) throws IOException { + + final Sort indexSort = segmentInfo.getIndexSort(); + int numReaders = originalReaders.size(); + leafDocMaps = new DocMap[numReaders]; + List readers = maybeSortReaders(originalReaders, segmentInfo); - int numReaders = readers.size(); - docMaps = new DocMap[numReaders]; - docBase = new int[numReaders]; maxDocs = new int[numReaders]; fieldsProducers = new FieldsProducer[numReaders]; normsProducers = new NormsProducer[numReaders]; @@ -94,6 +100,7 @@ public class MergeState { fieldInfos = new FieldInfos[numReaders]; liveDocs = new Bits[numReaders]; + int numDocs = 0; for(int i=0;i readers) throws IOException { - final int numReaders = maxDocs.length; + private DocMap[] buildDocMaps(List readers, Sort indexSort) throws IOException { - // Remap docIDs - int docBase = 0; - for(int i=0;i 0; - } - - /** Creates a {@link DocMap} instance appropriate for - * this reader. */ - public static DocMap build(CodecReader reader) { - final int maxDoc = reader.maxDoc(); - if (!reader.hasDeletions()) { - return new NoDelDocMap(maxDoc); - } - final Bits liveDocs = reader.getLiveDocs(); - return build(maxDoc, liveDocs); - } - - static DocMap build(final int maxDoc, final Bits liveDocs) { - assert liveDocs != null; - final PackedLongValues.Builder docMapBuilder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); - int del = 0; - for (int i = 0; i < maxDoc; ++i) { - docMapBuilder.add(i - del); - if (!liveDocs.get(i)) { - ++del; + final PackedLongValues delDocMap; + if (liveDocs != null) { + delDocMap = removeDeletes(reader.maxDoc(), liveDocs); + } else { + delDocMap = null; } - } - final PackedLongValues docMap = docMapBuilder.build(); - final int numDeletedDocs = del; - assert docMap.size() == maxDoc; - return new DocMap() { - @Override - public int get(int docID) { - if (!liveDocs.get(docID)) { - return -1; + final int docBase = totalDocs; + docMaps[i] = new DocMap() { + @Override + public int get(int docID) { + if (liveDocs == null) { + return docBase + docID; + } else if (liveDocs.get(docID)) { + return docBase + (int) delDocMap.get(docID); + } else { + return -1; + } } - return (int) docMap.get(docID); - } + }; + totalDocs += reader.numDocs(); + } - @Override - public int maxDoc() { - return maxDoc; - } + return docMaps; - @Override - public int numDeletedDocs() { - return numDeletedDocs; - } - }; + } else { + // do a merge sort of the incoming leaves: + return MultiSorter.sort(indexSort, readers); } } - private static final class NoDelDocMap extends DocMap { + private List maybeSortReaders(List originalReaders, SegmentInfo segmentInfo) throws IOException { - private final int maxDoc; - - NoDelDocMap(int maxDoc) { - this.maxDoc = maxDoc; + // Default to identity: + for(int i=0;i readers = new ArrayList<>(originalReaders.size()); + + //System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort); + + for (CodecReader leaf : originalReaders) { + if (leaf instanceof SegmentReader) { + SegmentReader segmentReader = (SegmentReader) leaf; + Sort segmentSort = segmentReader.getSegmentInfo().info.getIndexSort(); + //System.out.println(" leaf=" + leaf + " sort=" + segmentSort); + + if (segmentSort == null) { + // TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live" + // to the files on each indexed document: + + // This segment was written by flush, so documents are not yet sorted, so we sort them now: + Sorter.DocMap sortDocMap = sorter.sort(leaf); + if (sortDocMap != null) { + //System.out.println(" sort!"); + // nocommit what about MergedReaderWrapper in here? + leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap)); + leafDocMaps[readers.size()] = new DocMap() { + @Override + public int get(int docID) { + return sortDocMap.oldToNew(docID); + } + }; + } + + } else if (segmentSort.equals(indexSort) == false) { + throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort); + } + } else { + throw new IllegalArgumentException("cannot sort index with foreign readers; leaf=" + leaf); + } + + readers.add(leaf); } - @Override - public int numDeletedDocs() { - return 0; + return readers; + } + + /** A map of doc IDs. */ + public static abstract class DocMap { + /** Return the mapped docID or -1 if the given doc is not mapped. */ + public abstract int get(int docID); + } + + static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) { + final PackedLongValues.Builder docMapBuilder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); + int del = 0; + for (int i = 0; i < maxDoc; ++i) { + docMapBuilder.add(i - del); + if (liveDocs.get(i) == false) { + ++del; + } } + return docMapBuilder.build(); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java index 1736bace115..447e0aee4cb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java @@ -51,6 +51,8 @@ public final class MultiFields extends Fields { private final ReaderSlice[] subSlices; private final Map terms = new ConcurrentHashMap<>(); + // nocommit should we somehow throw exc if you try to pass in "sorted" Fields? + /** Returns a single {@link Fields} instance for this * reader, merging fields/terms/docs/positions on the * fly. This method will return null if the reader diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java index 54563254162..573bbe8c6db 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java @@ -57,7 +57,9 @@ public final class MultiPostingsEnum extends PostingsEnum { return this.parent == parent; } - /** Rre-use and reset this instance on the provided slices. */ + // nocommit is this class supposed to be aware of index sorting too??? + + /** Re-use and reset this instance on the provided slices. */ public MultiPostingsEnum reset(final EnumWithSlice[] subs, final int numSubs) { this.numSubs = numSubs; for(int i=0;i readers) throws IOException { + + SortField fields[] = sort.getSort(); + final CrossReaderComparator[] comparators = new CrossReaderComparator[fields.length]; + for(int i=0;i queue = new PriorityQueue(leafCount) { + @Override + public boolean lessThan(LeafAndDocID a, LeafAndDocID b) { + for(int i=0;i readers, SortField sortField) throws IOException { + switch(sortField.getType()) { + // TODO: use global ords for string sort + case INT: + { + List values = new ArrayList<>(); + List docsWithFields = new ArrayList<>(); + for(CodecReader reader : readers) { + values.add(DocValues.getNumeric(reader, sortField.getField())); + docsWithFields.add(DocValues.getDocsWithField(reader, sortField.getField())); + } + + final int reverseMul; + if (sortField.getReverse()) { + reverseMul = -1; + } else { + reverseMul = 1; + } + + final int missingValue; + + if (sortField.getMissingValue() != null) { + missingValue = (Integer) sortField.getMissingValue(); + } else { + missingValue = 0; + } + + return new CrossReaderComparator() { + @Override + public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) { + int valueA; + if (docsWithFields.get(readerIndexA).get(docIDA)) { + valueA = (int) values.get(readerIndexA).get(docIDA); + } else { + valueA = missingValue; + } + + int valueB; + if (docsWithFields.get(readerIndexB).get(docIDB)) { + valueB = (int) values.get(readerIndexB).get(docIDB); + } else { + valueB = missingValue; + } + return reverseMul * Integer.compare(valueA, valueB); + } + }; + } + case LONG: + // nocommit refactor/share at least numerics here: + { + List values = new ArrayList<>(); + List docsWithFields = new ArrayList<>(); + for(CodecReader reader : readers) { + values.add(DocValues.getNumeric(reader, sortField.getField())); + docsWithFields.add(DocValues.getDocsWithField(reader, sortField.getField())); + } + + final int reverseMul; + if (sortField.getReverse()) { + reverseMul = -1; + } else { + reverseMul = 1; + } + + final int missingValue; + + if (sortField.getMissingValue() != null) { + missingValue = (Integer) sortField.getMissingValue(); + } else { + missingValue = 0; + } + + return new CrossReaderComparator() { + @Override + public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) { + long valueA; + if (docsWithFields.get(readerIndexA).get(docIDA)) { + valueA = (int) values.get(readerIndexA).get(docIDA); + } else { + valueA = missingValue; + } + + long valueB; + if (docsWithFields.get(readerIndexB).get(docIDB)) { + valueB = (int) values.get(readerIndexB).get(docIDB); + } else { + valueB = missingValue; + } + return reverseMul * Long.compare(valueA, valueB); + } + }; + } + // nocommit do the rest: + default: + throw new IllegalArgumentException("unhandled SortField.getType()=" + sortField.getType()); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java index 532265f59c7..ef9f28cf666 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java @@ -26,6 +26,7 @@ import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.Bits; /** An {@link LeafReader} which reads multiple, parallel indexes. Each index @@ -55,6 +56,7 @@ public class ParallelLeafReader extends LeafReader { private final boolean closeSubReaders; private final int maxDoc, numDocs; private final boolean hasDeletions; + private final Sort indexSort; private final SortedMap fieldToReader = new TreeMap<>(); private final SortedMap tvFieldToReader = new TreeMap<>(); @@ -100,8 +102,17 @@ public class ParallelLeafReader extends LeafReader { // TODO: make this read-only in a cleaner way? FieldInfos.Builder builder = new FieldInfos.Builder(); + + Sort indexSort = null; + // build FieldInfos and fieldToReader map: for (final LeafReader reader : this.parallelReaders) { + if (indexSort == null) { + indexSort = reader.getIndexSort(); + } else if (indexSort.equals(reader.getIndexSort()) == false) { + throw new IllegalArgumentException("cannot combine LeafReaders that have different index sorts: saw both sort=" + indexSort + " and " + reader.getIndexSort()); + } + final FieldInfos readerFieldInfos = reader.getFieldInfos(); for (FieldInfo fieldInfo : readerFieldInfos) { // NOTE: first reader having a given field "wins": @@ -115,6 +126,7 @@ public class ParallelLeafReader extends LeafReader { } } fieldInfos = builder.finish(); + this.indexSort = indexSort; // build Fields instance for (final LeafReader reader : this.parallelReaders) { @@ -423,4 +435,10 @@ public class ParallelLeafReader extends LeafReader { ensureOpen(); return parallelReaders; } + + @Override + public Sort getIndexSort() { + return indexSort; + } + } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java index bed84589576..5830201e087 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java @@ -28,6 +28,7 @@ import java.util.Set; import java.util.regex.Matcher; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.search.Sort; import org.apache.lucene.store.Directory; import org.apache.lucene.store.TrackingDirectoryWrapper; import org.apache.lucene.util.StringHelper; @@ -69,6 +70,8 @@ public final class SegmentInfo { private final Map attributes; + private final Sort indexSort; + // Tracks the Lucene version this segment was created with, since 3.1. Null // indicates an older than 3.0 index, and it's used to detect a too old index. // The format expected is "x.y" - "2.x" for pre-3.0 indexes (or null), and @@ -93,7 +96,7 @@ public final class SegmentInfo { */ public SegmentInfo(Directory dir, Version version, String name, int maxDoc, boolean isCompoundFile, Codec codec, Map diagnostics, - byte[] id, Map attributes) { + byte[] id, Map attributes, Sort indexSort) { assert !(dir instanceof TrackingDirectoryWrapper); this.dir = Objects.requireNonNull(dir); this.version = Objects.requireNonNull(version); @@ -107,6 +110,7 @@ public final class SegmentInfo { throw new IllegalArgumentException("invalid id: " + Arrays.toString(id)); } this.attributes = Objects.requireNonNull(attributes); + this.indexSort = indexSort; } /** @@ -194,13 +198,13 @@ public final class SegmentInfo { s.append('/').append(delCount); } - final String sorter_key = "sorter"; // SortingMergePolicy.SORTER_ID_PROP; // TODO: use this once we can import SortingMergePolicy (currently located in 'misc' instead of 'core') - final String sorter_val = diagnostics.get(sorter_key); - if (sorter_val != null) { - s.append(":["); - s.append(sorter_key); - s.append('='); - s.append(sorter_val); + // nocommit does search time "do the right thing" automatically when segment is sorted? + + // nocommit remove sorter_key from diagnostics + + if (indexSort != null) { + s.append(":[indexSort="); + s.append(indexSort); s.append(']'); } @@ -311,5 +315,10 @@ public final class SegmentInfo { public Map getAttributes() { return attributes; } + + /** Return the sort order of this segment, or null if the index has no sort. */ + public Sort getIndexSort() { + return indexSort; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java index 8ed93e376c3..e68f8186272 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java @@ -28,6 +28,7 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.search.Sort; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.util.Bits; @@ -303,4 +304,9 @@ public final class SegmentReader extends CodecReader { ensureOpen(); core.removeCoreClosedListener(listener); } + + @Override + public Sort getIndexSort() { + return si.info.getIndexSort(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java index 3a7370138a6..2742247381a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java @@ -26,6 +26,7 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.Bits; /** @@ -125,6 +126,16 @@ public final class SlowCodecReaderWrapper { public void removeCoreClosedListener(CoreClosedListener listener) { reader.removeCoreClosedListener(listener); } + + @Override + public String toString() { + return "SlowCodecReaderWrapper(" + reader + ")"; + } + + @Override + public Sort getIndexSort() { + return reader.getIndexSort(); + } }; } } diff --git a/lucene/misc/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java similarity index 99% rename from lucene/misc/src/java/org/apache/lucene/index/Sorter.java rename to lucene/core/src/java/org/apache/lucene/index/Sorter.java index 7e4e475a248..0ce7d64e0d1 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/Sorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java @@ -33,6 +33,7 @@ import org.apache.lucene.util.packed.PackedLongValues; * IDs. * @lucene.experimental */ +// nocommit rename to IndexSorter? final class Sorter { final Sort sort; @@ -168,6 +169,7 @@ final class Sorter { } final PackedLongValues newToOld = newToOldBuilder.build(); + // invert the docs mapping: for (int i = 0; i < maxDoc; ++i) { docs[(int) newToOld.get(i)] = i; } // docs is now the oldToNew mapping @@ -196,7 +198,7 @@ final class Sorter { } }; } - + /** * Returns a mapping from the old document ID to its new location in the * sorted index. Implementations can use the auxiliary diff --git a/lucene/misc/src/java/org/apache/lucene/index/SortingLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java similarity index 96% rename from lucene/misc/src/java/org/apache/lucene/index/SortingLeafReader.java rename to lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java index 683c5c263bb..45d44828978 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/SortingLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java @@ -1,3 +1,5 @@ +package org.apache.lucene.index; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -14,7 +16,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.index; import java.io.IOException; import java.util.Arrays; @@ -35,21 +36,13 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; /** * An {@link org.apache.lucene.index.LeafReader} which supports sorting documents by a given - * {@link Sort}. You can use this class to sort an index as follows: - * - *
      - * IndexWriter writer; // writer to which the sorted index will be added
      - * DirectoryReader reader; // reader on the input index
      - * Sort sort; // determines how the documents are sorted
      - * LeafReader sortingReader = SortingLeafReader.wrap(SlowCompositeReaderWrapper.wrap(reader), sort);
      - * writer.addIndexes(reader);
      - * writer.close();
      - * reader.close();
      - * 
      + * {@link Sort}. This is package private and is only used by Lucene when it needs to merge + * a newly flushed (unsorted) segment. * * @lucene.experimental */ -public class SortingLeafReader extends FilterLeafReader { + +class SortingLeafReader extends FilterLeafReader { private static class SortingFields extends FilterFields { @@ -111,25 +104,6 @@ public class SortingLeafReader extends FilterLeafReader { this.hasPositions = hasPositions; } - Bits newToOld(final Bits liveDocs) { - if (liveDocs == null) { - return null; - } - return new Bits() { - - @Override - public boolean get(int index) { - return liveDocs.get(docMap.oldToNew(index)); - } - - @Override - public int length() { - return liveDocs.length(); - } - - }; - } - @Override public PostingsEnum postings( PostingsEnum reuse, final int flags) throws IOException { @@ -363,11 +337,15 @@ public class SortingLeafReader extends FilterLeafReader { @Override public long nextOrd() { - return in.nextOrd(); + // nocommit + long v = in.nextOrd(); + //System.out.println(" slr.sssdv.nextOrd return " + v + " this=" + this); + return v; } @Override public void setDocument(int docID) { + //System.out.println(" slr.sssdv.setDocument docID=" + docID + " this=" + this); in.setDocument(docMap.newToOld(docID)); } @@ -865,7 +843,7 @@ public class SortingLeafReader extends FilterLeafReader { if (inPointValues == null) { return null; } else { - // TODO: this is untested! + // nocommit make sure this is tested return new SortingPointValues(inPointValues, docMap); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/Sort.java b/lucene/core/src/java/org/apache/lucene/search/Sort.java index 7493e9b6a89..77585a2a674 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Sort.java +++ b/lucene/core/src/java/org/apache/lucene/search/Sort.java @@ -147,6 +147,9 @@ public class Sort { * etc. Finally, if there is still a tie after all SortFields * are checked, the internal Lucene docid is used to break it. */ public void setSort(SortField... fields) { + if (fields.length == 0) { + throw new IllegalArgumentException("There must be at least 1 sort field"); + } this.fields = fields; } diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index 288ece4c51d..09eef266b6d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -299,9 +299,6 @@ public class BKDWriter implements Closeable { final BKDReader.IntersectState state; final MergeState.DocMap docMap; - /** Base offset for all our docIDs */ - final int docIDBase; - /** Current doc ID */ public int docID; @@ -314,7 +311,7 @@ public class BKDWriter implements Closeable { /** Which leaf block we are up to */ private int blockID; - public MergeReader(BKDReader bkd, MergeState.DocMap docMap, int docIDBase) throws IOException { + public MergeReader(BKDReader bkd, MergeState.DocMap docMap) throws IOException { this.bkd = bkd; state = new BKDReader.IntersectState(bkd.in.clone(), bkd.numDims, @@ -322,7 +319,6 @@ public class BKDWriter implements Closeable { bkd.maxPointsInLeafNode, null); this.docMap = docMap; - this.docIDBase = docIDBase; long minFP = Long.MAX_VALUE; //System.out.println("MR.init " + this + " bkdreader=" + bkd + " leafBlockFPs.length=" + bkd.leafBlockFPs.length); for(long fp : bkd.leafBlockFPs) { @@ -396,14 +392,14 @@ public class BKDWriter implements Closeable { } // Tie break by sorting smaller docIDs earlier: - return a.docIDBase < b.docIDBase; + return a.docID < b.docID; } } /** More efficient bulk-add for incoming {@link BKDReader}s. This does a merge sort of the already * sorted values and currently only works when numDims==1. This returns -1 if all documents containing * dimensional values were deleted. */ - public long merge(IndexOutput out, List docMaps, List readers, List docIDBases) throws IOException { + public long merge(IndexOutput out, List docMaps, List readers) throws IOException { if (numDims != 1) { throw new UnsupportedOperationException("numDims must be 1 but got " + numDims); } @@ -411,8 +407,6 @@ public class BKDWriter implements Closeable { throw new IllegalStateException("cannot mix add and merge"); } - //System.out.println("BKDW.merge segs=" + readers.size()); - // Catch user silliness: if (heapPointWriter == null && tempInput == null) { throw new IllegalStateException("already finished"); @@ -433,7 +427,7 @@ public class BKDWriter implements Closeable { } else { docMap = docMaps.get(i); } - MergeReader reader = new MergeReader(bkd, docMap, docIDBases.get(i)); + MergeReader reader = new MergeReader(bkd, docMap); if (reader.next()) { queue.add(reader); } @@ -468,7 +462,7 @@ public class BKDWriter implements Closeable { // System.out.println("iter reader=" + reader); // NOTE: doesn't work with subclasses (e.g. SimpleText!) - int docID = reader.docIDBase + reader.docID; + int docID = reader.docID; leafBlockDocIDs[leafCount] = docID; System.arraycopy(reader.state.scratchPackedValue, 0, leafBlockPackedValues[leafCount], 0, packedBytesLength); docsSeen.set(docID); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java index 73b4622b82c..bd1e9b6e14f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java @@ -222,7 +222,7 @@ public class TestCodecs extends LuceneTestCase { final FieldInfos fieldInfos = builder.finish(); final Directory dir = newDirectory(); Codec codec = Codec.getDefault(); - final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); this.write(si, fieldInfos, dir, fields); final FieldsProducer reader = codec.postingsFormat().fieldsProducer(new SegmentReadState(dir, si, fieldInfos, newIOContext(random()))); @@ -279,7 +279,7 @@ public class TestCodecs extends LuceneTestCase { } Codec codec = Codec.getDefault(); - final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); this.write(si, fieldInfos, dir, fields); if (VERBOSE) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDemoParallelLeafReader.java b/lucene/core/src/test/org/apache/lucene/index/TestDemoParallelLeafReader.java index 9f3339c8c69..0dc654cb212 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDemoParallelLeafReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDemoParallelLeafReader.java @@ -503,7 +503,7 @@ public class TestDemoParallelLeafReader extends LuceneTestCase { class ReindexingOneMerge extends OneMerge { - List parallelReaders; + final List parallelReaders = new ArrayList<>(); final long schemaGen; ReindexingOneMerge(List segments) { @@ -519,33 +519,23 @@ public class TestDemoParallelLeafReader extends LuceneTestCase { } @Override - public List getMergeReaders() throws IOException { - if (parallelReaders == null) { - parallelReaders = new ArrayList<>(); - for (CodecReader reader : super.getMergeReaders()) { - parallelReaders.add(getCurrentReader((SegmentReader)reader, schemaGen)); - } + public CodecReader wrapForMerge(CodecReader reader) throws IOException { + LeafReader wrapped = getCurrentReader((SegmentReader)reader, schemaGen); + if (wrapped instanceof ParallelLeafReader) { + parallelReaders.add((ParallelLeafReader) wrapped); } - - // TODO: fix ParallelLeafReader, if this is a good use case - List mergeReaders = new ArrayList<>(); - for (LeafReader reader : parallelReaders) { - mergeReaders.add(SlowCodecReaderWrapper.wrap(reader)); - } - return mergeReaders; + return SlowCodecReaderWrapper.wrap(wrapped); } @Override public void mergeFinished() throws IOException { Throwable th = null; - for(LeafReader r : parallelReaders) { - if (r instanceof ParallelLeafReader) { - try { - r.decRef(); - } catch (Throwable t) { - if (th == null) { - th = t; - } + for (ParallelLeafReader r : parallelReaders) { + try { + r.decRef(); + } catch (Throwable t) { + if (th == null) { + th = t; } } } @@ -561,10 +551,6 @@ public class TestDemoParallelLeafReader extends LuceneTestCase { super.setMergeInfo(info); } - @Override - public MergePolicy.DocMap getDocMap(final MergeState mergeState) { - return super.getDocMap(mergeState); - } } class ReindexingMergeSpecification extends MergeSpecification { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java index 803b1d9bc65..8b24b4d7bc5 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java @@ -218,7 +218,7 @@ public class TestDoc extends LuceneTestCase { final Codec codec = Codec.getDefault(); TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(si1.info.dir); - final SegmentInfo si = new SegmentInfo(si1.info.dir, Version.LATEST, merged, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + final SegmentInfo si = new SegmentInfo(si1.info.dir, Version.LATEST, merged, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); SegmentMerger merger = new SegmentMerger(Arrays.asList(r1, r2), si, InfoStream.getDefault(), trackingDir, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java new file mode 100644 index 00000000000..9251b00d349 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +public class TestDocIDMerger extends LuceneTestCase { + + private static class TestSubUnsorted extends DocIDMerger.Sub { + private int docID = -1; + final int valueStart; + final int maxDoc; + + public TestSubUnsorted(MergeState.DocMap docMap, Bits liveDocs, int maxDoc, int valueStart) { + super(docMap, liveDocs); + this.maxDoc = maxDoc; + this.valueStart = valueStart; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + return docID; + } + } + + public int getValue() { + return valueStart + docID; + } + } + + public void testNoSort() throws Exception { + + int subCount = TestUtil.nextInt(random(), 1, 20); + List subs = new ArrayList<>(); + int valueStart = 0; + for(int i=0;i merger = new DocIDMerger<>(subs, false); + + int count = 0; + while (true) { + TestSubUnsorted sub = merger.next(); + if (sub == null) { + break; + } + assertEquals(count, sub.mappedDocID); + assertEquals(count, sub.getValue()); + count++; + } + + assertEquals(valueStart, count); + } + + private static class TestSubSorted extends DocIDMerger.Sub { + private int docID = -1; + final int maxDoc; + final int index; + + public TestSubSorted(MergeState.DocMap docMap, Bits liveDocs, int maxDoc, int index) { + super(docMap, liveDocs); + this.maxDoc = maxDoc; + this.index = index; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + return docID; + } + } + + @Override + public String toString() { + return "TestSubSorted(index=" + index + ", mappedDocID=" + mappedDocID+ ")"; + } + } + + public void testWithSort() throws Exception { + + int subCount = TestUtil.nextInt(random(), 1, 20); + List oldToNew = new ArrayList<>(); + // how many docs we've written to each sub: + List uptos = new ArrayList<>(); + int totDocCount = 0; + for(int i=0;i completedSubs = new ArrayList<>(); + + // randomly distribute target docIDs into the segments: + for(int docID=0;docID subs = new ArrayList<>(); + for(int i=0;i merger = new DocIDMerger<>(subs, true); + + int count = 0; + while (true) { + TestSubSorted sub = merger.next(); + if (sub == null) { + break; + } + assertEquals(count, sub.mappedDocID); + count++; + } + + assertEquals(totDocCount, count); + } + + // nocommit more tests, e.g. deleted docs +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java new file mode 100644 index 00000000000..2635b00ada7 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -0,0 +1,792 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.codecs.simpletext.SimpleTextCodec; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.BinaryPoint; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.PointValues.IntersectVisitor; +import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +// nocommit test tie break +// nocommit test multiple sorts +// nocommit test update dvs + +// nocommit test EarlyTerminatingCollector + +public class TestIndexSorting extends LuceneTestCase { + + public void testSortOnMerge(boolean withDeletes) throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far + Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + final int numDocs = atLeast(200); + final FixedBitSet deleted = new FixedBitSet(numDocs); + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + doc.add(new NumericDocValuesField("foo", random().nextInt(20))); + doc.add(new StringField("id", Integer.toString(i), Store.YES)); + doc.add(new NumericDocValuesField("id", i)); + w.addDocument(doc); + if (random().nextInt(5) == 0) { + w.getReader().close(); + } else if (random().nextInt(30) == 0) { + w.forceMerge(2); + } else if (random().nextInt(4) == 0) { + final int id = TestUtil.nextInt(random(), 0, i); + deleted.set(id); + w.deleteDocuments(new Term("id", Integer.toString(id))); + } + } + + // Check that segments are sorted + DirectoryReader reader = w.getReader(); + for (LeafReaderContext ctx : reader.leaves()) { + final SegmentReader leaf = (SegmentReader) ctx.reader(); + SegmentInfo info = leaf.getSegmentInfo().info; + switch (info.getDiagnostics().get(IndexWriter.SOURCE)) { + case IndexWriter.SOURCE_FLUSH: + assertNull(info.getIndexSort()); + break; + case IndexWriter.SOURCE_MERGE: + assertEquals(indexSort, info.getIndexSort()); + final NumericDocValues values = leaf.getNumericDocValues("foo"); + long previous = Long.MIN_VALUE; + for (int i = 0; i < leaf.maxDoc(); ++i) { + final long value = values.get(i); + assertTrue(value >= previous); + previous = value; + } + break; + default: + fail(); + } + } + + // Now check that the index is consistent + IndexSearcher searcher = newSearcher(reader); + for (int i = 0; i < numDocs; ++i) { + TermQuery termQuery = new TermQuery(new Term("id", Integer.toString(i))); + final TopDocs topDocs = searcher.search(termQuery, 1); + if (deleted.get(i)) { + assertEquals(0, topDocs.totalHits); + } else { + assertEquals(1, topDocs.totalHits); + assertEquals(i, MultiDocValues.getNumericValues(reader, "id").get(topDocs.scoreDocs[0].doc)); + Document document = reader.document(topDocs.scoreDocs[0].doc); + assertEquals(Integer.toString(i), document.get("id")); + } + } + + reader.close(); + w.close(); + dir.close(); + } + + public void testSortOnMerge() throws IOException { + testSortOnMerge(false); + } + + public void testSortOnMergeWithDeletes() throws IOException { + testSortOnMerge(true); + } + + static class UpdateRunnable implements Runnable { + + private final int numDocs; + private final Random random; + private final AtomicInteger updateCount; + private final IndexWriter w; + private final Map values; + private final CountDownLatch latch; + + UpdateRunnable(int numDocs, Random random, CountDownLatch latch, AtomicInteger updateCount, IndexWriter w, Map values) { + this.numDocs = numDocs; + this.random = random; + this.latch = latch; + this.updateCount = updateCount; + this.w = w; + this.values = values; + } + + @Override + public void run() { + try { + latch.await(); + while (updateCount.decrementAndGet() >= 0) { + final int id = random.nextInt(numDocs); + final long value = random.nextInt(20); + Document doc = new Document(); + doc.add(new StringField("id", Integer.toString(id), Store.NO)); + doc.add(new NumericDocValuesField("foo", value)); + + synchronized (values) { + w.updateDocument(new Term("id", Integer.toString(id)), doc); + values.put(id, value); + } + + switch (random.nextInt(10)) { + case 0: + case 1: + // reopen + DirectoryReader.open(w).close(); + break; + case 2: + w.forceMerge(3); + break; + } + } + } catch (IOException | InterruptedException e) { + throw new RuntimeException(e); + } + } + + } + + // There is tricky logic to resolve deletes that happened while merging + public void testConcurrentUpdates() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far + Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Map values = new HashMap<>(); + + final int numDocs = atLeast(100); + Thread[] threads = new Thread[2]; + final AtomicInteger updateCount = new AtomicInteger(atLeast(1000)); + final CountDownLatch latch = new CountDownLatch(1); + for (int i = 0; i < threads.length; ++i) { + Random r = new Random(random().nextLong()); + threads[i] = new Thread(new UpdateRunnable(numDocs, r, latch, updateCount, w, values)); + } + for (Thread thread : threads) { + thread.start(); + } + latch.countDown(); + for (Thread thread : threads) { + thread.join(); + } + w.forceMerge(1); + DirectoryReader reader = DirectoryReader.open(w); + IndexSearcher searcher = newSearcher(reader); + for (int i = 0; i < numDocs; ++i) { + final TopDocs topDocs = searcher.search(new TermQuery(new Term("id", Integer.toString(i))), 1); + if (values.containsKey(i) == false) { + assertEquals(0, topDocs.totalHits); + } else { + assertEquals(1, topDocs.totalHits); + assertEquals(values.get(i).longValue(), MultiDocValues.getNumericValues(reader, "foo").get(topDocs.scoreDocs[0].doc)); + } + } + reader.close(); + w.close(); + dir.close(); + } + + static class DVUpdateRunnable implements Runnable { + + private final int numDocs; + private final Random random; + private final AtomicInteger updateCount; + private final IndexWriter w; + private final Map values; + private final CountDownLatch latch; + + DVUpdateRunnable(int numDocs, Random random, CountDownLatch latch, AtomicInteger updateCount, IndexWriter w, Map values) { + this.numDocs = numDocs; + this.random = random; + this.latch = latch; + this.updateCount = updateCount; + this.w = w; + this.values = values; + } + + @Override + public void run() { + try { + latch.await(); + while (updateCount.decrementAndGet() >= 0) { + final int id = random.nextInt(numDocs); + final long value = random.nextInt(20); + + synchronized (values) { + w.updateDocValues(new Term("id", Integer.toString(id)), new NumericDocValuesField("foo", value)); + values.put(id, value); + } + + switch (random.nextInt(10)) { + case 0: + case 1: + // reopen + DirectoryReader.open(w).close(); + break; + case 2: + w.forceMerge(3); + break; + } + } + } catch (IOException | InterruptedException e) { + throw new RuntimeException(e); + } + } + + } + + // There is tricky logic to resolve dv updates that happened while merging + public void testConcurrentDVUpdates() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far + Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Map values = new HashMap<>(); + + final int numDocs = atLeast(100); + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + doc.add(new StringField("id", Integer.toString(i), Store.NO)); + doc.add(new NumericDocValuesField("foo", -1)); + values.put(i, -1L); + } + Thread[] threads = new Thread[2]; + final AtomicInteger updateCount = new AtomicInteger(atLeast(1000)); + final CountDownLatch latch = new CountDownLatch(1); + for (int i = 0; i < threads.length; ++i) { + Random r = new Random(random().nextLong()); + threads[i] = new Thread(new UpdateRunnable(numDocs, r, latch, updateCount, w, values)); + } + for (Thread thread : threads) { + thread.start(); + } + latch.countDown(); + for (Thread thread : threads) { + thread.join(); + } + w.forceMerge(1); + DirectoryReader reader = DirectoryReader.open(w); + IndexSearcher searcher = newSearcher(reader); + for (int i = 0; i < numDocs; ++i) { + final TopDocs topDocs = searcher.search(new TermQuery(new Term("id", Integer.toString(i))), 1); + assertEquals(1, topDocs.totalHits); + assertEquals(values.get(i).longValue(), MultiDocValues.getNumericValues(reader, "foo").get(topDocs.scoreDocs[0].doc)); + } + reader.close(); + w.close(); + dir.close(); + } + + public void testAddIndexes(boolean withDeletes) throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + final int numDocs = atLeast(100); + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + doc.add(new StringField("id", Integer.toString(i), Store.NO)); + doc.add(new NumericDocValuesField("foo", random().nextInt(20))); + w.addDocument(doc); + } + if (withDeletes) { + for (int i = random().nextInt(5); i < numDocs; i += TestUtil.nextInt(random(), 1, 5)) { + w.deleteDocuments(new Term("id", Integer.toString(i))); + } + } + final IndexReader reader = w.getReader(); + + Directory dir2 = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far + Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); + iwc.setIndexSort(indexSort); + IndexWriter w2 = new IndexWriter(dir2, iwc); + + CodecReader[] codecReaders = new CodecReader[reader.leaves().size()]; + for (int i = 0; i < codecReaders.length; ++i) { + codecReaders[i] = (CodecReader) reader.leaves().get(i).reader(); + } + w2.addIndexes(codecReaders); + final IndexReader reader2 = w2.getReader(); + final IndexSearcher searcher = newSearcher(reader); + final IndexSearcher searcher2 = newSearcher(reader2); + for (int i = 0; i < numDocs; ++i) { + Query query = new TermQuery(new Term("id", Integer.toString(i))); + final TopDocs topDocs = searcher.search(query, 1); + final TopDocs topDocs2 = searcher2.search(query, 1); + assertEquals(topDocs.totalHits, topDocs2.totalHits); + if (topDocs.totalHits == 1) { + assertEquals( + MultiDocValues.getNumericValues(reader, "foo").get(topDocs.scoreDocs[0].doc), + MultiDocValues.getNumericValues(reader2, "foo").get(topDocs2.scoreDocs[0].doc)); + } + } + + IOUtils.close(reader, reader2, w, w2, dir, dir2); + } + + public void testAddIndexes() throws Exception { + testAddIndexes(false); + } + + public void testAddIndexesWithDeletions() throws Exception { + testAddIndexes(true); + } + + public void testBadSort() throws Exception { + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + iwc.setIndexSort(Sort.RELEVANCE); + }); + assertEquals("invalid SortField type: must be one of [STRING, INT, FLOAT, LONG, DOUBLE, BYTES] but got: ", expected.getMessage()); + } + + // you can't change the index sort on an existing index: + public void testIllegalChangeSort() throws Exception { + final Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far + iwc.setIndexSort(new Sort(new SortField("foo", SortField.Type.LONG))); + IndexWriter w = new IndexWriter(dir, iwc); + w.addDocument(new Document()); + DirectoryReader.open(w).close(); + w.addDocument(new Document()); + w.forceMerge(1); + w.close(); + + final IndexWriterConfig iwc2 = new IndexWriterConfig(new MockAnalyzer(random())); + iwc2.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far + iwc2.setIndexSort(new Sort(new SortField("bar", SortField.Type.LONG))); + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + new IndexWriter(dir, iwc2); + }); + assertEquals("cannot change previous indexSort= (from segment=_2(7.0.0):c2:[indexSort=]) to new indexSort=", expected.getMessage()); + dir.close(); + } + + static final class NormsSimilarity extends Similarity { + + private final Similarity in; + + public NormsSimilarity(Similarity in) { + this.in = in; + } + + @Override + public long computeNorm(FieldInvertState state) { + if (state.getName().equals(NORMS_FIELD)) { + return Float.floatToIntBits(state.getBoost()); + } else { + return in.computeNorm(state); + } + } + + @Override + public SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { + return in.computeWeight(collectionStats, termStats); + } + + @Override + public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { + return in.simScorer(weight, context); + } + + } + + static final class PositionsTokenStream extends TokenStream { + + private final CharTermAttribute term; + private final PayloadAttribute payload; + private final OffsetAttribute offset; + + private int pos, off; + + public PositionsTokenStream() { + term = addAttribute(CharTermAttribute.class); + payload = addAttribute(PayloadAttribute.class); + offset = addAttribute(OffsetAttribute.class); + } + + @Override + public boolean incrementToken() throws IOException { + if (pos == 0) { + return false; + } + + clearAttributes(); + term.append(DOC_POSITIONS_TERM); + payload.setPayload(new BytesRef(Integer.toString(pos))); + offset.setOffset(off, off); + --pos; + ++off; + return true; + } + + void setId(int id) { + pos = id / 10 + 1; + off = 0; + } + } + + private static Directory dir; + private static IndexReader sortedReader; + + private static final FieldType TERM_VECTORS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); + static { + TERM_VECTORS_TYPE.setStoreTermVectors(true); + TERM_VECTORS_TYPE.freeze(); + } + + private static final FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); + static { + POSITIONS_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + POSITIONS_TYPE.freeze(); + } + + private static final String ID_FIELD = "id"; + private static final String DOCS_ENUM_FIELD = "docs"; + private static final String DOCS_ENUM_TERM = "$all$"; + private static final String DOC_POSITIONS_FIELD = "positions"; + private static final String DOC_POSITIONS_TERM = "$all$"; + private static final String NUMERIC_DV_FIELD = "numeric"; + private static final String SORTED_NUMERIC_DV_FIELD = "sorted_numeric"; + private static final String NORMS_FIELD = "norm"; + private static final String BINARY_DV_FIELD = "binary"; + private static final String SORTED_DV_FIELD = "sorted"; + private static final String SORTED_SET_DV_FIELD = "sorted_set"; + private static final String TERM_VECTORS_FIELD = "term_vectors"; + private static final String DIMENSIONAL_FIELD = "numeric1d"; + + private static Document doc(final int id, PositionsTokenStream positions) { + final Document doc = new Document(); + doc.add(new StringField(ID_FIELD, Integer.toString(id), Store.YES)); + doc.add(new StringField(DOCS_ENUM_FIELD, DOCS_ENUM_TERM, Store.NO)); + positions.setId(id); + doc.add(new Field(DOC_POSITIONS_FIELD, positions, POSITIONS_TYPE)); + doc.add(new NumericDocValuesField(NUMERIC_DV_FIELD, id)); + TextField norms = new TextField(NORMS_FIELD, Integer.toString(id), Store.NO); + norms.setBoost(Float.intBitsToFloat(id)); + doc.add(norms); + doc.add(new BinaryDocValuesField(BINARY_DV_FIELD, new BytesRef(Integer.toString(id)))); + doc.add(new SortedDocValuesField(SORTED_DV_FIELD, new BytesRef(Integer.toString(id)))); + doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id)))); + doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id + 1)))); + doc.add(new SortedNumericDocValuesField(SORTED_NUMERIC_DV_FIELD, id)); + doc.add(new SortedNumericDocValuesField(SORTED_NUMERIC_DV_FIELD, id + 1)); + doc.add(new Field(TERM_VECTORS_FIELD, Integer.toString(id), TERM_VECTORS_TYPE)); + byte[] bytes = new byte[4]; + NumericUtils.intToSortableBytes(id, bytes, 0); + doc.add(new BinaryPoint(DIMENSIONAL_FIELD, bytes)); + return doc; + } + + @AfterClass + public static void afterClass() throws Exception { + if (sortedReader != null) { + sortedReader.close(); + sortedReader = null; + } + if (dir != null) { + dir.close(); + dir = null; + } + } + + @BeforeClass + public static void createIndex() throws Exception { + dir = newFSDirectory(createTempDir()); + int numDocs = atLeast(100); + + List ids = new ArrayList<>(); + for (int i = 0; i < numDocs; i++) { + ids.add(Integer.valueOf(i * 10)); + } + // shuffle them for indexing + Collections.shuffle(ids, random()); + if (VERBOSE) { + System.out.println("Shuffled IDs for indexing: " + Arrays.toString(ids.toArray())); + } + + PositionsTokenStream positions = new PositionsTokenStream(); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + // nocommit: + conf.setCodec(new SimpleTextCodec()); + conf.setMaxBufferedDocs(4); // create some segments + conf.setSimilarity(new NormsSimilarity(conf.getSimilarity())); // for testing norms field + // nocommit + conf.setMergeScheduler(new SerialMergeScheduler()); + // sort the index by id (as integer, in NUMERIC_DV_FIELD) + conf.setIndexSort(new Sort(new SortField(NUMERIC_DV_FIELD, SortField.Type.INT))); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + writer.setDoRandomForceMerge(false); + for (int id : ids) { + writer.addDocument(doc(id, positions)); + } + // delete some documents + writer.commit(); + // nocommit need thread safety test too + for (Integer id : ids) { + if (random().nextDouble() < 0.2) { + if (VERBOSE) { + System.out.println("delete doc_id " + id); + } + writer.deleteDocuments(new Term(ID_FIELD, id.toString())); + } + } + + sortedReader = writer.getReader(); + writer.close(); + + TestUtil.checkReader(sortedReader); + } + + // nocommit just do assertReaderEquals, don't use @BeforeClass, etc.? + + public void testBinaryDocValuesField() throws Exception { + for(LeafReaderContext ctx : sortedReader.leaves()) { + LeafReader reader = ctx.reader(); + BinaryDocValues dv = reader.getBinaryDocValues(BINARY_DV_FIELD); + boolean isSorted = reader.getIndexSort() != null; + int lastID = Integer.MIN_VALUE; + for (int docID = 0; docID < reader.maxDoc(); docID++) { + BytesRef bytes = dv.get(docID); + String idString = reader.document(docID).get(ID_FIELD); + assertEquals("incorrect binary DocValues for doc " + docID, idString, bytes.utf8ToString()); + if (isSorted) { + int id = Integer.parseInt(idString); + assertTrue("lastID=" + lastID + " vs id=" + id, lastID < id); + lastID = id; + } + } + } + } + + public void testPostings() throws Exception { + for(LeafReaderContext ctx : sortedReader.leaves()) { + LeafReader reader = ctx.reader(); + TermsEnum termsEnum = reader.terms(DOC_POSITIONS_FIELD).iterator(); + assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(DOC_POSITIONS_TERM))); + PostingsEnum sortedPositions = termsEnum.postings(null, PostingsEnum.ALL); + int doc; + boolean isSorted = reader.getIndexSort() != null; + + // test nextDoc() + while ((doc = sortedPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + int freq = sortedPositions.freq(); + int id = Integer.parseInt(reader.document(doc).get(ID_FIELD)); + assertEquals("incorrect freq for doc=" + doc, id / 10 + 1, freq); + for (int i = 0; i < freq; i++) { + assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition()); + assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset()); + assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset()); + assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString())); + } + } + + // test advance() + final PostingsEnum reuse = sortedPositions; + sortedPositions = termsEnum.postings(reuse, PostingsEnum.ALL); + + doc = 0; + while ((doc = sortedPositions.advance(doc + TestUtil.nextInt(random(), 1, 5))) != DocIdSetIterator.NO_MORE_DOCS) { + int freq = sortedPositions.freq(); + int id = Integer.parseInt(reader.document(doc).get(ID_FIELD)); + assertEquals("incorrect freq for doc=" + doc, id / 10 + 1, freq); + for (int i = 0; i < freq; i++) { + assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition()); + assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset()); + assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset()); + assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString())); + } + } + } + } + + public void testDocsAreSortedByID() throws Exception { + for(LeafReaderContext ctx : sortedReader.leaves()) { + LeafReader reader = ctx.reader(); + if (reader.getIndexSort() != null) { + int maxDoc = reader.maxDoc(); + int lastID = Integer.MIN_VALUE; + for(int doc=0;doc lastID); + lastID = id; + } + } + } + } + + public void testNormValues() throws Exception { + for(LeafReaderContext ctx : sortedReader.leaves()) { + LeafReader reader = ctx.reader(); + NumericDocValues dv = reader.getNormValues(NORMS_FIELD); + int maxDoc = reader.maxDoc(); + boolean isSorted = reader.getIndexSort() != null; + for (int doc = 0; doc < maxDoc; doc++) { + int id = Integer.parseInt(reader.document(doc).get(ID_FIELD)); + assertEquals("incorrect norm value for doc " + doc, id, dv.get(doc)); + } + } + } + + public void testNumericDocValuesField() throws Exception { + for(LeafReaderContext ctx : sortedReader.leaves()) { + LeafReader reader = ctx.reader(); + NumericDocValues dv = reader.getNumericDocValues(NUMERIC_DV_FIELD); + int maxDoc = reader.maxDoc(); + for (int doc = 0; doc < maxDoc; doc++) { + int id = Integer.parseInt(reader.document(doc).get(ID_FIELD)); + assertEquals("incorrect numeric DocValues for doc " + doc, id, dv.get(doc)); + } + } + } + + public void testSortedDocValuesField() throws Exception { + for(LeafReaderContext ctx : sortedReader.leaves()) { + LeafReader reader = ctx.reader(); + SortedDocValues dv = reader.getSortedDocValues(SORTED_DV_FIELD); + int maxDoc = reader.maxDoc(); + for (int doc = 0; doc < maxDoc; doc++) { + final BytesRef bytes = dv.get(doc); + String id = reader.document(doc).get(ID_FIELD); + assertEquals("incorrect sorted DocValues for doc " + doc, id, bytes.utf8ToString()); + } + } + } + + public void testSortedSetDocValuesField() throws Exception { + for(LeafReaderContext ctx : sortedReader.leaves()) { + LeafReader reader = ctx.reader(); + SortedSetDocValues dv = reader.getSortedSetDocValues(SORTED_SET_DV_FIELD); + int maxDoc = reader.maxDoc(); + for (int doc = 0; doc < maxDoc; doc++) { + dv.setDocument(doc); + BytesRef bytes = dv.lookupOrd(dv.nextOrd()); + String id = reader.document(doc).get(ID_FIELD); + assertEquals("incorrect sorted-set DocValues for doc " + doc, id, bytes.utf8ToString()); + bytes = dv.lookupOrd(dv.nextOrd()); + assertEquals("incorrect sorted-set DocValues for doc " + doc, Integer.valueOf(Integer.parseInt(id) + 1).toString(), bytes.utf8ToString()); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, dv.nextOrd()); + } + } + } + + public void testSortedNumericDocValuesField() throws Exception { + for(LeafReaderContext ctx : sortedReader.leaves()) { + LeafReader reader = ctx.reader(); + SortedNumericDocValues dv = reader.getSortedNumericDocValues(SORTED_NUMERIC_DV_FIELD); + int maxDoc = reader.maxDoc(); + for (int doc = 0; doc < maxDoc; doc++) { + dv.setDocument(doc); + assertEquals(2, dv.count()); + int id = Integer.parseInt(reader.document(doc).get(ID_FIELD)); + assertEquals("incorrect sorted-numeric DocValues for doc " + doc, id, dv.valueAt(0)); + assertEquals("incorrect sorted-numeric DocValues for doc " + doc, id + 1, dv.valueAt(1)); + } + } + } + + public void testTermVectors() throws Exception { + for(LeafReaderContext ctx : sortedReader.leaves()) { + LeafReader reader = ctx.reader(); + int maxDoc = reader.maxDoc(); + for (int doc = 0; doc < maxDoc; doc++) { + Terms terms = reader.getTermVector(doc, TERM_VECTORS_FIELD); + assertNotNull("term vectors not found for doc " + doc + " field [" + TERM_VECTORS_FIELD + "]", terms); + String id = reader.document(doc).get(ID_FIELD); + assertEquals("incorrect term vector for doc " + doc, id, terms.iterator().next().utf8ToString()); + } + } + } + + public void testPoints() throws Exception { + for(LeafReaderContext ctx : sortedReader.leaves()) { + final LeafReader reader = ctx.reader(); + PointValues values = reader.getPointValues(); + values.intersect(DIMENSIONAL_FIELD, + new IntersectVisitor() { + @Override + public void visit(int docID) { + throw new IllegalStateException(); + } + + @Override + public void visit(int docID, byte[] packedValues) throws IOException { + int id = Integer.parseInt(reader.document(docID).get(ID_FIELD)); + assertEquals(id, NumericUtils.sortableBytesToInt(packedValues, 0)); + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_CROSSES_QUERY; + } + }); + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 2c3543e0438..fb3e07ee95d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -69,6 +69,8 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.BaseDirectoryWrapper; @@ -2759,5 +2761,6 @@ public class TestIndexWriter extends LuceneTestCase { w.close(); dir.close(); } + } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java index 58ceb445ef6..179d2663a58 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java @@ -51,7 +51,7 @@ public class TestSegmentInfos extends LuceneTestCase { SegmentInfos sis = new SegmentInfos(); SegmentInfo info = new SegmentInfo(dir, Version.LUCENE_6_0_0, "_0", 1, false, Codec.getDefault(), - Collections.emptyMap(), id, Collections.emptyMap()); + Collections.emptyMap(), id, Collections.emptyMap(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, -1, -1, -1); @@ -73,14 +73,14 @@ public class TestSegmentInfos extends LuceneTestCase { SegmentInfos sis = new SegmentInfos(); SegmentInfo info = new SegmentInfo(dir, Version.LUCENE_6_0_0, "_0", 1, false, Codec.getDefault(), - Collections.emptyMap(), id, Collections.emptyMap()); + Collections.emptyMap(), id, Collections.emptyMap(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, -1, -1, -1); sis.add(commitInfo); info = new SegmentInfo(dir, Version.LUCENE_6_0_0, "_1", 1, false, Codec.getDefault(), - Collections.emptyMap(), id, Collections.emptyMap()); + Collections.emptyMap(), id, Collections.emptyMap(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); commitInfo = new SegmentCommitInfo(info, 0, -1, -1, -1); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java index e1075abe735..1ef37c0892f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -35,6 +35,7 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.Version; +import org.apache.lucene.util.packed.PackedLongValues; public class TestSegmentMerger extends LuceneTestCase { //The variables for the new merged segment @@ -83,7 +84,7 @@ public class TestSegmentMerger extends LuceneTestCase { public void testMerge() throws IOException { final Codec codec = Codec.getDefault(); - final SegmentInfo si = new SegmentInfo(mergedDir, Version.LATEST, mergedSegment, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + final SegmentInfo si = new SegmentInfo(mergedDir, Version.LATEST, mergedSegment, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); SegmentMerger merger = new SegmentMerger(Arrays.asList(reader1, reader2), si, InfoStream.getDefault(), mergedDir, @@ -144,22 +145,9 @@ public class TestSegmentMerger extends LuceneTestCase { mergedReader.close(); } - private static boolean equals(MergeState.DocMap map1, MergeState.DocMap map2) { - if (map1.maxDoc() != map2.maxDoc()) { - return false; - } - for (int i = 0; i < map1.maxDoc(); ++i) { - if (map1.get(i) != map2.get(i)) { - return false; - } - } - return true; - } - public void testBuildDocMap() { final int maxDoc = TestUtil.nextInt(random(), 1, 128); final int numDocs = TestUtil.nextInt(random(), 0, maxDoc); - final int numDeletedDocs = maxDoc - numDocs; final FixedBitSet liveDocs = new FixedBitSet(maxDoc); for (int i = 0; i < numDocs; ++i) { while (true) { @@ -171,15 +159,11 @@ public class TestSegmentMerger extends LuceneTestCase { } } - final MergeState.DocMap docMap = MergeState.DocMap.build(maxDoc, liveDocs); + final PackedLongValues docMap = MergeState.removeDeletes(maxDoc, liveDocs); - assertEquals(maxDoc, docMap.maxDoc()); - assertEquals(numDocs, docMap.numDocs()); - assertEquals(numDeletedDocs, docMap.numDeletedDocs()); // assert the mapping is compact for (int i = 0, del = 0; i < maxDoc; ++i) { - if (!liveDocs.get(i)) { - assertEquals(-1, docMap.get(i)); + if (liveDocs.get(i) == false) { ++del; } else { assertEquals(i - del, docMap.get(i)); diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java index 38b3fb5c87c..b1a8f8d3f88 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java @@ -25,6 +25,7 @@ import java.util.BitSet; import java.util.List; import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.MergeState; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.store.CorruptingIndexOutput; @@ -554,7 +555,7 @@ public class TestBKD extends LuceneTestCase { } List toMerge = null; - List docIDBases = null; + List docMaps = null; int seg = 0; BKDWriter w = new BKDWriter(numValues, dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB, docValues.length, false); @@ -601,9 +602,15 @@ public class TestBKD extends LuceneTestCase { if (useMerge && segCount == valuesInThisSeg) { if (toMerge == null) { toMerge = new ArrayList<>(); - docIDBases = new ArrayList<>(); + docMaps = new ArrayList<>(); } - docIDBases.add(lastDocIDBase); + final int curDocIDBase = lastDocIDBase; + docMaps.add(new MergeState.DocMap() { + @Override + public int get(int docID) { + return curDocIDBase + docID; + } + }); toMerge.add(w.finish(out)); valuesInThisSeg = TestUtil.nextInt(random(), numValues/10, numValues/2); segCount = 0; @@ -620,8 +627,14 @@ public class TestBKD extends LuceneTestCase { if (toMerge != null) { if (segCount > 0) { - docIDBases.add(lastDocIDBase); toMerge.add(w.finish(out)); + final int curDocIDBase = lastDocIDBase; + docMaps.add(new MergeState.DocMap() { + @Override + public int get(int docID) { + return curDocIDBase + docID; + } + }); } out.close(); in = dir.openInput("bkd", IOContext.DEFAULT); @@ -633,7 +646,7 @@ public class TestBKD extends LuceneTestCase { readers.add(new BKDReader(in)); } out = dir.createOutput("bkd2", IOContext.DEFAULT); - indexFP = w.merge(out, null, readers, docIDBases); + indexFP = w.merge(out, docMaps, readers); out.close(); in.close(); in = dir.openInput("bkd2", IOContext.DEFAULT); diff --git a/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java b/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java index c672ed00bac..368c2854a22 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java +++ b/lucene/misc/src/java/org/apache/lucene/index/IndexSplitter.java @@ -140,7 +140,7 @@ public class IndexSplitter { SegmentInfo info = infoPerCommit.info; // Same info just changing the dir: SegmentInfo newInfo = new SegmentInfo(destFSDir, info.getVersion(), info.name, info.maxDoc(), - info.getUseCompoundFile(), info.getCodec(), info.getDiagnostics(), info.getId(), new HashMap<>()); + info.getUseCompoundFile(), info.getCodec(), info.getDiagnostics(), info.getId(), new HashMap<>(), null); destInfos.add(new SegmentCommitInfo(newInfo, infoPerCommit.getDelCount(), infoPerCommit.getDelGen(), infoPerCommit.getFieldInfosGen(), infoPerCommit.getDocValuesGen())); diff --git a/lucene/misc/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java b/lucene/misc/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java index de79ab07f76..6cd990b00fe 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java +++ b/lucene/misc/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java @@ -24,6 +24,7 @@ import java.util.Map; import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues; import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; import org.apache.lucene.index.MultiDocValues.OrdinalMap; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.Bits; /** @@ -67,6 +68,11 @@ public final class SlowCompositeReaderWrapper extends LeafReader { if (getFieldInfos().hasPointValues()) { throw new IllegalArgumentException("cannot wrap points"); } + for(LeafReaderContext context : reader.leaves()) { + if (context.reader().getIndexSort() != null) { + throw new IllegalArgumentException("cannot use index sort"); + } + } fields = MultiFields.getFields(in); in.registerParentReader(this); this.merging = merging; @@ -272,4 +278,9 @@ public final class SlowCompositeReaderWrapper extends LeafReader { ctx.reader().checkIntegrity(); } } + + @Override + public Sort getIndexSort() { + return null; + } } diff --git a/lucene/misc/src/java/org/apache/lucene/index/SortingMergePolicy.java b/lucene/misc/src/java/org/apache/lucene/index/SortingMergePolicy.java deleted file mode 100644 index cd8f84e056e..00000000000 --- a/lucene/misc/src/java/org/apache/lucene/index/SortingMergePolicy.java +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.index; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.MergePolicy; -import org.apache.lucene.index.MergeState; -import org.apache.lucene.index.MergeTrigger; -import org.apache.lucene.index.MultiReader; -import org.apache.lucene.index.SegmentCommitInfo; -import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.SegmentInfos; -import org.apache.lucene.index.SegmentReader; -import org.apache.lucene.index.SlowCompositeReaderWrapper; -import org.apache.lucene.search.Sort; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.InfoStream; -import org.apache.lucene.util.packed.PackedInts; -import org.apache.lucene.util.packed.PackedLongValues; - -/** A {@link MergePolicy} that reorders documents according to a {@link Sort} - * before merging them. As a consequence, all segments resulting from a merge - * will be sorted while segments resulting from a flush will be in the order - * in which documents have been added. - *

      NOTE: Never use this policy if you rely on - * {@link IndexWriter#addDocuments(Iterable) IndexWriter.addDocuments} - * to have sequentially-assigned doc IDs, this policy will scatter doc IDs. - *

      NOTE: This policy should only be used with idempotent {@code Sort}s - * so that the order of segments is predictable. For example, using - * {@link Sort#INDEXORDER} in reverse (which is not idempotent) will make - * the order of documents in a segment depend on the number of times the segment - * has been merged. - * @lucene.experimental */ -public final class SortingMergePolicy extends MergePolicyWrapper { - - /** - * Put in the {@link SegmentInfo#getDiagnostics() diagnostics} to denote that - * this segment is sorted. - */ - public static final String SORTER_ID_PROP = "sorter"; - - class SortingOneMerge extends OneMerge { - - List unsortedReaders; - Sorter.DocMap docMap; - LeafReader sortedView; - final InfoStream infoStream; - - SortingOneMerge(List segments, InfoStream infoStream) { - super(segments); - this.infoStream = infoStream; - } - - @Override - public List getMergeReaders() throws IOException { - if (unsortedReaders == null) { - unsortedReaders = super.getMergeReaders(); - if (infoStream.isEnabled("SMP")) { - infoStream.message("SMP", "sorting " + unsortedReaders); - for (LeafReader leaf : unsortedReaders) { - String sortDescription = getSortDescription(leaf); - if (sortDescription == null) { - sortDescription = "not sorted"; - } - infoStream.message("SMP", "seg=" + leaf + " " + sortDescription); - } - } - // wrap readers, to be optimal for merge; - List wrapped = new ArrayList<>(unsortedReaders.size()); - for (LeafReader leaf : unsortedReaders) { - if (leaf instanceof SegmentReader) { - leaf = new MergeReaderWrapper((SegmentReader)leaf); - } - wrapped.add(leaf); - } - final LeafReader atomicView; - if (wrapped.size() == 1) { - atomicView = wrapped.get(0); - } else { - final CompositeReader multiReader = new MultiReader(wrapped.toArray(new LeafReader[wrapped.size()])); - atomicView = new SlowCompositeReaderWrapper(multiReader, true); - } - docMap = sorter.sort(atomicView); - sortedView = SortingLeafReader.wrap(atomicView, docMap); - } - // a null doc map means that the readers are already sorted - if (docMap == null) { - if (infoStream.isEnabled("SMP")) { - infoStream.message("SMP", "readers already sorted, omitting sort"); - } - return unsortedReaders; - } else { - if (infoStream.isEnabled("SMP")) { - infoStream.message("SMP", "sorting readers by " + sort); - } - return Collections.singletonList(SlowCodecReaderWrapper.wrap(sortedView)); - } - } - - @Override - public void setMergeInfo(SegmentCommitInfo info) { - Map diagnostics = info.info.getDiagnostics(); - diagnostics.put(SORTER_ID_PROP, sorter.getID()); - super.setMergeInfo(info); - } - - private PackedLongValues getDeletes(List readers) { - PackedLongValues.Builder deletes = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); - int deleteCount = 0; - for (LeafReader reader : readers) { - final int maxDoc = reader.maxDoc(); - final Bits liveDocs = reader.getLiveDocs(); - for (int i = 0; i < maxDoc; ++i) { - if (liveDocs != null && !liveDocs.get(i)) { - ++deleteCount; - } else { - deletes.add(deleteCount); - } - } - } - return deletes.build(); - } - - @Override - public MergePolicy.DocMap getDocMap(final MergeState mergeState) { - if (unsortedReaders == null) { - throw new IllegalStateException(); - } - if (docMap == null) { - return super.getDocMap(mergeState); - } - assert mergeState.docMaps.length == 1; // we returned a singleton reader - final PackedLongValues deletes = getDeletes(unsortedReaders); - return new MergePolicy.DocMap() { - @Override - public int map(int old) { - final int oldWithDeletes = old + (int) deletes.get(old); - final int newWithDeletes = docMap.oldToNew(oldWithDeletes); - return mergeState.docMaps[0].get(newWithDeletes); - } - }; - } - - @Override - public String toString() { - return "SortingMergePolicy.SortingOneMerge(segments=" + segString() + " sort=" + sort + ")"; - } - } - - class SortingMergeSpecification extends MergeSpecification { - final InfoStream infoStream; - - SortingMergeSpecification(InfoStream infoStream) { - this.infoStream = infoStream; - } - - @Override - public void add(OneMerge merge) { - super.add(new SortingOneMerge(merge.segments, infoStream)); - } - - @Override - public String segString(Directory dir) { - return "SortingMergeSpec(" + super.segString(dir) + ", sorter=" + sorter + ")"; - } - - } - - /** Returns {@code true} if the given {@code reader} is sorted by the - * {@code sort} given. Typically the given {@code sort} would be the - * {@link SortingMergePolicy#getSort()} order of a {@link SortingMergePolicy}. */ - public static boolean isSorted(LeafReader reader, Sort sort) { - String description = getSortDescription(reader); - if (description != null && description.equals(sort.toString())) { - return true; - } - return false; - } - - private static String getSortDescription(LeafReader reader) { - if (reader instanceof SegmentReader) { - final SegmentReader segReader = (SegmentReader) reader; - final Map diagnostics = segReader.getSegmentInfo().info.getDiagnostics(); - if (diagnostics != null) { - return diagnostics.get(SORTER_ID_PROP); - } - } else if (reader instanceof FilterLeafReader) { - return getSortDescription(FilterLeafReader.unwrap(reader)); - } - return null; - } - - private MergeSpecification sortedMergeSpecification(MergeSpecification specification, InfoStream infoStream) { - if (specification == null) { - return null; - } - MergeSpecification sortingSpec = new SortingMergeSpecification(infoStream); - for (OneMerge merge : specification.merges) { - sortingSpec.add(merge); - } - return sortingSpec; - } - - final Sorter sorter; - final Sort sort; - - /** Create a new {@code MergePolicy} that sorts documents with the given {@code sort}. */ - public SortingMergePolicy(MergePolicy in, Sort sort) { - super(in); - this.sorter = new Sorter(sort); - this.sort = sort; - } - - /** Return the {@link Sort} order that is used to sort segments when merging. */ - public Sort getSort() { - return sort; - } - - @Override - public MergeSpecification findMerges(MergeTrigger mergeTrigger, - SegmentInfos segmentInfos, IndexWriter writer) throws IOException { - return sortedMergeSpecification(in.findMerges(mergeTrigger, segmentInfos, writer), writer.infoStream); - } - - @Override - public MergeSpecification findForcedMerges(SegmentInfos segmentInfos, - int maxSegmentCount, Map segmentsToMerge, IndexWriter writer) - throws IOException { - return sortedMergeSpecification(in.findForcedMerges(segmentInfos, maxSegmentCount, segmentsToMerge, writer), writer.infoStream); - } - - @Override - public MergeSpecification findForcedDeletesMerges(SegmentInfos segmentInfos, IndexWriter writer) - throws IOException { - return sortedMergeSpecification(in.findForcedDeletesMerges(segmentInfos, writer), writer.infoStream); - } - - @Override - public String toString() { - return "SortingMergePolicy(" + in + ", sorter=" + sorter + ")"; - } -} diff --git a/lucene/misc/src/java/org/apache/lucene/search/BlockJoinComparatorSource.java b/lucene/misc/src/java/org/apache/lucene/search/BlockJoinComparatorSource.java index 03a2cb8ee78..7633ff53cd6 100644 --- a/lucene/misc/src/java/org/apache/lucene/search/BlockJoinComparatorSource.java +++ b/lucene/misc/src/java/org/apache/lucene/search/BlockJoinComparatorSource.java @@ -20,13 +20,14 @@ import java.io.IOException; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.ReaderUtil; -import org.apache.lucene.index.SortingMergePolicy; import org.apache.lucene.util.BitSet; +// nocommit what to do here? + /** * Helper class to sort readers that contain blocks of documents. *

      - * Note that this class is intended to used with {@link SortingMergePolicy}, + * Note that this class is intended to used with index sorting, * and for other purposes has some limitations: *

        *
      • Cannot yet be used with {@link IndexSearcher#searchAfter(ScoreDoc, Query, int, Sort) IndexSearcher.searchAfter} diff --git a/lucene/misc/src/java/org/apache/lucene/search/EarlyTerminatingSortingCollector.java b/lucene/misc/src/java/org/apache/lucene/search/EarlyTerminatingSortingCollector.java index 5d82be41450..731c33551ee 100644 --- a/lucene/misc/src/java/org/apache/lucene/search/EarlyTerminatingSortingCollector.java +++ b/lucene/misc/src/java/org/apache/lucene/search/EarlyTerminatingSortingCollector.java @@ -20,14 +20,14 @@ import java.io.IOException; import java.util.Arrays; import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.SortingMergePolicy; -import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.CollectionTerminatedException; import org.apache.lucene.search.Collector; -import org.apache.lucene.search.FilterLeafCollector; import org.apache.lucene.search.FilterCollector; +import org.apache.lucene.search.FilterLeafCollector; +import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TopDocsCollector; import org.apache.lucene.search.TotalHitCountCollector; @@ -39,8 +39,7 @@ import org.apache.lucene.search.TotalHitCountCollector; * *

        * NOTE: the {@code Collector} detects segments sorted according to a - * {@link SortingMergePolicy}'s {@link Sort} and so it's best used in conjunction - * with a {@link SortingMergePolicy}. Also,it collects up to a specified + * an {@link IndexWriterConfig#setIndexSort}. Also, it collects up to a specified * {@code numDocsToCollect} from each segment, and therefore is mostly suitable * for use in conjunction with collectors such as {@link TopDocsCollector}, and * not e.g. {@link TotalHitCountCollector}. @@ -48,24 +47,14 @@ import org.apache.lucene.search.TotalHitCountCollector; * NOTE: If you wrap a {@code TopDocsCollector} that sorts in the same * order as the index order, the returned {@link TopDocsCollector#topDocs() TopDocs} * will be correct. However the total of {@link TopDocsCollector#getTotalHits() - * hit count} will be underestimated since not all matching documents will have + * hit count} will be vastly underestimated since not all matching documents will have * been collected. - *

        - * NOTE: This {@code Collector} uses {@link Sort#toString()} to detect - * whether a segment was sorted with the same {@code Sort}. This has - * two implications: - *

          - *
        • if a custom comparator is not implemented correctly and returns - * different identifiers for equivalent instances, this collector will not - * detect sorted segments,
        • - *
        • if you suddenly change the {@link IndexWriter}'s - * {@code SortingMergePolicy} to sort according to another criterion and if both - * the old and the new {@code Sort}s have the same identifier, this - * {@code Collector} will incorrectly detect sorted segments.
        • - *
        * * @lucene.experimental */ + +// nocommit move to core too + public class EarlyTerminatingSortingCollector extends FilterCollector { /** Returns whether collection can be early-terminated if it sorts with the @@ -85,7 +74,6 @@ public class EarlyTerminatingSortingCollector extends FilterCollector { protected final Sort sort; /** Number of documents to collect in each segment */ protected final int numDocsToCollect; - private final Sort mergePolicySort; private final AtomicBoolean terminatedEarly = new AtomicBoolean(false); /** @@ -99,27 +87,26 @@ public class EarlyTerminatingSortingCollector extends FilterCollector { * the number of documents to collect on each segment. When wrapping * a {@link TopDocsCollector}, this number should be the number of * hits. - * @param mergePolicySort - * the sort your {@link SortingMergePolicy} uses * @throws IllegalArgumentException if the sort order doesn't allow for early * termination with the given merge policy. */ - public EarlyTerminatingSortingCollector(Collector in, Sort sort, int numDocsToCollect, Sort mergePolicySort) { + public EarlyTerminatingSortingCollector(Collector in, Sort sort, int numDocsToCollect) { super(in); if (numDocsToCollect <= 0) { throw new IllegalArgumentException("numDocsToCollect must always be > 0, got " + numDocsToCollect); } - if (canEarlyTerminate(sort, mergePolicySort) == false) { - throw new IllegalStateException("Cannot early terminate with sort order " + sort + " if segments are sorted with " + mergePolicySort); - } this.sort = sort; this.numDocsToCollect = numDocsToCollect; - this.mergePolicySort = mergePolicySort; } @Override public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { - if (SortingMergePolicy.isSorted(context.reader(), mergePolicySort)) { + Sort segmentSort = context.reader().getIndexSort(); + if (segmentSort != null && canEarlyTerminate(sort, segmentSort) == false) { + throw new IllegalStateException("Cannot early terminate with sort order " + sort + " if segments are sorted with " + segmentSort); + } + + if (segmentSort != null) { // segment is sorted, can early-terminate return new FilterLeafCollector(super.getLeafCollector(context)) { private int numCollected; @@ -142,5 +129,4 @@ public class EarlyTerminatingSortingCollector extends FilterCollector { public boolean terminatedEarly() { return terminatedEarly.get(); } - } diff --git a/lucene/misc/src/test/org/apache/lucene/index/IndexSortingTest.java b/lucene/misc/src/test/org/apache/lucene/index/IndexSortingTest.java deleted file mode 100644 index 8b384f41db1..00000000000 --- a/lucene/misc/src/test/org/apache/lucene/index/IndexSortingTest.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.index; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.SlowCompositeReaderWrapper; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.TestUtil; -import org.junit.BeforeClass; - -public class IndexSortingTest extends SorterTestBase { - - private static final Sort[] SORT = new Sort[] { - new Sort(new SortField(NUMERIC_DV_FIELD, SortField.Type.LONG)), - new Sort(new SortField(null, SortField.Type.DOC, true)) - }; - - @BeforeClass - public static void beforeClassSorterUtilTest() throws Exception { - // NOTE: index was created by by super's @BeforeClass - - // only read the values of the undeleted documents, since after addIndexes, - // the deleted ones will be dropped from the index. - Bits liveDocs = unsortedReader.getLiveDocs(); - List values = new ArrayList<>(); - for (int i = 0; i < unsortedReader.maxDoc(); i++) { - if (liveDocs == null || liveDocs.get(i)) { - values.add(Integer.valueOf(unsortedReader.document(i).get(ID_FIELD))); - } - } - int idx = random().nextInt(SORT.length); - Sort sorter = SORT[idx]; - if (idx == 1) { // reverse doc sort - Collections.reverse(values); - } else { - Collections.sort(values); - if (random().nextBoolean()) { - sorter = new Sort(new SortField(NUMERIC_DV_FIELD, SortField.Type.LONG, true)); // descending - Collections.reverse(values); - } - } - sortedValues = values.toArray(new Integer[values.size()]); - if (VERBOSE) { - System.out.println("sortedValues: " + sortedValues); - System.out.println("Sorter: " + sorter); - } - - Directory target = newDirectory(); - IndexWriter writer = new IndexWriter(target, newIndexWriterConfig(null)); - LeafReader reader = SortingLeafReader.wrap(unsortedReader, sorter); - writer.addIndexes(SlowCodecReaderWrapper.wrap(reader)); - writer.close(); - // NOTE: also closes unsortedReader - reader.close(); - dir.close(); - - // CheckIndex the target directory - dir = target; - TestUtil.checkIndex(dir); - - // set reader for tests - sortedReader = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir)); - assertFalse("index should not have deletions", sortedReader.hasDeletions()); - } - -} diff --git a/lucene/misc/src/test/org/apache/lucene/index/SorterTestBase.java b/lucene/misc/src/test/org/apache/lucene/index/SorterTestBase.java deleted file mode 100644 index df1c80f881a..00000000000 --- a/lucene/misc/src/test/org/apache/lucene/index/SorterTestBase.java +++ /dev/null @@ -1,405 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.index; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Random; - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.document.BinaryDocValuesField; -import org.apache.lucene.document.BinaryPoint; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field.Store; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.NumericDocValuesField; -import org.apache.lucene.document.SortedDocValuesField; -import org.apache.lucene.document.SortedNumericDocValuesField; -import org.apache.lucene.document.SortedSetDocValuesField; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; -import org.apache.lucene.index.PointValues.IntersectVisitor; -import org.apache.lucene.index.PointValues.Relation; -import org.apache.lucene.index.SortingLeafReader.SortingDocsEnum; -import org.apache.lucene.index.TermsEnum.SeekStatus; -import org.apache.lucene.search.CollectionStatistics; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.TermStatistics; -import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.NumericUtils; -import org.apache.lucene.util.TestUtil; -import org.junit.AfterClass; -import org.junit.BeforeClass; - -public abstract class SorterTestBase extends LuceneTestCase { - - static final class NormsSimilarity extends Similarity { - - private final Similarity in; - - public NormsSimilarity(Similarity in) { - this.in = in; - } - - @Override - public long computeNorm(FieldInvertState state) { - if (state.getName().equals(NORMS_FIELD)) { - return Float.floatToIntBits(state.getBoost()); - } else { - return in.computeNorm(state); - } - } - - @Override - public SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { - return in.computeWeight(collectionStats, termStats); - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - return in.simScorer(weight, context); - } - - } - - static final class PositionsTokenStream extends TokenStream { - - private final CharTermAttribute term; - private final PayloadAttribute payload; - private final OffsetAttribute offset; - - private int pos, off; - - public PositionsTokenStream() { - term = addAttribute(CharTermAttribute.class); - payload = addAttribute(PayloadAttribute.class); - offset = addAttribute(OffsetAttribute.class); - } - - @Override - public boolean incrementToken() throws IOException { - if (pos == 0) { - return false; - } - - clearAttributes(); - term.append(DOC_POSITIONS_TERM); - payload.setPayload(new BytesRef(Integer.toString(pos))); - offset.setOffset(off, off); - --pos; - ++off; - return true; - } - - void setId(int id) { - pos = id / 10 + 1; - off = 0; - } - } - - protected static final String ID_FIELD = "id"; - protected static final String DOCS_ENUM_FIELD = "docs"; - protected static final String DOCS_ENUM_TERM = "$all$"; - protected static final String DOC_POSITIONS_FIELD = "positions"; - protected static final String DOC_POSITIONS_TERM = "$all$"; - protected static final String NUMERIC_DV_FIELD = "numeric"; - protected static final String SORTED_NUMERIC_DV_FIELD = "sorted_numeric"; - protected static final String NORMS_FIELD = "norm"; - protected static final String BINARY_DV_FIELD = "binary"; - protected static final String SORTED_DV_FIELD = "sorted"; - protected static final String SORTED_SET_DV_FIELD = "sorted_set"; - protected static final String TERM_VECTORS_FIELD = "term_vectors"; - protected static final String DIMENSIONAL_FIELD = "numeric1d"; - - private static final FieldType TERM_VECTORS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); - static { - TERM_VECTORS_TYPE.setStoreTermVectors(true); - TERM_VECTORS_TYPE.freeze(); - } - - private static final FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); - static { - POSITIONS_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - POSITIONS_TYPE.freeze(); - } - - protected static Directory dir; - protected static LeafReader unsortedReader; - protected static LeafReader sortedReader; - protected static Integer[] sortedValues; - - private static Document doc(final int id, PositionsTokenStream positions) { - final Document doc = new Document(); - doc.add(new StringField(ID_FIELD, Integer.toString(id), Store.YES)); - doc.add(new StringField(DOCS_ENUM_FIELD, DOCS_ENUM_TERM, Store.NO)); - positions.setId(id); - doc.add(new Field(DOC_POSITIONS_FIELD, positions, POSITIONS_TYPE)); - doc.add(new NumericDocValuesField(NUMERIC_DV_FIELD, id)); - TextField norms = new TextField(NORMS_FIELD, Integer.toString(id), Store.NO); - norms.setBoost(Float.intBitsToFloat(id)); - doc.add(norms); - doc.add(new BinaryDocValuesField(BINARY_DV_FIELD, new BytesRef(Integer.toString(id)))); - doc.add(new SortedDocValuesField(SORTED_DV_FIELD, new BytesRef(Integer.toString(id)))); - doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id)))); - doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id + 1)))); - doc.add(new SortedNumericDocValuesField(SORTED_NUMERIC_DV_FIELD, id)); - doc.add(new SortedNumericDocValuesField(SORTED_NUMERIC_DV_FIELD, id + 1)); - doc.add(new Field(TERM_VECTORS_FIELD, Integer.toString(id), TERM_VECTORS_TYPE)); - byte[] bytes = new byte[4]; - NumericUtils.intToSortableBytes(id, bytes, 0); - // TODO: index time sorting doesn't yet support points - //doc.add(new BinaryPoint(DIMENSIONAL_FIELD, bytes)); - return doc; - } - - /** Creates an unsorted index; subclasses then sort this index and open sortedReader. */ - private static void createIndex(Directory dir, int numDocs, Random random) throws IOException { - List ids = new ArrayList<>(); - for (int i = 0; i < numDocs; i++) { - ids.add(Integer.valueOf(i * 10)); - } - // shuffle them for indexing - Collections.shuffle(ids, random); - if (VERBOSE) { - System.out.println("Shuffled IDs for indexing: " + Arrays.toString(ids.toArray())); - } - - PositionsTokenStream positions = new PositionsTokenStream(); - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random)); - conf.setMaxBufferedDocs(4); // create some segments - conf.setSimilarity(new NormsSimilarity(conf.getSimilarity())); // for testing norms field - RandomIndexWriter writer = new RandomIndexWriter(random, dir, conf); - writer.setDoRandomForceMerge(false); - for (int id : ids) { - writer.addDocument(doc(id, positions)); - } - // delete some documents - writer.commit(); - for (Integer id : ids) { - if (random.nextDouble() < 0.2) { - if (VERBOSE) { - System.out.println("delete doc_id " + id); - } - writer.deleteDocuments(new Term(ID_FIELD, id.toString())); - } - } - writer.close(); - } - - @BeforeClass - public static void beforeClassSorterTestBase() throws Exception { - dir = newDirectory(); - int numDocs = atLeast(20); - createIndex(dir, numDocs, random()); - - unsortedReader = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir)); - } - - @AfterClass - public static void afterClassSorterTestBase() throws Exception { - unsortedReader.close(); - sortedReader.close(); - dir.close(); - unsortedReader = sortedReader = null; - dir = null; - } - - public void testBinaryDocValuesField() throws Exception { - BinaryDocValues dv = sortedReader.getBinaryDocValues(BINARY_DV_FIELD); - for (int i = 0; i < sortedReader.maxDoc(); i++) { - final BytesRef bytes = dv.get(i); - assertEquals("incorrect binary DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString()); - } - } - - public void testDocsAndPositionsEnum() throws Exception { - TermsEnum termsEnum = sortedReader.terms(DOC_POSITIONS_FIELD).iterator(); - assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(DOC_POSITIONS_TERM))); - PostingsEnum sortedPositions = termsEnum.postings(null, PostingsEnum.ALL); - int doc; - - // test nextDoc() - while ((doc = sortedPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - int freq = sortedPositions.freq(); - assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq); - for (int i = 0; i < freq; i++) { - assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition()); - assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset()); - assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset()); - assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString())); - } - } - - // test advance() - final PostingsEnum reuse = sortedPositions; - sortedPositions = termsEnum.postings(reuse, PostingsEnum.ALL); - if (sortedPositions instanceof SortingDocsEnum) { - assertTrue(((SortingDocsEnum) sortedPositions).reused(reuse)); // make sure reuse worked - } - doc = 0; - while ((doc = sortedPositions.advance(doc + TestUtil.nextInt(random(), 1, 5))) != DocIdSetIterator.NO_MORE_DOCS) { - int freq = sortedPositions.freq(); - assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq); - for (int i = 0; i < freq; i++) { - assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition()); - assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset()); - assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset()); - assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString())); - } - } - } - - Bits randomLiveDocs(int maxDoc) { - if (rarely()) { - if (random().nextBoolean()) { - return null; - } else { - return new Bits.MatchNoBits(maxDoc); - } - } - final FixedBitSet bits = new FixedBitSet(maxDoc); - final int bitsSet = TestUtil.nextInt(random(), 1, maxDoc - 1); - for (int i = 0; i < bitsSet; ++i) { - while (true) { - final int index = random().nextInt(maxDoc); - if (!bits.get(index)) { - bits.set(index); - break; - } - } - } - return bits; - } - - public void testDocsEnum() throws Exception { - TermsEnum termsEnum = sortedReader.terms(DOCS_ENUM_FIELD).iterator(); - assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(DOCS_ENUM_TERM))); - PostingsEnum docs = termsEnum.postings(null); - - int doc; - while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(sortedReader.document(doc).get(ID_FIELD))); - } - - PostingsEnum reuse = docs; - docs = termsEnum.postings(reuse); - if (docs instanceof SortingDocsEnum) { - assertTrue(((SortingDocsEnum) docs).reused(reuse)); // make sure reuse worked - } - doc = -1; - while ((doc = docs.advance(doc + 1)) != DocIdSetIterator.NO_MORE_DOCS) { - assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(sortedReader.document(doc).get(ID_FIELD))); - } - } - - public void testNormValues() throws Exception { - NumericDocValues dv = sortedReader.getNormValues(NORMS_FIELD); - int maxDoc = sortedReader.maxDoc(); - for (int i = 0; i < maxDoc; i++) { - assertEquals("incorrect norm value for doc " + i, sortedValues[i].intValue(), dv.get(i)); - } - } - - public void testNumericDocValuesField() throws Exception { - NumericDocValues dv = sortedReader.getNumericDocValues(NUMERIC_DV_FIELD); - int maxDoc = sortedReader.maxDoc(); - for (int i = 0; i < maxDoc; i++) { - assertEquals("incorrect numeric DocValues for doc " + i, sortedValues[i].intValue(), dv.get(i)); - } - } - - public void testSortedDocValuesField() throws Exception { - SortedDocValues dv = sortedReader.getSortedDocValues(SORTED_DV_FIELD); - int maxDoc = sortedReader.maxDoc(); - for (int i = 0; i < maxDoc; i++) { - final BytesRef bytes = dv.get(i); - assertEquals("incorrect sorted DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString()); - } - } - - public void testSortedSetDocValuesField() throws Exception { - SortedSetDocValues dv = sortedReader.getSortedSetDocValues(SORTED_SET_DV_FIELD); - int maxDoc = sortedReader.maxDoc(); - for (int i = 0; i < maxDoc; i++) { - dv.setDocument(i); - BytesRef bytes = dv.lookupOrd(dv.nextOrd()); - int value = sortedValues[i].intValue(); - assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value).toString(), bytes.utf8ToString()); - bytes = dv.lookupOrd(dv.nextOrd()); - assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value + 1).toString(), bytes.utf8ToString()); - assertEquals(SortedSetDocValues.NO_MORE_ORDS, dv.nextOrd()); - } - } - - public void testSortedNumericDocValuesField() throws Exception { - SortedNumericDocValues dv = sortedReader.getSortedNumericDocValues(SORTED_NUMERIC_DV_FIELD); - int maxDoc = sortedReader.maxDoc(); - for (int i = 0; i < maxDoc; i++) { - dv.setDocument(i); - assertEquals(2, dv.count()); - int value = sortedValues[i].intValue(); - assertEquals("incorrect sorted-numeric DocValues for doc " + i, value, dv.valueAt(0)); - assertEquals("incorrect sorted-numeric DocValues for doc " + i, value + 1, dv.valueAt(1)); - } - } - - public void testTermVectors() throws Exception { - int maxDoc = sortedReader.maxDoc(); - for (int i = 0; i < maxDoc; i++) { - Terms terms = sortedReader.getTermVector(i, TERM_VECTORS_FIELD); - assertNotNull("term vectors not found for doc " + i + " field [" + TERM_VECTORS_FIELD + "]", terms); - assertEquals("incorrect term vector for doc " + i, sortedValues[i].toString(), terms.iterator().next().utf8ToString()); - } - } - - // TODO: index sorting doesn't yet support points - /* - public void testPoints() throws Exception { - PointValues values = sortedReader.getPointValues(); - values.intersect(DIMENSIONAL_FIELD, - new IntersectVisitor() { - @Override - public void visit(int docID) { - throw new IllegalStateException(); - } - - @Override - public void visit(int docID, byte[] packedValues) { - assertEquals(sortedValues[docID].intValue(), NumericUtils.bytesToInt(packedValues, 0)); - } - - @Override - public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { - return Relation.CELL_CROSSES_QUERY; - } - }); - } - */ -} diff --git a/lucene/misc/src/test/org/apache/lucene/index/SortingLeafReaderTest.java b/lucene/misc/src/test/org/apache/lucene/index/SortingLeafReaderTest.java deleted file mode 100644 index 3e8cb99ae07..00000000000 --- a/lucene/misc/src/test/org/apache/lucene/index/SortingLeafReaderTest.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.index; - -import java.util.Arrays; - -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.TestUtil; -import org.junit.BeforeClass; - -public class SortingLeafReaderTest extends SorterTestBase { - - @BeforeClass - public static void beforeClassSortingLeafReaderTest() throws Exception { - // NOTE: index was created by by super's @BeforeClass - - // sort the index by id (as integer, in NUMERIC_DV_FIELD) - Sort sort = new Sort(new SortField(NUMERIC_DV_FIELD, SortField.Type.INT)); - final Sorter.DocMap docMap = new Sorter(sort).sort(unsortedReader); - - // Sorter.compute also sorts the values - NumericDocValues dv = unsortedReader.getNumericDocValues(NUMERIC_DV_FIELD); - sortedValues = new Integer[unsortedReader.maxDoc()]; - for (int i = 0; i < unsortedReader.maxDoc(); ++i) { - sortedValues[docMap.oldToNew(i)] = (int)dv.get(i); - } - if (VERBOSE) { - System.out.println("docMap: " + docMap); - System.out.println("sortedValues: " + Arrays.toString(sortedValues)); - } - - // sort the index by id (as integer, in NUMERIC_DV_FIELD) - sortedReader = SortingLeafReader.wrap(unsortedReader, sort); - - if (VERBOSE) { - System.out.print("mapped-deleted-docs: "); - Bits mappedLiveDocs = sortedReader.getLiveDocs(); - for (int i = 0; i < mappedLiveDocs.length(); i++) { - if (!mappedLiveDocs.get(i)) { - System.out.print(i + " "); - } - } - System.out.println(); - } - - TestUtil.checkReader(sortedReader); - } - - public void testBadSort() throws Exception { - IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { - SortingLeafReader.wrap(sortedReader, Sort.RELEVANCE); - }); - assertEquals("Cannot sort an index with a Sort that refers to the relevance score", expected.getMessage()); - } - -} diff --git a/lucene/misc/src/test/org/apache/lucene/index/TestSortingMergePolicy.java b/lucene/misc/src/test/org/apache/lucene/index/TestSortingMergePolicy.java deleted file mode 100644 index a5486f4ce7e..00000000000 --- a/lucene/misc/src/test/org/apache/lucene/index/TestSortingMergePolicy.java +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.index; - -import java.io.IOException; -import java.lang.reflect.Method; -import java.lang.reflect.Modifier; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Random; -import java.util.Set; - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field.Store; -import org.apache.lucene.document.NumericDocValuesField; -import org.apache.lucene.document.StringField; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.LogMergePolicy; -import org.apache.lucene.index.MergePolicy; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.SlowCompositeReaderWrapper; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TieredMergePolicy; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.TestUtil; - -import com.carrotsearch.randomizedtesting.generators.RandomPicks; - -public class TestSortingMergePolicy extends BaseMergePolicyTestCase { - - private List terms; - private Directory dir1, dir2; - private Sort sort; - private boolean reversedSort; - private IndexReader reader; - private IndexReader sortedReader; - - @Override - public void setUp() throws Exception { - super.setUp(); - final Boolean reverse = (random().nextBoolean() ? null : new Boolean(random().nextBoolean())); - final SortField sort_field = (reverse == null - ? new SortField("ndv", SortField.Type.LONG) - : new SortField("ndv", SortField.Type.LONG, reverse.booleanValue())); - sort = new Sort(sort_field); - reversedSort = (null != reverse && reverse.booleanValue()); - createRandomIndexes(); - } - - private Document randomDocument() { - final Document doc = new Document(); - doc.add(new NumericDocValuesField("ndv", random().nextLong())); - doc.add(new StringField("s", RandomPicks.randomFrom(random(), terms), Store.YES)); - return doc; - } - - public MergePolicy mergePolicy() { - return newSortingMergePolicy(sort); - } - - public static SortingMergePolicy newSortingMergePolicy(Sort sort) { - // usually create a MP with a low merge factor so that many merges happen - MergePolicy mp; - int thingToDo = random().nextInt(3); - if (thingToDo == 0) { - TieredMergePolicy tmp = newTieredMergePolicy(random()); - final int numSegs = TestUtil.nextInt(random(), 3, 5); - tmp.setSegmentsPerTier(numSegs); - tmp.setMaxMergeAtOnce(TestUtil.nextInt(random(), 2, numSegs)); - mp = tmp; - } else if (thingToDo == 1) { - LogMergePolicy lmp = newLogMergePolicy(random()); - lmp.setMergeFactor(TestUtil.nextInt(random(), 3, 5)); - mp = lmp; - } else { - // just a regular random one from LTC (could be alcoholic etc) - mp = newMergePolicy(); - } - // wrap it with a sorting mp - if (VERBOSE) { - System.out.println("TEST: return SortingMergePolicy(mp=" + mp + " sort=" + sort + ")"); - } - return new SortingMergePolicy(mp, sort); - } - - private void createRandomIndexes() throws IOException { - dir1 = newDirectory(); - dir2 = newDirectory(); - final int numDocs = atLeast(150); - final int numTerms = TestUtil.nextInt(random(), 1, numDocs / 5); - Set randomTerms = new HashSet<>(); - while (randomTerms.size() < numTerms) { - randomTerms.add(TestUtil.randomSimpleString(random())); - } - terms = new ArrayList<>(randomTerms); - final long seed = random().nextLong(); - final IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(new Random(seed))); - final IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(new Random(seed))); - iwc2.setMergePolicy(mergePolicy()); - final RandomIndexWriter iw1 = new RandomIndexWriter(new Random(seed), dir1, iwc1); - final RandomIndexWriter iw2 = new RandomIndexWriter(new Random(seed), dir2, iwc2); - for (int i = 0; i < numDocs; ++i) { - if (random().nextInt(5) == 0 && i != numDocs - 1) { - final String term = RandomPicks.randomFrom(random(), terms); - iw1.deleteDocuments(new Term("s", term)); - iw2.deleteDocuments(new Term("s", term)); - } - final Document doc = randomDocument(); - iw1.addDocument(doc); - iw2.addDocument(doc); - if (random().nextInt(8) == 0) { - iw1.commit(); - iw2.commit(); - } - } - // Make sure we have something to merge - iw1.commit(); - iw2.commit(); - final Document doc = randomDocument(); - // NOTE: don't use RIW.addDocument directly, since it sometimes commits - // which may trigger a merge, at which case forceMerge may not do anything. - // With field updates this is a problem, since the updates can go into the - // single segment in the index, and threefore the index won't be sorted. - // This hurts the assumption of the test later on, that the index is sorted - // by SortingMP. - iw1.w.addDocument(doc); - iw2.w.addDocument(doc); - - // update NDV of docs belonging to one term (covers many documents) - final long value = random().nextLong(); - final String term = RandomPicks.randomFrom(random(), terms); - iw1.w.updateNumericDocValue(new Term("s", term), "ndv", value); - iw2.w.updateNumericDocValue(new Term("s", term), "ndv", value); - - iw1.forceMerge(1); - iw2.forceMerge(1); - iw1.close(); - iw2.close(); - reader = DirectoryReader.open(dir1); - sortedReader = DirectoryReader.open(dir2); - } - - @Override - public void tearDown() throws Exception { - reader.close(); - sortedReader.close(); - dir1.close(); - dir2.close(); - super.tearDown(); - } - - private static void assertSorted(LeafReader reader, boolean reverse) throws IOException { - final NumericDocValues ndv = reader.getNumericDocValues("ndv"); - for (int i = 1; i < reader.maxDoc(); ++i) { - final int lhs = (!reverse ? i-1 : i); - final int rhs = (!reverse ? i : i-1); - assertTrue("ndv(" + (i-1) + ")=" + ndv.get(i-1) + ",ndv(" + i + ")=" + ndv.get(i)+",reverse="+reverse, ndv.get(lhs) <= ndv.get(rhs)); - } - } - - public void testSortingMP() throws IOException { - final LeafReader sortedReader1 = SortingLeafReader.wrap(SlowCompositeReaderWrapper.wrap(reader), sort); - final LeafReader sortedReader2 = SlowCompositeReaderWrapper.wrap(sortedReader); - - assertSorted(sortedReader1, reversedSort); - assertSorted(sortedReader2, reversedSort); - - assertReaderEquals("", sortedReader1, sortedReader2); - } - - public void testBadSort() throws Exception { - IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { - new SortingMergePolicy(newMergePolicy(), Sort.RELEVANCE); - }); - assertEquals("Cannot sort an index with a Sort that refers to the relevance score", expected.getMessage()); - } - -} diff --git a/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java b/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java index 3fbe14a4217..cf860a2ac20 100644 --- a/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java +++ b/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java @@ -32,6 +32,7 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.MultiDocValues; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.SlowCompositeReaderWrapper; @@ -367,8 +368,7 @@ public class TestDiversifiedTopDocsCollector extends LuceneTestCase { reader = writer.getReader(); writer.close(); searcher = newSearcher(reader); - LeafReader ar = SlowCompositeReaderWrapper.wrap(reader); - artistDocValues = ar.getSortedDocValues("artist"); + artistDocValues = MultiDocValues.getSortedValues(reader, "artist"); // All searches sort by song popularity final Similarity base = searcher.getSimilarity(true); diff --git a/lucene/misc/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java b/lucene/misc/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java index 14bd43714fb..3bcc4924327 100644 --- a/lucene/misc/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java +++ b/lucene/misc/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java @@ -25,23 +25,23 @@ import java.util.Random; import java.util.Set; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.ExitableDirectoryReader; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.MockRandomMergePolicy; import org.apache.lucene.index.QueryTimeout; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.SerialMergeScheduler; -import org.apache.lucene.index.SortingMergePolicy; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TestSortingMergePolicy; -import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -50,8 +50,8 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.store.Directory; -import org.apache.lucene.uninverting.UninvertingReader; import org.apache.lucene.uninverting.UninvertingReader.Type; +import org.apache.lucene.uninverting.UninvertingReader; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -62,18 +62,11 @@ public class TestEarlyTerminatingSortingCollector extends LuceneTestCase { private int numDocs; private List terms; private Directory dir; - private Sort sort; + private final Sort sort = new Sort(new SortField("ndv1", SortField.Type.LONG)); private RandomIndexWriter iw; private IndexReader reader; - private SortingMergePolicy mergePolicy; private final int forceMergeMaxSegmentCount = 5; - @Override - public void setUp() throws Exception { - super.setUp(); - sort = new Sort(new SortField("ndv1", SortField.Type.LONG)); - } - private Document randomDocument() { final Document doc = new Document(); doc.add(new NumericDocValuesField("ndv1", random().nextInt(10))); @@ -93,9 +86,14 @@ public class TestEarlyTerminatingSortingCollector extends LuceneTestCase { terms = new ArrayList<>(randomTerms); final long seed = random().nextLong(); final IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(new Random(seed))); + if (iwc.getMergePolicy() instanceof MockRandomMergePolicy) { + // MockRandomMP randomly wraps the leaf readers which makes merging angry + iwc.setMergePolicy(newTieredMergePolicy()); + } iwc.setMergeScheduler(new SerialMergeScheduler()); // for reproducible tests - mergePolicy = TestSortingMergePolicy.newSortingMergePolicy(sort); - iwc.setMergePolicy(mergePolicy); + iwc.setIndexSort(sort); + // nocommit: + iwc.setCodec(Codec.forName("SimpleText")); iw = new RandomIndexWriter(new Random(seed), dir, iwc); iw.setDoRandomForceMerge(false); // don't do this, it may happen anyway with MockRandomMP for (int i = 0; i < numDocs; ++i) { @@ -151,7 +149,7 @@ public class TestEarlyTerminatingSortingCollector extends LuceneTestCase { query = new MatchAllDocsQuery(); } searcher.search(query, collector1); - searcher.search(query, new EarlyTerminatingSortingCollector(collector2, sort, numHits, mergePolicy.getSort())); + searcher.search(query, new EarlyTerminatingSortingCollector(collector2, sort, numHits)); assertTrue(collector1.getTotalHits() >= collector2.getTotalHits()); assertTopDocsEquals(collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs); } @@ -190,40 +188,16 @@ public class TestEarlyTerminatingSortingCollector extends LuceneTestCase { } public void testEarlyTerminationDifferentSorter() throws IOException { - createRandomIndex(false); - final int iters = atLeast(3); - for (int i = 0; i < iters; ++i) { - final IndexSearcher searcher = newSearcher(reader); - // test that the collector works correctly when the index was sorted by a - // different sorter than the one specified in the ctor. - final int numHits = TestUtil.nextInt(random(), 1, numDocs); - final Sort sort = new Sort(new SortField("ndv2", SortField.Type.LONG, false)); - final boolean fillFields = random().nextBoolean(); - final boolean trackDocScores = random().nextBoolean(); - final boolean trackMaxScore = random().nextBoolean(); - final TopFieldCollector collector1 = TopFieldCollector.create(sort, numHits, fillFields, trackDocScores, trackMaxScore); - final TopFieldCollector collector2 = TopFieldCollector.create(sort, numHits, fillFields, trackDocScores, trackMaxScore); - - final Query query; - if (random().nextBoolean()) { - query = new TermQuery(new Term("s", RandomPicks.randomFrom(random(), terms))); - } else { - query = new MatchAllDocsQuery(); - } - searcher.search(query, collector1); - Sort different = new Sort(new SortField("ndv2", SortField.Type.LONG)); + createRandomIndex(true); - searcher.search(query, new EarlyTerminatingSortingCollector(collector2, different, numHits, different) { - @Override - public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { - final LeafCollector ret = super.getLeafCollector(context); - assertTrue("segment should not be recognized as sorted as different sorter was used", ret.getClass() == in.getLeafCollector(context).getClass()); - return ret; - } - }); - assertTrue(collector1.getTotalHits() >= collector2.getTotalHits()); - assertTopDocsEquals(collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs); - } + Sort sort = new Sort(new SortField("ndv2", SortField.Type.LONG, false)); + Collector c = new EarlyTerminatingSortingCollector(TopFieldCollector.create(sort, 10, true, true, true), sort, 10); + IndexSearcher searcher = newSearcher(reader); + Exception e = expectThrows(IllegalStateException.class, + () -> { + searcher.search(new MatchAllDocsQuery(), c); + }); + assertEquals("Cannot early terminate with sort order if segments are sorted with ", e.getMessage()); closeIndex(); } @@ -289,7 +263,7 @@ public class TestEarlyTerminatingSortingCollector extends LuceneTestCase { searcher.search(query, collector1); final TestTerminatedEarlySimpleCollector collector2 = new TestTerminatedEarlySimpleCollector(); - final EarlyTerminatingSortingCollector etsCollector = new EarlyTerminatingSortingCollector(collector2, sort, 1, mergePolicy.getSort()); + final EarlyTerminatingSortingCollector etsCollector = new EarlyTerminatingSortingCollector(collector2, sort, 1); searcher.search(query, etsCollector); assertTrue("collector1="+collector1.collectedSomething()+" vs. collector2="+collector2.collectedSomething(), collector1.collectedSomething() == collector2.collectedSomething()); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseCompoundFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseCompoundFormatTestCase.java index f09be9d05a6..7c19596aa81 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseCompoundFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseCompoundFormatTestCase.java @@ -627,7 +627,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest /** Returns a new fake segment */ protected static SegmentInfo newSegmentInfo(Directory dir, String name) { - return new SegmentInfo(dir, Version.LATEST, name, 10000, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + return new SegmentInfo(dir, Version.LATEST, name, 10000, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); } /** Creates a file of the specified size with random data. */ diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java index d8e2296e336..528e92afc17 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseFieldInfoFormatTestCase.java @@ -347,7 +347,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes /** Returns a new fake segment */ protected static SegmentInfo newSegmentInfo(Directory dir, String name) { - return new SegmentInfo(dir, Version.LATEST, name, 10000, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + return new SegmentInfo(dir, Version.LATEST, name, 10000, false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java index 2c6f379f4b6..d7dc44bbeed 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java @@ -303,7 +303,7 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase { Directory dir = newFSDirectory(createTempDir("justSoYouGetSomeChannelErrors")); Codec codec = getCodec(); - SegmentInfo segmentInfo = new SegmentInfo(dir, Version.LATEST, "_0", 1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + SegmentInfo segmentInfo = new SegmentInfo(dir, Version.LATEST, "_0", 1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field"); FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(), proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<>(), diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java index 1136afa64fb..27e01c1fbef 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java @@ -26,7 +26,8 @@ import java.util.Set; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; -import org.apache.lucene.document.TextField; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.MockDirectoryWrapper; @@ -52,7 +53,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT Codec codec = getCodec(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - Collections.emptyMap(), id, new HashMap<>()); + Collections.emptyMap(), id, new HashMap<>(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); @@ -66,7 +67,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT Codec codec = getCodec(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - Collections.emptyMap(), id, new HashMap<>()); + Collections.emptyMap(), id, new HashMap<>(), null); Set originalFiles = Collections.singleton("_123.a"); info.setFiles(originalFiles); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); @@ -95,7 +96,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT diagnostics.put("key1", "value1"); diagnostics.put("key2", "value2"); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - diagnostics, id, new HashMap<>()); + diagnostics, id, new HashMap<>(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); @@ -118,7 +119,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT attributes.put("key1", "value1"); attributes.put("key2", "value2"); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - Collections.emptyMap(), id, attributes); + Collections.emptyMap(), id, attributes, null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); @@ -138,7 +139,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT Directory dir = newDirectory(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - Collections.emptyMap(), id, new HashMap<>()); + Collections.emptyMap(), id, new HashMap<>(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); @@ -153,7 +154,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT Directory dir = newDirectory(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, v, "_123", 1, false, codec, - Collections.emptyMap(), id, new HashMap<>()); + Collections.emptyMap(), id, new HashMap<>(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); @@ -161,7 +162,51 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT dir.close(); } } - + + /** Test sort */ + public void testSort() throws IOException { + final int iters = atLeast(5); + for (int i = 0; i < iters; ++i) { + Sort sort; + if (i == 0) { + sort = null; + } else { + final int numSortFields = TestUtil.nextInt(random(), 1, 3); + SortField[] sortFields = new SortField[numSortFields]; + for (int j = 0; j < numSortFields; ++j) { + sortFields[j] = new SortField( + TestUtil.randomSimpleString(random()), + random().nextBoolean() ? SortField.Type.LONG : SortField.Type.STRING, + random().nextBoolean()); + if (random().nextBoolean()) { + switch (sortFields[j].getType()) { + case LONG: + sortFields[j].setMissingValue(random().nextLong()); + break; + case STRING: + sortFields[j].setMissingValue(random().nextBoolean() ? SortField.STRING_FIRST : SortField.STRING_LAST); + break; + default: + fail(); + } + } + } + sort = new Sort(sortFields); + } + + Directory dir = newDirectory(); + Codec codec = getCodec(); + byte id[] = StringHelper.randomId(); + SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, + Collections.emptyMap(), id, new HashMap<>(), sort); + info.setFiles(Collections.emptySet()); + codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); + SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); + assertEquals(sort, info2.getIndexSort()); + dir.close(); + } + } + /** * Test segment infos write that hits exception immediately on open. * make sure we get our exception back, no file handle leaks, etc. @@ -183,7 +228,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT Codec codec = getCodec(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - Collections.emptyMap(), id, new HashMap<>()); + Collections.emptyMap(), id, new HashMap<>(), null); info.setFiles(Collections.emptySet()); fail.setDoFail(); @@ -216,7 +261,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT Codec codec = getCodec(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - Collections.emptyMap(), id, new HashMap<>()); + Collections.emptyMap(), id, new HashMap<>(), null); info.setFiles(Collections.emptySet()); fail.setDoFail(); @@ -249,7 +294,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT Codec codec = getCodec(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - Collections.emptyMap(), id, new HashMap<>()); + Collections.emptyMap(), id, new HashMap<>(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); @@ -283,7 +328,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT Codec codec = getCodec(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo(dir, getVersions()[0], "_123", 1, false, codec, - Collections.emptyMap(), id, new HashMap<>()); + Collections.emptyMap(), id, new HashMap<>(), null); info.setFiles(Collections.emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); @@ -332,7 +377,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT TestUtil.randomUnicodeString(random())); } - SegmentInfo info = new SegmentInfo(dir, version, name, docCount, isCompoundFile, codec, diagnostics, id, attributes); + SegmentInfo info = new SegmentInfo(dir, version, name, docCount, isCompoundFile, codec, diagnostics, id, attributes, null); info.setFiles(files); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, name, id, IOContext.DEFAULT); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java b/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java index b40ac2685d5..93898881416 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java @@ -138,7 +138,6 @@ public class MockRandomMergePolicy extends MergePolicy { static class MockRandomOneMerge extends OneMerge { final Random r; - ArrayList readers; MockRandomOneMerge(List segments, long seed) { super(segments); @@ -146,34 +145,30 @@ public class MockRandomMergePolicy extends MergePolicy { } @Override - public List getMergeReaders() throws IOException { - if (readers == null) { - readers = new ArrayList(super.getMergeReaders()); - for (int i = 0; i < readers.size(); i++) { - // wrap it (e.g. prevent bulk merge etc) - // TODO: cut this over to FilterCodecReader api, we can explicitly - // enable/disable bulk merge for portions of the index we want. - int thingToDo = r.nextInt(7); - if (thingToDo == 0) { - // simple no-op FilterReader - if (LuceneTestCase.VERBOSE) { - System.out.println("NOTE: MockRandomMergePolicy now swaps in a SlowCodecReaderWrapper for merging reader=" + readers.get(i)); - } - readers.set(i, SlowCodecReaderWrapper.wrap(new FilterLeafReader(readers.get(i)) {})); - } else if (thingToDo == 1) { - // renumber fields - // NOTE: currently this only "blocks" bulk merges just by - // being a FilterReader. But it might find bugs elsewhere, - // and maybe the situation can be improved in the future. - if (LuceneTestCase.VERBOSE) { - System.out.println("NOTE: MockRandomMergePolicy now swaps in a MismatchedLeafReader for merging reader=" + readers.get(i)); - } - readers.set(i, SlowCodecReaderWrapper.wrap(new MismatchedLeafReader(readers.get(i), r))); - } - // otherwise, reader is unchanged + public CodecReader wrapForMerge(CodecReader reader) throws IOException { + // wrap it (e.g. prevent bulk merge etc) + // TODO: cut this over to FilterCodecReader api, we can explicitly + // enable/disable bulk merge for portions of the index we want. + int thingToDo = r.nextInt(7); + if (thingToDo == 0) { + // simple no-op FilterReader + if (LuceneTestCase.VERBOSE) { + System.out.println("NOTE: MockRandomMergePolicy now swaps in a SlowCodecReaderWrapper for merging reader=" + reader); } + return SlowCodecReaderWrapper.wrap(new FilterLeafReader(reader) {}); + } else if (thingToDo == 1) { + // renumber fields + // NOTE: currently this only "blocks" bulk merges just by + // being a FilterReader. But it might find bugs elsewhere, + // and maybe the situation can be improved in the future. + if (LuceneTestCase.VERBOSE) { + System.out.println("NOTE: MockRandomMergePolicy now swaps in a MismatchedLeafReader for merging reader=" + reader); + } + return SlowCodecReaderWrapper.wrap(new MismatchedLeafReader(reader, r)); + } else { + // otherwise, reader is unchanged + return reader; } - return readers; } } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java index 90064c4d7ff..d4159279311 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java @@ -611,7 +611,7 @@ public class RandomPostingsTester { // maxAllowed = the "highest" we can index, but we will still // randomly index at lower IndexOption public FieldsProducer buildIndex(Codec codec, Directory dir, IndexOptions maxAllowed, boolean allowPayloads, boolean alwaysTestMax) throws IOException { - SegmentInfo segmentInfo = new SegmentInfo(dir, Version.LATEST, "_0", maxDoc, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); + SegmentInfo segmentInfo = new SegmentInfo(dir, Version.LATEST, "_0", maxDoc, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null); int maxIndexOption = Arrays.asList(IndexOptions.values()).indexOf(maxAllowed); if (LuceneTestCase.VERBOSE) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java b/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java index b517af00cf1..74a46d4f5e7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java @@ -283,6 +283,11 @@ public class QueryUtils { @Override protected void doClose() throws IOException {} + + @Override + public Sort getIndexSort() { + return null; + } }; } From 8fe78da23c05af70c2ff6047a58a4a34708f96a5 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 6 May 2016 16:41:26 -0400 Subject: [PATCH 02/16] LUCENE-6766: make new Lucene62Codec, with new segment info format that reads/writes the index sort, to support index sorting; remove all usage of SlowCompositeReaderWrapper; also sort ids when merging norms; CheckIndex verifies sort --- .../lucene/codecs/lucene50/Lucene50Codec.java | 2 +- .../lucene50/Lucene50SegmentInfoFormat.java | 105 +++++ .../lucene/codecs/lucene60/Lucene60Codec.java | 2 +- .../services/org.apache.lucene.codecs.Codec | 1 + .../codecs/lucene50/Lucene50RWCodec.java | 7 + .../Lucene50RWSegmentInfoFormat.java} | 39 +- .../byTask/tasks/CreateIndexTask.java | 4 +- lucene/classification/build.xml | 6 +- .../classification/utils/DatasetSplitter.java | 27 +- .../utils/DataSplitterTest.java | 6 +- .../SimpleTextSegmentInfoFormat.java | 2 + .../java/org/apache/lucene/codecs/Codec.java | 2 +- .../lucene/codecs/DocValuesConsumer.java | 16 +- .../apache/lucene/codecs/NormsConsumer.java | 73 ++-- .../CompressingStoredFieldsWriter.java | 6 + .../CompressingTermVectorsWriter.java | 5 + .../lucene/codecs/lucene54/package-info.java | 4 +- .../codecs/lucene60/Lucene60PointsWriter.java | 7 + .../lucene/codecs/lucene60/package-info.java | 397 +----------------- .../lucene/codecs/lucene62/Lucene62Codec.java | 178 ++++++++ .../lucene62/Lucene62SegmentInfoFormat.java | 289 +++++++++++++ .../lucene/codecs/lucene62/package-info.java | 24 ++ .../org/apache/lucene/index/CheckIndex.java | 87 ++++ .../index/MappingMultiPostingsEnum.java | 7 +- .../java/org/apache/lucene/index/Sorter.java | 1 - .../lucene/index/SortingLeafReader.java | 5 +- .../services/org.apache.lucene.codecs.Codec | 2 +- ...ne50StoredFieldsFormatHighCompression.java | 8 +- .../lucene53/TestLucene53NormsFormat.java | 4 +- .../TestLucene62SegmentInfoFormat.java} | 8 +- .../org/apache/lucene/index/Test2BPoints.java | 4 +- .../org/apache/lucene/index/Test2BTerms.java | 2 +- .../apache/lucene/index/TestIndexSorting.java | 24 +- .../apache/lucene/index/TestPointValues.java | 8 +- .../lucene/search/TestPointQueries.java | 4 +- .../highlight/TermVectorLeafReader.java | 7 +- .../lucene/index/memory/MemoryIndex.java | 5 + .../TestDiversifiedTopDocsCollector.java | 1 - .../TestEarlyTerminatingSortingCollector.java | 19 +- .../apache/lucene/document/TestNearest.java | 2 +- .../lucene/spatial3d/TestGeo3DPoint.java | 4 +- .../analyzing/AnalyzingInfixSuggester.java | 6 +- .../suggest/document/TestSuggestField.java | 4 +- .../lucene/geo/BaseGeoPointTestCase.java | 2 +- .../index/BaseSegmentInfoFormatTestCase.java | 6 + .../util/TestRuleSetupAndRestoreClassEnv.java | 6 +- .../java/org/apache/lucene/util/TestUtil.java | 4 +- 47 files changed, 855 insertions(+), 577 deletions(-) create mode 100644 lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java rename lucene/{core => backward-codecs}/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java (99%) rename lucene/{core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java => backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWSegmentInfoFormat.java} (77%) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java rename lucene/core/src/test/org/apache/lucene/codecs/{lucene50/TestLucene50SegmentInfoFormat.java => lucene62/TestLucene62SegmentInfoFormat.java} (89%) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java index 001439ce14d..19d6e3bbe90 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java @@ -108,7 +108,7 @@ public class Lucene50Codec extends Codec { } @Override - public final SegmentInfoFormat segmentInfoFormat() { + public SegmentInfoFormat segmentInfoFormat() { return segmentInfosFormat; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java new file mode 100644 index 00000000000..9c5453f65b5 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene50; + + +import java.io.IOException; +import java.util.Collections; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexWriter; // javadocs +import org.apache.lucene.index.SegmentInfo; // javadocs +import org.apache.lucene.index.SegmentInfos; // javadocs +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataOutput; // javadocs +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Version; + +/** + * Lucene 5.0 Segment info format. + * @deprecated Only for reading old 5.0-6.0 segments + */ +@Deprecated +public class Lucene50SegmentInfoFormat extends SegmentInfoFormat { + + /** Sole constructor. */ + public Lucene50SegmentInfoFormat() { + } + + @Override + public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException { + final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene50SegmentInfoFormat.SI_EXTENSION); + try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) { + Throwable priorE = null; + SegmentInfo si = null; + try { + int format = CodecUtil.checkIndexHeader(input, Lucene50SegmentInfoFormat.CODEC_NAME, + Lucene50SegmentInfoFormat.VERSION_START, + Lucene50SegmentInfoFormat.VERSION_CURRENT, + segmentID, ""); + final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + + final int docCount = input.readInt(); + if (docCount < 0) { + throw new CorruptIndexException("invalid docCount: " + docCount, input); + } + final boolean isCompoundFile = input.readByte() == SegmentInfo.YES; + + final Map diagnostics; + final Set files; + final Map attributes; + + if (format >= VERSION_SAFE_MAPS) { + diagnostics = input.readMapOfStrings(); + files = input.readSetOfStrings(); + attributes = input.readMapOfStrings(); + } else { + diagnostics = Collections.unmodifiableMap(input.readStringStringMap()); + files = Collections.unmodifiableSet(input.readStringSet()); + attributes = Collections.unmodifiableMap(input.readStringStringMap()); + } + + si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, segmentID, attributes, null); + si.setFiles(files); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(input, priorE); + } + return si; + } + } + + @Override + public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException { + throw new UnsupportedOperationException("this codec can only be used for reading"); + } + + /** File extension used to store {@link SegmentInfo}. */ + public final static String SI_EXTENSION = "si"; + static final String CODEC_NAME = "Lucene50SegmentInfo"; + static final int VERSION_START = 0; + static final int VERSION_SAFE_MAPS = 1; + static final int VERSION_CURRENT = VERSION_SAFE_MAPS; +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java index c696fbe5f31..7210b3f0dcf 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java @@ -151,7 +151,7 @@ public class Lucene60Codec extends Codec { /** Returns the docvalues format that should be used for writing * new segments of field. * - * The default implementation always returns "Lucene50". + * The default implementation always returns "Lucene54". *

        * WARNING: if you subclass, you are responsible for index * backwards compatibility: future version of Lucene are only diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 7f66de899e7..71aa938e21e 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -16,3 +16,4 @@ org.apache.lucene.codecs.lucene50.Lucene50Codec org.apache.lucene.codecs.lucene53.Lucene53Codec org.apache.lucene.codecs.lucene54.Lucene54Codec +org.apache.lucene.codecs.lucene60.Lucene60Codec diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWCodec.java index 359e2ec3d22..8fdeb2041d2 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWCodec.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene50; import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; /** * Codec for testing 5.0 index format @@ -26,9 +27,15 @@ import org.apache.lucene.codecs.NormsFormat; @Deprecated final class Lucene50RWCodec extends Lucene50Codec { private final NormsFormat normsFormat = new Lucene50RWNormsFormat(); + private final SegmentInfoFormat segmentInfoFormat = new Lucene50RWSegmentInfoFormat(); @Override public NormsFormat normsFormat() { return normsFormat; } + + @Override + public SegmentInfoFormat segmentInfoFormat() { + return segmentInfoFormat; + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWSegmentInfoFormat.java similarity index 77% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWSegmentInfoFormat.java index 9786ec11aa3..d457243f67b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWSegmentInfoFormat.java @@ -37,43 +37,14 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.Version; /** - * Lucene 5.0 Segment info format. - *

        - * Files: - *

          - *
        • .si: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Attributes, Footer - *
        - * Data types: - *
          - *
        • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
        • - *
        • SegSize --> {@link DataOutput#writeInt Int32}
        • - *
        • SegVersion --> {@link DataOutput#writeString String}
        • - *
        • Files --> {@link DataOutput#writeSetOfStrings Set<String>}
        • - *
        • Diagnostics,Attributes --> {@link DataOutput#writeMapOfStrings Map<String,String>}
        • - *
        • IsCompoundFile --> {@link DataOutput#writeByte Int8}
        • - *
        • Footer --> {@link CodecUtil#writeFooter CodecFooter}
        • - *
        - * Field Descriptions: - *
          - *
        • SegVersion is the code version that created the segment.
        • - *
        • SegSize is the number of documents contained in the segment index.
        • - *
        • IsCompoundFile records whether the segment is written as a compound file or - * not. If this is -1, the segment is not a compound file. If it is 1, the segment - * is a compound file.
        • - *
        • The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid, - * for each segment it creates. It includes metadata like the current Lucene - * version, OS, Java version, why the segment was created (merge, flush, - * addIndexes), etc.
        • - *
        • Files is a list of files referred to by this segment.
        • - *
        - * - * @see SegmentInfos - * @lucene.experimental + * Read-write version of 5.0 SegmentInfoFormat for testing + * @deprecated for test purposes only */ -public class Lucene50SegmentInfoFormat extends SegmentInfoFormat { +@Deprecated +public class Lucene50RWSegmentInfoFormat extends Lucene50SegmentInfoFormat { /** Sole constructor. */ - public Lucene50SegmentInfoFormat() { + public Lucene50RWSegmentInfoFormat() { } @Override diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java index 74486d6c38c..df8a1b49404 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java @@ -29,7 +29,7 @@ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene60.Lucene60Codec; +import org.apache.lucene.codecs.lucene62.Lucene62Codec; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexDeletionPolicy; @@ -139,7 +139,7 @@ public class CreateIndexTask extends PerfTask { if (defaultCodec == null && postingsFormat != null) { try { final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat); - iwConf.setCodec(new Lucene60Codec() { + iwConf.setCodec(new Lucene62Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return postingsFormatChosen; diff --git a/lucene/classification/build.xml b/lucene/classification/build.xml index 3ddb9bd26e6..704cae8973d 100644 --- a/lucene/classification/build.xml +++ b/lucene/classification/build.xml @@ -28,7 +28,6 @@ - @@ -37,17 +36,16 @@ - + - - diff --git a/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java b/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java index fce786bf1e9..c1c8ad19ee6 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java @@ -29,6 +29,7 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; @@ -38,7 +39,6 @@ import org.apache.lucene.search.grouping.GroupDocs; import org.apache.lucene.search.grouping.GroupingSearch; import org.apache.lucene.search.grouping.TopGroups; import org.apache.lucene.store.Directory; -import org.apache.lucene.uninverting.UninvertingReader; /** * Utility class for creating training / test / cross validation indexes from the original index. @@ -68,7 +68,7 @@ public class DatasetSplitter { * @param crossValidationIndex a {@link Directory} used to write the cross validation index * @param analyzer {@link Analyzer} used to create the new docs * @param termVectors {@code true} if term vectors should be kept - * @param classFieldName names of the field used as the label for classification + * @param classFieldName name of the field used as the label for classification; this must be indexed with sorted doc values * @param fieldNames names of fields that need to be put in the new indexes or null if all should be used * @throws IOException if any writing operation fails on any of the indexes */ @@ -80,30 +80,23 @@ public class DatasetSplitter { IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer)); IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer)); - // try to get the exact no. of existing classes - Terms terms = originalIndex.terms(classFieldName); - long noOfClasses = -1; - if (terms != null) { - noOfClasses = terms.size(); - - } - if (noOfClasses == -1) { - noOfClasses = 10000; // fallback + // get the exact no. of existing classes + SortedDocValues classValues = originalIndex.getSortedDocValues(classFieldName); + if (classValues == null) { + throw new IllegalStateException("the classFieldName \"" + classFieldName + "\" must index sorted doc values"); } - HashMap mapping = new HashMap<>(); - mapping.put(classFieldName, UninvertingReader.Type.SORTED); - UninvertingReader uninvertingReader = new UninvertingReader(originalIndex, mapping); + int noOfClasses = classValues.getValueCount(); try { - IndexSearcher indexSearcher = new IndexSearcher(uninvertingReader); + IndexSearcher indexSearcher = new IndexSearcher(originalIndex); GroupingSearch gs = new GroupingSearch(classFieldName); gs.setGroupSort(Sort.INDEXORDER); gs.setSortWithinGroup(Sort.INDEXORDER); gs.setAllGroups(true); gs.setGroupDocsLimit(originalIndex.maxDoc()); - TopGroups topGroups = gs.search(indexSearcher, new MatchAllDocsQuery(), 0, (int) noOfClasses); + TopGroups topGroups = gs.search(indexSearcher, new MatchAllDocsQuery(), 0, noOfClasses); // set the type to be indexed, stored, with term vectors FieldType ft = new FieldType(TextField.TYPE_STORED); @@ -156,7 +149,7 @@ public class DatasetSplitter { testWriter.close(); cvWriter.close(); trainingWriter.close(); - uninvertingReader.close(); + originalIndex.close(); } } diff --git a/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java b/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java index 0b6f077cdfd..fdd4b0bb4a8 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java @@ -70,7 +70,9 @@ public class DataSplitterTest extends LuceneTestCase { doc = new Document(); doc.add(new Field(idFieldName, "id" + Integer.toString(i), ft)); doc.add(new Field(textFieldName, TestUtil.randomUnicodeString(rnd, 1024), ft)); - doc.add(new Field(classFieldName, Integer.toString(rnd.nextInt(10)), ft)); + String className = Integer.toString(rnd.nextInt(10)); + doc.add(new Field(classFieldName, className, ft)); + doc.add(new SortedDocValuesField(classFieldName, new BytesRef(className))); indexWriter.addDocument(doc); } @@ -89,13 +91,11 @@ public class DataSplitterTest extends LuceneTestCase { super.tearDown(); } - @Test public void testSplitOnAllFields() throws Exception { assertSplit(originalIndex, 0.1, 0.1); } - @Test public void testSplitOnSomeFields() throws Exception { assertSplit(originalIndex, 0.2, 0.35, idFieldName, textFieldName); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index 594fccf2472..bf9d3ded573 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -216,6 +216,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { break; } break; + // nocommit need the rest default: throw new AssertionError(); } @@ -337,6 +338,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { case INT: sortType = "int"; break; + // nocommit the rest: default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java index 5d704ca017d..442445c2237 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java @@ -57,7 +57,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI { } // TODO: should we use this, or maybe a system property is better? - static Codec defaultCodec = LOADER.lookup("Lucene60"); + static Codec defaultCodec = LOADER.lookup("Lucene62"); } private final String name; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java index 52bf9b2f82d..79cc42227d4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java @@ -316,16 +316,14 @@ public abstract class DocValuesConsumer implements Closeable { } private boolean setNext() { - while (true) { - NumericDocValuesSub sub = docIDMerger.next(); - if (sub == null) { - return false; - } - nextIsSet = true; - nextValue = sub.values.get(sub.docID); - nextHasValue = nextValue != 0 || sub.docsWithField.get(sub.docID); - return true; + NumericDocValuesSub sub = docIDMerger.next(); + if (sub == null) { + return false; } + nextIsSet = true; + nextValue = sub.values.get(sub.docID); + nextHasValue = nextValue != 0 || sub.docsWithField.get(sub.docID); + return true; } }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java index b771aabf43a..76f8be727f5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.codecs; - import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; @@ -24,6 +23,7 @@ import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; +import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.MergeState; @@ -31,6 +31,8 @@ import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.Bits; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + /** * Abstract API that consumes normalization values. * Concrete implementations of this @@ -98,6 +100,30 @@ public abstract class NormsConsumer implements Closeable { } } + /** Tracks state of one numeric sub-reader that we are merging */ + private static class NumericDocValuesSub extends DocIDMerger.Sub { + + private final NumericDocValues values; + private int docID = -1; + private final int maxDoc; + + public NumericDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, NumericDocValues values, int maxDoc) { + super(docMap, liveDocs); + this.values = values; + this.maxDoc = maxDoc; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + return docID; + } + } + } + /** * Merges the norms from toMerge. *

        @@ -111,13 +137,18 @@ public abstract class NormsConsumer implements Closeable { new Iterable() { @Override public Iterator iterator() { + + // We must make a new DocIDMerger for each iterator: + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == toMerge.size(); + for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + return new Iterator() { - int readerUpto = -1; - int docIDUpto; long nextValue; - int maxDoc; - NumericDocValues currentValues; - Bits currentLiveDocs; boolean nextIsSet; @Override @@ -141,31 +172,13 @@ public abstract class NormsConsumer implements Closeable { } private boolean setNext() { - while (true) { - if (readerUpto == toMerge.size()) { - return false; - } - - if (currentValues == null || docIDUpto == maxDoc) { - readerUpto++; - if (readerUpto < toMerge.size()) { - currentValues = toMerge.get(readerUpto); - currentLiveDocs = mergeState.liveDocs[readerUpto]; - maxDoc = mergeState.maxDocs[readerUpto]; - } - docIDUpto = 0; - continue; - } - - if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { - nextIsSet = true; - nextValue = currentValues.get(docIDUpto); - docIDUpto++; - return true; - } - - docIDUpto++; + NumericDocValuesSub sub = docIDMerger.next(); + if (sub == null) { + return false; } + nextIsSet = true; + nextValue = sub.values.get(sub.docID); + return true; } }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java index 79dfb27ad53..d5bf4ad8e88 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java @@ -486,6 +486,12 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { @Override public int merge(MergeState mergeState) throws IOException { + if (mergeState.segmentInfo.getIndexSort() != null) { + // TODO: can we gain back some optos even if index is sorted? E.g. if sort results in large chunks of contiguous docs from one sub + // being copied over...? + return super.merge(mergeState); + } + int docCount = 0; int numReaders = mergeState.maxDocs.length; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java index 07f27117f0d..9f8f44ef36d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java @@ -730,6 +730,11 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter { @Override public int merge(MergeState mergeState) throws IOException { + if (mergeState.segmentInfo.getIndexSort() != null) { + // TODO: can we gain back some optos even if index is sorted? E.g. if sort results in large chunks of contiguous docs from one sub + // being copied over...? + return super.merge(mergeState); + } int docCount = 0; int numReaders = mergeState.maxDocs.length; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/package-info.java index ebae8491bc2..5dec06bf16a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/package-info.java @@ -135,7 +135,7 @@ *

        Each segment index maintains the following:

        *
          *
        • - * {@link org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat Segment info}. + * {@link org.apache.lucene.codecs.lucene62.Lucene62SegmentInfoFormat Segment info}. * This contains metadata about a segment, such as the number of documents, * what files it uses, *
        • @@ -235,7 +235,7 @@ * file. * * - * {@link org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat Segment Info} + * {@link org.apache.lucene.codecs.lucene62.Lucene62SegmentInfoFormat Segment Info} * .si * Stores metadata about a segment * diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java index f6c73bdcc02..63308c422b3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java @@ -123,6 +123,13 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable { @Override public void merge(MergeState mergeState) throws IOException { + if (mergeState.segmentInfo.getIndexSort() != null) { + // TODO: can we gain back some optos even if index is sorted? E.g. if sort results in large chunks of contiguous docs from one sub + // being copied over...? + super.merge(mergeState); + return; + } + for(PointsReader reader : mergeState.pointsReaders) { if (reader instanceof Lucene60PointsReader == false) { // We can only bulk merge when all to-be-merged segments use our format: diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java index 03a17ba2e38..8968a6d624c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java @@ -16,400 +16,7 @@ */ /** - * Lucene 6.0 file format. - * - *

          Apache Lucene - Index File Formats

          - * - * - *

          Introduction

          - *
          - *

          This document defines the index file formats used in this version of Lucene. - * If you are using a different version of Lucene, please consult the copy of - * docs/ that was distributed with - * the version you are using.

          - *

          Apache Lucene is written in Java, but several efforts are underway to write - * versions of - * Lucene in other programming languages. If these versions are to remain - * compatible with Apache Lucene, then a language-independent definition of the - * Lucene index format is required. This document thus attempts to provide a - * complete and independent definition of the Apache Lucene file formats.

          - *

          As Lucene evolves, this document should evolve. Versions of Lucene in - * different programming languages should endeavor to agree on file formats, and - * generate new versions of this document.

          - *
          - * - *

          Definitions

          - *
          - *

          The fundamental concepts in Lucene are index, document, field and term.

          - *

          An index contains a sequence of documents.

          - *
            - *
          • A document is a sequence of fields.
          • - *
          • A field is a named sequence of terms.
          • - *
          • A term is a sequence of bytes.
          • - *
          - *

          The same sequence of bytes in two different fields is considered a different - * term. Thus terms are represented as a pair: the string naming the field, and the - * bytes within the field.

          - * - *

          Inverted Indexing

          - *

          The index stores statistics about terms in order to make term-based search - * more efficient. Lucene's index falls into the family of indexes known as an - * inverted index. This is because it can list, for a term, the documents - * that contain it. This is the inverse of the natural relationship, in which - * documents list terms.

          - * - *

          Types of Fields

          - *

          In Lucene, fields may be stored, in which case their text is stored - * in the index literally, in a non-inverted manner. Fields that are inverted are - * called indexed. A field may be both stored and indexed.

          - *

          The text of a field may be tokenized into terms to be indexed, or the - * text of a field may be used literally as a term to be indexed. Most fields are - * tokenized, but sometimes it is useful for certain identifier fields to be - * indexed literally.

          - *

          See the {@link org.apache.lucene.document.Field Field} - * java docs for more information on Fields.

          - * - *

          Segments

          - *

          Lucene indexes may be composed of multiple sub-indexes, or segments. - * Each segment is a fully independent index, which could be searched separately. - * Indexes evolve by:

          - *
            - *
          1. Creating new segments for newly added documents.
          2. - *
          3. Merging existing segments.
          4. - *
          - *

          Searches may involve multiple segments and/or multiple indexes, each index - * potentially composed of a set of segments.

          - * - *

          Document Numbers

          - *

          Internally, Lucene refers to documents by an integer document number. - * The first document added to an index is numbered zero, and each subsequent - * document added gets a number one greater than the previous.

          - *

          Note that a document's number may change, so caution should be taken when - * storing these numbers outside of Lucene. In particular, numbers may change in - * the following situations:

          - *
            - *
          • - *

            The numbers stored in each segment are unique only within the segment, and - * must be converted before they can be used in a larger context. The standard - * technique is to allocate each segment a range of values, based on the range of - * numbers used in that segment. To convert a document number from a segment to an - * external value, the segment's base document number is added. To convert - * an external value back to a segment-specific value, the segment is identified - * by the range that the external value is in, and the segment's base value is - * subtracted. For example two five document segments might be combined, so that - * the first segment has a base value of zero, and the second of five. Document - * three from the second segment would have an external value of eight.

            - *
          • - *
          • - *

            When documents are deleted, gaps are created in the numbering. These are - * eventually removed as the index evolves through merging. Deleted documents are - * dropped when segments are merged. A freshly-merged segment thus has no gaps in - * its numbering.

            - *
          • - *
          - *
          - * - *

          Index Structure Overview

          - *
          - *

          Each segment index maintains the following:

          - *
            - *
          • - * {@link org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat Segment info}. - * This contains metadata about a segment, such as the number of documents, - * what files it uses, - *
          • - *
          • - * {@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Field names}. - * This contains the set of field names used in the index. - *
          • - *
          • - * {@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Stored Field values}. - * This contains, for each document, a list of attribute-value pairs, where the attributes - * are field names. These are used to store auxiliary information about the document, such as - * its title, url, or an identifier to access a database. The set of stored fields are what is - * returned for each hit when searching. This is keyed by document number. - *
          • - *
          • - * {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term dictionary}. - * A dictionary containing all of the terms used in all of the - * indexed fields of all of the documents. The dictionary also contains the number - * of documents which contain the term, and pointers to the term's frequency and - * proximity data. - *
          • - *
          • - * {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Frequency data}. - * For each term in the dictionary, the numbers of all the - * documents that contain that term, and the frequency of the term in that - * document, unless frequencies are omitted (IndexOptions.DOCS_ONLY) - *
          • - *
          • - * {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Proximity data}. - * For each term in the dictionary, the positions that the - * term occurs in each document. Note that this will not exist if all fields in - * all documents omit position data. - *
          • - *
          • - * {@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Normalization factors}. - * For each field in each document, a value is stored - * that is multiplied into the score for hits on that field. - *
          • - *
          • - * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}. - * For each field in each document, the term vector (sometimes - * called document vector) may be stored. A term vector consists of term text and - * term frequency. To add Term Vectors to your index see the - * {@link org.apache.lucene.document.Field Field} constructors - *
          • - *
          • - * {@link org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat Per-document values}. - * Like stored values, these are also keyed by document - * number, but are generally intended to be loaded into main memory for fast - * access. Whereas stored values are generally intended for summary results from - * searches, per-document values are useful for things like scoring factors. - *
          • - *
          • - * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}. - * An optional file indicating which documents are live. - *
          • - *
          • - * {@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}. - * Optional pair of files, recording dimensionally indexed fields, to enable fast - * numeric range filtering and large numeric values like BigInteger and BigDecimal (1D) - * and geographic shape intersection (2D, 3D). - *
          • - *
          - *

          Details on each of these are provided in their linked pages.

          - *
          - * - *

          File Naming

          - *
          - *

          All files belonging to a segment have the same name with varying extensions. - * The extensions correspond to the different file formats described below. When - * using the Compound File format (default in 1.4 and greater) these files (except - * for the Segment info file, the Lock file, and Deleted documents file) are collapsed - * into a single .cfs file (see below for details)

          - *

          Typically, all segments in an index are stored in a single directory, - * although this is not required.

          - *

          As of version 2.1 (lock-less commits), file names are never re-used. - * That is, when any file is saved - * to the Directory it is given a never before used filename. This is achieved - * using a simple generations approach. For example, the first segments file is - * segments_1, then segments_2, etc. The generation is a sequential long integer - * represented in alpha-numeric (base 36) form.

          - *
          - * - *

          Summary of File Extensions

          - *
          - *

          The following table summarizes the names and extensions of the files in - * Lucene:

          - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
          NameExtensionBrief Description
          {@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
          Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same - * file.
          {@link org.apache.lucene.codecs.lucene50.Lucene50SegmentInfoFormat Segment Info}.siStores metadata about a segment
          {@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for - * systems that frequently run out of file handles.
          {@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Fields}.fnmStores information about the fields
          {@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Index}.fdxContains pointers to field data
          {@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Data}.fdtThe stored fields for documents
          {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
          {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Index}.tipThe index into the Term Dictionary
          {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
          {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Positions}.posStores position information about where a term occurs in the index
          {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
          {@link org.apache.lucene.codecs.lucene53.Lucene53NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
          {@link org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
          {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
          {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Documents}.tvdContains information about each document that has term vectors
          {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Fields}.tvfThe field level info about term vectors
          {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}.livInfo about what files are live
          {@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}.dii, .dimHolds indexed points, if any
          - *
          - * - *

          Lock File

          - * The write lock, which is stored in the index directory by default, is named - * "write.lock". If the lock directory is different from the index directory then - * the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix - * derived from the full path to the index directory. When this file is present, a - * writer is currently modifying the index (adding or removing documents). This - * lock file ensures that only one writer is modifying the index at a time. - * - *

          History

          - *

          Compatibility notes are provided in this document, describing how file - * formats have changed from prior versions:

          - *
            - *
          • In version 2.1, the file format was changed to allow lock-less commits (ie, - * no more commit lock). The change is fully backwards compatible: you can open a - * pre-2.1 index for searching or adding/deleting of docs. When the new segments - * file is saved (committed), it will be written in the new file format (meaning - * no specific "upgrade" process is needed). But note that once a commit has - * occurred, pre-2.1 Lucene will not be able to read the index.
          • - *
          • In version 2.3, the file format was changed to allow segments to share a - * single set of doc store (vectors & stored fields) files. This allows for - * faster indexing in certain cases. The change is fully backwards compatible (in - * the same way as the lock-less commits change in 2.1).
          • - *
          • In version 2.4, Strings are now written as true UTF-8 byte sequence, not - * Java's modified UTF-8. See - * LUCENE-510 for details.
          • - *
          • In version 2.9, an optional opaque Map<String,String> CommitUserData - * may be passed to IndexWriter's commit methods (and later retrieved), which is - * recorded in the segments_N file. See - * LUCENE-1382 for details. Also, - * diagnostics were added to each segment written recording details about why it - * was written (due to flush, merge; which OS/JRE was used; etc.). See issue - * LUCENE-1654 for details.
          • - *
          • In version 3.0, compressed fields are no longer written to the index (they - * can still be read, but on merge the new segment will write them, uncompressed). - * See issue LUCENE-1960 - * for details.
          • - *
          • In version 3.1, segments records the code version that created them. See - * LUCENE-2720 for details. - * Additionally segments track explicitly whether or not they have term vectors. - * See LUCENE-2811 - * for details.
          • - *
          • In version 3.2, numeric fields are written as natively to stored fields - * file, previously they were stored in text format only.
          • - *
          • In version 3.4, fields can omit position data while still indexing term - * frequencies.
          • - *
          • In version 4.0, the format of the inverted index became extensible via - * the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage - * ({@code DocValues}) was introduced. Normalization factors need no longer be a - * single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. - * Terms need not be unicode strings, they can be any byte sequence. Term offsets - * can optionally be indexed into the postings lists. Payloads can be stored in the - * term vectors.
          • - *
          • In version 4.1, the format of the postings list changed to use either - * of FOR compression or variable-byte encoding, depending upon the frequency - * of the term. Terms appearing only once were changed to inline directly into - * the term dictionary. Stored fields are compressed by default.
          • - *
          • In version 4.2, term vectors are compressed by default. DocValues has - * a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining - * on multi-valued fields.
          • - *
          • In version 4.5, DocValues were extended to explicitly represent missing values.
          • - *
          • In version 4.6, FieldInfos were extended to support per-field DocValues generation, to - * allow updating NumericDocValues fields.
          • - *
          • In version 4.8, checksum footers were added to the end of each index file - * for improved data integrity. Specifically, the last 8 bytes of every index file - * contain the zlib-crc32 checksum of the file.
          • - *
          • In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) - * that is suitable for faceting/sorting/analytics. - *
          • In version 5.4, DocValues have been improved to store more information on disk: - * addresses for binary fields and ord indexes for multi-valued fields. - *
          • In version 6.0, Points were added, for multi-dimensional range/distance search. - *
          • - *
          - * - *

          Limitations

          - *
          - *

          Lucene uses a Java int to refer to - * document numbers, and the index file format uses an Int32 - * on-disk to store document numbers. This is a limitation - * of both the index file format and the current implementation. Eventually these - * should be replaced with either UInt64 values, or - * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.

          - *
          + * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene62} + * for an overview of the index format. */ package org.apache.lucene.codecs.lucene60; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java new file mode 100644 index 00000000000..aa0adaed8cd --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene62; + +import java.util.Objects; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat; +import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat; +import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode; +import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat; +import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat; +import org.apache.lucene.codecs.lucene53.Lucene53NormsFormat; +import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat; +import org.apache.lucene.codecs.lucene60.Lucene60PointsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +// nocommit if somehow this does NOT land in 6.2, rename all this!! + +/** + * Implements the Lucene 6.2 index format, with configurable per-field postings + * and docvalues formats. + *

          + * If you want to reuse functionality of this codec in another codec, extend + * {@link FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene60 package documentation for file format details. + * + * @lucene.experimental + */ +public class Lucene62Codec extends Codec { + private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene60FieldInfosFormat(); + private final SegmentInfoFormat segmentInfosFormat = new Lucene62SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat(); + private final CompoundFormat compoundFormat = new Lucene50CompoundFormat(); + + private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene62Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene62Codec.this.getDocValuesFormatForField(field); + } + }; + + private final StoredFieldsFormat storedFieldsFormat; + + /** + * Instantiates a new codec. + */ + public Lucene62Codec() { + this(Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression + * mode to use. + * @param mode stored fields compression mode to use for newly + * flushed/merged segments. + */ + public Lucene62Codec(Mode mode) { + super("Lucene62"); + this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode)); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + @Override + public final CompoundFormat compoundFormat() { + return compoundFormat; + } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene60PointsFormat(); + } + + /** Returns the postings format that should be used for writing + * new segments of field. + * + * The default implementation always returns "Lucene50". + *

          + * WARNING: if you subclass, you are responsible for index + * backwards compatibility: future version of Lucene are only + * guaranteed to be able to read the default implementation. + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultFormat; + } + + /** Returns the docvalues format that should be used for writing + * new segments of field. + * + * The default implementation always returns "Lucene54". + *

          + * WARNING: if you subclass, you are responsible for index + * backwards compatibility: future version of Lucene are only + * guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50"); + private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene54"); + + private final NormsFormat normsFormat = new Lucene53NormsFormat(); + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java new file mode 100644 index 00000000000..53d273474a8 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java @@ -0,0 +1,289 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene62; + +import java.io.IOException; +import java.util.Collections; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexWriter; // javadocs +import org.apache.lucene.index.SegmentInfo; // javadocs +import org.apache.lucene.index.SegmentInfos; // javadocs +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataOutput; // javadocs +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Version; + +// nocommit fixup javadocs here: + +/** + * Lucene 6.2 Segment info format. + *

          + * Files: + *

            + *
          • .si: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, Attributes, IndexSort, Footer + *
          + * Data types: + *
            + *
          • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
          • + *
          • SegSize --> {@link DataOutput#writeInt Int32}
          • + *
          • SegVersion --> {@link DataOutput#writeString String}
          • + *
          • Files --> {@link DataOutput#writeSetOfStrings Set<String>}
          • + *
          • Diagnostics,Attributes --> {@link DataOutput#writeMapOfStrings Map<String,String>}
          • + *
          • IsCompoundFile --> {@link DataOutput#writeByte Int8}
          • + *
          • IndexSort --> {@link DataOutput#writeInt Int32} count, followed by {@code count} SortField
          • + *
          • Footer --> {@link CodecUtil#writeFooter CodecFooter}
          • + *
          + * Field Descriptions: + *
            + *
          • SegVersion is the code version that created the segment.
          • + *
          • SegSize is the number of documents contained in the segment index.
          • + *
          • IsCompoundFile records whether the segment is written as a compound file or + * not. If this is -1, the segment is not a compound file. If it is 1, the segment + * is a compound file.
          • + *
          • The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid, + * for each segment it creates. It includes metadata like the current Lucene + * version, OS, Java version, why the segment was created (merge, flush, + * addIndexes), etc.
          • + *
          • Files is a list of files referred to by this segment.
          • + *
          + * + * @see SegmentInfos + * @lucene.experimental + */ +public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { + + /** Sole constructor. */ + public Lucene62SegmentInfoFormat() { + } + + @Override + public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException { + final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene62SegmentInfoFormat.SI_EXTENSION); + try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) { + Throwable priorE = null; + SegmentInfo si = null; + try { + int format = CodecUtil.checkIndexHeader(input, Lucene62SegmentInfoFormat.CODEC_NAME, + Lucene62SegmentInfoFormat.VERSION_START, + Lucene62SegmentInfoFormat.VERSION_CURRENT, + segmentID, ""); + final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + + final int docCount = input.readInt(); + if (docCount < 0) { + throw new CorruptIndexException("invalid docCount: " + docCount, input); + } + final boolean isCompoundFile = input.readByte() == SegmentInfo.YES; + + final Map diagnostics = input.readMapOfStrings(); + final Set files = input.readSetOfStrings(); + final Map attributes = input.readMapOfStrings(); + + int numSortFields = input.readVInt(); + Sort indexSort; + if (numSortFields > 0) { + SortField[] sortFields = new SortField[numSortFields]; + for(int i=0;i= 5 but got: " + version.major + " segment=" + si); + } + // Write the Lucene version that created this segment, since 3.1 + output.writeInt(version.major); + output.writeInt(version.minor); + output.writeInt(version.bugfix); + assert version.prerelease == 0; + output.writeInt(si.maxDoc()); + + output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO)); + output.writeMapOfStrings(si.getDiagnostics()); + Set files = si.files(); + for (String file : files) { + if (!IndexFileNames.parseSegmentName(file).equals(si.name)) { + throw new IllegalArgumentException("invalid files: expected segment=" + si.name + ", got=" + files); + } + } + output.writeSetOfStrings(files); + output.writeMapOfStrings(si.getAttributes()); + + Sort indexSort = si.getIndexSort(); + int numSortFields = indexSort == null ? 0 : indexSort.getSort().length; + output.writeVInt(numSortFields); + for (int i = 0; i < numSortFields; ++i) { + SortField sortField = indexSort.getSort()[i]; + output.writeString(sortField.getField()); + int sortTypeID; + switch (sortField.getType()) { + case STRING: + sortTypeID = 0; + break; + case LONG: + sortTypeID = 1; + break; + case INT: + sortTypeID = 2; + break; + // nocommit the rest: + default: + throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); + } + output.writeVInt(sortTypeID); + output.writeByte((byte) (sortField.getReverse() ? 0 : 1)); + + // write missing value + Object missingValue = sortField.getMissingValue(); + if (missingValue == null) { + output.writeByte((byte) 0); + } else { + switch(sortField.getType()) { + case STRING: + if (missingValue == SortField.STRING_LAST) { + output.writeByte((byte) 1); + } else if (missingValue == SortField.STRING_FIRST) { + output.writeByte((byte) 2); + } else { + throw new AssertionError("unrecognized missing value for STRING field \"" + sortField.getField() + "\": " + missingValue); + } + break; + case LONG: + output.writeByte((byte) 1); + output.writeLong(((Long) missingValue).longValue()); + break; + case INT: + output.writeByte((byte) 1); + output.writeInt(((Integer) missingValue).intValue()); + break; + // nocommit the rest: + default: + throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); + } + } + } + + CodecUtil.writeFooter(output); + } + } + + /** File extension used to store {@link SegmentInfo}. */ + public final static String SI_EXTENSION = "si"; + static final String CODEC_NAME = "Lucene62SegmentInfo"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java new file mode 100644 index 00000000000..2fe2dc74b4a --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Components from the Lucene 6.2 index format + * See {@link org.apache.lucene.codecs.lucene62} for an overview + * of the index format. + */ + +package org.apache.lucene.codecs.lucene62; diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 5a68a3d0f66..fb2dc80ce3f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -43,6 +43,9 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.DocumentStoredFieldVisitor; import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.LeafFieldComparator; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; @@ -217,6 +220,9 @@ public final class CheckIndex implements Closeable { /** Status for testing of PointValues (null if PointValues could not be tested). */ public PointsStatus pointsStatus; + + /** Status of index sort */ + public IndexSortStatus indexSortStatus; } /** @@ -374,6 +380,16 @@ public final class CheckIndex implements Closeable { /** Exception thrown during doc values test (null on success) */ public Throwable error = null; } + + /** + * Status from testing index sort + */ + public static final class IndexSortStatus { + + /** Exception thrown during term index test (null on success) */ + public Throwable error = null; + } + } /** Create a new CheckIndex on the directory. */ @@ -642,6 +658,10 @@ public final class CheckIndex implements Closeable { msg(infoStream, " compound=" + info.info.getUseCompoundFile()); segInfoStat.compound = info.info.getUseCompoundFile(); msg(infoStream, " numFiles=" + info.files().size()); + Sort indexSort = info.info.getIndexSort(); + if (indexSort != null) { + msg(infoStream, " sort=" + indexSort); + } segInfoStat.numFiles = info.files().size(); segInfoStat.sizeMB = info.sizeInBytes()/(1024.*1024.); msg(infoStream, " size (MB)=" + nf.format(segInfoStat.sizeMB)); @@ -722,6 +742,9 @@ public final class CheckIndex implements Closeable { // Test PointValues segInfoStat.pointsStatus = testPoints(reader, infoStream, failFast); + // Test index sort + segInfoStat.indexSortStatus = testSort(reader, indexSort, infoStream, failFast); + // Rethrow the first exception we encountered // This will cause stats for failed segments to be incremented properly if (segInfoStat.liveDocStatus.error != null) { @@ -790,6 +813,70 @@ public final class CheckIndex implements Closeable { return result; } + + public static Status.IndexSortStatus testSort(CodecReader reader, Sort sort, PrintStream infoStream, boolean failFast) throws IOException { + // This segment claims its documents are sorted according to the incoming sort ... let's make sure: + + long startNS = System.nanoTime(); + + Status.IndexSortStatus status = new Status.IndexSortStatus(); + + if (sort != null) { + if (infoStream != null) { + infoStream.print(" test: check index sort....."); + } + + SortField fields[] = sort.getSort(); + final int reverseMul[] = new int[fields.length]; + final LeafFieldComparator comparators[] = new LeafFieldComparator[fields.length]; + + LeafReaderContext readerContext = new LeafReaderContext(reader); + + for (int i = 0; i < fields.length; i++) { + reverseMul[i] = fields[i].getReverse() ? -1 : 1; + comparators[i] = fields[i].getComparator(1, i).getLeafComparator(readerContext); + // nocommit we prevent SCORE? + //comparators[i].setScorer(FAKESCORER); + } + + int maxDoc = reader.maxDoc(); + + try { + + for(int docID=1;docID < maxDoc;docID++) { + + int cmp = 0; + + for (int i = 0; i < comparators.length; i++) { + // TODO: would be better if copy() didnt cause a term lookup in TermOrdVal & co, + // the segments are always the same here... + comparators[i].copy(0, docID-1); + comparators[i].setBottom(0); + cmp = reverseMul[i] * comparators[i].compareBottom(docID); + if (cmp != 0) { + break; + } + } + + if (cmp > 0) { + throw new RuntimeException("segment has indexSort=" + sort + " but docID=" + (docID-1) + " sorts after docID=" + docID); + } + } + msg(infoStream, String.format(Locale.ROOT, "OK [took %.3f sec]", nsToSec(System.nanoTime()-startNS))); + } catch (Throwable e) { + if (failFast) { + IOUtils.reThrow(e); + } + msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]"); + status.error = e; + if (infoStream != null) { + e.printStackTrace(infoStream); + } + } + } + + return status; + } /** * Test live docs. diff --git a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java index c4333bc049d..41c2a46f218 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java @@ -62,7 +62,6 @@ final class MappingMultiPostingsEnum extends PostingsEnum { this.field = field; allSubs = new MappingPostingsSub[mergeState.fieldsProducers.length]; for(int i=0;i(subs, allSubs.length, mergeState.segmentInfo.getIndexSort() != null); @@ -89,7 +88,11 @@ final class MappingMultiPostingsEnum extends PostingsEnum { @Override public int docID() { - return current.mappedDocID; + if (current == null) { + return -1; + } else { + return current.mappedDocID; + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java index 0ce7d64e0d1..cf75c18f6f0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java @@ -33,7 +33,6 @@ import org.apache.lucene.util.packed.PackedLongValues; * IDs. * @lucene.experimental */ -// nocommit rename to IndexSorter? final class Sorter { final Sort sort; diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java index 45d44828978..b6558f7fd15 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java @@ -337,10 +337,7 @@ class SortingLeafReader extends FilterLeafReader { @Override public long nextOrd() { - // nocommit - long v = in.nextOrd(); - //System.out.println(" slr.sssdv.nextOrd return " + v + " this=" + this); - return v; + return in.nextOrd(); } @Override diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 875aba527e2..548f8d09244 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene60.Lucene60Codec +org.apache.lucene.codecs.lucene62.Lucene62Codec diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java index 59e48144d44..f945c2d0dc0 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java @@ -19,7 +19,7 @@ package org.apache.lucene.codecs.lucene50; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode; -import org.apache.lucene.codecs.lucene60.Lucene60Codec; +import org.apache.lucene.codecs.lucene62.Lucene62Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.BaseStoredFieldsFormatTestCase; @@ -33,7 +33,7 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks; public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase { @Override protected Codec getCodec() { - return new Lucene60Codec(Mode.BEST_COMPRESSION); + return new Lucene62Codec(Mode.BEST_COMPRESSION); } /** @@ -44,7 +44,7 @@ public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFie Directory dir = newDirectory(); for (int i = 0; i < 10; i++) { IndexWriterConfig iwc = newIndexWriterConfig(); - iwc.setCodec(new Lucene60Codec(RandomPicks.randomFrom(random(), Mode.values()))); + iwc.setCodec(new Lucene62Codec(RandomPicks.randomFrom(random(), Mode.values()))); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig()); Document doc = new Document(); doc.add(new StoredField("field1", "value1")); @@ -71,7 +71,7 @@ public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFie public void testInvalidOptions() throws Exception { expectThrows(NullPointerException.class, () -> { - new Lucene60Codec(null); + new Lucene62Codec(null); }); expectThrows(NullPointerException.class, () -> { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java index c915de0bfce..a0ad87fca87 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene53/TestLucene53NormsFormat.java @@ -18,14 +18,14 @@ package org.apache.lucene.codecs.lucene53; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene60.Lucene60Codec; +import org.apache.lucene.codecs.lucene62.Lucene62Codec; import org.apache.lucene.index.BaseNormsFormatTestCase; /** * Tests Lucene53NormsFormat */ public class TestLucene53NormsFormat extends BaseNormsFormatTestCase { - private final Codec codec = new Lucene60Codec(); + private final Codec codec = new Lucene62Codec(); @Override protected Codec getCodec() { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50SegmentInfoFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene62/TestLucene62SegmentInfoFormat.java similarity index 89% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50SegmentInfoFormat.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene62/TestLucene62SegmentInfoFormat.java index 81143300ee5..8c758f29e5a 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50SegmentInfoFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene62/TestLucene62SegmentInfoFormat.java @@ -1,3 +1,5 @@ +package org.apache.lucene.codecs.lucene62; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -14,8 +16,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene50; - import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.BaseSegmentInfoFormatTestCase; @@ -23,9 +23,9 @@ import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.Version; /** - * Tests Lucene50SegmentInfoFormat + * Tests Lucene62SegmentInfoFormat */ -public class TestLucene50SegmentInfoFormat extends BaseSegmentInfoFormatTestCase { +public class TestLucene62SegmentInfoFormat extends BaseSegmentInfoFormatTestCase { @Override protected Version[] getVersions() { diff --git a/lucene/core/src/test/org/apache/lucene/index/Test2BPoints.java b/lucene/core/src/test/org/apache/lucene/index/Test2BPoints.java index 2f3a3a69890..da8dbac0f5e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/Test2BPoints.java +++ b/lucene/core/src/test/org/apache/lucene/index/Test2BPoints.java @@ -24,8 +24,6 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.PointsWriter; -import org.apache.lucene.codecs.lucene60.Lucene60PointsReader; -import org.apache.lucene.codecs.lucene60.Lucene60PointsWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.LongPoint; import org.apache.lucene.search.IndexSearcher; @@ -143,6 +141,6 @@ public class Test2BPoints extends LuceneTestCase { } private static Codec getCodec() { - return Codec.forName("Lucene60"); + return Codec.forName("Lucene62"); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java b/lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java index 22b3605965b..22d12346d4e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java +++ b/lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java @@ -53,7 +53,7 @@ import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; // disk (but, should run successfully). Best to run w/ // -Dtests.codec=, and w/ plenty of RAM, eg: // -// ant test -Dtests.monster=true -Dtests.heapsize=8g -Dtests.codec=Lucene60 -Dtestcase=Test2BTerms +// ant test -Dtests.monster=true -Dtests.heapsize=8g -Dtests.codec=Lucene62 -Dtestcase=Test2BTerms // @SuppressCodecs({ "SimpleText", "Memory", "Direct" }) @Monster("very slow, use 5g minimum heap") diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 2635b00ada7..8df81bab264 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -33,7 +33,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.BinaryPoint; import org.apache.lucene.document.Document; @@ -76,16 +75,17 @@ import org.junit.BeforeClass; // nocommit test EarlyTerminatingCollector +// nocommit must test all supported SortField.Type + public class TestIndexSorting extends LuceneTestCase { public void testSortOnMerge(boolean withDeletes) throws IOException { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); iwc.setIndexSort(indexSort); IndexWriter w = new IndexWriter(dir, iwc); - final int numDocs = atLeast(200); + final int numDocs = atLeast(1000); final FixedBitSet deleted = new FixedBitSet(numDocs); for (int i = 0; i < numDocs; ++i) { Document doc = new Document(); @@ -212,7 +212,6 @@ public class TestIndexSorting extends LuceneTestCase { public void testConcurrentUpdates() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); iwc.setIndexSort(indexSort); IndexWriter w = new IndexWriter(dir, iwc); @@ -303,7 +302,6 @@ public class TestIndexSorting extends LuceneTestCase { public void testConcurrentDVUpdates() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); iwc.setIndexSort(indexSort); IndexWriter w = new IndexWriter(dir, iwc); @@ -314,6 +312,7 @@ public class TestIndexSorting extends LuceneTestCase { Document doc = new Document(); doc.add(new StringField("id", Integer.toString(i), Store.NO)); doc.add(new NumericDocValuesField("foo", -1)); + w.addDocument(doc); values.put(i, -1L); } Thread[] threads = new Thread[2]; @@ -321,7 +320,7 @@ public class TestIndexSorting extends LuceneTestCase { final CountDownLatch latch = new CountDownLatch(1); for (int i = 0; i < threads.length; ++i) { Random r = new Random(random().nextLong()); - threads[i] = new Thread(new UpdateRunnable(numDocs, r, latch, updateCount, w, values)); + threads[i] = new Thread(new DVUpdateRunnable(numDocs, r, latch, updateCount, w, values)); } for (Thread thread : threads) { thread.start(); @@ -362,7 +361,6 @@ public class TestIndexSorting extends LuceneTestCase { Directory dir2 = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); iwc.setIndexSort(indexSort); IndexWriter w2 = new IndexWriter(dir2, iwc); @@ -410,7 +408,6 @@ public class TestIndexSorting extends LuceneTestCase { public void testIllegalChangeSort() throws Exception { final Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far iwc.setIndexSort(new Sort(new SortField("foo", SortField.Type.LONG))); IndexWriter w = new IndexWriter(dir, iwc); w.addDocument(new Document()); @@ -420,12 +417,13 @@ public class TestIndexSorting extends LuceneTestCase { w.close(); final IndexWriterConfig iwc2 = new IndexWriterConfig(new MockAnalyzer(random())); - iwc2.setCodec(new SimpleTextCodec()); // nocommit only simple-text supports sorting so far iwc2.setIndexSort(new Sort(new SortField("bar", SortField.Type.LONG))); - IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> { new IndexWriter(dir, iwc2); }); - assertEquals("cannot change previous indexSort= (from segment=_2(7.0.0):c2:[indexSort=]) to new indexSort=", expected.getMessage()); + String message = e.getMessage(); + assertTrue(message.contains("cannot change previous indexSort=")); + assertTrue(message.contains("to new indexSort=")); dir.close(); } @@ -574,8 +572,6 @@ public class TestIndexSorting extends LuceneTestCase { PositionsTokenStream positions = new PositionsTokenStream(); IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - // nocommit: - conf.setCodec(new SimpleTextCodec()); conf.setMaxBufferedDocs(4); // create some segments conf.setSimilarity(new NormsSimilarity(conf.getSimilarity())); // for testing norms field // nocommit @@ -633,7 +629,6 @@ public class TestIndexSorting extends LuceneTestCase { assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(DOC_POSITIONS_TERM))); PostingsEnum sortedPositions = termsEnum.postings(null, PostingsEnum.ALL); int doc; - boolean isSorted = reader.getIndexSort() != null; // test nextDoc() while ((doc = sortedPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { @@ -687,7 +682,6 @@ public class TestIndexSorting extends LuceneTestCase { LeafReader reader = ctx.reader(); NumericDocValues dv = reader.getNormValues(NORMS_FIELD); int maxDoc = reader.maxDoc(); - boolean isSorted = reader.getIndexSort() != null; for (int doc = 0; doc < maxDoc; doc++) { int id = Integer.parseInt(reader.document(doc).get(ID_FIELD)); assertEquals("incorrect norm value for doc " + doc, id, dv.get(doc)); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPointValues.java b/lucene/core/src/test/org/apache/lucene/index/TestPointValues.java index 9f55ec3fdcc..9693c5c32b1 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPointValues.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPointValues.java @@ -394,11 +394,11 @@ public class TestPointValues extends LuceneTestCase { dir.close(); } - // Write point values, one segment with Lucene60, another with SimpleText, then forceMerge with SimpleText + // Write point values, one segment with Lucene62, another with SimpleText, then forceMerge with SimpleText public void testDifferentCodecs1() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(Codec.forName("Lucene60")); + iwc.setCodec(Codec.forName("Lucene62")); IndexWriter w = new IndexWriter(dir, iwc); Document doc = new Document(); doc.add(new IntPoint("int", 1)); @@ -417,7 +417,7 @@ public class TestPointValues extends LuceneTestCase { dir.close(); } - // Write point values, one segment with Lucene60, another with SimpleText, then forceMerge with Lucene60 + // Write point values, one segment with Lucene62, another with SimpleText, then forceMerge with Lucene60 public void testDifferentCodecs2() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); @@ -429,7 +429,7 @@ public class TestPointValues extends LuceneTestCase { w.close(); iwc = new IndexWriterConfig(new MockAnalyzer(random())); - iwc.setCodec(Codec.forName("Lucene60")); + iwc.setCodec(Codec.forName("Lucene62")); w = new IndexWriter(dir, iwc); doc = new Document(); doc.add(new IntPoint("int", 1)); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java index 88d89d29417..078c8da3653 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java @@ -1151,14 +1151,14 @@ public class TestPointQueries extends LuceneTestCase { } private static Codec getCodec() { - if (Codec.getDefault().getName().equals("Lucene60")) { + if (Codec.getDefault().getName().equals("Lucene62")) { int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048); double maxMBSortInHeap = 5.0 + (3*random().nextDouble()); if (VERBOSE) { System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); } - return new FilterCodec("Lucene60", Codec.getDefault()) { + return new FilterCodec("Lucene62", Codec.getDefault()) { @Override public PointsFormat pointsFormat() { return new PointsFormat() { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java index 4d76fa9dd39..55f360ad308 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java @@ -21,7 +21,6 @@ import java.util.Collections; import java.util.Iterator; import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.PointValues; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -29,11 +28,13 @@ import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PointValues; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Terms; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.Bits; /** @@ -178,4 +179,8 @@ public class TermVectorLeafReader extends LeafReader { public void document(int docID, StoredFieldVisitor visitor) throws IOException { } + @Override + public Sort getIndexSort() { + return null; + } } diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 5b133013ed5..e3aa4b14a13 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -40,6 +40,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.SimpleCollector; +import org.apache.lucene.search.Sort; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.*; @@ -1606,6 +1607,10 @@ public class MemoryIndex { return info.getNormDocValues(); } + @Override + public Sort getIndexSort() { + return null; + } } /** diff --git a/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java b/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java index cf860a2ac20..54ad7445c79 100644 --- a/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java +++ b/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java @@ -35,7 +35,6 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiDocValues; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; diff --git a/lucene/misc/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java b/lucene/misc/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java index 3bcc4924327..6108992ce67 100644 --- a/lucene/misc/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java +++ b/lucene/misc/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java @@ -50,8 +50,6 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.store.Directory; -import org.apache.lucene.uninverting.UninvertingReader.Type; -import org.apache.lucene.uninverting.UninvertingReader; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -236,27 +234,12 @@ public class TestEarlyTerminatingSortingCollector extends LuceneTestCase { } } - private IndexSearcher newSearcherForTestTerminatedEarly(IndexReader r) throws IOException { - switch(random().nextInt(2)) { - case 0: - return new IndexSearcher(r); - case 1: - assertTrue(r+" is not a DirectoryReader", (r instanceof DirectoryReader)); - final DirectoryReader directoryReader = ExitableDirectoryReader.wrap( - UninvertingReader.wrap((DirectoryReader) r, new HashMap()), - new TestEarlyTerminatingSortingcollectorQueryTimeout(false)); - return new IndexSearcher(directoryReader); - } - fail("newSearcherForTestTerminatedEarly("+r+") fell through switch"); - return null; - } - public void testTerminatedEarly() throws IOException { final int iters = atLeast(8); for (int i = 0; i < iters; ++i) { createRandomIndex(true); - final IndexSearcher searcher = newSearcherForTestTerminatedEarly(reader); // future TODO: use newSearcher(reader); + final IndexSearcher searcher = new IndexSearcher(reader); // future TODO: use newSearcher(reader); final Query query = new MatchAllDocsQuery(); // search for everything/anything final TestTerminatedEarlySimpleCollector collector1 = new TestTerminatedEarlySimpleCollector(); diff --git a/lucene/sandbox/src/test/org/apache/lucene/document/TestNearest.java b/lucene/sandbox/src/test/org/apache/lucene/document/TestNearest.java index 66630df2bca..0b19254d985 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/document/TestNearest.java +++ b/lucene/sandbox/src/test/org/apache/lucene/document/TestNearest.java @@ -247,7 +247,7 @@ public class TestNearest extends LuceneTestCase { private IndexWriterConfig getIndexWriterConfig() { IndexWriterConfig iwc = newIndexWriterConfig(); - iwc.setCodec(Codec.forName("Lucene60")); + iwc.setCodec(Codec.forName("Lucene62")); return iwc; } } diff --git a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java index 0f8f2026fe4..c2cb93b13f0 100644 --- a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java +++ b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java @@ -85,14 +85,14 @@ import com.carrotsearch.randomizedtesting.generators.RandomInts; public class TestGeo3DPoint extends LuceneTestCase { private static Codec getCodec() { - if (Codec.getDefault().getName().equals("Lucene60")) { + if (Codec.getDefault().getName().equals("Lucene62")) { int maxPointsInLeafNode = TestUtil.nextInt(random(), 16, 2048); double maxMBSortInHeap = 3.0 + (3*random().nextDouble()); if (VERBOSE) { System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap); } - return new FilterCodec("Lucene60", Codec.getDefault()) { + return new FilterCodec("Lucene62", Codec.getDefault()) { @Override public PointsFormat pointsFormat() { return new PointsFormat() { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java index 2b14d6e8016..004aef44d94 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java @@ -56,7 +56,6 @@ import org.apache.lucene.index.MultiDocValues; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.SegmentReader; import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.SortingMergePolicy; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause; @@ -232,7 +231,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { // This way all merged segments will be sorted at // merge time, allow for per-segment early termination // when those segments are searched: - iwc.setMergePolicy(new SortingMergePolicy(iwc.getMergePolicy(), SORT)); + iwc.setIndexSort(SORT); return iwc; } @@ -586,8 +585,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { // We sorted postings by weight during indexing, so we // only retrieve the first num hits now: - final SortingMergePolicy sortingMergePolicy = (SortingMergePolicy) writer.getConfig().getMergePolicy(); - Collector c2 = new EarlyTerminatingSortingCollector(c, SORT, num, sortingMergePolicy.getSort()); + Collector c2 = new EarlyTerminatingSortingCollector(c, SORT, num); IndexSearcher searcher = searcherMgr.acquire(); List results = null; try { diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java index 62ed08b8d0e..6b1c2d1b21d 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java @@ -32,7 +32,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene60.Lucene60Codec; +import org.apache.lucene.codecs.lucene62.Lucene62Codec; import org.apache.lucene.document.IntPoint; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -646,7 +646,7 @@ public class TestSuggestField extends LuceneTestCase { static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, final Set suggestFields) { IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer); iwc.setMergePolicy(newLogMergePolicy()); - Codec filterCodec = new Lucene60Codec() { + Codec filterCodec = new Lucene62Codec() { PostingsFormat postingsFormat = new Completion50PostingsFormat(); @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java index bda4cdebaad..275c1864857 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/geo/BaseGeoPointTestCase.java @@ -1242,7 +1242,7 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase { // Else seeds may not reproduce: iwc.setMergeScheduler(new SerialMergeScheduler()); int pointsInLeaf = 2 + random().nextInt(4); - iwc.setCodec(new FilterCodec("Lucene60", TestUtil.getDefaultCodec()) { + iwc.setCodec(new FilterCodec("Lucene62", TestUtil.getDefaultCodec()) { @Override public PointsFormat pointsFormat() { return new PointsFormat() { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java index 27e01c1fbef..49d19ae4322 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseSegmentInfoFormatTestCase.java @@ -163,8 +163,14 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT } } + protected boolean supportsIndexSort() { + return true; + } + /** Test sort */ public void testSort() throws IOException { + assumeTrue("test requires a codec that can read/write index sort", supportsIndexSort()); + final int iters = atLeast(5); for (int i = 0; i < iters; ++i) { Sort sort; diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java index 7ac40375f15..5c88dc7ec92 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java @@ -32,7 +32,7 @@ import org.apache.lucene.codecs.asserting.AssertingPostingsFormat; import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec; import org.apache.lucene.codecs.compressing.CompressingCodec; import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat; -import org.apache.lucene.codecs.lucene60.Lucene60Codec; +import org.apache.lucene.codecs.lucene62.Lucene62Codec; import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.index.RandomCodec; @@ -181,8 +181,8 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { codec = new AssertingCodec(); } else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) { codec = CompressingCodec.randomInstance(random); - } else if ("Lucene60".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene60"))) { - codec = new Lucene60Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values())); + } else if ("Lucene62".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene62"))) { + codec = new Lucene62Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values())); } else if (!"random".equals(TEST_CODEC)) { codec = Codec.forName(TEST_CODEC); } else if ("random".equals(TEST_POSTINGSFORMAT)) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java index d772ae321d3..b63216085b3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java @@ -54,7 +54,7 @@ import org.apache.lucene.codecs.blockterms.LuceneFixedGap; import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat; import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; import org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat; -import org.apache.lucene.codecs.lucene60.Lucene60Codec; +import org.apache.lucene.codecs.lucene62.Lucene62Codec; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; @@ -911,7 +911,7 @@ public final class TestUtil { * This may be different than {@link Codec#getDefault()} because that is randomized. */ public static Codec getDefaultCodec() { - return new Lucene60Codec(); + return new Lucene62Codec(); } /** From eb8b1a92d87c3670579f1c45e48762b21510107f Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 6 May 2016 19:02:41 -0400 Subject: [PATCH 03/16] LUCENE-6766: MultiXXX now refuse to merge if there is an index sort --- .../lucene50/Lucene50RWSegmentInfoFormat.java | 6 +- .../org/apache/lucene/index/DocIDMerger.java | 4 +- .../apache/lucene/index/MultiDocValues.java | 21 +++++++ .../org/apache/lucene/index/MultiFields.java | 9 ++- .../org/apache/lucene/index/MultiReader.java | 2 + .../apache/lucene/search/IndexSearcher.java | 40 ++++++++----- .../apache/lucene/index/TestIndexSorting.java | 18 ++++-- .../lucene/index/TestMultiDocValues.java | 57 +++++++++++++++++++ .../apache/lucene/index/TestMultiFields.java | 26 +++++++++ .../analyzing/AnalyzingInfixSuggester.java | 30 ++++++---- .../AnalyzingInfixSuggesterTest.java | 2 +- 11 files changed, 179 insertions(+), 36 deletions(-) diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWSegmentInfoFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWSegmentInfoFormat.java index d457243f67b..0a373b1dc76 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWSegmentInfoFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50RWSegmentInfoFormat.java @@ -94,10 +94,8 @@ public class Lucene50RWSegmentInfoFormat extends Lucene50SegmentInfoFormat { @Override public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException { final String fileName = IndexFileNames.segmentFileName(si.name, "", Lucene50SegmentInfoFormat.SI_EXTENSION); - // nocommit indexSort - if (si.getIndexSort() != null) { - throw new IllegalArgumentException("teach me to write indexSort"); - } + + assert si.getIndexSort() == null; try (IndexOutput output = dir.createOutput(fileName, ioContext)) { // Only add the file once we've successfully created it, else IFD assert can trip: diff --git a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java index fb1cdf280f0..fdc705660b4 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java @@ -84,7 +84,9 @@ public class DocIDMerger { /** Reuse API, currently only used by postings during merge */ public void reset() { if (queue != null) { - assert queue.size() == 0; + queue.clear(); + // nocommit why does bloom filter wrapper trip this? + // assert queue.size() == 0: "queue.size() = " + queue.size(); for(T sub : subs) { while (true) { int docID = sub.nextDoc(); diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java index 33947974bce..ae61183527b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java @@ -78,6 +78,9 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); + if (context.reader().getIndexSort() != null) { + throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); + } NumericDocValues v = context.reader().getNormValues(field); if (v == null) { v = DocValues.emptyNumeric(); @@ -120,6 +123,9 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); + if (context.reader().getIndexSort() != null) { + throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); + } NumericDocValues v = context.reader().getNumericDocValues(field); if (v == null) { v = DocValues.emptyNumeric(); @@ -165,6 +171,9 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); + if (context.reader().getIndexSort() != null) { + throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); + } Bits v = context.reader().getDocsWithField(field); if (v == null) { v = new Bits.MatchNoBits(context.reader().maxDoc()); @@ -210,6 +219,9 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); + if (context.reader().getIndexSort() != null) { + throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); + } BinaryDocValues v = context.reader().getBinaryDocValues(field); if (v == null) { v = DocValues.emptyBinary(); @@ -254,6 +266,9 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); + if (context.reader().getIndexSort() != null) { + throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); + } SortedNumericDocValues v = context.reader().getSortedNumericDocValues(field); if (v == null) { v = DocValues.emptySortedNumeric(context.reader().maxDoc()); @@ -312,6 +327,9 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); + if (context.reader().getIndexSort() != null) { + throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); + } SortedDocValues v = context.reader().getSortedDocValues(field); if (v == null) { v = DocValues.emptySorted(); @@ -352,6 +370,9 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); + if (context.reader().getIndexSort() != null) { + throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); + } SortedSetDocValues v = context.reader().getSortedSetDocValues(field); if (v == null) { v = DocValues.emptySortedSet(); diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java index 447e0aee4cb..d8e79acad77 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java @@ -51,7 +51,7 @@ public final class MultiFields extends Fields { private final ReaderSlice[] subSlices; private final Map terms = new ConcurrentHashMap<>(); - // nocommit should we somehow throw exc if you try to pass in "sorted" Fields? + // nocommit make test for sorted fields /** Returns a single {@link Fields} instance for this * reader, merging fields/terms/docs/positions on the @@ -72,6 +72,9 @@ public final class MultiFields extends Fields { final List slices = new ArrayList<>(leaves.size()); for (final LeafReaderContext ctx : leaves) { final LeafReader r = ctx.reader(); + if (r.getIndexSort() != null) { + throw new IllegalArgumentException("cannot handle index sort: reader=" + r); + } final Fields f = r.fields(); fields.add(f); slices.add(new ReaderSlice(ctx.docBase, r.maxDoc(), fields.size()-1)); @@ -107,6 +110,10 @@ public final class MultiFields extends Fields { for (int i = 0; i < size; i++) { // record all liveDocs, even if they are null final LeafReaderContext ctx = leaves.get(i); + if (ctx.reader().getIndexSort() != null) { + throw new IllegalArgumentException("cannot handle index sort: reader=" + ctx.reader()); + } + liveDocs[i] = ctx.reader().getLiveDocs(); starts[i] = ctx.docBase; } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiReader.java b/lucene/core/src/java/org/apache/lucene/index/MultiReader.java index 8f1bb66ae63..15d170bd518 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiReader.java @@ -65,6 +65,8 @@ public class MultiReader extends BaseCompositeReader { } } + // nocommit what if there is an indexSort? + @Override protected synchronized void doClose() throws IOException { IOException ioe = null; diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java index 3ac64c84c5d..c91fc773408 100644 --- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java @@ -37,7 +37,6 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.IndexWriter; // javadocs import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Term; @@ -803,21 +802,34 @@ public class IndexSearcher { * @lucene.experimental */ public CollectionStatistics collectionStatistics(String field) throws IOException { - final int docCount; - final long sumTotalTermFreq; - final long sumDocFreq; + int docCount = 0; + long sumTotalTermFreq = 0; + long sumDocFreq = 0; assert field != null; - - Terms terms = MultiFields.getTerms(reader, field); - if (terms == null) { - docCount = 0; - sumTotalTermFreq = 0; - sumDocFreq = 0; - } else { - docCount = terms.getDocCount(); - sumTotalTermFreq = terms.getSumTotalTermFreq(); - sumDocFreq = terms.getSumDocFreq(); + + for(LeafReaderContext ctx : reader.leaves()) { + Terms terms = ctx.reader().fields().terms(field); + if (terms != null) { + int subDocCount = terms.getDocCount(); + if (subDocCount == -1) { + docCount = -1; + } else if (docCount != -1) { + docCount += subDocCount; + } + long subSumDocFreq = terms.getSumDocFreq(); + if (subSumDocFreq == -1) { + sumDocFreq = -1; + } else if (sumDocFreq != -1) { + sumDocFreq += subSumDocFreq; + } + long subSumTotalTermFreq = terms.getSumTotalTermFreq(); + if (subSumTotalTermFreq == -1) { + sumTotalTermFreq = -1; + } else if (sumTotalTermFreq != -1) { + sumTotalTermFreq += subSumTotalTermFreq; + } + } } return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 8df81bab264..6e93986c415 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -137,7 +137,7 @@ public class TestIndexSorting extends LuceneTestCase { assertEquals(0, topDocs.totalHits); } else { assertEquals(1, topDocs.totalHits); - assertEquals(i, MultiDocValues.getNumericValues(reader, "id").get(topDocs.scoreDocs[0].doc)); + assertEquals(i, getNumericDocValue(reader, "id", topDocs.scoreDocs[0].doc)); Document document = reader.document(topDocs.scoreDocs[0].doc); assertEquals(Integer.toString(i), document.get("id")); } @@ -148,6 +148,14 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } + private static long getNumericDocValue(IndexReader reader, String field, int docID) throws IOException { + // We can't use MultiDocValues because it gets angry about the sorting: + List leaves = reader.leaves(); + int sub = ReaderUtil.subIndex(docID, leaves); + LeafReaderContext leaf = leaves.get(sub); + return leaf.reader().getNumericDocValues(field).get(docID - leaf.docBase); + } + public void testSortOnMerge() throws IOException { testSortOnMerge(false); } @@ -241,7 +249,7 @@ public class TestIndexSorting extends LuceneTestCase { assertEquals(0, topDocs.totalHits); } else { assertEquals(1, topDocs.totalHits); - assertEquals(values.get(i).longValue(), MultiDocValues.getNumericValues(reader, "foo").get(topDocs.scoreDocs[0].doc)); + assertEquals(values.get(i).longValue(), getNumericDocValue(reader, "foo", topDocs.scoreDocs[0].doc)); } } reader.close(); @@ -335,7 +343,7 @@ public class TestIndexSorting extends LuceneTestCase { for (int i = 0; i < numDocs; ++i) { final TopDocs topDocs = searcher.search(new TermQuery(new Term("id", Integer.toString(i))), 1); assertEquals(1, topDocs.totalHits); - assertEquals(values.get(i).longValue(), MultiDocValues.getNumericValues(reader, "foo").get(topDocs.scoreDocs[0].doc)); + assertEquals(values.get(i).longValue(), getNumericDocValue(reader, "foo", topDocs.scoreDocs[0].doc)); } reader.close(); w.close(); @@ -380,8 +388,8 @@ public class TestIndexSorting extends LuceneTestCase { assertEquals(topDocs.totalHits, topDocs2.totalHits); if (topDocs.totalHits == 1) { assertEquals( - MultiDocValues.getNumericValues(reader, "foo").get(topDocs.scoreDocs[0].doc), - MultiDocValues.getNumericValues(reader2, "foo").get(topDocs2.scoreDocs[0].doc)); + getNumericDocValue(reader, "foo", topDocs.scoreDocs[0].doc), + getNumericDocValue(reader2, "foo", topDocs2.scoreDocs[0].doc)); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java b/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java index 5b70c38a7fa..5ab54483f19 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java @@ -26,6 +26,8 @@ import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -412,4 +414,59 @@ public class TestMultiDocValues extends LuceneTestCase { ir2.close(); dir.close(); } + + public void testNoIndexSort() throws Exception { + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, iwc); + w.addDocument(new Document()); + DirectoryReader.open(w).close(); + w.addDocument(new Document()); + // this makes a sorted segment: + w.forceMerge(1); + // this makes another segment, so that MultiDocValues isn't just a no-op: + w.addDocument(new Document()); + IndexReader r = DirectoryReader.open(w); + + String message = expectThrows(IllegalArgumentException.class, () -> { + MultiDocValues.getDocsWithField(r, "foo"); + }).getMessage(); + assertTrue(message.contains("cannot handle index sort")); + assertTrue(message.contains("indexSort=")); + + message = expectThrows(IllegalArgumentException.class, () -> { + MultiDocValues.getNumericValues(r, "foo"); + }).getMessage(); + assertTrue(message.contains("cannot handle index sort")); + assertTrue(message.contains("indexSort=")); + + message = expectThrows(IllegalArgumentException.class, () -> { + MultiDocValues.getBinaryValues(r, "foo"); + }).getMessage(); + assertTrue(message.contains("cannot handle index sort")); + assertTrue(message.contains("indexSort=")); + + message = expectThrows(IllegalArgumentException.class, () -> { + MultiDocValues.getSortedValues(r, "foo"); + }).getMessage(); + assertTrue(message.contains("cannot handle index sort")); + assertTrue(message.contains("indexSort=")); + + message = expectThrows(IllegalArgumentException.class, () -> { + MultiDocValues.getSortedSetValues(r, "foo"); + }).getMessage(); + assertTrue(message.contains("cannot handle index sort")); + assertTrue(message.contains("indexSort=")); + + message = expectThrows(IllegalArgumentException.class, () -> { + MultiDocValues.getSortedNumericValues(r, "foo"); + }).getMessage(); + assertTrue(message.contains("cannot handle index sort")); + assertTrue(message.contains("indexSort=")); + + r.close(); + w.close(); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMultiFields.java b/lucene/core/src/test/org/apache/lucene/index/TestMultiFields.java index 27f2f1a3699..0aae9a1be90 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestMultiFields.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestMultiFields.java @@ -29,6 +29,8 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -199,4 +201,28 @@ public class TestMultiFields extends LuceneTestCase { r.close(); dir.close(); } + + public void testNoIndexSort() throws Exception { + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, iwc); + w.addDocument(new Document()); + DirectoryReader.open(w).close(); + w.addDocument(new Document()); + // this makes a sorted segment: + w.forceMerge(1); + // this makes another segment, so that MultiFields.getFields isn't just a no-op: + w.addDocument(new Document()); + IndexReader r = DirectoryReader.open(w); + + Exception e = expectThrows(IllegalArgumentException.class, () -> { + MultiFields.getFields(r); + }); + assertTrue(e.getMessage().contains("cannot handle index sort")); + assertTrue(e.getMessage().contains("indexSort=")); + r.close(); + w.close(); + dir.close(); + } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java index 004aef44d94..7124aae0180 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java @@ -48,6 +48,7 @@ import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FilterLeafReader; import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; @@ -586,8 +587,8 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { // We sorted postings by weight during indexing, so we // only retrieve the first num hits now: Collector c2 = new EarlyTerminatingSortingCollector(c, SORT, num); - IndexSearcher searcher = searcherMgr.acquire(); List results = null; + IndexSearcher searcher = searcherMgr.acquire(); try { //System.out.println("got searcher=" + searcher); searcher.search(finalQuery, c2); @@ -607,6 +608,19 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { return results; } + private static BytesRef getBinaryDocValue(IndexReader reader, String field, int docID) throws IOException { + // We can't use MultiDocValues because it gets angry about the sorting: + List leaves = reader.leaves(); + int sub = ReaderUtil.subIndex(docID, leaves); + LeafReaderContext leaf = leaves.get(sub); + BinaryDocValues bdv = leaf.reader().getBinaryDocValues(field); + if (bdv == null) { + return null; + } else { + return bdv.get(docID - leaf.docBase); + } + } + /** * Create the results based on the search hits. * Can be overridden by subclass to add particular behavior (e.g. weight transformation). @@ -621,24 +635,20 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { boolean doHighlight, Set matchedTokens, String prefixToken) throws IOException { - BinaryDocValues textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME); - // This will just be null if app didn't pass payloads to build(): // TODO: maybe just stored fields? they compress... - BinaryDocValues payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads"); List leaves = searcher.getIndexReader().leaves(); List results = new ArrayList<>(); for (int i=0;iearned", results.get(0).highlightKey); assertEquals(10, results.get(0).value); - assertEquals(new BytesRef("foobaz"), results.get(0).payload); + assertEquals("foobaz", results.get(0).payload.utf8ToString()); assertEquals("lend me your ear", results.get(1).key); assertEquals("lend me your ear", results.get(1).highlightKey); From 54650eccf397b20788bd08abaa09cef217b8cd3c Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sat, 7 May 2016 11:45:59 -0400 Subject: [PATCH 04/16] LUCENE-6766: don't throw exc from MultiXXX if there is an index sort --- .../apache/lucene/index/MultiDocValues.java | 21 ------- .../org/apache/lucene/index/MultiFields.java | 9 --- .../org/apache/lucene/index/MultiSorter.java | 2 +- .../apache/lucene/search/IndexSearcher.java | 41 +++++-------- .../apache/lucene/index/TestIndexSorting.java | 18 ++---- .../lucene/index/TestMultiDocValues.java | 57 ------------------- .../apache/lucene/index/TestMultiFields.java | 26 --------- .../analyzing/AnalyzingInfixSuggester.java | 28 +++------ 8 files changed, 30 insertions(+), 172 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java index ae61183527b..33947974bce 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java @@ -78,9 +78,6 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); - if (context.reader().getIndexSort() != null) { - throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); - } NumericDocValues v = context.reader().getNormValues(field); if (v == null) { v = DocValues.emptyNumeric(); @@ -123,9 +120,6 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); - if (context.reader().getIndexSort() != null) { - throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); - } NumericDocValues v = context.reader().getNumericDocValues(field); if (v == null) { v = DocValues.emptyNumeric(); @@ -171,9 +165,6 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); - if (context.reader().getIndexSort() != null) { - throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); - } Bits v = context.reader().getDocsWithField(field); if (v == null) { v = new Bits.MatchNoBits(context.reader().maxDoc()); @@ -219,9 +210,6 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); - if (context.reader().getIndexSort() != null) { - throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); - } BinaryDocValues v = context.reader().getBinaryDocValues(field); if (v == null) { v = DocValues.emptyBinary(); @@ -266,9 +254,6 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); - if (context.reader().getIndexSort() != null) { - throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); - } SortedNumericDocValues v = context.reader().getSortedNumericDocValues(field); if (v == null) { v = DocValues.emptySortedNumeric(context.reader().maxDoc()); @@ -327,9 +312,6 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); - if (context.reader().getIndexSort() != null) { - throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); - } SortedDocValues v = context.reader().getSortedDocValues(field); if (v == null) { v = DocValues.emptySorted(); @@ -370,9 +352,6 @@ public class MultiDocValues { final int[] starts = new int[size+1]; for (int i = 0; i < size; i++) { LeafReaderContext context = leaves.get(i); - if (context.reader().getIndexSort() != null) { - throw new IllegalArgumentException("cannot handle index sort: reader=" + context.reader()); - } SortedSetDocValues v = context.reader().getSortedSetDocValues(field); if (v == null) { v = DocValues.emptySortedSet(); diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java index d8e79acad77..1736bace115 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java @@ -51,8 +51,6 @@ public final class MultiFields extends Fields { private final ReaderSlice[] subSlices; private final Map terms = new ConcurrentHashMap<>(); - // nocommit make test for sorted fields - /** Returns a single {@link Fields} instance for this * reader, merging fields/terms/docs/positions on the * fly. This method will return null if the reader @@ -72,9 +70,6 @@ public final class MultiFields extends Fields { final List slices = new ArrayList<>(leaves.size()); for (final LeafReaderContext ctx : leaves) { final LeafReader r = ctx.reader(); - if (r.getIndexSort() != null) { - throw new IllegalArgumentException("cannot handle index sort: reader=" + r); - } final Fields f = r.fields(); fields.add(f); slices.add(new ReaderSlice(ctx.docBase, r.maxDoc(), fields.size()-1)); @@ -110,10 +105,6 @@ public final class MultiFields extends Fields { for (int i = 0; i < size; i++) { // record all liveDocs, even if they are null final LeafReaderContext ctx = leaves.get(i); - if (ctx.reader().getIndexSort() != null) { - throw new IllegalArgumentException("cannot handle index sort: reader=" + ctx.reader()); - } - liveDocs[i] = ctx.reader().getLiveDocs(); starts[i] = ctx.docBase; } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 062dde9dc5d..7f71eb55880 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -123,7 +123,7 @@ final class MultiSorter { private static CrossReaderComparator getComparator(List readers, SortField sortField) throws IOException { switch(sortField.getType()) { - // TODO: use global ords for string sort + // ncommit: use segment-local ords for string sort case INT: { List values = new ArrayList<>(); diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java index c91fc773408..b81b8079a1c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java @@ -37,6 +37,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.IndexWriter; // javadocs import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Term; @@ -802,35 +803,23 @@ public class IndexSearcher { * @lucene.experimental */ public CollectionStatistics collectionStatistics(String field) throws IOException { - int docCount = 0; - long sumTotalTermFreq = 0; - long sumDocFreq = 0; + final int docCount; + final long sumTotalTermFreq; + final long sumDocFreq; assert field != null; - - for(LeafReaderContext ctx : reader.leaves()) { - Terms terms = ctx.reader().fields().terms(field); - if (terms != null) { - int subDocCount = terms.getDocCount(); - if (subDocCount == -1) { - docCount = -1; - } else if (docCount != -1) { - docCount += subDocCount; - } - long subSumDocFreq = terms.getSumDocFreq(); - if (subSumDocFreq == -1) { - sumDocFreq = -1; - } else if (sumDocFreq != -1) { - sumDocFreq += subSumDocFreq; - } - long subSumTotalTermFreq = terms.getSumTotalTermFreq(); - if (subSumTotalTermFreq == -1) { - sumTotalTermFreq = -1; - } else if (sumTotalTermFreq != -1) { - sumTotalTermFreq += subSumTotalTermFreq; - } - } + + Terms terms = MultiFields.getTerms(reader, field); + if (terms == null) { + docCount = 0; + sumTotalTermFreq = 0; + sumDocFreq = 0; + } else { + docCount = terms.getDocCount(); + sumTotalTermFreq = terms.getSumTotalTermFreq(); + sumDocFreq = terms.getSumDocFreq(); } + return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 6e93986c415..8df81bab264 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -137,7 +137,7 @@ public class TestIndexSorting extends LuceneTestCase { assertEquals(0, topDocs.totalHits); } else { assertEquals(1, topDocs.totalHits); - assertEquals(i, getNumericDocValue(reader, "id", topDocs.scoreDocs[0].doc)); + assertEquals(i, MultiDocValues.getNumericValues(reader, "id").get(topDocs.scoreDocs[0].doc)); Document document = reader.document(topDocs.scoreDocs[0].doc); assertEquals(Integer.toString(i), document.get("id")); } @@ -148,14 +148,6 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } - private static long getNumericDocValue(IndexReader reader, String field, int docID) throws IOException { - // We can't use MultiDocValues because it gets angry about the sorting: - List leaves = reader.leaves(); - int sub = ReaderUtil.subIndex(docID, leaves); - LeafReaderContext leaf = leaves.get(sub); - return leaf.reader().getNumericDocValues(field).get(docID - leaf.docBase); - } - public void testSortOnMerge() throws IOException { testSortOnMerge(false); } @@ -249,7 +241,7 @@ public class TestIndexSorting extends LuceneTestCase { assertEquals(0, topDocs.totalHits); } else { assertEquals(1, topDocs.totalHits); - assertEquals(values.get(i).longValue(), getNumericDocValue(reader, "foo", topDocs.scoreDocs[0].doc)); + assertEquals(values.get(i).longValue(), MultiDocValues.getNumericValues(reader, "foo").get(topDocs.scoreDocs[0].doc)); } } reader.close(); @@ -343,7 +335,7 @@ public class TestIndexSorting extends LuceneTestCase { for (int i = 0; i < numDocs; ++i) { final TopDocs topDocs = searcher.search(new TermQuery(new Term("id", Integer.toString(i))), 1); assertEquals(1, topDocs.totalHits); - assertEquals(values.get(i).longValue(), getNumericDocValue(reader, "foo", topDocs.scoreDocs[0].doc)); + assertEquals(values.get(i).longValue(), MultiDocValues.getNumericValues(reader, "foo").get(topDocs.scoreDocs[0].doc)); } reader.close(); w.close(); @@ -388,8 +380,8 @@ public class TestIndexSorting extends LuceneTestCase { assertEquals(topDocs.totalHits, topDocs2.totalHits); if (topDocs.totalHits == 1) { assertEquals( - getNumericDocValue(reader, "foo", topDocs.scoreDocs[0].doc), - getNumericDocValue(reader2, "foo", topDocs2.scoreDocs[0].doc)); + MultiDocValues.getNumericValues(reader, "foo").get(topDocs.scoreDocs[0].doc), + MultiDocValues.getNumericValues(reader2, "foo").get(topDocs2.scoreDocs[0].doc)); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java b/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java index 5ab54483f19..5b70c38a7fa 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java @@ -26,8 +26,6 @@ import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -414,59 +412,4 @@ public class TestMultiDocValues extends LuceneTestCase { ir2.close(); dir.close(); } - - public void testNoIndexSort() throws Exception { - IndexWriterConfig iwc = newIndexWriterConfig(); - iwc.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); - Directory dir = newDirectory(); - IndexWriter w = new IndexWriter(dir, iwc); - w.addDocument(new Document()); - DirectoryReader.open(w).close(); - w.addDocument(new Document()); - // this makes a sorted segment: - w.forceMerge(1); - // this makes another segment, so that MultiDocValues isn't just a no-op: - w.addDocument(new Document()); - IndexReader r = DirectoryReader.open(w); - - String message = expectThrows(IllegalArgumentException.class, () -> { - MultiDocValues.getDocsWithField(r, "foo"); - }).getMessage(); - assertTrue(message.contains("cannot handle index sort")); - assertTrue(message.contains("indexSort=")); - - message = expectThrows(IllegalArgumentException.class, () -> { - MultiDocValues.getNumericValues(r, "foo"); - }).getMessage(); - assertTrue(message.contains("cannot handle index sort")); - assertTrue(message.contains("indexSort=")); - - message = expectThrows(IllegalArgumentException.class, () -> { - MultiDocValues.getBinaryValues(r, "foo"); - }).getMessage(); - assertTrue(message.contains("cannot handle index sort")); - assertTrue(message.contains("indexSort=")); - - message = expectThrows(IllegalArgumentException.class, () -> { - MultiDocValues.getSortedValues(r, "foo"); - }).getMessage(); - assertTrue(message.contains("cannot handle index sort")); - assertTrue(message.contains("indexSort=")); - - message = expectThrows(IllegalArgumentException.class, () -> { - MultiDocValues.getSortedSetValues(r, "foo"); - }).getMessage(); - assertTrue(message.contains("cannot handle index sort")); - assertTrue(message.contains("indexSort=")); - - message = expectThrows(IllegalArgumentException.class, () -> { - MultiDocValues.getSortedNumericValues(r, "foo"); - }).getMessage(); - assertTrue(message.contains("cannot handle index sort")); - assertTrue(message.contains("indexSort=")); - - r.close(); - w.close(); - dir.close(); - } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMultiFields.java b/lucene/core/src/test/org/apache/lucene/index/TestMultiFields.java index 0aae9a1be90..27f2f1a3699 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestMultiFields.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestMultiFields.java @@ -29,8 +29,6 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -201,28 +199,4 @@ public class TestMultiFields extends LuceneTestCase { r.close(); dir.close(); } - - public void testNoIndexSort() throws Exception { - IndexWriterConfig iwc = newIndexWriterConfig(); - iwc.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); - Directory dir = newDirectory(); - IndexWriter w = new IndexWriter(dir, iwc); - w.addDocument(new Document()); - DirectoryReader.open(w).close(); - w.addDocument(new Document()); - // this makes a sorted segment: - w.forceMerge(1); - // this makes another segment, so that MultiFields.getFields isn't just a no-op: - w.addDocument(new Document()); - IndexReader r = DirectoryReader.open(w); - - Exception e = expectThrows(IllegalArgumentException.class, () -> { - MultiFields.getFields(r); - }); - assertTrue(e.getMessage().contains("cannot handle index sort")); - assertTrue(e.getMessage().contains("indexSort=")); - r.close(); - w.close(); - dir.close(); - } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java index 7124aae0180..16e9406310f 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java @@ -48,7 +48,6 @@ import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FilterLeafReader; import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; @@ -608,19 +607,6 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { return results; } - private static BytesRef getBinaryDocValue(IndexReader reader, String field, int docID) throws IOException { - // We can't use MultiDocValues because it gets angry about the sorting: - List leaves = reader.leaves(); - int sub = ReaderUtil.subIndex(docID, leaves); - LeafReaderContext leaf = leaves.get(sub); - BinaryDocValues bdv = leaf.reader().getBinaryDocValues(field); - if (bdv == null) { - return null; - } else { - return bdv.get(docID - leaf.docBase); - } - } - /** * Create the results based on the search hits. * Can be overridden by subclass to add particular behavior (e.g. weight transformation). @@ -635,20 +621,24 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { boolean doHighlight, Set matchedTokens, String prefixToken) throws IOException { + BinaryDocValues textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME); + // This will just be null if app didn't pass payloads to build(): // TODO: maybe just stored fields? they compress... + BinaryDocValues payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads"); List leaves = searcher.getIndexReader().leaves(); List results = new ArrayList<>(); for (int i=0;i Date: Sat, 7 May 2016 11:46:17 -0400 Subject: [PATCH 05/16] LUCENE-6766: implement STRING sort, using segment-local ordinals --- .../org/apache/lucene/index/MergeState.java | 45 +++++++++---------- .../org/apache/lucene/index/MultiSorter.java | 42 ++++++++++++++++- .../apache/lucene/index/TestIndexSorting.java | 34 ++++++++++++++ .../lucene/index/MockRandomMergePolicy.java | 1 + 4 files changed, 96 insertions(+), 26 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 32e048086a7..31065e31656 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -213,34 +213,29 @@ public class MergeState { //System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort); for (CodecReader leaf : originalReaders) { - if (leaf instanceof SegmentReader) { - SegmentReader segmentReader = (SegmentReader) leaf; - Sort segmentSort = segmentReader.getSegmentInfo().info.getIndexSort(); - //System.out.println(" leaf=" + leaf + " sort=" + segmentSort); + Sort segmentSort = leaf.getIndexSort(); + //System.out.println(" leaf=" + leaf + " sort=" + segmentSort); - if (segmentSort == null) { - // TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live" - // to the files on each indexed document: + if (segmentSort == null) { + // TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live" + // to the files on each indexed document: - // This segment was written by flush, so documents are not yet sorted, so we sort them now: - Sorter.DocMap sortDocMap = sorter.sort(leaf); - if (sortDocMap != null) { - //System.out.println(" sort!"); - // nocommit what about MergedReaderWrapper in here? - leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap)); - leafDocMaps[readers.size()] = new DocMap() { - @Override - public int get(int docID) { - return sortDocMap.oldToNew(docID); - } - }; - } - - } else if (segmentSort.equals(indexSort) == false) { - throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort); + // This segment was written by flush, so documents are not yet sorted, so we sort them now: + Sorter.DocMap sortDocMap = sorter.sort(leaf); + if (sortDocMap != null) { + //System.out.println(" sort!"); + // nocommit what about MergedReaderWrapper in here? + leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap)); + leafDocMaps[readers.size()] = new DocMap() { + @Override + public int get(int docID) { + return sortDocMap.oldToNew(docID); + } + }; } - } else { - throw new IllegalArgumentException("cannot sort index with foreign readers; leaf=" + leaf); + + } else if (segmentSort.equals(indexSort) == false) { + throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort); } readers.add(leaf); diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 7f71eb55880..ca1ebe57780 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -123,7 +123,47 @@ final class MultiSorter { private static CrossReaderComparator getComparator(List readers, SortField sortField) throws IOException { switch(sortField.getType()) { - // ncommit: use segment-local ords for string sort + + case STRING: + { + // this uses the efficient segment-local ordinal map: + MultiReader multiReader = new MultiReader(readers.toArray(new LeafReader[readers.size()])); + final SortedDocValues sorted = MultiDocValues.getSortedValues(multiReader, sortField.getField()); + final int[] docStarts = new int[readers.size()]; + List leaves = multiReader.leaves(); + for(int i=0;i values = new ArrayList<>(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 8df81bab264..1da6c82cc0e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -79,6 +79,40 @@ import org.junit.BeforeClass; public class TestIndexSorting extends LuceneTestCase { + public void testBasicString() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.STRING)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new SortedDocValuesField("foo", new BytesRef("zzz"))); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + doc = new Document(); + doc.add(new SortedDocValuesField("foo", new BytesRef("aaa"))); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new SortedDocValuesField("foo", new BytesRef("mmm"))); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + SortedDocValues values = leaf.getSortedDocValues("foo"); + assertEquals("aaa", values.get(0).utf8ToString()); + assertEquals("mmm", values.get(1).utf8ToString()); + assertEquals("zzz", values.get(2).utf8ToString()); + r.close(); + w.close(); + dir.close(); + } + public void testSortOnMerge(boolean withDeletes) throws IOException { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java b/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java index 93898881416..f32e4d3c118 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java @@ -146,6 +146,7 @@ public class MockRandomMergePolicy extends MergePolicy { @Override public CodecReader wrapForMerge(CodecReader reader) throws IOException { + // wrap it (e.g. prevent bulk merge etc) // TODO: cut this over to FilterCodecReader api, we can explicitly // enable/disable bulk merge for portions of the index we want. From 8098a911beb9786570065c6e810c96453079153d Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sat, 7 May 2016 18:36:13 -0400 Subject: [PATCH 06/16] LUCENE-6766: add float, double --- .../SimpleTextSegmentInfoFormat.java | 36 ++++- .../lucene62/Lucene62SegmentInfoFormat.java | 32 ++++ .../org/apache/lucene/index/CheckIndex.java | 13 +- .../org/apache/lucene/index/MultiSorter.java | 111 ++++++++++++-- .../lucene/index/SortingLeafReader.java | 1 - .../apache/lucene/index/TestIndexSorting.java | 139 ++++++++++++++++++ 6 files changed, 313 insertions(+), 19 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index bf9d3ded573..8ab45be2ede 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -168,6 +168,12 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { case "int": type = SortField.Type.INT; break; + case "double": + type = SortField.Type.DOUBLE; + break; + case "float": + type = SortField.Type.FLOAT; + break; default: throw new CorruptIndexException("unable to parse sort type string: " + typeAsString, input); } @@ -216,6 +222,26 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { break; } break; + case DOUBLE: + switch (missingLastAsString) { + case "null": + missingValue = null; + break; + default: + missingValue = Double.parseDouble(missingLastAsString); + break; + } + break; + case FLOAT: + switch (missingLastAsString) { + case "null": + missingValue = null; + break; + default: + missingValue = Float.parseFloat(missingLastAsString); + break; + } + break; // nocommit need the rest default: throw new AssertionError(); @@ -338,6 +364,12 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { case INT: sortType = "int"; break; + case DOUBLE: + sortType = "double"; + break; + case FLOAT: + sortType = "float"; + break; // nocommit the rest: default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); @@ -358,10 +390,8 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { missing = "first"; } else if (missingValue == SortField.STRING_LAST) { missing = "last"; - } else if (missingValue instanceof Long) { - missing = Long.toString((Long) missingValue); } else { - throw new IllegalStateException("Unexpected missing sort value: " + missingValue); + missing = missingValue.toString(); } SimpleTextUtil.write(output, missing, scratch); SimpleTextUtil.writeNewline(output); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java index 53d273474a8..bb52eebc06e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java @@ -121,6 +121,12 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { case 2: sortType = SortField.Type.INT; break; + case 3: + sortType = SortField.Type.DOUBLE; + break; + case 4: + sortType = SortField.Type.FLOAT; + break; default: throw new CorruptIndexException("invalid index sort field type ID: " + sortTypeID, input); } @@ -163,6 +169,18 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { } missingValue = input.readInt(); break; + case DOUBLE: + if (b != 1) { + throw new CorruptIndexException("invalid missing value flag: " + b, input); + } + missingValue = Double.longBitsToDouble(input.readLong()); + break; + case FLOAT: + if (b != 1) { + throw new CorruptIndexException("invalid missing value flag: " + b, input); + } + missingValue = Float.intBitsToFloat(input.readInt()); + break; default: throw new AssertionError("unhandled sortType=" + sortType); } @@ -240,6 +258,12 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { case INT: sortTypeID = 2; break; + case DOUBLE: + sortTypeID = 3; + break; + case FLOAT: + sortTypeID = 4; + break; // nocommit the rest: default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); @@ -270,6 +294,14 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { output.writeByte((byte) 1); output.writeInt(((Integer) missingValue).intValue()); break; + case DOUBLE: + output.writeByte((byte) 1); + output.writeLong(Double.doubleToLongBits(((Double) missingValue).doubleValue())); + break; + case FLOAT: + output.writeByte((byte) 1); + output.writeLong(Float.floatToIntBits(((Float) missingValue).floatValue())); + break; // nocommit the rest: default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index fb2dc80ce3f..1031d22e22a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -648,6 +648,7 @@ public final class CheckIndex implements Closeable { int toLoseDocCount = info.info.maxDoc(); SegmentReader reader = null; + Sort previousIndexSort = null; try { msg(infoStream, " version=" + (version == null ? "3.0" : version)); @@ -661,6 +662,13 @@ public final class CheckIndex implements Closeable { Sort indexSort = info.info.getIndexSort(); if (indexSort != null) { msg(infoStream, " sort=" + indexSort); + if (previousIndexSort != null) { + if (previousIndexSort.equals(indexSort) == false) { + throw new RuntimeException("index sort changed from " + previousIndexSort + " to " + indexSort); + } + } else { + previousIndexSort = indexSort; + } } segInfoStat.numFiles = info.files().size(); segInfoStat.sizeMB = info.sizeInBytes()/(1024.*1024.); @@ -835,8 +843,6 @@ public final class CheckIndex implements Closeable { for (int i = 0; i < fields.length; i++) { reverseMul[i] = fields[i].getReverse() ? -1 : 1; comparators[i] = fields[i].getComparator(1, i).getLeafComparator(readerContext); - // nocommit we prevent SCORE? - //comparators[i].setScorer(FAKESCORER); } int maxDoc = reader.maxDoc(); @@ -2585,9 +2591,6 @@ public final class CheckIndex implements Closeable { } } - // nocommit must check index is sorted, if it claims to be - // nocommit must check that all segments have the same sort, if any segment is sorted - /** * Parse command line args into fields * @param args The command line arguments diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index ca1ebe57780..4c78aa1233f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -164,6 +164,51 @@ final class MultiSorter { }; } + case LONG: + { + List values = new ArrayList<>(); + List docsWithFields = new ArrayList<>(); + for(CodecReader reader : readers) { + values.add(DocValues.getNumeric(reader, sortField.getField())); + docsWithFields.add(DocValues.getDocsWithField(reader, sortField.getField())); + } + + final int reverseMul; + if (sortField.getReverse()) { + reverseMul = -1; + } else { + reverseMul = 1; + } + + final int missingValue; + + if (sortField.getMissingValue() != null) { + missingValue = (Integer) sortField.getMissingValue(); + } else { + missingValue = 0; + } + + return new CrossReaderComparator() { + @Override + public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) { + long valueA; + if (docsWithFields.get(readerIndexA).get(docIDA)) { + valueA = (int) values.get(readerIndexA).get(docIDA); + } else { + valueA = missingValue; + } + + long valueB; + if (docsWithFields.get(readerIndexB).get(docIDB)) { + valueB = (int) values.get(readerIndexB).get(docIDB); + } else { + valueB = missingValue; + } + return reverseMul * Long.compare(valueA, valueB); + } + }; + } + case INT: { List values = new ArrayList<>(); @@ -208,8 +253,8 @@ final class MultiSorter { } }; } - case LONG: - // nocommit refactor/share at least numerics here: + + case DOUBLE: { List values = new ArrayList<>(); List docsWithFields = new ArrayList<>(); @@ -225,34 +270,80 @@ final class MultiSorter { reverseMul = 1; } - final int missingValue; + final double missingValue; if (sortField.getMissingValue() != null) { - missingValue = (Integer) sortField.getMissingValue(); + missingValue = (Double) sortField.getMissingValue(); } else { - missingValue = 0; + missingValue = 0.0; } return new CrossReaderComparator() { @Override public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) { - long valueA; + double valueA; if (docsWithFields.get(readerIndexA).get(docIDA)) { - valueA = (int) values.get(readerIndexA).get(docIDA); + valueA = Double.longBitsToDouble(values.get(readerIndexA).get(docIDA)); } else { valueA = missingValue; } - long valueB; + double valueB; if (docsWithFields.get(readerIndexB).get(docIDB)) { - valueB = (int) values.get(readerIndexB).get(docIDB); + valueB = Double.longBitsToDouble(values.get(readerIndexB).get(docIDB)); } else { valueB = missingValue; } - return reverseMul * Long.compare(valueA, valueB); + return reverseMul * Double.compare(valueA, valueB); } }; } + + case FLOAT: + { + List values = new ArrayList<>(); + List docsWithFields = new ArrayList<>(); + for(CodecReader reader : readers) { + values.add(DocValues.getNumeric(reader, sortField.getField())); + docsWithFields.add(DocValues.getDocsWithField(reader, sortField.getField())); + } + + final int reverseMul; + if (sortField.getReverse()) { + reverseMul = -1; + } else { + reverseMul = 1; + } + + final float missingValue; + + if (sortField.getMissingValue() != null) { + missingValue = (Float) sortField.getMissingValue(); + } else { + missingValue = 0.0f; + } + + return new CrossReaderComparator() { + @Override + public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) { + float valueA; + if (docsWithFields.get(readerIndexA).get(docIDA)) { + valueA = Float.intBitsToFloat((int) values.get(readerIndexA).get(docIDA)); + } else { + valueA = missingValue; + } + + float valueB; + if (docsWithFields.get(readerIndexB).get(docIDB)) { + valueB = Float.intBitsToFloat((int) values.get(readerIndexB).get(docIDB)); + } else { + valueB = missingValue; + } + return reverseMul * Float.compare(valueA, valueB); + } + }; + } + // nocommit do the rest: default: throw new IllegalArgumentException("unhandled SortField.getType()=" + sortField.getType()); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java index b6558f7fd15..70d5d204439 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java @@ -840,7 +840,6 @@ class SortingLeafReader extends FilterLeafReader { if (inPointValues == null) { return null; } else { - // nocommit make sure this is tested return new SortingPointValues(inPointValues, docMap); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 1da6c82cc0e..4e2606374f0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -36,9 +36,11 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.BinaryPoint; import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoubleDocValuesField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.FloatDocValuesField; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; @@ -72,6 +74,7 @@ import org.junit.BeforeClass; // nocommit test tie break // nocommit test multiple sorts // nocommit test update dvs +// nocommit test missing value // nocommit test EarlyTerminatingCollector @@ -113,6 +116,142 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } + public void testBasicLong() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new NumericDocValuesField("foo", 18)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", -1)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", 7)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + assertEquals(-1, values.get(0)); + assertEquals(7, values.get(1)); + assertEquals(18, values.get(2)); + r.close(); + w.close(); + dir.close(); + } + + public void testBasicInt() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new NumericDocValuesField("foo", 18)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", -1)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", 7)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + assertEquals(-1, values.get(0)); + assertEquals(7, values.get(1)); + assertEquals(18, values.get(2)); + r.close(); + w.close(); + dir.close(); + } + + public void testBasicDouble() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.DOUBLE)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new DoubleDocValuesField("foo", 18.0)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + doc = new Document(); + doc.add(new DoubleDocValuesField("foo", -1.0)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new DoubleDocValuesField("foo", 7.0)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + assertEquals(-1.0, Double.longBitsToDouble(values.get(0)), 0.0); + assertEquals(7.0, Double.longBitsToDouble(values.get(1)), 0.0); + assertEquals(18.0, Double.longBitsToDouble(values.get(2)), 0.0); + r.close(); + w.close(); + dir.close(); + } + + public void testBasicFloat() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.FLOAT)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new FloatDocValuesField("foo", 18.0f)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + doc = new Document(); + doc.add(new FloatDocValuesField("foo", -1.0f)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new FloatDocValuesField("foo", 7.0f)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + assertEquals(-1.0f, Float.intBitsToFloat((int) values.get(0)), 0.0f); + assertEquals(7.0f, Float.intBitsToFloat((int) values.get(1)), 0.0f); + assertEquals(18.0f, Float.intBitsToFloat((int) values.get(2)), 0.0f); + r.close(); + w.close(); + dir.close(); + } + public void testSortOnMerge(boolean withDeletes) throws IOException { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); From 7683f33586df5323fd74d97cb008da2234c6ccf8 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sun, 8 May 2016 05:00:19 -0400 Subject: [PATCH 07/16] LUCENE-6766: remove unused SortField.Type.BYTES; don't pass liveDocs to DocIDMerger (it's redundant with MergeState.DocMap) --- .../lucene/codecs/lucene60/Lucene60Codec.java | 3 -- .../lucene/codecs/DocValuesConsumer.java | 34 +++++++++---------- .../apache/lucene/codecs/NormsConsumer.java | 6 ++-- .../lucene/codecs/StoredFieldsWriter.java | 6 ++-- .../lucene/codecs/TermVectorsWriter.java | 6 ++-- .../org/apache/lucene/index/DocIDMerger.java | 30 +++++++++------- .../org/apache/lucene/index/IndexWriter.java | 2 -- .../lucene/index/IndexWriterConfig.java | 5 ++- .../index/MappingMultiPostingsEnum.java | 6 ++-- .../lucene/index/MergeReaderWrapper.java | 4 +-- .../org/apache/lucene/index/MergeState.java | 4 +-- .../org/apache/lucene/index/MultiSorter.java | 1 - .../org/apache/lucene/index/SegmentInfo.java | 4 --- .../org/apache/lucene/search/SortField.java | 3 -- .../apache/lucene/index/TestDocIDMerger.java | 12 +++---- .../apache/lucene/index/TestIndexSorting.java | 2 +- 16 files changed, 58 insertions(+), 70 deletions(-) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java index 7210b3f0dcf..fd6b6fe82f9 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/Lucene60Codec.java @@ -16,9 +16,6 @@ */ package org.apache.lucene.codecs.lucene60; -// nocommit if index time sorting is in use, don't try to bulk merge ... later we can make crazy bulk merger that looks for long runs from -// one sub? - import java.util.Objects; import org.apache.lucene.codecs.Codec; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java index 79cc42227d4..427b520aa4a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java @@ -251,8 +251,8 @@ public abstract class DocValuesConsumer implements Closeable { private int docID = -1; private final int maxDoc; - public NumericDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, NumericDocValues values, Bits docsWithField, int maxDoc) { - super(docMap, liveDocs); + public NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values, Bits docsWithField, int maxDoc) { + super(docMap); this.values = values; this.docsWithField = docsWithField; this.maxDoc = maxDoc; @@ -285,7 +285,7 @@ public abstract class DocValuesConsumer implements Closeable { List subs = new ArrayList<>(); assert mergeState.docMaps.length == toMerge.size(); for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); @@ -338,8 +338,8 @@ public abstract class DocValuesConsumer implements Closeable { private int docID = -1; private final int maxDoc; - public BinaryDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, BinaryDocValues values, Bits docsWithField, int maxDoc) { - super(docMap, liveDocs); + public BinaryDocValuesSub(MergeState.DocMap docMap, BinaryDocValues values, Bits docsWithField, int maxDoc) { + super(docMap); this.values = values; this.docsWithField = docsWithField; this.maxDoc = maxDoc; @@ -372,7 +372,7 @@ public abstract class DocValuesConsumer implements Closeable { List subs = new ArrayList<>(); assert mergeState.docMaps.length == toMerge.size(); for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); @@ -429,8 +429,8 @@ public abstract class DocValuesConsumer implements Closeable { private int docID = -1; private final int maxDoc; - public SortedNumericDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, SortedNumericDocValues values, int maxDoc) { - super(docMap, liveDocs); + public SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values, int maxDoc) { + super(docMap); this.values = values; this.maxDoc = maxDoc; } @@ -465,7 +465,7 @@ public abstract class DocValuesConsumer implements Closeable { List subs = new ArrayList<>(); assert mergeState.docMaps.length == toMerge.size(); for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); @@ -516,7 +516,7 @@ public abstract class DocValuesConsumer implements Closeable { List subs = new ArrayList<>(); assert mergeState.docMaps.length == toMerge.size(); for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); @@ -581,8 +581,8 @@ public abstract class DocValuesConsumer implements Closeable { private final int maxDoc; private final LongValues map; - public SortedDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, SortedDocValues values, int maxDoc, LongValues map) { - super(docMap, liveDocs); + public SortedDocValuesSub(MergeState.DocMap docMap, SortedDocValues values, int maxDoc, LongValues map) { + super(docMap); this.values = values; this.maxDoc = maxDoc; this.map = map; @@ -678,7 +678,7 @@ public abstract class DocValuesConsumer implements Closeable { List subs = new ArrayList<>(); assert mergeState.docMaps.length == toMerge.size(); for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); @@ -735,8 +735,8 @@ public abstract class DocValuesConsumer implements Closeable { private final int maxDoc; private final LongValues map; - public SortedSetDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, SortedSetDocValues values, int maxDoc, LongValues map) { - super(docMap, liveDocs); + public SortedSetDocValuesSub(MergeState.DocMap docMap, SortedSetDocValues values, int maxDoc, LongValues map) { + super(docMap); this.values = values; this.maxDoc = maxDoc; this.map = map; @@ -837,7 +837,7 @@ public abstract class DocValuesConsumer implements Closeable { List subs = new ArrayList<>(); assert mergeState.docMaps.length == toMerge.size(); for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); @@ -895,7 +895,7 @@ public abstract class DocValuesConsumer implements Closeable { List subs = new ArrayList<>(); assert mergeState.docMaps.length == toMerge.size(); for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java index 76f8be727f5..39d39022a75 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java @@ -107,8 +107,8 @@ public abstract class NormsConsumer implements Closeable { private int docID = -1; private final int maxDoc; - public NumericDocValuesSub(MergeState.DocMap docMap, Bits liveDocs, NumericDocValues values, int maxDoc) { - super(docMap, liveDocs); + public NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values, int maxDoc) { + super(docMap); this.values = values; this.maxDoc = maxDoc; } @@ -142,7 +142,7 @@ public abstract class NormsConsumer implements Closeable { List subs = new ArrayList<>(); assert mergeState.docMaps.length == toMerge.size(); for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java index b76247d7a89..26652aa8231 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java @@ -85,8 +85,8 @@ public abstract class StoredFieldsWriter implements Closeable { private final MergeVisitor visitor; int docID = -1; - public StoredFieldsMergeSub(MergeVisitor visitor, MergeState.DocMap docMap, Bits liveDocs, StoredFieldsReader reader, int maxDoc) { - super(docMap, liveDocs); + public StoredFieldsMergeSub(MergeVisitor visitor, MergeState.DocMap docMap, StoredFieldsReader reader, int maxDoc) { + super(docMap); this.maxDoc = maxDoc; this.reader = reader; this.visitor = visitor; @@ -115,7 +115,7 @@ public abstract class StoredFieldsWriter implements Closeable { for(int i=0;i docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java index 6ab115745b1..81dd095953b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java @@ -169,8 +169,8 @@ public abstract class TermVectorsWriter implements Closeable { private final int maxDoc; int docID = -1; - public TermVectorsMergeSub(MergeState.DocMap docMap, Bits liveDocs, TermVectorsReader reader, int maxDoc) { - super(docMap, liveDocs); + public TermVectorsMergeSub(MergeState.DocMap docMap, TermVectorsReader reader, int maxDoc) { + super(docMap); this.maxDoc = maxDoc; this.reader = reader; } @@ -204,7 +204,7 @@ public abstract class TermVectorsWriter implements Closeable { reader.checkIntegrity(); } // nocommit make sure the else case tested here - subs.add(new TermVectorsMergeSub(mergeState.docMaps[i], mergeState.liveDocs[i], reader, mergeState.maxDocs[i])); + subs.add(new TermVectorsMergeSub(mergeState.docMaps[i], reader, mergeState.maxDocs[i])); } final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java index fdc705660b4..2d4198b0e2b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java @@ -45,12 +45,9 @@ public class DocIDMerger { public static abstract class Sub { public int mappedDocID; final MergeState.DocMap docMap; - final Bits liveDocs; - // nocommit isn't liveDocs redundant? docMap returns -1 for us? - public Sub(MergeState.DocMap docMap, Bits liveDocs) { + public Sub(MergeState.DocMap docMap) { this.docMap = docMap; - this.liveDocs = liveDocs; } /** Returns the next document ID from this sub reader, and {@link DocIdSetIterator#NO_MORE_DOCS} when done */ @@ -93,12 +90,14 @@ public class DocIDMerger { if (docID == NO_MORE_DOCS) { // all docs in this sub were deleted; do not add it to the queue! break; - } else if (sub.liveDocs != null && sub.liveDocs.get(docID) == false) { - // nocommit is it sub's job to skip deleted docs? + } + + int mappedDocID = sub.docMap.get(docID); + if (mappedDocID == -1) { + // doc was deleted continue; } else { - sub.mappedDocID = sub.docMap.get(docID); - assert sub.mappedDocID != -1; + sub.mappedDocID = mappedDocID; queue.add(sub); break; } @@ -133,10 +132,13 @@ public class DocIDMerger { queue.pop(); top = queue.top(); break; - } else if (top.liveDocs != null && top.liveDocs.get(docID) == false) { + } + int mappedDocID = top.docMap.get(docID); + if (mappedDocID == -1) { + // doc was deleted continue; } else { - top.mappedDocID = top.docMap.get(docID); + top.mappedDocID = mappedDocID; top = queue.updateTop(); break; } @@ -162,12 +164,14 @@ public class DocIDMerger { current = subs.get(nextIndex); nextIndex++; continue; - } else if (current.liveDocs != null && current.liveDocs.get(docID) == false) { - // Document is deleted + } + int mappedDocID = current.docMap.get(docID); + if (mappedDocID == -1) { + // doc is deleted continue; } - current.mappedDocID = current.docMap.get(docID); + current.mappedDocID = mappedDocID; return current; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 2254ba4cd05..0289c612743 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -1004,8 +1004,6 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { } } - // nocommit can we improve this from just best effort? - /** Confirms that the incoming index sort (if any) matches the existing index sort (if any). This is unfortunately just best effort, * because it could be the old index only has flushed segments. */ private void validateIndexSort() { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java index a6b559900df..e2957d74316 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -444,11 +444,10 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig { /** We only allow sorting on these types */ private static final EnumSet ALLOWED_INDEX_SORT_TYPES = EnumSet.of(SortField.Type.STRING, - SortField.Type.INT, - SortField.Type.FLOAT, SortField.Type.LONG, + SortField.Type.INT, SortField.Type.DOUBLE, - SortField.Type.BYTES); + SortField.Type.FLOAT); /** * Set the {@link Sort} order to use when merging segments. Note that newly flushed segments will remain unsorted. diff --git a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java index 41c2a46f218..166878d3f8f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java @@ -43,8 +43,8 @@ final class MappingMultiPostingsEnum extends PostingsEnum { private static class MappingPostingsSub extends DocIDMerger.Sub { public PostingsEnum postings; - public MappingPostingsSub(MergeState.DocMap docMap, Bits liveDocs) { - super(docMap, liveDocs); + public MappingPostingsSub(MergeState.DocMap docMap) { + super(docMap); } @Override @@ -62,7 +62,7 @@ final class MappingMultiPostingsEnum extends PostingsEnum { this.field = field; allSubs = new MappingPostingsSub[mergeState.fieldsProducers.length]; for(int i=0;i(subs, allSubs.length, mergeState.segmentInfo.getIndexSort() != null); } diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java index be3513a698e..2401d0fa3ee 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java @@ -29,14 +29,14 @@ import org.apache.lucene.util.Bits; /** This is a hack to make index sorting fast, with a {@link LeafReader} that always returns merge instances when you ask for the codec readers. */ class MergeReaderWrapper extends LeafReader { - final SegmentReader in; + final CodecReader in; final FieldsProducer fields; final NormsProducer norms; final DocValuesProducer docValues; final StoredFieldsReader store; final TermVectorsReader vectors; - MergeReaderWrapper(SegmentReader in) throws IOException { + MergeReaderWrapper(CodecReader in) throws IOException { this.in = in; FieldsProducer fields = in.getPostingsReader(); diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 31065e31656..95609837ff5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -40,7 +40,6 @@ import org.apache.lucene.util.packed.PackedLongValues; public class MergeState { /** Maps document IDs from old segments to document IDs in the new segment */ - // nocommit in the unsorted case, this should map correctly, e.g. apply per segment docBase public final DocMap[] docMaps; // nocommit can we somehow not need to expose this? should IW's reader pool always sort on load...? @@ -224,8 +223,7 @@ public class MergeState { Sorter.DocMap sortDocMap = sorter.sort(leaf); if (sortDocMap != null) { //System.out.println(" sort!"); - // nocommit what about MergedReaderWrapper in here? - leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap)); + leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(new MergeReaderWrapper(leaf), sortDocMap)); leafDocMaps[readers.size()] = new DocMap() { @Override public int get(int docID) { diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 4c78aa1233f..39ef8d8f214 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -344,7 +344,6 @@ final class MultiSorter { }; } - // nocommit do the rest: default: throw new IllegalArgumentException("unhandled SortField.getType()=" + sortField.getType()); } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java index 5830201e087..ec12365e958 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java @@ -198,10 +198,6 @@ public final class SegmentInfo { s.append('/').append(delCount); } - // nocommit does search time "do the right thing" automatically when segment is sorted? - - // nocommit remove sorter_key from diagnostics - if (indexSort != null) { s.append(":[indexSort="); s.append(indexSort); diff --git a/lucene/core/src/java/org/apache/lucene/search/SortField.java b/lucene/core/src/java/org/apache/lucene/search/SortField.java index 880697bd605..412a50ab4b4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SortField.java +++ b/lucene/core/src/java/org/apache/lucene/search/SortField.java @@ -77,9 +77,6 @@ public class SortField { * uses ordinals to do the sorting. */ STRING_VAL, - /** Sort use byte[] index values. */ - BYTES, - /** Force rewriting of SortField using {@link SortField#rewrite(IndexSearcher)} * before it can be used for sorting */ REWRITEABLE diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java index 9251b00d349..949deced60c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java @@ -33,8 +33,8 @@ public class TestDocIDMerger extends LuceneTestCase { final int valueStart; final int maxDoc; - public TestSubUnsorted(MergeState.DocMap docMap, Bits liveDocs, int maxDoc, int valueStart) { - super(docMap, liveDocs); + public TestSubUnsorted(MergeState.DocMap docMap, int maxDoc, int valueStart) { + super(docMap); this.maxDoc = maxDoc; this.valueStart = valueStart; } @@ -67,7 +67,7 @@ public class TestDocIDMerger extends LuceneTestCase { public int get(int docID) { return docBase + docID; } - }, null, maxDoc, valueStart)); + }, maxDoc, valueStart)); valueStart += maxDoc; } @@ -92,8 +92,8 @@ public class TestDocIDMerger extends LuceneTestCase { final int maxDoc; final int index; - public TestSubSorted(MergeState.DocMap docMap, Bits liveDocs, int maxDoc, int index) { - super(docMap, liveDocs); + public TestSubSorted(MergeState.DocMap docMap, int maxDoc, int index) { + super(docMap); this.maxDoc = maxDoc; this.index = index; } @@ -155,7 +155,7 @@ public class TestDocIDMerger extends LuceneTestCase { public int get(int docID) { return docMap[docID]; } - }, null, docMap.length, i)); + }, docMap.length, i)); } // nocommit test w/ deletions too diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 4e2606374f0..15d18fbb11c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -574,7 +574,7 @@ public class TestIndexSorting extends LuceneTestCase { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { iwc.setIndexSort(Sort.RELEVANCE); }); - assertEquals("invalid SortField type: must be one of [STRING, INT, FLOAT, LONG, DOUBLE, BYTES] but got: ", expected.getMessage()); + assertEquals("invalid SortField type: must be one of [STRING, INT, FLOAT, LONG, DOUBLE] but got: ", expected.getMessage()); } // you can't change the index sort on an existing index: From da473399a310c3b485ca14257db230e3696b4e7f Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sun, 8 May 2016 06:20:59 -0400 Subject: [PATCH 08/16] LUCENE-6766: simplify test case --- .../lucene62/Lucene62SegmentInfoFormat.java | 2 - .../apache/lucene/index/TestIndexSorting.java | 364 ++++-------------- 2 files changed, 82 insertions(+), 284 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java index bb52eebc06e..762b2c084bc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java @@ -264,7 +264,6 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { case FLOAT: sortTypeID = 4; break; - // nocommit the rest: default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); } @@ -302,7 +301,6 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { output.writeByte((byte) 1); output.writeLong(Float.floatToIntBits(((Float) missingValue).floatValue())); break; - // nocommit the rest: default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 15d18fbb11c..278aadc333f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -22,9 +22,11 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; +import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.atomic.AtomicInteger; @@ -610,7 +612,7 @@ public class TestIndexSorting extends LuceneTestCase { @Override public long computeNorm(FieldInvertState state) { - if (state.getName().equals(NORMS_FIELD)) { + if (state.getName().equals("norms")) { return Float.floatToIntBits(state.getBoost()); } else { return in.computeNorm(state); @@ -650,7 +652,7 @@ public class TestIndexSorting extends LuceneTestCase { } clearAttributes(); - term.append(DOC_POSITIONS_TERM); + term.append("#all#"); payload.setPayload(new BytesRef(Integer.toString(pos))); offset.setOffset(off, off); --pos; @@ -664,296 +666,94 @@ public class TestIndexSorting extends LuceneTestCase { } } - private static Directory dir; - private static IndexReader sortedReader; + // nocommit testrandom1 with deletions - private static final FieldType TERM_VECTORS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); - static { - TERM_VECTORS_TYPE.setStoreTermVectors(true); - TERM_VECTORS_TYPE.freeze(); - } - - private static final FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); - static { - POSITIONS_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - POSITIONS_TYPE.freeze(); - } - - private static final String ID_FIELD = "id"; - private static final String DOCS_ENUM_FIELD = "docs"; - private static final String DOCS_ENUM_TERM = "$all$"; - private static final String DOC_POSITIONS_FIELD = "positions"; - private static final String DOC_POSITIONS_TERM = "$all$"; - private static final String NUMERIC_DV_FIELD = "numeric"; - private static final String SORTED_NUMERIC_DV_FIELD = "sorted_numeric"; - private static final String NORMS_FIELD = "norm"; - private static final String BINARY_DV_FIELD = "binary"; - private static final String SORTED_DV_FIELD = "sorted"; - private static final String SORTED_SET_DV_FIELD = "sorted_set"; - private static final String TERM_VECTORS_FIELD = "term_vectors"; - private static final String DIMENSIONAL_FIELD = "numeric1d"; - - private static Document doc(final int id, PositionsTokenStream positions) { - final Document doc = new Document(); - doc.add(new StringField(ID_FIELD, Integer.toString(id), Store.YES)); - doc.add(new StringField(DOCS_ENUM_FIELD, DOCS_ENUM_TERM, Store.NO)); - positions.setId(id); - doc.add(new Field(DOC_POSITIONS_FIELD, positions, POSITIONS_TYPE)); - doc.add(new NumericDocValuesField(NUMERIC_DV_FIELD, id)); - TextField norms = new TextField(NORMS_FIELD, Integer.toString(id), Store.NO); - norms.setBoost(Float.intBitsToFloat(id)); - doc.add(norms); - doc.add(new BinaryDocValuesField(BINARY_DV_FIELD, new BytesRef(Integer.toString(id)))); - doc.add(new SortedDocValuesField(SORTED_DV_FIELD, new BytesRef(Integer.toString(id)))); - doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id)))); - doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id + 1)))); - doc.add(new SortedNumericDocValuesField(SORTED_NUMERIC_DV_FIELD, id)); - doc.add(new SortedNumericDocValuesField(SORTED_NUMERIC_DV_FIELD, id + 1)); - doc.add(new Field(TERM_VECTORS_FIELD, Integer.toString(id), TERM_VECTORS_TYPE)); - byte[] bytes = new byte[4]; - NumericUtils.intToSortableBytes(id, bytes, 0); - doc.add(new BinaryPoint(DIMENSIONAL_FIELD, bytes)); - return doc; - } - - @AfterClass - public static void afterClass() throws Exception { - if (sortedReader != null) { - sortedReader.close(); - sortedReader = null; - } - if (dir != null) { - dir.close(); - dir = null; - } - } - - @BeforeClass - public static void createIndex() throws Exception { - dir = newFSDirectory(createTempDir()); + public void testRandom1() throws Exception { int numDocs = atLeast(100); - List ids = new ArrayList<>(); - for (int i = 0; i < numDocs; i++) { - ids.add(Integer.valueOf(i * 10)); + FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); + POSITIONS_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + POSITIONS_TYPE.freeze(); + + FieldType TERM_VECTORS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); + TERM_VECTORS_TYPE.setStoreTermVectors(true); + TERM_VECTORS_TYPE.freeze(); + + List docs = new ArrayList<>(); + for (int i=0;i Date: Sun, 8 May 2016 06:41:55 -0400 Subject: [PATCH 09/16] LUCENE_6766: add missing first/last tests --- .../lucene62/Lucene62SegmentInfoFormat.java | 2 +- .../org/apache/lucene/index/MultiSorter.java | 12 +- .../apache/lucene/index/SegmentMerger.java | 7 + .../apache/lucene/index/TestIndexSorting.java | 381 +++++++++++++++++- 4 files changed, 383 insertions(+), 19 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java index 762b2c084bc..da1959419e3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java @@ -299,7 +299,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { break; case FLOAT: output.writeByte((byte) 1); - output.writeLong(Float.floatToIntBits(((Float) missingValue).floatValue())); + output.writeInt(Float.floatToIntBits(((Float) missingValue).floatValue())); break; default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 39ef8d8f214..1c67fd5cb34 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -136,9 +136,9 @@ final class MultiSorter { } final int missingOrd; if (sortField.getMissingValue() == SortField.STRING_LAST) { - missingOrd = Integer.MIN_VALUE; - } else { missingOrd = Integer.MAX_VALUE; + } else { + missingOrd = Integer.MIN_VALUE; } final int reverseMul; @@ -180,10 +180,10 @@ final class MultiSorter { reverseMul = 1; } - final int missingValue; + final long missingValue; if (sortField.getMissingValue() != null) { - missingValue = (Integer) sortField.getMissingValue(); + missingValue = (Long) sortField.getMissingValue(); } else { missingValue = 0; } @@ -193,14 +193,14 @@ final class MultiSorter { public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) { long valueA; if (docsWithFields.get(readerIndexA).get(docIDA)) { - valueA = (int) values.get(readerIndexA).get(docIDA); + valueA = values.get(readerIndexA).get(docIDA); } else { valueA = missingValue; } long valueB; if (docsWithFields.get(readerIndexB).get(docIDB)) { - valueB = (int) values.get(readerIndexB).get(docIDB); + valueB = values.get(readerIndexB).get(docIDB); } else { valueB = missingValue; } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index b0d9bcff50b..0cc1823def7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -48,6 +48,8 @@ final class SegmentMerger { final MergeState mergeState; private final FieldInfos.Builder fieldInfosBuilder; + // nocommit make sure infoStream states per-segment-being-merged if they are already sorted + // note, just like in codec apis Directory 'dir' is NOT the same as segmentInfo.dir!! SegmentMerger(List readers, SegmentInfo segmentInfo, InfoStream infoStream, Directory dir, FieldInfos.FieldNumbers fieldNumbers, IOContext context) throws IOException { @@ -59,6 +61,11 @@ final class SegmentMerger { this.codec = segmentInfo.getCodec(); this.context = context; this.fieldInfosBuilder = new FieldInfos.Builder(fieldNumbers); + if (mergeState.infoStream.isEnabled("SM")) { + if (segmentInfo.getIndexSort() != null) { + mergeState.infoStream.message("SM", "index sort during merge: " + segmentInfo.getIndexSort()); + } + } } /** True if any merging should happen */ diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 278aadc333f..3eb30ec77c3 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -118,6 +118,76 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } + public void testMissingStringFirst() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortField("foo", SortField.Type.STRING); + sortField.setMissingValue(SortField.STRING_FIRST); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new SortedDocValuesField("foo", new BytesRef("zzz"))); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + // missing + w.addDocument(new Document()); + w.commit(); + + doc = new Document(); + doc.add(new SortedDocValuesField("foo", new BytesRef("mmm"))); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + SortedDocValues values = leaf.getSortedDocValues("foo"); + assertEquals(-1, values.getOrd(0)); + assertEquals("mmm", values.get(1).utf8ToString()); + assertEquals("zzz", values.get(2).utf8ToString()); + r.close(); + w.close(); + dir.close(); + } + + public void testMissingStringLast() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortField("foo", SortField.Type.STRING); + sortField.setMissingValue(SortField.STRING_LAST); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new SortedDocValuesField("foo", new BytesRef("zzz"))); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + // missing + w.addDocument(new Document()); + w.commit(); + + doc = new Document(); + doc.add(new SortedDocValuesField("foo", new BytesRef("mmm"))); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + SortedDocValues values = leaf.getSortedDocValues("foo"); + assertEquals("mmm", values.get(0).utf8ToString()); + assertEquals("zzz", values.get(1).utf8ToString()); + assertEquals(-1, values.getOrd(2)); + r.close(); + w.close(); + dir.close(); + } + public void testBasicLong() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); @@ -152,6 +222,80 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } + public void testMissingLongFirst() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortField("foo", SortField.Type.LONG); + sortField.setMissingValue(Long.valueOf(Long.MIN_VALUE)); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new NumericDocValuesField("foo", 18)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + // missing + w.addDocument(new Document()); + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", 7)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + Bits docsWithField = leaf.getDocsWithField("foo"); + assertEquals(0, values.get(0)); + assertFalse(docsWithField.get(0)); + assertEquals(7, values.get(1)); + assertEquals(18, values.get(2)); + r.close(); + w.close(); + dir.close(); + } + + public void testMissingLongLast() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortField("foo", SortField.Type.LONG); + sortField.setMissingValue(Long.valueOf(Long.MAX_VALUE)); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new NumericDocValuesField("foo", 18)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + // missing + w.addDocument(new Document()); + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", 7)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + Bits docsWithField = leaf.getDocsWithField("foo"); + assertEquals(7, values.get(0)); + assertEquals(18, values.get(1)); + assertEquals(0, values.get(2)); + assertFalse(docsWithField.get(2)); + r.close(); + w.close(); + dir.close(); + } + public void testBasicInt() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); @@ -186,6 +330,80 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } + public void testMissingIntFirst() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortField("foo", SortField.Type.INT); + sortField.setMissingValue(Integer.valueOf(Integer.MIN_VALUE)); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new NumericDocValuesField("foo", 18)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + // missing + w.addDocument(new Document()); + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", 7)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + Bits docsWithField = leaf.getDocsWithField("foo"); + assertEquals(0, values.get(0)); + assertFalse(docsWithField.get(0)); + assertEquals(7, values.get(1)); + assertEquals(18, values.get(2)); + r.close(); + w.close(); + dir.close(); + } + + public void testMissingIntLast() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortField("foo", SortField.Type.INT); + sortField.setMissingValue(Integer.valueOf(Integer.MAX_VALUE)); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new NumericDocValuesField("foo", 18)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + // missing + w.addDocument(new Document()); + w.commit(); + + doc = new Document(); + doc.add(new NumericDocValuesField("foo", 7)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + Bits docsWithField = leaf.getDocsWithField("foo"); + assertEquals(7, values.get(0)); + assertEquals(18, values.get(1)); + assertEquals(0, values.get(2)); + assertFalse(docsWithField.get(2)); + r.close(); + w.close(); + dir.close(); + } + public void testBasicDouble() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); @@ -220,6 +438,80 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } + public void testMissingDoubleFirst() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortField("foo", SortField.Type.DOUBLE); + sortField.setMissingValue(Double.NEGATIVE_INFINITY); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new DoubleDocValuesField("foo", 18.0)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + // missing + w.addDocument(new Document()); + w.commit(); + + doc = new Document(); + doc.add(new DoubleDocValuesField("foo", 7.0)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + Bits docsWithField = leaf.getDocsWithField("foo"); + assertEquals(0.0, Double.longBitsToDouble(values.get(0)), 0.0); + assertFalse(docsWithField.get(0)); + assertEquals(7.0, Double.longBitsToDouble(values.get(1)), 0.0); + assertEquals(18.0, Double.longBitsToDouble(values.get(2)), 0.0); + r.close(); + w.close(); + dir.close(); + } + + public void testMissingDoubleLast() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortField("foo", SortField.Type.DOUBLE); + sortField.setMissingValue(Double.POSITIVE_INFINITY); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new DoubleDocValuesField("foo", 18.0)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + // missing + w.addDocument(new Document()); + w.commit(); + + doc = new Document(); + doc.add(new DoubleDocValuesField("foo", 7.0)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + Bits docsWithField = leaf.getDocsWithField("foo"); + assertEquals(7.0, Double.longBitsToDouble(values.get(0)), 0.0); + assertEquals(18.0, Double.longBitsToDouble(values.get(1)), 0.0); + assertEquals(0.0, Double.longBitsToDouble(values.get(2)), 0.0); + assertFalse(docsWithField.get(2)); + r.close(); + w.close(); + dir.close(); + } + public void testBasicFloat() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); @@ -254,7 +546,82 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } - public void testSortOnMerge(boolean withDeletes) throws IOException { + public void testMissingFloatFirst() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortField("foo", SortField.Type.FLOAT); + sortField.setMissingValue(Float.NEGATIVE_INFINITY); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new FloatDocValuesField("foo", 18.0f)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + // missing + w.addDocument(new Document()); + w.commit(); + + doc = new Document(); + doc.add(new FloatDocValuesField("foo", 7.0f)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + Bits docsWithField = leaf.getDocsWithField("foo"); + assertEquals(0.0f, Float.intBitsToFloat((int) values.get(0)), 0.0f); + assertFalse(docsWithField.get(0)); + assertEquals(7.0f, Float.intBitsToFloat((int) values.get(1)), 0.0f); + assertEquals(18.0f, Float.intBitsToFloat((int) values.get(2)), 0.0f); + r.close(); + w.close(); + dir.close(); + } + + public void testMissingFloatLast() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortField("foo", SortField.Type.FLOAT); + sortField.setMissingValue(Float.POSITIVE_INFINITY); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new FloatDocValuesField("foo", 18.0f)); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + // missing + w.addDocument(new Document()); + w.commit(); + + doc = new Document(); + doc.add(new FloatDocValuesField("foo", 7.0f)); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + NumericDocValues values = leaf.getNumericDocValues("foo"); + Bits docsWithField = leaf.getDocsWithField("foo"); + assertEquals(7.0f, Float.intBitsToFloat((int) values.get(0)), 0.0f); + assertEquals(18.0f, Float.intBitsToFloat((int) values.get(1)), 0.0f); + assertEquals(0.0f, Float.intBitsToFloat((int) values.get(2)), 0.0f); + assertFalse(docsWithField.get(2)); + r.close(); + w.close(); + dir.close(); + } + + public void testRandom1() throws IOException { + boolean withDeletes = random().nextBoolean(); Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); @@ -323,14 +690,6 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } - public void testSortOnMerge() throws IOException { - testSortOnMerge(false); - } - - public void testSortOnMergeWithDeletes() throws IOException { - testSortOnMerge(true); - } - static class UpdateRunnable implements Runnable { private final int numDocs; @@ -666,9 +1025,7 @@ public class TestIndexSorting extends LuceneTestCase { } } - // nocommit testrandom1 with deletions - - public void testRandom1() throws Exception { + public void testRandom2() throws Exception { int numDocs = atLeast(100); FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED); From 87690f8b13b1def6c822ba36a42e4cb6939ab4c2 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sun, 8 May 2016 07:58:20 -0400 Subject: [PATCH 10/16] LUCENE-6766: add another random test case; move early terminating collector to core --- .../org/apache/lucene/index/DocIDMerger.java | 1 + .../org/apache/lucene/index/MergeState.java | 6 +- .../org/apache/lucene/index/MultiSorter.java | 2 + .../EarlyTerminatingSortingCollector.java | 2 - .../apache/lucene/index/TestIndexSorting.java | 186 ++++++++++++++++++ .../TestEarlyTerminatingSortingCollector.java | 0 6 files changed, 190 insertions(+), 7 deletions(-) rename lucene/{misc => core}/src/java/org/apache/lucene/search/EarlyTerminatingSortingCollector.java (99%) rename lucene/{misc => core}/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java (100%) diff --git a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java index 2d4198b0e2b..eec3301036f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java @@ -58,6 +58,7 @@ public class DocIDMerger { this.subs = subs; if (indexIsSorted) { + // nocommit: we could optimize the (silly) single-sub case and pretend it's a concatenation instead queue = new PriorityQueue(maxCount) { @Override protected boolean lessThan(Sub a, Sub b) { diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 95609837ff5..63eab98e548 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -209,20 +209,16 @@ public class MergeState { final Sorter sorter = new Sorter(indexSort); List readers = new ArrayList<>(originalReaders.size()); - //System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort); - for (CodecReader leaf : originalReaders) { Sort segmentSort = leaf.getIndexSort(); - //System.out.println(" leaf=" + leaf + " sort=" + segmentSort); if (segmentSort == null) { // TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live" - // to the files on each indexed document: + // to their index files on each indexed document: // This segment was written by flush, so documents are not yet sorted, so we sort them now: Sorter.DocMap sortDocMap = sorter.sort(leaf); if (sortDocMap != null) { - //System.out.println(" sort!"); leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(new MergeReaderWrapper(leaf), sortDocMap)); leafDocMaps[readers.size()] = new DocMap() { @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 1c67fd5cb34..3448c9064c5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -36,6 +36,8 @@ final class MultiSorter { * documents into the merged segment. The documents for each incoming leaf reader must already be sorted by the same sort! */ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOException { + // nocommit optimize if only 1 reader is incoming + SortField fields[] = sort.getSort(); final CrossReaderComparator[] comparators = new CrossReaderComparator[fields.length]; for(int i=0;i docs = new ArrayList<>(); + + Sort sort = randomSort(); + if (VERBOSE) { + System.out.println("TEST: numDocs=" + numDocs + " use sort=" + sort); + } + + // no index sorting, all search-time sorting: + Directory dir1 = newFSDirectory(createTempDir()); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); + IndexWriter w1 = new IndexWriter(dir1, iwc1); + + // use index sorting: + Directory dir2 = newFSDirectory(createTempDir()); + IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc2.setIndexSort(sort); + IndexWriter w2 = new IndexWriter(dir2, iwc2); + + for(int id=0;id Date: Sun, 8 May 2016 08:03:11 -0400 Subject: [PATCH 11/16] LUCENE-6766: add deletions to random test --- .../org/apache/lucene/index/TestIndexSorting.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index e14606ee2be..ba171319d40 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -1214,6 +1214,10 @@ public class TestIndexSorting extends LuceneTestCase { iwc2.setIndexSort(sort); IndexWriter w2 = new IndexWriter(dir2, iwc2); + Set toDelete = new HashSet<>(); + + double deleteChance = random().nextDouble(); + for(int id=0;id Date: Tue, 10 May 2016 04:49:33 -0400 Subject: [PATCH 12/16] LUCENE-6766: resolve remaining nocommits; add more IW infoStream logging during merge --- .../SimpleTextSegmentInfoFormat.java | 2 - .../lucene/codecs/TermVectorsWriter.java | 1 - .../lucene/codecs/lucene62/Lucene62Codec.java | 2 - .../lucene62/Lucene62SegmentInfoFormat.java | 6 +- .../org/apache/lucene/index/DocIDMerger.java | 17 +- .../org/apache/lucene/index/DocValues.java | 2 +- .../org/apache/lucene/index/IndexWriter.java | 22 +- .../org/apache/lucene/index/MergeState.java | 23 +- .../lucene/index/MultiPostingsEnum.java | 2 - .../org/apache/lucene/index/MultiReader.java | 2 - .../org/apache/lucene/index/MultiSorter.java | 2 +- .../apache/lucene/index/SegmentMerger.java | 2 - .../apache/lucene/index/TestAddIndexes.java | 51 ++++ .../apache/lucene/index/TestDocIDMerger.java | 38 ++- .../apache/lucene/index/TestIndexSorting.java | 86 +++++-- .../TestEarlyTerminatingSortingCollector.java | 2 - .../search/BlockJoinComparatorSource.java | 225 ------------------ .../lucene/index/TestBlockJoinSorter.java | 128 ---------- 18 files changed, 195 insertions(+), 418 deletions(-) delete mode 100644 lucene/misc/src/java/org/apache/lucene/search/BlockJoinComparatorSource.java delete mode 100644 lucene/misc/src/test/org/apache/lucene/index/TestBlockJoinSorter.java diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index 8ab45be2ede..146e92a6a29 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -242,7 +242,6 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { break; } break; - // nocommit need the rest default: throw new AssertionError(); } @@ -370,7 +369,6 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { case FLOAT: sortType = "float"; break; - // nocommit the rest: default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java index 81dd095953b..5756d5beb87 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java @@ -203,7 +203,6 @@ public abstract class TermVectorsWriter implements Closeable { if (reader != null) { reader.checkIntegrity(); } - // nocommit make sure the else case tested here subs.add(new TermVectorsMergeSub(mergeState.docMaps[i], reader, mergeState.maxDocs[i])); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java index aa0adaed8cd..50710752694 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62Codec.java @@ -41,8 +41,6 @@ import org.apache.lucene.codecs.lucene60.Lucene60PointsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; -// nocommit if somehow this does NOT land in 6.2, rename all this!! - /** * Implements the Lucene 6.2 index format, with configurable per-field postings * and docvalues formats. diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java index da1959419e3..fe78572680c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java @@ -37,8 +37,6 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.Version; -// nocommit fixup javadocs here: - /** * Lucene 6.2 Segment info format. *

          @@ -54,7 +52,9 @@ import org.apache.lucene.util.Version; *

        • Files --> {@link DataOutput#writeSetOfStrings Set<String>}
        • *
        • Diagnostics,Attributes --> {@link DataOutput#writeMapOfStrings Map<String,String>}
        • *
        • IsCompoundFile --> {@link DataOutput#writeByte Int8}
        • - *
        • IndexSort --> {@link DataOutput#writeInt Int32} count, followed by {@code count} SortField
        • + *
        • IndexSort --> {@link DataOutput#writeVInt Int32} count, followed by {@code count} SortField
        • + *
        • SortField --> {@link DataOutput#writeString String} field name, followed by {@link DataOutput#writeVInt Int32} sort type ID, + * followed by {@link DataOutput#writeByte Int8} indicatating reversed sort, followed by a type-specific encoding of the optional missing value *
        • Footer --> {@link CodecUtil#writeFooter CodecFooter}
        • *
        * Field Descriptions: diff --git a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java index eec3301036f..e8ffc6c955e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java @@ -58,7 +58,6 @@ public class DocIDMerger { this.subs = subs; if (indexIsSorted) { - // nocommit: we could optimize the (silly) single-sub case and pretend it's a concatenation instead queue = new PriorityQueue(maxCount) { @Override protected boolean lessThan(Sub a, Sub b) { @@ -74,7 +73,6 @@ public class DocIDMerger { reset(); } - // nocommit it's awkward that we must pass in this boolean, when the subs should "know" this based on what docMap they have? public DocIDMerger(List subs, boolean indexIsSorted) { this(subs, subs.size(), indexIsSorted); } @@ -82,9 +80,8 @@ public class DocIDMerger { /** Reuse API, currently only used by postings during merge */ public void reset() { if (queue != null) { + // caller may not have fully consumed the queue: queue.clear(); - // nocommit why does bloom filter wrapper trip this? - // assert queue.size() == 0: "queue.size() = " + queue.size(); for(T sub : subs) { while (true) { int docID = sub.nextDoc(); @@ -105,14 +102,12 @@ public class DocIDMerger { } } first = true; + } else if (subs.size() > 0) { + current = subs.get(0); + nextIndex = 1; } else { - if (subs.size() > 0) { - current = subs.get(0); - nextIndex = 1; - } else { - current = null; - nextIndex = 0; - } + current = null; + nextIndex = 0; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValues.java b/lucene/core/src/java/org/apache/lucene/index/DocValues.java index feceb3bd3ff..15b15c6c72b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValues.java @@ -210,7 +210,7 @@ public final class DocValues { (expected.length == 1 ? "(expected=" + expected[0] : "(expected one of " + Arrays.toString(expected)) + "). " + - "Use UninvertingReader or index with docvalues."); + " Re-index with correct docvalues type."); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 0289c612743..159f5917c42 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.index; -// nocommit must add sorted indices to back compat tests - import java.io.Closeable; import java.io.FileNotFoundException; import java.io.IOException; @@ -2490,9 +2488,9 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @throws IllegalArgumentException if addIndexes would cause - * the index to exceed {@link #MAX_DOCS} + * the index to exceed {@link #MAX_DOCS}, or if the indoming + * index sort does not match this index's index sort */ - // nocommit doesn't support index sorting? or sorts must be the same? public void addIndexes(Directory... dirs) throws IOException { ensureOpen(); @@ -2500,6 +2498,8 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { List locks = acquireWriteLocks(dirs); + Sort indexSort = config.getIndexSort(); + boolean successTop = false; try { @@ -2532,6 +2532,13 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { for (SegmentCommitInfo info : sis) { assert !infos.contains(info): "dup info dir=" + info.info.dir + " name=" + info.info.name; + Sort segmentIndexSort = info.info.getIndexSort(); + + if (indexSort != null && segmentIndexSort != null && indexSort.equals(segmentIndexSort) == false) { + // TODO: we could make this smarter, e.g. if the incoming indexSort is congruent with our sort ("starts with") then it's OK + throw new IllegalArgumentException("cannot change index sort from " + segmentIndexSort + " to " + indexSort); + } + String newSegName = newSegmentName(); if (infoStream.isEnabled("IW")) { @@ -2622,13 +2629,14 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { * @throws IllegalArgumentException * if addIndexes would cause the index to exceed {@link #MAX_DOCS} */ - // nocommit make sure if you add "sorted by X" to "sorted by Y" index, we catch it public void addIndexes(CodecReader... readers) throws IOException { ensureOpen(); // long so we can detect int overflow: long numDocs = 0; + Sort indexSort = config.getIndexSort(); + try { if (infoStream.isEnabled("IW")) { infoStream.message("IW", "flush at addIndexes(CodecReader...)"); @@ -2638,6 +2646,10 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { String mergedName = newSegmentName(); for (CodecReader leaf : readers) { numDocs += leaf.numDocs(); + Sort leafIndexSort = leaf.getIndexSort(); + if (indexSort != null && leafIndexSort != null && indexSort.equals(leafIndexSort) == false) { + throw new IllegalArgumentException("cannot change index sort from " + leafIndexSort + " to " + indexSort); + } } // Best-effort up front check: diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 63eab98e548..7737ff2cc40 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -42,8 +42,8 @@ public class MergeState { /** Maps document IDs from old segments to document IDs in the new segment */ public final DocMap[] docMaps; - // nocommit can we somehow not need to expose this? should IW's reader pool always sort on load...? - public final DocMap[] leafDocMaps; + // Only used by IW when it must remap deletes that arrived against the merging segmetns while a merge was running: + final DocMap[] leafDocMaps; /** {@link SegmentInfo} of the newly merged segment. */ public final SegmentInfo segmentInfo; @@ -84,6 +84,8 @@ public class MergeState { /** Sole constructor. */ MergeState(List originalReaders, SegmentInfo segmentInfo, InfoStream infoStream) throws IOException { + this.infoStream = infoStream; + final Sort indexSort = segmentInfo.getIndexSort(); int numReaders = originalReaders.size(); leafDocMaps = new DocMap[numReaders]; @@ -138,7 +140,6 @@ public class MergeState { segmentInfo.setMaxDoc(numDocs); this.segmentInfo = segmentInfo; - this.infoStream = infoStream; this.docMaps = buildDocMaps(readers, indexSort); } @@ -219,6 +220,9 @@ public class MergeState { // This segment was written by flush, so documents are not yet sorted, so we sort them now: Sorter.DocMap sortDocMap = sorter.sort(leaf); if (sortDocMap != null) { + if (infoStream.isEnabled("SM")) { + infoStream.message("SM", "segment " + leaf + " is not sorted; wrapping for sort " + indexSort + " now"); + } leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(new MergeReaderWrapper(leaf), sortDocMap)); leafDocMaps[readers.size()] = new DocMap() { @Override @@ -226,10 +230,19 @@ public class MergeState { return sortDocMap.oldToNew(docID); } }; + } else { + if (infoStream.isEnabled("SM")) { + infoStream.message("SM", "segment " + leaf + " is not sorted, but is already accidentally in sort " + indexSort + " order"); + } } - } else if (segmentSort.equals(indexSort) == false) { - throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort); + } else { + if (segmentSort.equals(indexSort) == false) { + throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort); + } + if (infoStream.isEnabled("SM")) { + infoStream.message("SM", "segment " + leaf + " already sorted"); + } } readers.add(leaf); diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java index 573bbe8c6db..42e3f41cdd7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java @@ -57,8 +57,6 @@ public final class MultiPostingsEnum extends PostingsEnum { return this.parent == parent; } - // nocommit is this class supposed to be aware of index sorting too??? - /** Re-use and reset this instance on the provided slices. */ public MultiPostingsEnum reset(final EnumWithSlice[] subs, final int numSubs) { this.numSubs = numSubs; diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiReader.java b/lucene/core/src/java/org/apache/lucene/index/MultiReader.java index 15d170bd518..8f1bb66ae63 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiReader.java @@ -65,8 +65,6 @@ public class MultiReader extends BaseCompositeReader { } } - // nocommit what if there is an indexSort? - @Override protected synchronized void doClose() throws IOException { IOException ioe = null; diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 3448c9064c5..6a5eb5a0d38 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -36,7 +36,7 @@ final class MultiSorter { * documents into the merged segment. The documents for each incoming leaf reader must already be sorted by the same sort! */ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOException { - // nocommit optimize if only 1 reader is incoming + // TODO: optimize if only 1 reader is incoming, though that's a rare case SortField fields[] = sort.getSort(); final CrossReaderComparator[] comparators = new CrossReaderComparator[fields.length]; diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index 0cc1823def7..d23f01024d8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -48,8 +48,6 @@ final class SegmentMerger { final MergeState mergeState; private final FieldInfos.Builder fieldInfosBuilder; - // nocommit make sure infoStream states per-segment-being-merged if they are already sorted - // note, just like in codec apis Directory 'dir' is NOT the same as segmentInfo.dir!! SegmentMerger(List readers, SegmentInfo segmentInfo, InfoStream infoStream, Directory dir, FieldInfos.FieldNumbers fieldNumbers, IOContext context) throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index d1148ef13a6..9d00c3f42d2 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -39,6 +39,8 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.store.Directory; @@ -1281,4 +1283,53 @@ public class TestAddIndexes extends LuceneTestCase { w2.close(); IOUtils.close(src, dest); } + + public void testIllegalIndexSortChange1() throws Exception { + Directory dir1 = newDirectory(); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc1.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); + RandomIndexWriter w1 = new RandomIndexWriter(random(), dir1, iwc1); + w1.addDocument(new Document()); + w1.commit(); + w1.addDocument(new Document()); + w1.commit(); + // so the index sort is in fact burned into the index: + w1.forceMerge(1); + w1.close(); + + Directory dir2 = newDirectory(); + IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc2.setIndexSort(new Sort(new SortField("foo", SortField.Type.STRING))); + RandomIndexWriter w2 = new RandomIndexWriter(random(), dir2, iwc2); + String message = expectThrows(IllegalArgumentException.class, () -> { + w2.addIndexes(dir1); + }).getMessage(); + assertEquals("cannot change index sort from to ", message); + IOUtils.close(dir1, w2, dir2); + } + + public void testIllegalIndexSortChange2() throws Exception { + Directory dir1 = newDirectory(); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc1.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); + RandomIndexWriter w1 = new RandomIndexWriter(random(), dir1, iwc1); + w1.addDocument(new Document()); + w1.commit(); + w1.addDocument(new Document()); + w1.commit(); + // so the index sort is in fact burned into the index: + w1.forceMerge(1); + w1.close(); + + Directory dir2 = newDirectory(); + IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc2.setIndexSort(new Sort(new SortField("foo", SortField.Type.STRING))); + RandomIndexWriter w2 = new RandomIndexWriter(random(), dir2, iwc2); + IndexReader r1 = DirectoryReader.open(dir1); + String message = expectThrows(IllegalArgumentException.class, () -> { + w2.addIndexes((SegmentReader) getOnlyLeafReader(r1)); + }).getMessage(); + assertEquals("cannot change index sort from to ", message); + IOUtils.close(r1, dir1, w2, dir2); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java index 949deced60c..003db9e4529 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocIDMerger.java @@ -21,6 +21,7 @@ import java.util.Collections; import java.util.List; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -147,19 +148,35 @@ public class TestDocIDMerger extends LuceneTestCase { } assertEquals(0, oldToNew.size()); + // sometimes do some deletions: + final FixedBitSet liveDocs; + if (random().nextBoolean()) { + liveDocs = new FixedBitSet(totDocCount); + liveDocs.set(0, totDocCount); + int deleteAttemptCount = TestUtil.nextInt(random(), 1, totDocCount); + for(int i=0;i subs = new ArrayList<>(); for(int i=0;i merger = new DocIDMerger<>(subs, true); int count = 0; @@ -168,12 +185,21 @@ public class TestDocIDMerger extends LuceneTestCase { if (sub == null) { break; } + if (liveDocs != null) { + count = liveDocs.nextSetBit(count); + } assertEquals(count, sub.mappedDocID); count++; } - assertEquals(totDocCount, count); + if (liveDocs != null) { + if (count < totDocCount) { + assertEquals(NO_MORE_DOCS, liveDocs.nextSetBit(count)); + } else { + assertEquals(totDocCount, count); + } + } else { + assertEquals(totDocCount, count); + } } - - // nocommit more tests, e.g. deleted docs } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index ba171319d40..4deadd30fe4 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -47,6 +47,7 @@ import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.PointValues.IntersectVisitor; @@ -78,15 +79,6 @@ import org.apache.lucene.util.TestUtil; import org.junit.AfterClass; import org.junit.BeforeClass; -// nocommit test tie break -// nocommit test multiple sorts -// nocommit test update dvs -// nocommit test missing value - -// nocommit test EarlyTerminatingCollector - -// nocommit must test all supported SortField.Type - public class TestIndexSorting extends LuceneTestCase { public void testBasicString() throws Exception { @@ -881,8 +873,13 @@ public class TestIndexSorting extends LuceneTestCase { dir.close(); } - public void testAddIndexes(boolean withDeletes) throws Exception { + public void testAddIndexes(boolean withDeletes, boolean useReaders) throws Exception { Directory dir = newDirectory(); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); + IndexWriterConfig iwc1 = newIndexWriterConfig(); + if (random().nextBoolean()) { + iwc1.setIndexSort(indexSort); + } RandomIndexWriter w = new RandomIndexWriter(random(), dir); final int numDocs = atLeast(100); for (int i = 0; i < numDocs; ++i) { @@ -896,19 +893,26 @@ public class TestIndexSorting extends LuceneTestCase { w.deleteDocuments(new Term("id", Integer.toString(i))); } } + if (random().nextBoolean()) { + w.forceMerge(1); + } final IndexReader reader = w.getReader(); + w.close(); Directory dir2 = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG)); iwc.setIndexSort(indexSort); IndexWriter w2 = new IndexWriter(dir2, iwc); - CodecReader[] codecReaders = new CodecReader[reader.leaves().size()]; - for (int i = 0; i < codecReaders.length; ++i) { - codecReaders[i] = (CodecReader) reader.leaves().get(i).reader(); + if (useReaders) { + CodecReader[] codecReaders = new CodecReader[reader.leaves().size()]; + for (int i = 0; i < codecReaders.length; ++i) { + codecReaders[i] = (CodecReader) reader.leaves().get(i).reader(); + } + w2.addIndexes(codecReaders); + } else { + w2.addIndexes(dir); } - w2.addIndexes(codecReaders); final IndexReader reader2 = w2.getReader(); final IndexSearcher searcher = newSearcher(reader); final IndexSearcher searcher2 = newSearcher(reader2); @@ -924,15 +928,23 @@ public class TestIndexSorting extends LuceneTestCase { } } - IOUtils.close(reader, reader2, w, w2, dir, dir2); + IOUtils.close(reader, reader2, w2, dir, dir2); } public void testAddIndexes() throws Exception { - testAddIndexes(false); + testAddIndexes(false, true); } public void testAddIndexesWithDeletions() throws Exception { - testAddIndexes(true); + testAddIndexes(true, true); + } + + public void testAddIndexesWithDirectory() throws Exception { + testAddIndexes(false, false); + } + + public void testAddIndexesWithDeletionsAndDirectory() throws Exception { + testAddIndexes(true, false); } public void testBadSort() throws Exception { @@ -1126,7 +1138,6 @@ public class TestIndexSorting extends LuceneTestCase { public final float floatValue; public final double doubleValue; public final byte[] bytesValue; - // nocommit postings, points, term vectors public RandomDoc(int id) { this.id = id; @@ -1194,7 +1205,7 @@ public class TestIndexSorting extends LuceneTestCase { if (TEST_NIGHTLY) { numDocs = atLeast(100000); } else { - numDocs = atLeast(1000); + numDocs = atLeast(10000); } List docs = new ArrayList<>(); @@ -1309,4 +1320,39 @@ public class TestIndexSorting extends LuceneTestCase { IOUtils.close(r1, r2, w1, w2, dir1, dir2); } + + public void testTieBreak() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); + iwc.setIndexSort(new Sort(new SortField("foo", SortField.Type.STRING))); + iwc.setMergePolicy(newLogMergePolicy()); + IndexWriter w = new IndexWriter(dir, iwc); + for(int id=0;id<1000;id++) { + Document doc = new Document(); + doc.add(new StoredField("id", id)); + String value; + if (id < 500) { + value = "bar2"; + } else { + value = "bar1"; + } + doc.add(new SortedDocValuesField("foo", new BytesRef(value))); + w.addDocument(doc); + if (id == 500) { + w.commit(); + } + } + w.forceMerge(1); + DirectoryReader r = DirectoryReader.open(w); + for(int docID=0;docID<1000;docID++) { + int expectedID; + if (docID < 500) { + expectedID = 500 + docID; + } else { + expectedID = docID - 500; + } + assertEquals(expectedID, r.document(docID).getField("id").numericValue().intValue()); + } + IOUtils.close(r, w, dir); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java index 6108992ce67..84d326ff48c 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestEarlyTerminatingSortingCollector.java @@ -90,8 +90,6 @@ public class TestEarlyTerminatingSortingCollector extends LuceneTestCase { } iwc.setMergeScheduler(new SerialMergeScheduler()); // for reproducible tests iwc.setIndexSort(sort); - // nocommit: - iwc.setCodec(Codec.forName("SimpleText")); iw = new RandomIndexWriter(new Random(seed), dir, iwc); iw.setDoRandomForceMerge(false); // don't do this, it may happen anyway with MockRandomMP for (int i = 0; i < numDocs; ++i) { diff --git a/lucene/misc/src/java/org/apache/lucene/search/BlockJoinComparatorSource.java b/lucene/misc/src/java/org/apache/lucene/search/BlockJoinComparatorSource.java deleted file mode 100644 index 7633ff53cd6..00000000000 --- a/lucene/misc/src/java/org/apache/lucene/search/BlockJoinComparatorSource.java +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search; - -import java.io.IOException; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.ReaderUtil; -import org.apache.lucene.util.BitSet; - -// nocommit what to do here? - -/** - * Helper class to sort readers that contain blocks of documents. - *

        - * Note that this class is intended to used with index sorting, - * and for other purposes has some limitations: - *

          - *
        • Cannot yet be used with {@link IndexSearcher#searchAfter(ScoreDoc, Query, int, Sort) IndexSearcher.searchAfter} - *
        • Filling sort field values is not yet supported. - *
        - * @lucene.experimental - */ -// TODO: can/should we clean this thing up (e.g. return a proper sort value) -// and move to the join/ module? -public class BlockJoinComparatorSource extends FieldComparatorSource { - final Query parentsFilter; - final Sort parentSort; - final Sort childSort; - - /** - * Create a new BlockJoinComparatorSource, sorting only blocks of documents - * with {@code parentSort} and not reordering children with a block. - * - * @param parentsFilter Filter identifying parent documents - * @param parentSort Sort for parent documents - */ - public BlockJoinComparatorSource(Query parentsFilter, Sort parentSort) { - this(parentsFilter, parentSort, new Sort(SortField.FIELD_DOC)); - } - - /** - * Create a new BlockJoinComparatorSource, specifying the sort order for both - * blocks of documents and children within a block. - * - * @param parentsFilter Filter identifying parent documents - * @param parentSort Sort for parent documents - * @param childSort Sort for child documents in the same block - */ - public BlockJoinComparatorSource(Query parentsFilter, Sort parentSort, Sort childSort) { - this.parentsFilter = parentsFilter; - this.parentSort = parentSort; - this.childSort = childSort; - } - - @Override - @SuppressWarnings({"unchecked", "rawtypes"}) - public FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException { - // we keep parallel slots: the parent ids and the child ids - final int parentSlots[] = new int[numHits]; - final int childSlots[] = new int[numHits]; - - SortField parentFields[] = parentSort.getSort(); - final int parentReverseMul[] = new int[parentFields.length]; - final FieldComparator parentComparators[] = new FieldComparator[parentFields.length]; - for (int i = 0; i < parentFields.length; i++) { - parentReverseMul[i] = parentFields[i].getReverse() ? -1 : 1; - parentComparators[i] = parentFields[i].getComparator(1, i); - } - - SortField childFields[] = childSort.getSort(); - final int childReverseMul[] = new int[childFields.length]; - final FieldComparator childComparators[] = new FieldComparator[childFields.length]; - for (int i = 0; i < childFields.length; i++) { - childReverseMul[i] = childFields[i].getReverse() ? -1 : 1; - childComparators[i] = childFields[i].getComparator(1, i); - } - - // NOTE: we could return parent ID as value but really our sort "value" is more complex... - // So we throw UOE for now. At the moment you really should only use this at indexing time. - return new FieldComparator() { - int bottomParent; - int bottomChild; - BitSet parentBits; - LeafFieldComparator[] parentLeafComparators; - LeafFieldComparator[] childLeafComparators; - - @Override - public int compare(int slot1, int slot2) { - try { - return compare(childSlots[slot1], parentSlots[slot1], childSlots[slot2], parentSlots[slot2]); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - @Override - public void setTopValue(Integer value) { - // we dont have enough information (the docid is needed) - throw new UnsupportedOperationException("this comparator cannot be used with deep paging"); - } - - @Override - public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException { - if (parentBits != null) { - throw new IllegalStateException("This comparator can only be used on a single segment"); - } - IndexSearcher searcher = new IndexSearcher(ReaderUtil.getTopLevelContext(context)); - searcher.setQueryCache(null); - final Weight weight = searcher.createNormalizedWeight(parentsFilter, false); - final Scorer parents = weight.scorer(context); - if (parents == null) { - throw new IllegalStateException("LeafReader " + context.reader() + " contains no parents!"); - } - parentBits = BitSet.of(parents.iterator(), context.reader().maxDoc()); - parentLeafComparators = new LeafFieldComparator[parentComparators.length]; - for (int i = 0; i < parentComparators.length; i++) { - parentLeafComparators[i] = parentComparators[i].getLeafComparator(context); - } - childLeafComparators = new LeafFieldComparator[childComparators.length]; - for (int i = 0; i < childComparators.length; i++) { - childLeafComparators[i] = childComparators[i].getLeafComparator(context); - } - - return new LeafFieldComparator() { - - @Override - public int compareBottom(int doc) throws IOException { - return compare(bottomChild, bottomParent, doc, parent(doc)); - } - - @Override - public int compareTop(int doc) throws IOException { - // we dont have enough information (the docid is needed) - throw new UnsupportedOperationException("this comparator cannot be used with deep paging"); - } - - @Override - public void copy(int slot, int doc) throws IOException { - childSlots[slot] = doc; - parentSlots[slot] = parent(doc); - } - - @Override - public void setBottom(int slot) { - bottomParent = parentSlots[slot]; - bottomChild = childSlots[slot]; - } - - @Override - public void setScorer(Scorer scorer) { - for (LeafFieldComparator comp : parentLeafComparators) { - comp.setScorer(scorer); - } - for (LeafFieldComparator comp : childLeafComparators) { - comp.setScorer(scorer); - } - } - - }; - } - - @Override - public Integer value(int slot) { - // really our sort "value" is more complex... - throw new UnsupportedOperationException("filling sort field values is not yet supported"); - } - - int parent(int doc) { - return parentBits.nextSetBit(doc); - } - - int compare(int docID1, int parent1, int docID2, int parent2) throws IOException { - if (parent1 == parent2) { // both are in the same block - if (docID1 == parent1 || docID2 == parent2) { - // keep parents at the end of blocks - return docID1 - docID2; - } else { - return compare(docID1, docID2, childLeafComparators, childReverseMul); - } - } else { - int cmp = compare(parent1, parent2, parentLeafComparators, parentReverseMul); - if (cmp == 0) { - return parent1 - parent2; - } else { - return cmp; - } - } - } - - int compare(int docID1, int docID2, LeafFieldComparator comparators[], int reverseMul[]) throws IOException { - for (int i = 0; i < comparators.length; i++) { - // TODO: would be better if copy() didnt cause a term lookup in TermOrdVal & co, - // the segments are always the same here... - comparators[i].copy(0, docID1); - comparators[i].setBottom(0); - int comp = reverseMul[i] * comparators[i].compareBottom(docID2); - if (comp != 0) { - return comp; - } - } - return 0; // no need to docid tiebreak - } - }; - } - - @Override - public String toString() { - return "blockJoin(parentSort=" + parentSort + ",childSort=" + childSort + ")"; - } -} diff --git a/lucene/misc/src/test/org/apache/lucene/index/TestBlockJoinSorter.java b/lucene/misc/src/test/org/apache/lucene/index/TestBlockJoinSorter.java deleted file mode 100644 index 4a0d2b5a594..00000000000 --- a/lucene/misc/src/test/org/apache/lucene/index/TestBlockJoinSorter.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.index; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field.Store; -import org.apache.lucene.document.NumericDocValuesField; -import org.apache.lucene.document.StringField; -import org.apache.lucene.search.BlockJoinComparatorSource; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.Weight; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BitSet; -import org.apache.lucene.util.LuceneTestCase; - -public class TestBlockJoinSorter extends LuceneTestCase { - - public void test() throws IOException { - final int numParents = atLeast(200); - IndexWriterConfig cfg = newIndexWriterConfig(new MockAnalyzer(random())); - cfg.setMergePolicy(newLogMergePolicy()); - final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), cfg); - final Document parentDoc = new Document(); - final NumericDocValuesField parentVal = new NumericDocValuesField("parent_val", 0L); - parentDoc.add(parentVal); - final StringField parent = new StringField("parent", "true", Store.YES); - parentDoc.add(parent); - for (int i = 0; i < numParents; ++i) { - List documents = new ArrayList<>(); - final int numChildren = random().nextInt(10); - for (int j = 0; j < numChildren; ++j) { - final Document childDoc = new Document(); - childDoc.add(new NumericDocValuesField("child_val", random().nextInt(5))); - documents.add(childDoc); - } - parentVal.setLongValue(random().nextInt(50)); - documents.add(parentDoc); - writer.addDocuments(documents); - } - writer.forceMerge(1); - IndexReader indexReader = writer.getReader(); - writer.close(); - - IndexSearcher searcher = newSearcher(indexReader); - indexReader = searcher.getIndexReader(); // newSearcher may have wrapped it - assertEquals(1, indexReader.leaves().size()); - final LeafReader reader = indexReader.leaves().get(0).reader(); - final Query parentsFilter = new TermQuery(new Term("parent", "true")); - - final Weight weight = searcher.createNormalizedWeight(parentsFilter, false); - final Scorer parents = weight.scorer(indexReader.leaves().get(0)); - final BitSet parentBits = BitSet.of(parents.iterator(), reader.maxDoc()); - final NumericDocValues parentValues = reader.getNumericDocValues("parent_val"); - final NumericDocValues childValues = reader.getNumericDocValues("child_val"); - - final Sort parentSort = new Sort(new SortField("parent_val", SortField.Type.LONG)); - final Sort childSort = new Sort(new SortField("child_val", SortField.Type.LONG)); - - final Sort sort = new Sort(new SortField("custom", new BlockJoinComparatorSource(parentsFilter, parentSort, childSort))); - final Sorter sorter = new Sorter(sort); - final Sorter.DocMap docMap = sorter.sort(reader); - assertEquals(reader.maxDoc(), docMap.size()); - - int[] children = new int[1]; - int numChildren = 0; - int previousParent = -1; - for (int i = 0; i < docMap.size(); ++i) { - final int oldID = docMap.newToOld(i); - if (parentBits.get(oldID)) { - // check that we have the right children - for (int j = 0; j < numChildren; ++j) { - assertEquals(oldID, parentBits.nextSetBit(children[j])); - } - // check that children are sorted - for (int j = 1; j < numChildren; ++j) { - final int doc1 = children[j-1]; - final int doc2 = children[j]; - if (childValues.get(doc1) == childValues.get(doc2)) { - assertTrue(doc1 < doc2); // sort is stable - } else { - assertTrue(childValues.get(doc1) < childValues.get(doc2)); - } - } - // check that parents are sorted - if (previousParent != -1) { - if (parentValues.get(previousParent) == parentValues.get(oldID)) { - assertTrue(previousParent < oldID); - } else { - assertTrue(parentValues.get(previousParent) < parentValues.get(oldID)); - } - } - // reset - previousParent = oldID; - numChildren = 0; - } else { - children = ArrayUtil.grow(children, numChildren+1); - children[numChildren++] = oldID; - } - } - indexReader.close(); - writer.w.getDirectory().close(); - } - -} From 8361de87becd64c8b217313877b996ac20167856 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Tue, 10 May 2016 05:31:02 -0400 Subject: [PATCH 13/16] LUCENE-6766: fix parallel reader's detection of conflicting index sort --- .../org/apache/lucene/index/DocValues.java | 2 +- .../lucene/index/ParallelLeafReader.java | 7 ++- .../lucene/index/TestParallelLeafReader.java | 59 ++++++++++++++++++- 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValues.java b/lucene/core/src/java/org/apache/lucene/index/DocValues.java index 15b15c6c72b..4de42387042 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValues.java @@ -210,7 +210,7 @@ public final class DocValues { (expected.length == 1 ? "(expected=" + expected[0] : "(expected one of " + Arrays.toString(expected)) + "). " + - " Re-index with correct docvalues type."); + "Re-index with correct docvalues type."); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java index ef9f28cf666..d85ff2d0fa2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java @@ -107,10 +107,11 @@ public class ParallelLeafReader extends LeafReader { // build FieldInfos and fieldToReader map: for (final LeafReader reader : this.parallelReaders) { + Sort leafIndexSort = reader.getIndexSort(); if (indexSort == null) { - indexSort = reader.getIndexSort(); - } else if (indexSort.equals(reader.getIndexSort()) == false) { - throw new IllegalArgumentException("cannot combine LeafReaders that have different index sorts: saw both sort=" + indexSort + " and " + reader.getIndexSort()); + indexSort = leafIndexSort; + } else if (leafIndexSort != null && indexSort.equals(leafIndexSort) == false) { + throw new IllegalArgumentException("cannot combine LeafReaders that have different index sorts: saw both sort=" + indexSort + " and " + leafIndexSort); } final FieldInfos readerFieldInfos = reader.getFieldInfos(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestParallelLeafReader.java b/lucene/core/src/test/org/apache/lucene/index/TestParallelLeafReader.java index f7f401fca9a..35523f352a7 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestParallelLeafReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestParallelLeafReader.java @@ -23,10 +23,11 @@ import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -314,4 +315,60 @@ public class TestParallelLeafReader extends LuceneTestCase { return dir2; } + // not ok to have one leaf w/ index sort and another with a different index sort + public void testWithIndexSort1() throws Exception { + Directory dir1 = newDirectory(); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc1.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); + IndexWriter w1 = new IndexWriter(dir1, iwc1); + w1.addDocument(new Document()); + w1.commit(); + w1.addDocument(new Document()); + w1.forceMerge(1); + w1.close(); + IndexReader r1 = DirectoryReader.open(dir1); + + Directory dir2 = newDirectory(); + IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc2.setIndexSort(new Sort(new SortField("bar", SortField.Type.INT))); + IndexWriter w2 = new IndexWriter(dir2, iwc2); + w2.addDocument(new Document()); + w2.commit(); + w2.addDocument(new Document()); + w2.forceMerge(1); + w2.close(); + IndexReader r2 = DirectoryReader.open(dir2); + + String message = expectThrows(IllegalArgumentException.class, () -> { + new ParallelLeafReader(getOnlyLeafReader(r1), getOnlyLeafReader(r2)); + }).getMessage(); + assertEquals("cannot combine LeafReaders that have different index sorts: saw both sort= and ", message); + IOUtils.close(r1, dir1, r2, dir2); + } + + // ok to have one leaf w/ index sort and the other with no sort + public void testWithIndexSort2() throws Exception { + Directory dir1 = newDirectory(); + IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random())); + iwc1.setIndexSort(new Sort(new SortField("foo", SortField.Type.INT))); + IndexWriter w1 = new IndexWriter(dir1, iwc1); + w1.addDocument(new Document()); + w1.commit(); + w1.addDocument(new Document()); + w1.forceMerge(1); + w1.close(); + IndexReader r1 = DirectoryReader.open(dir1); + + Directory dir2 = newDirectory(); + IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random())); + IndexWriter w2 = new IndexWriter(dir2, iwc2); + w2.addDocument(new Document()); + w2.addDocument(new Document()); + w2.close(); + + IndexReader r2 = DirectoryReader.open(dir2); + new ParallelLeafReader(false, getOnlyLeafReader(r1), getOnlyLeafReader(r2)).close(); + new ParallelLeafReader(false, getOnlyLeafReader(r2), getOnlyLeafReader(r1)).close(); + IOUtils.close(r1, dir1, r2, dir2); + } } From e283271aaf6da3033156f36b421d3241b5499d4e Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Thu, 12 May 2016 15:25:11 -0400 Subject: [PATCH 14/16] LUCENE-6766: more IW.infoStream logging around sorting; fix test bug --- .../org/apache/lucene/index/CheckIndex.java | 2 +- .../org/apache/lucene/index/DocIDMerger.java | 3 ++- .../org/apache/lucene/index/MergeState.java | 17 +++++++++--- .../apache/lucene/index/TestIndexSorting.java | 27 ++++++++++++++++--- 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 1031d22e22a..d752c257c1e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -831,7 +831,7 @@ public final class CheckIndex implements Closeable { if (sort != null) { if (infoStream != null) { - infoStream.print(" test: check index sort....."); + infoStream.print(" test: index sort.........."); } SortField fields[] = sort.getSort(); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java index e8ffc6c955e..84f08c7cf6c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java @@ -57,7 +57,8 @@ public class DocIDMerger { public DocIDMerger(List subs, int maxCount, boolean indexIsSorted) { this.subs = subs; - if (indexIsSorted) { + // nocommit safe? + if (indexIsSorted && maxCount > 1) { queue = new PriorityQueue(maxCount) { @Override protected boolean lessThan(Sub a, Sub b) { diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 7737ff2cc40..12310c6668d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Locale; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; @@ -185,7 +186,13 @@ public class MergeState { } else { // do a merge sort of the incoming leaves: - return MultiSorter.sort(indexSort, readers); + long t0 = System.nanoTime(); + DocMap[] result = MultiSorter.sort(indexSort, readers); + long t1 = System.nanoTime(); + if (infoStream.isEnabled("SM")) { + infoStream.message("SM", String.format(Locale.ROOT, "%.2f msec to build merge sorted DocMaps", (t1-t0)/1000000.0)); + } + return result; } } @@ -218,10 +225,14 @@ public class MergeState { // to their index files on each indexed document: // This segment was written by flush, so documents are not yet sorted, so we sort them now: + long t0 = System.nanoTime(); Sorter.DocMap sortDocMap = sorter.sort(leaf); + long t1 = System.nanoTime(); + double msec = (t1-t0)/1000000.0; + if (sortDocMap != null) { if (infoStream.isEnabled("SM")) { - infoStream.message("SM", "segment " + leaf + " is not sorted; wrapping for sort " + indexSort + " now"); + infoStream.message("SM", String.format(Locale.ROOT, "segment %s is not sorted; wrapping for sort %s now (%.2f msec to sort)", leaf, indexSort, msec)); } leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(new MergeReaderWrapper(leaf), sortDocMap)); leafDocMaps[readers.size()] = new DocMap() { @@ -232,7 +243,7 @@ public class MergeState { }; } else { if (infoStream.isEnabled("SM")) { - infoStream.message("SM", "segment " + leaf + " is not sorted, but is already accidentally in sort " + indexSort + " order"); + infoStream.message("SM", String.format(Locale.ROOT, "segment %s is not sorted, but is already accidentally in sort %s order (%.2f msec to sort)", leaf, indexSort, msec)); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 4deadd30fe4..4e775f3e5da 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -30,8 +30,11 @@ import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; @@ -1053,6 +1056,14 @@ public class TestIndexSorting extends LuceneTestCase { TERM_VECTORS_TYPE.setStoreTermVectors(true); TERM_VECTORS_TYPE.freeze(); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + List docs = new ArrayList<>(); for (int i=0;i Date: Thu, 12 May 2016 15:57:22 -0400 Subject: [PATCH 15/16] LUCENE-6766: remove leftover sop --- .../src/java/org/apache/lucene/util/LuceneTestCase.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java index 52aca7e8f52..98cd2a790ea 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java @@ -2008,9 +2008,9 @@ public abstract class LuceneTestCase extends Assert { return; } assertTermsStatisticsEquals(info, leftTerms, rightTerms); - assertEquals(leftTerms.hasOffsets(), rightTerms.hasOffsets()); - assertEquals(leftTerms.hasPositions(), rightTerms.hasPositions()); - assertEquals(leftTerms.hasPayloads(), rightTerms.hasPayloads()); + assertEquals("hasOffsets", leftTerms.hasOffsets(), rightTerms.hasOffsets()); + assertEquals("hasPositions", leftTerms.hasPositions(), rightTerms.hasPositions()); + assertEquals("hasPayloads", leftTerms.hasPayloads(), rightTerms.hasPayloads()); TermsEnum leftTermsEnum = leftTerms.iterator(); TermsEnum rightTermsEnum = rightTerms.iterator(); From 3cde9eb3d027b273a3c136e9eb284ae18f1824fe Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 13 May 2016 04:29:48 -0400 Subject: [PATCH 16/16] LUCENE-6766: keep SortingMergePolicy for solr back-compat; fix Solr tests; fix precommit failures --- .../lucene/codecs/lucene60/package.html | 25 +++++++ .../org/apache/lucene/index/CheckIndex.java | 6 ++ .../org/apache/lucene/index/DocIDMerger.java | 7 +- .../org/apache/lucene/index/MergeState.java | 4 ++ .../lucene/index/MultiPostingsEnum.java | 3 + .../org/apache/lucene/index/MultiSorter.java | 3 +- .../index/SlowCompositeReaderWrapper.java | 5 -- .../java/org/apache/lucene/index/package.html | 18 ----- .../apache/solr/core/SchemaCodecFactory.java | 6 +- .../apache/solr/index/SortingMergePolicy.java | 65 +++++++++++++++++++ .../solr/index/SortingMergePolicyFactory.java | 1 - .../apache/solr/search/SolrIndexSearcher.java | 2 +- .../solr/update/DefaultSolrCoreState.java | 4 +- .../apache/solr/update/SolrIndexConfig.java | 17 +++-- .../org/apache/solr/search/TestDocSet.java | 8 ++- .../solr/update/SolrIndexConfigTest.java | 2 +- 16 files changed, 138 insertions(+), 38 deletions(-) create mode 100644 lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/package.html create mode 100644 solr/core/src/java/org/apache/solr/index/SortingMergePolicy.java diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/package.html b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/package.html new file mode 100644 index 00000000000..6b4e234826d --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene60/package.html @@ -0,0 +1,25 @@ + + + + + + + +Lucene 6.0 file format. + + diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index d752c257c1e..9dee2d14e1c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -385,6 +385,8 @@ public final class CheckIndex implements Closeable { * Status from testing index sort */ public static final class IndexSortStatus { + IndexSortStatus() { + } /** Exception thrown during term index test (null on success) */ public Throwable error = null; @@ -822,6 +824,10 @@ public final class CheckIndex implements Closeable { return result; } + /** + * Tests index sort order. + * @lucene.experimental + */ public static Status.IndexSortStatus testSort(CodecReader reader, Sort sort, PrintStream infoStream, boolean failFast) throws IOException { // This segment claims its documents are sorted according to the incoming sort ... let's make sure: diff --git a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java index 84f08c7cf6c..07c9e725270 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java @@ -42,10 +42,14 @@ public class DocIDMerger { private T current; private int nextIndex; + /** Represents one sub-reader being merged */ public static abstract class Sub { + /** Mapped doc ID */ public int mappedDocID; + final MergeState.DocMap docMap; + /** Sole constructor */ public Sub(MergeState.DocMap docMap) { this.docMap = docMap; } @@ -54,10 +58,10 @@ public class DocIDMerger { public abstract int nextDoc(); } + /** Construct this from the provided subs, specifying the maximum sub count */ public DocIDMerger(List subs, int maxCount, boolean indexIsSorted) { this.subs = subs; - // nocommit safe? if (indexIsSorted && maxCount > 1) { queue = new PriorityQueue(maxCount) { @Override @@ -74,6 +78,7 @@ public class DocIDMerger { reset(); } + /** Construct this from the provided subs */ public DocIDMerger(List subs, boolean indexIsSorted) { this(subs, subs.size(), indexIsSorted); } diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 12310c6668d..3723f19476f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -264,6 +264,10 @@ public class MergeState { /** A map of doc IDs. */ public static abstract class DocMap { + /** Sole constructor */ + public DocMap() { + } + /** Return the mapped docID or -1 if the given doc is not mapped. */ public abstract int get(int docID); } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java index 42e3f41cdd7..062fc303c09 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java @@ -171,6 +171,9 @@ public final class MultiPostingsEnum extends PostingsEnum { /** {@link ReaderSlice} describing how this sub-reader * fits into the composite reader. */ public ReaderSlice slice; + + EnumWithSlice() { + } @Override public String toString() { diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 6a5eb5a0d38..8f5be86ede2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.lucene.index.MergeState.DocMap; import org.apache.lucene.index.MergeState; import org.apache.lucene.search.LeafFieldComparator; import org.apache.lucene.search.Sort; @@ -32,7 +33,7 @@ import org.apache.lucene.util.packed.PackedLongValues; final class MultiSorter { - /** Does a merge sort of the leaves of the incoming reader, returning {@link MergeState#DocMap} to map each leaf's + /** Does a merge sort of the leaves of the incoming reader, returning {@link DocMap} to map each leaf's * documents into the merged segment. The documents for each incoming leaf reader must already be sorted by the same sort! */ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOException { diff --git a/lucene/misc/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java b/lucene/misc/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java index 6cd990b00fe..de711fda460 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java +++ b/lucene/misc/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java @@ -68,11 +68,6 @@ public final class SlowCompositeReaderWrapper extends LeafReader { if (getFieldInfos().hasPointValues()) { throw new IllegalArgumentException("cannot wrap points"); } - for(LeafReaderContext context : reader.leaves()) { - if (context.reader().getIndexSort() != null) { - throw new IllegalArgumentException("cannot use index sort"); - } - } fields = MultiFields.getFields(in); in.registerParentReader(this); this.merging = merging; diff --git a/lucene/misc/src/java/org/apache/lucene/index/package.html b/lucene/misc/src/java/org/apache/lucene/index/package.html index dc9cbb7b676..33ce964eaf2 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/package.html +++ b/lucene/misc/src/java/org/apache/lucene/index/package.html @@ -18,23 +18,5 @@ Misc index tools and index support. - -SortingMergePolicy: -

        Provides index sorting capablities. The application can use any -Sort specification, e.g. to sort by fields using DocValues or FieldCache, or to -reverse the order of the documents (by using SortField.Type.DOC in reverse). -Multi-level sorts can be specified the same way you would when searching, by -building Sort from multiple SortFields. - -

        {@link org.apache.lucene.index.SortingMergePolicy} can be used to -make Lucene sort segments before merging them. This will ensure that every -segment resulting from a merge will be sorted according to the provided -{@link org.apache.lucene.search.Sort}. This however makes merging and -thus indexing slower. - -

        Sorted segments allow for early query termination when the sort order -matches index order. This makes query execution faster since not all documents -need to be visited. Please note that this is an expert feature and should not -be used without a deep understanding of Lucene merging and document collection. diff --git a/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java b/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java index dc423d99212..c575ecbca10 100644 --- a/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java +++ b/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java @@ -24,9 +24,9 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode; -import org.apache.lucene.codecs.lucene60.Lucene60Codec; -import org.apache.solr.common.SolrException; +import org.apache.lucene.codecs.lucene62.Lucene62Codec; import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.common.SolrException; import org.apache.solr.common.util.NamedList; import org.apache.solr.schema.SchemaField; import org.apache.solr.util.plugin.SolrCoreAware; @@ -91,7 +91,7 @@ public class SchemaCodecFactory extends CodecFactory implements SolrCoreAware { compressionMode = SOLR_DEFAULT_COMPRESSION_MODE; log.info("Using default compressionMode: " + compressionMode); } - codec = new Lucene60Codec(compressionMode) { + codec = new Lucene62Codec(compressionMode) { @Override public PostingsFormat getPostingsFormatForField(String field) { final SchemaField schemaField = core.getLatestSchema().getFieldOrNull(field); diff --git a/solr/core/src/java/org/apache/solr/index/SortingMergePolicy.java b/solr/core/src/java/org/apache/solr/index/SortingMergePolicy.java new file mode 100644 index 00000000000..b58d0a4ba16 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/index/SortingMergePolicy.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.MergePolicyWrapper; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.MergeTrigger; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.SegmentCommitInfo; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.index.SegmentReader; +import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.search.Sort; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.InfoStream; +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.PackedLongValues; + +// TODO: remove this and add indexSort specification directly to solrconfig.xml? But for BWC, also accept SortingMergePolicy specifiction? + +public final class SortingMergePolicy extends MergePolicyWrapper { + + private final Sort sort; + + /** Create a new {@code MergePolicy} that sorts documents with the given {@code sort}. */ + public SortingMergePolicy(MergePolicy in, Sort sort) { + super(in); + this.sort = sort; + } + + /** Return the {@link Sort} order that is used to sort segments when merging. */ + public Sort getSort() { + return sort; + } + + @Override + public String toString() { + return "SortingMergePolicy(" + in + ", sort=" + sort + ")"; + } +} diff --git a/solr/core/src/java/org/apache/solr/index/SortingMergePolicyFactory.java b/solr/core/src/java/org/apache/solr/index/SortingMergePolicyFactory.java index 53190b5f4f9..b22df3b3f97 100644 --- a/solr/core/src/java/org/apache/solr/index/SortingMergePolicyFactory.java +++ b/solr/core/src/java/org/apache/solr/index/SortingMergePolicyFactory.java @@ -17,7 +17,6 @@ package org.apache.solr.index; import org.apache.lucene.index.MergePolicy; -import org.apache.lucene.index.SortingMergePolicy; import org.apache.lucene.search.Sort; import org.apache.solr.core.SolrResourceLoader; diff --git a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java index 6ff54699a8a..5fe336ba192 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java +++ b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java @@ -237,7 +237,7 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable, SolrI !EarlyTerminatingSortingCollector.canEarlyTerminate(cmdSort, mergeSort)) { log.warn("unsupported combination: segmentTerminateEarly=true cmdSort={} cmdLen={} mergeSort={}", cmdSort, cmdLen, mergeSort); } else { - collector = earlyTerminatingSortingCollector = new EarlyTerminatingSortingCollector(collector, cmdSort, cmd.getLen(), mergeSort); + collector = earlyTerminatingSortingCollector = new EarlyTerminatingSortingCollector(collector, cmdSort, cmd.getLen()); } } diff --git a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java index 8eab83f52e8..a29d57dcb5f 100644 --- a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java +++ b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java @@ -29,16 +29,16 @@ import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.MergePolicy; -import org.apache.lucene.index.SortingMergePolicy; import org.apache.lucene.search.Sort; import org.apache.solr.cloud.ActionThrottle; import org.apache.solr.cloud.RecoveryStrategy; -import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.common.SolrException; import org.apache.solr.core.CoreContainer; import org.apache.solr.core.CoreDescriptor; import org.apache.solr.core.DirectoryFactory; import org.apache.solr.core.SolrCore; +import org.apache.solr.index.SortingMergePolicy; import org.apache.solr.logging.MDCLoggingContext; import org.apache.solr.util.RefCounted; import org.slf4j.Logger; diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexConfig.java b/solr/core/src/java/org/apache/solr/update/SolrIndexConfig.java index bc2d6545210..696b3ed38be 100644 --- a/solr/core/src/java/org/apache/solr/update/SolrIndexConfig.java +++ b/solr/core/src/java/org/apache/solr/update/SolrIndexConfig.java @@ -16,8 +16,7 @@ */ package org.apache.solr.update; -import static org.apache.solr.core.Config.assertWarnOrFail; - +import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.List; import java.util.Map; @@ -31,6 +30,7 @@ import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.TieredMergePolicy; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.Version; import org.apache.solr.common.util.NamedList; @@ -44,11 +44,14 @@ import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.index.DefaultMergePolicyFactory; import org.apache.solr.index.MergePolicyFactory; import org.apache.solr.index.MergePolicyFactoryArgs; +import org.apache.solr.index.SortingMergePolicy; import org.apache.solr.schema.IndexSchema; import org.apache.solr.util.SolrPluginUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.apache.solr.core.Config.assertWarnOrFail; + /** * This config object encapsulates IndexWriter config params, * defined in the <indexConfig> section of solrconfig.xml @@ -222,7 +225,7 @@ public class SolrIndexConfig implements MapSerializable { } } - public IndexWriterConfig toIndexWriterConfig(SolrCore core) { + public IndexWriterConfig toIndexWriterConfig(SolrCore core) throws IOException { IndexSchema schema = core.getLatestSchema(); IndexWriterConfig iwc = new IndexWriterConfig(new DelayedSchemaAnalyzer(core)); if (maxBufferedDocs != -1) @@ -232,10 +235,16 @@ public class SolrIndexConfig implements MapSerializable { iwc.setRAMBufferSizeMB(ramBufferSizeMB); iwc.setSimilarity(schema.getSimilarity()); - iwc.setMergePolicy(buildMergePolicy(schema)); + MergePolicy mergePolicy = buildMergePolicy(schema); + iwc.setMergePolicy(mergePolicy); iwc.setMergeScheduler(buildMergeScheduler(schema)); iwc.setInfoStream(infoStream); + if (mergePolicy instanceof SortingMergePolicy) { + Sort indexSort = ((SortingMergePolicy) mergePolicy).getSort(); + iwc.setIndexSort(indexSort); + } + // do this after buildMergePolicy since the backcompat logic // there may modify the effective useCompoundFile iwc.setUseCompoundFile(getUseCompoundFile()); diff --git a/solr/core/src/test/org/apache/solr/search/TestDocSet.java b/solr/core/src/test/org/apache/solr/search/TestDocSet.java index 9c46d5baa57..cdddd86e7ec 100644 --- a/solr/core/src/test/org/apache/solr/search/TestDocSet.java +++ b/solr/core/src/test/org/apache/solr/search/TestDocSet.java @@ -22,7 +22,6 @@ import java.util.List; import java.util.Random; import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.PointValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; @@ -32,12 +31,14 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PointValues; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Sort; import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; @@ -467,6 +468,11 @@ public class TestDocSet extends LuceneTestCase { @Override public void checkIntegrity() throws IOException { } + + @Override + public Sort getIndexSort() { + return null; + } }; } diff --git a/solr/core/src/test/org/apache/solr/update/SolrIndexConfigTest.java b/solr/core/src/test/org/apache/solr/update/SolrIndexConfigTest.java index ffb495e9f4b..08a9037608a 100644 --- a/solr/core/src/test/org/apache/solr/update/SolrIndexConfigTest.java +++ b/solr/core/src/test/org/apache/solr/update/SolrIndexConfigTest.java @@ -23,7 +23,6 @@ import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.SimpleMergedSegmentWarmer; -import org.apache.lucene.index.SortingMergePolicy; import org.apache.lucene.index.TieredMergePolicy; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; @@ -31,6 +30,7 @@ import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.core.DirectoryFactory; import org.apache.solr.core.SolrConfig; import org.apache.solr.core.TestMergePolicyConfig; +import org.apache.solr.index.SortingMergePolicy; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.IndexSchemaFactory; import org.junit.BeforeClass;