From 6c3c6bc3797307efa13cae06778d41f24a26bccb Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Tue, 15 Nov 2016 16:22:51 -0500 Subject: [PATCH] LUCENE-7537: Index time sorting now supports multi-valued sorts using selectors (MIN, MAX, etc.) --- lucene/CHANGES.txt | 3 + .../SimpleTextSegmentInfoFormat.java | 148 ++- .../lucene62/Lucene62SegmentInfoFormat.java | 118 ++- .../lucene/index/IndexWriterConfig.java | 3 +- .../org/apache/lucene/index/MultiSorter.java | 68 +- .../java/org/apache/lucene/index/Sorter.java | 102 +-- .../lucene/search/SortedNumericSortField.java | 5 + .../apache/lucene/index/TestIndexSorting.java | 854 +++++++++++++++++- .../index/BaseSegmentInfoFormatTestCase.java | 91 +- 9 files changed, 1220 insertions(+), 172 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a6c6dbedaeb..bdc118b95f2 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -90,6 +90,9 @@ Improvements which can be overridden to return false to eek out more speed in some cases. (Timothy M. Rodriguez, David Smiley) +* LUCENE-7537: Index time sorting now supports multi-valued sorts + using selectors (MIN, MAX, etc.) (Jim Ferenczi via Mike McCandless) + Other * LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index 146e92a6a29..3d38d72385f 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -33,9 +33,14 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSelector; +import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.search.SortedSetSelector; +import org.apache.lucene.search.SortedSetSortField; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -64,6 +69,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { final static BytesRef SI_SORT = new BytesRef(" sort "); final static BytesRef SI_SORT_FIELD = new BytesRef(" field "); final static BytesRef SI_SORT_TYPE = new BytesRef(" type "); + final static BytesRef SI_SELECTOR_TYPE = new BytesRef(" selector "); final static BytesRef SI_SORT_REVERSE = new BytesRef(" reverse "); final static BytesRef SI_SORT_MISSING = new BytesRef(" missing "); @@ -158,6 +164,8 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { final String typeAsString = readString(SI_SORT_TYPE.length, scratch); final SortField.Type type; + SortedSetSelector.Type selectorSet = null; + SortedNumericSelector.Type selectorNumeric = null; switch (typeAsString) { case "string": type = SortField.Type.STRING; @@ -174,6 +182,26 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { case "float": type = SortField.Type.FLOAT; break; + case "multi_valued_string": + type = SortField.Type.STRING; + selectorSet = readSetSelector(input, scratch); + break; + case "multi_valued_long": + type = SortField.Type.LONG; + selectorNumeric = readNumericSelector(input, scratch); + break; + case "multi_valued_int": + type = SortField.Type.INT; + selectorNumeric = readNumericSelector(input, scratch); + break; + case "multi_valued_double": + type = SortField.Type.DOUBLE; + selectorNumeric = readNumericSelector(input, scratch); + break; + case "multi_valued_float": + type = SortField.Type.FLOAT; + selectorNumeric = readNumericSelector(input, scratch); + break; default: throw new CorruptIndexException("unable to parse sort type string: " + typeAsString, input); } @@ -245,7 +273,13 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { default: throw new AssertionError(); } - sortField[i] = new SortField(field, type, reverse); + if (selectorSet != null) { + sortField[i] = new SortedSetSortField(field, reverse); + } else if (selectorNumeric != null) { + sortField[i] = new SortedNumericSortField(field, type, reverse); + } else { + sortField[i] = new SortField(field, type, reverse); + } if (missingValue != null) { sortField[i].setMissingValue(missingValue); } @@ -265,6 +299,38 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { private String readString(int offset, BytesRefBuilder scratch) { return new String(scratch.bytes(), offset, scratch.length()-offset, StandardCharsets.UTF_8); } + + private SortedSetSelector.Type readSetSelector(IndexInput input, BytesRefBuilder scratch) throws IOException { + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SI_SELECTOR_TYPE); + final String selectorAsString = readString(SI_SELECTOR_TYPE.length, scratch); + switch (selectorAsString) { + case "min": + return SortedSetSelector.Type.MIN; + case "middle_min": + return SortedSetSelector.Type.MIDDLE_MIN; + case "middle_max": + return SortedSetSelector.Type.MIDDLE_MAX; + case "max": + return SortedSetSelector.Type.MAX; + default: + throw new CorruptIndexException("unable to parse SortedSetSelector type: " + selectorAsString, input); + } + } + + private SortedNumericSelector.Type readNumericSelector(IndexInput input, BytesRefBuilder scratch) throws IOException { + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SI_SELECTOR_TYPE); + final String selectorAsString = readString(SI_SELECTOR_TYPE.length, scratch); + switch (selectorAsString) { + case "min": + return SortedNumericSelector.Type.MIN; + case "max": + return SortedNumericSelector.Type.MAX; + default: + throw new CorruptIndexException("unable to parse SortedNumericSelector type: " + selectorAsString, input); + } + } @Override public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException { @@ -352,29 +418,93 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { SimpleTextUtil.writeNewline(output); SimpleTextUtil.write(output, SI_SORT_TYPE); - final String sortType; - switch (sortField.getType()) { + final String sortTypeString; + final SortField.Type sortType; + final boolean multiValued; + if (sortField instanceof SortedSetSortField) { + sortType = SortField.Type.STRING; + multiValued = true; + } else if (sortField instanceof SortedNumericSortField) { + sortType = ((SortedNumericSortField) sortField).getNumericType(); + multiValued = true; + } else { + sortType = sortField.getType(); + multiValued = false; + } + switch (sortType) { case STRING: - sortType = "string"; + if (multiValued) { + sortTypeString = "multi_valued_string"; + } else { + sortTypeString = "string"; + } break; case LONG: - sortType = "long"; + if (multiValued) { + sortTypeString = "multi_valued_long"; + } else { + sortTypeString = "long"; + } break; case INT: - sortType = "int"; + if (multiValued) { + sortTypeString = "multi_valued_int"; + } else { + sortTypeString = "int"; + } break; case DOUBLE: - sortType = "double"; + if (multiValued) { + sortTypeString = "multi_valued_double"; + } else { + sortTypeString = "double"; + } break; case FLOAT: - sortType = "float"; + if (multiValued) { + sortTypeString = "multi_valued_float"; + } else { + sortTypeString = "float"; + } break; default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); } - SimpleTextUtil.write(output, sortType, scratch); + SimpleTextUtil.write(output, sortTypeString, scratch); SimpleTextUtil.writeNewline(output); + if (sortField instanceof SortedSetSortField) { + SortedSetSelector.Type selector = ((SortedSetSortField) sortField).getSelector(); + final String selectorString; + if (selector == SortedSetSelector.Type.MIN) { + selectorString = "min"; + } else if (selector == SortedSetSelector.Type.MIDDLE_MIN) { + selectorString = "middle_min"; + } else if (selector == SortedSetSelector.Type.MIDDLE_MAX) { + selectorString = "middle_max"; + } else if (selector == SortedSetSelector.Type.MAX) { + selectorString = "max"; + } else { + throw new IllegalStateException("Unexpected SortedSetSelector type selector: " + selector); + } + SimpleTextUtil.write(output, SI_SELECTOR_TYPE); + SimpleTextUtil.write(output, selectorString, scratch); + SimpleTextUtil.writeNewline(output); + } else if (sortField instanceof SortedNumericSortField) { + SortedNumericSelector.Type selector = ((SortedNumericSortField) sortField).getSelector(); + final String selectorString; + if (selector == SortedNumericSelector.Type.MIN) { + selectorString = "min"; + } else if (selector == SortedNumericSelector.Type.MAX) { + selectorString = "max"; + } else { + throw new IllegalStateException("Unexpected SortedNumericSelector type selector: " + selector); + } + SimpleTextUtil.write(output, SI_SELECTOR_TYPE); + SimpleTextUtil.write(output, selectorString, scratch); + SimpleTextUtil.writeNewline(output); + } + SimpleTextUtil.write(output, SI_SORT_REVERSE); SimpleTextUtil.write(output, Boolean.toString(sortField.getReverse()), scratch); SimpleTextUtil.writeNewline(output); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java index 1ee52588a55..da6e395e27a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene62/Lucene62SegmentInfoFormat.java @@ -29,6 +29,10 @@ import org.apache.lucene.index.SegmentInfo; // javadocs import org.apache.lucene.index.SegmentInfos; // javadocs import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSelector; +import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.search.SortedSetSelector; +import org.apache.lucene.search.SortedSetSortField; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataOutput; // javadocs import org.apache.lucene.store.Directory; @@ -69,7 +73,7 @@ import org.apache.lucene.util.Version; * addIndexes), etc. *
  • Files is a list of files referred to by this segment.
  • * - * + * * @see SegmentInfos * @lucene.experimental */ @@ -78,7 +82,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { /** Sole constructor. */ public Lucene62SegmentInfoFormat() { } - + @Override public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException { final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene62SegmentInfoFormat.SI_EXTENSION); @@ -91,13 +95,13 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { Lucene62SegmentInfoFormat.VERSION_CURRENT, segmentID, ""); final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); - + final int docCount = input.readInt(); if (docCount < 0) { throw new CorruptIndexException("invalid docCount: " + docCount, input); } final boolean isCompoundFile = input.readByte() == SegmentInfo.YES; - + final Map diagnostics = input.readMapOfStrings(); final Set files = input.readSetOfStrings(); final Map attributes = input.readMapOfStrings(); @@ -110,6 +114,8 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { String fieldName = input.readString(); int sortTypeID = input.readVInt(); SortField.Type sortType; + SortedSetSelector.Type sortedSetSelector = null; + SortedNumericSelector.Type sortedNumericSelector = null; switch(sortTypeID) { case 0: sortType = SortField.Type.STRING; @@ -126,6 +132,43 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { case 4: sortType = SortField.Type.FLOAT; break; + case 5: + sortType = SortField.Type.STRING; + byte selector = input.readByte(); + if (selector == 0) { + sortedSetSelector = SortedSetSelector.Type.MIN; + } else if (selector == 1) { + sortedSetSelector = SortedSetSelector.Type.MAX; + } else if (selector == 2) { + sortedSetSelector = SortedSetSelector.Type.MIDDLE_MIN; + } else if (selector == 3) { + sortedSetSelector = SortedSetSelector.Type.MIDDLE_MAX; + } else { + throw new CorruptIndexException("invalid index SortedSetSelector ID: " + selector, input); + } + break; + case 6: + byte type = input.readByte(); + if (type == 0) { + sortType = SortField.Type.LONG; + } else if (type == 1) { + sortType = SortField.Type.INT; + } else if (type == 2) { + sortType = SortField.Type.DOUBLE; + } else if (type == 3) { + sortType = SortField.Type.FLOAT; + } else { + throw new CorruptIndexException("invalid index SortedNumericSortField type ID: " + type, input); + } + byte numericSelector = input.readByte(); + if (numericSelector == 0) { + sortedNumericSelector = SortedNumericSelector.Type.MIN; + } else if (numericSelector == 1) { + sortedNumericSelector = SortedNumericSelector.Type.MAX; + } else { + throw new CorruptIndexException("invalid index SortedNumericSelector ID: " + numericSelector, input); + } + break; default: throw new CorruptIndexException("invalid index sort field type ID: " + sortTypeID, input); } @@ -139,7 +182,13 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { throw new CorruptIndexException("invalid index sort reverse: " + b, input); } - sortFields[i] = new SortField(fieldName, sortType, reverse); + if (sortedSetSelector != null) { + sortFields[i] = new SortedSetSortField(fieldName, reverse, sortedSetSelector); + } else if (sortedNumericSelector != null) { + sortFields[i] = new SortedNumericSortField(fieldName, sortType, reverse, sortedNumericSelector); + } else { + sortFields[i] = new SortField(fieldName, sortType, reverse); + } Object missingValue; b = input.readByte(); @@ -194,7 +243,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { } else { indexSort = null; } - + si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, segmentID, attributes, indexSort); si.setFiles(files); } catch (Throwable exception) { @@ -213,8 +262,8 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { try (IndexOutput output = dir.createOutput(fileName, ioContext)) { // Only add the file once we've successfully created it, else IFD assert can trip: si.addFile(fileName); - CodecUtil.writeIndexHeader(output, - Lucene62SegmentInfoFormat.CODEC_NAME, + CodecUtil.writeIndexHeader(output, + Lucene62SegmentInfoFormat.CODEC_NAME, Lucene62SegmentInfoFormat.VERSION_CURRENT, si.getId(), ""); @@ -245,6 +294,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { output.writeVInt(numSortFields); for (int i = 0; i < numSortFields; ++i) { SortField sortField = indexSort.getSort()[i]; + SortField.Type sortType = sortField.getType(); output.writeString(sortField.getField()); int sortTypeID; switch (sortField.getType()) { @@ -263,10 +313,55 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { case FLOAT: sortTypeID = 4; break; + case CUSTOM: + if (sortField instanceof SortedSetSortField) { + sortTypeID = 5; + sortType = SortField.Type.STRING; + } else if (sortField instanceof SortedNumericSortField) { + sortTypeID = 6; + sortType = ((SortedNumericSortField) sortField).getNumericType(); + } else { + throw new IllegalStateException("Unexpected SortedNumericSortField " + sortField); + } + break; default: throw new IllegalStateException("Unexpected sort type: " + sortField.getType()); } output.writeVInt(sortTypeID); + if (sortTypeID == 5) { + SortedSetSortField ssf = (SortedSetSortField) sortField; + if (ssf.getSelector() == SortedSetSelector.Type.MIN) { + output.writeByte((byte) 0); + } else if (ssf.getSelector() == SortedSetSelector.Type.MAX) { + output.writeByte((byte) 1); + } else if (ssf.getSelector() == SortedSetSelector.Type.MIDDLE_MIN) { + output.writeByte((byte) 2); + } else if (ssf.getSelector() == SortedSetSelector.Type.MIDDLE_MAX) { + output.writeByte((byte) 3); + } else { + throw new IllegalStateException("Unexpected SortedSetSelector type: " + ssf.getSelector()); + } + } else if (sortTypeID == 6) { + SortedNumericSortField snsf = (SortedNumericSortField) sortField; + if (snsf.getNumericType() == SortField.Type.LONG) { + output.writeByte((byte) 0); + } else if (snsf.getNumericType() == SortField.Type.INT) { + output.writeByte((byte) 1); + } else if (snsf.getNumericType() == SortField.Type.DOUBLE) { + output.writeByte((byte) 2); + } else if (snsf.getNumericType() == SortField.Type.FLOAT) { + output.writeByte((byte) 3); + } else { + throw new IllegalStateException("Unexpected SortedNumericSelector type: " + snsf.getNumericType()); + } + if (snsf.getSelector() == SortedNumericSelector.Type.MIN) { + output.writeByte((byte) 0); + } else if (snsf.getSelector() == SortedNumericSelector.Type.MAX) { + output.writeByte((byte) 1); + } else { + throw new IllegalStateException("Unexpected sorted numeric selector type: " + snsf.getSelector()); + } + } output.writeByte((byte) (sortField.getReverse() ? 0 : 1)); // write missing value @@ -274,7 +369,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { if (missingValue == null) { output.writeByte((byte) 0); } else { - switch(sortField.getType()) { + switch(sortType) { case STRING: if (missingValue == SortField.STRING_LAST) { output.writeByte((byte) 1); @@ -305,7 +400,7 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { } } } - + CodecUtil.writeFooter(output); } } @@ -314,5 +409,6 @@ public class Lucene62SegmentInfoFormat extends SegmentInfoFormat { public final static String SI_EXTENSION = "si"; static final String CODEC_NAME = "Lucene62SegmentInfo"; static final int VERSION_START = 0; - static final int VERSION_CURRENT = VERSION_START; + static final int VERSION_MULTI_VALUED_SORT = 1; + static final int VERSION_CURRENT = VERSION_MULTI_VALUED_SORT; } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java index 368259a5553..4f642eed52a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -468,7 +468,8 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig { */ public IndexWriterConfig setIndexSort(Sort sort) { for(SortField sortField : sort.getSort()) { - if (ALLOWED_INDEX_SORT_TYPES.contains(sortField.getType()) == false) { + final SortField.Type sortType = Sorter.getSortFieldType(sortField); + if (ALLOWED_INDEX_SORT_TYPES.contains(sortType) == false) { throw new IllegalArgumentException("invalid SortField type: must be one of " + ALLOWED_INDEX_SORT_TYPES + " but got: " + sortField); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index ee969c7b6b6..5ca6b65a7bb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -141,33 +141,25 @@ final class MultiSorter { private static ComparableProvider[] getComparableProviders(List readers, SortField sortField) throws IOException { ComparableProvider[] providers = new ComparableProvider[readers.size()]; + final int reverseMul = sortField.getReverse() ? -1 : 1; + final SortField.Type sortType = Sorter.getSortFieldType(sortField); - switch(sortField.getType()) { + switch(sortType) { case STRING: { // this uses the efficient segment-local ordinal map: final SortedDocValues[] values = new SortedDocValues[readers.size()]; for(int i=0;i