LUCENE-9484: Allow sorting an index after the fact (#1789)

Today we need to decide on an index sorting before we create the index.
In some situations it might make a lot of sense to sort an index afterwards
when the index doesn't change anymore or to compress older indices.
This change adds the ability to wrap readers from an unsorted index and merge it
into a sorted index by using IW#addIndices.
This commit is contained in:
Simon Willnauer 2020-09-03 12:54:40 +02:00 committed by GitHub
parent 73371cb4b6
commit 430a43520a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 1604 additions and 1286 deletions

View File

@ -182,6 +182,10 @@ New Features
small segments on getReader, subject to a configurable timeout, to improve
search performance by reducing the number of small segments for searching. (Simon Willnauer)
* LUCENE-9484: Allow sorting an index after it was created. With SortingCodecReader, existing
unsorted segments can be wrapped and merged into a fresh index using IndexWriter#addIndices
API. (Simon Willnauer, Adrien Grand)
Improvements
---------------------

View File

@ -24,6 +24,7 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.Counter;
@ -99,7 +100,7 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
bytesUsed = newBytesUsed;
}
private SortingLeafReader.CachedBinaryDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
static CachedBinaryDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc);
BytesRef[] values = new BytesRef[maxDoc];
while (true) {
@ -111,7 +112,7 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
docsWithField.set(newDocID);
values[newDocID] = BytesRef.deepCopyOf(oldValues.binaryValue());
}
return new SortingLeafReader.CachedBinaryDVs(values, docsWithField);
return new CachedBinaryDVs(values, docsWithField);
}
@Override
@ -128,7 +129,7 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
if (finalLengths == null) {
finalLengths = this.lengths.build();
}
final SortingLeafReader.CachedBinaryDVs sorted;
final CachedBinaryDVs sorted;
if (sortMap != null) {
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap,
new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
@ -145,7 +146,7 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
if (sorted == null) {
return new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator());
} else {
return new SortingLeafReader.SortingBinaryDocValues(sorted);
return new SortingBinaryDocValues(sorted);
}
}
});
@ -202,4 +203,63 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
return value.get();
}
}
static class SortingBinaryDocValues extends BinaryDocValues {
private final CachedBinaryDVs dvs;
private int docID = -1;
private long cost = -1;
SortingBinaryDocValues(CachedBinaryDVs dvs) {
this.dvs = dvs;
}
@Override
public int nextDoc() {
if (docID+1 == dvs.docsWithField.length()) {
docID = NO_MORE_DOCS;
} else {
docID = dvs.docsWithField.nextSetBit(docID+1);
}
return docID;
}
@Override
public int docID() {
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException("use nextDoc instead");
}
@Override
public boolean advanceExact(int target) throws IOException {
throw new UnsupportedOperationException("use nextDoc instead");
}
@Override
public BytesRef binaryValue() {
return dvs.values[docID];
}
@Override
public long cost() {
if (cost == -1) {
cost = dvs.docsWithField.cardinality();
}
return cost;
}
}
static class CachedBinaryDVs {
// TODO: at least cutover to BytesRefArray here:
private final BytesRef[] values;
private final BitSet docsWithField;
CachedBinaryDVs(BytesRef[] values, BitSet docsWithField) {
this.values = values;
this.docsWithField = docsWithField;
}
}
}

View File

@ -26,9 +26,16 @@ import java.util.Map;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.ByteBuffersDataInput;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TimSorter;
import org.apache.lucene.util.automaton.CompiledAutomaton;
final class FreqProxTermsWriter extends TermsHash {
@ -88,7 +95,20 @@ final class FreqProxTermsWriter extends TermsHash {
Fields fields = new FreqProxFields(allFields);
applyDeletes(state, fields);
if (sortMap != null) {
fields = new SortingLeafReader.SortingFields(fields, state.fieldInfos, sortMap);
final Sorter.DocMap docMap = sortMap;
final FieldInfos infos = state.fieldInfos;
fields = new FilterLeafReader.FilterFields(fields) {
@Override
public Terms terms(final String field) throws IOException {
Terms terms = in.terms(field);
if (terms == null) {
return null;
} else {
return new SortingTerms(terms, infos.fieldInfo(field).getIndexOptions(), docMap);
}
}
};
}
FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state);
@ -110,4 +130,503 @@ final class FreqProxTermsWriter extends TermsHash {
public TermsHashPerField addField(FieldInvertState invertState, FieldInfo fieldInfo) {
return new FreqProxTermsWriterPerField(invertState, this, fieldInfo, nextTermsHash.addField(invertState, fieldInfo));
}
static class SortingTerms extends FilterLeafReader.FilterTerms {
private final Sorter.DocMap docMap;
private final IndexOptions indexOptions;
SortingTerms(final Terms in, IndexOptions indexOptions, final Sorter.DocMap docMap) {
super(in);
this.docMap = docMap;
this.indexOptions = indexOptions;
}
@Override
public TermsEnum iterator() throws IOException {
return new SortingTermsEnum(in.iterator(), docMap, indexOptions, hasPositions());
}
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm)
throws IOException {
return new SortingTermsEnum(in.intersect(compiled, startTerm), docMap, indexOptions, hasPositions());
}
}
private static class SortingTermsEnum extends FilterLeafReader.FilterTermsEnum {
final Sorter.DocMap docMap; // pkg-protected to avoid synthetic accessor methods
private final IndexOptions indexOptions;
private final boolean hasPositions;
SortingTermsEnum(final TermsEnum in, Sorter.DocMap docMap, IndexOptions indexOptions, boolean hasPositions) {
super(in);
this.docMap = docMap;
this.indexOptions = indexOptions;
this.hasPositions = hasPositions;
}
@Override
public PostingsEnum postings( PostingsEnum reuse, final int flags) throws IOException {
if (hasPositions && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) {
final PostingsEnum inReuse;
final SortingPostingsEnum wrapReuse;
if (reuse != null && reuse instanceof SortingPostingsEnum) {
// if we're asked to reuse the given DocsEnum and it is Sorting, return
// the wrapped one, since some Codecs expect it.
wrapReuse = (SortingPostingsEnum) reuse;
inReuse = wrapReuse.getWrapped();
} else {
wrapReuse = null;
inReuse = reuse;
}
final PostingsEnum inDocsAndPositions = in.postings(inReuse, flags);
// we ignore the fact that offsets may be stored but not asked for,
// since this code is expected to be used during addIndexes which will
// ask for everything. if that assumption changes in the future, we can
// factor in whether 'flags' says offsets are not required.
final boolean storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
return new SortingPostingsEnum(docMap.size(), wrapReuse, inDocsAndPositions, docMap, storeOffsets);
}
final PostingsEnum inReuse;
final SortingDocsEnum wrapReuse;
if (reuse != null && reuse instanceof SortingDocsEnum) {
// if we're asked to reuse the given DocsEnum and it is Sorting, return
// the wrapped one, since some Codecs expect it.
wrapReuse = (SortingDocsEnum) reuse;
inReuse = wrapReuse.getWrapped();
} else {
wrapReuse = null;
inReuse = reuse;
}
final PostingsEnum inDocs = in.postings(inReuse, flags);
final boolean withFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >=0 && PostingsEnum.featureRequested(flags, PostingsEnum.FREQS);
return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, withFreqs, docMap);
}
}
static class SortingDocsEnum extends FilterLeafReader.FilterPostingsEnum {
private static final class DocFreqSorter extends TimSorter {
private int[] docs;
private int[] freqs;
private final int[] tmpDocs;
private int[] tmpFreqs;
DocFreqSorter(int maxDoc) {
super(maxDoc / 64);
this.tmpDocs = new int[maxDoc / 64];
}
public void reset(int[] docs, int[] freqs) {
this.docs = docs;
this.freqs = freqs;
if (freqs != null && tmpFreqs == null) {
tmpFreqs = new int[tmpDocs.length];
}
}
@Override
protected int compare(int i, int j) {
return docs[i] - docs[j];
}
@Override
protected void swap(int i, int j) {
int tmpDoc = docs[i];
docs[i] = docs[j];
docs[j] = tmpDoc;
if (freqs != null) {
int tmpFreq = freqs[i];
freqs[i] = freqs[j];
freqs[j] = tmpFreq;
}
}
@Override
protected void copy(int src, int dest) {
docs[dest] = docs[src];
if (freqs != null) {
freqs[dest] = freqs[src];
}
}
@Override
protected void save(int i, int len) {
System.arraycopy(docs, i, tmpDocs, 0, len);
if (freqs != null) {
System.arraycopy(freqs, i, tmpFreqs, 0, len);
}
}
@Override
protected void restore(int i, int j) {
docs[j] = tmpDocs[i];
if (freqs != null) {
freqs[j] = tmpFreqs[i];
}
}
@Override
protected int compareSaved(int i, int j) {
return tmpDocs[i] - docs[j];
}
}
private final int maxDoc;
private final DocFreqSorter sorter;
private int[] docs;
private int[] freqs;
private int docIt = -1;
private final int upto;
private final boolean withFreqs;
SortingDocsEnum(int maxDoc, SortingDocsEnum reuse, final PostingsEnum in, boolean withFreqs, final Sorter.DocMap docMap) throws IOException {
super(in);
this.maxDoc = maxDoc;
this.withFreqs = withFreqs;
if (reuse != null) {
if (reuse.maxDoc == maxDoc) {
sorter = reuse.sorter;
} else {
sorter = new DocFreqSorter(maxDoc);
}
docs = reuse.docs;
freqs = reuse.freqs; // maybe null
} else {
docs = new int[64];
sorter = new DocFreqSorter(maxDoc);
}
docIt = -1;
int i = 0;
int doc;
if (withFreqs) {
if (freqs == null || freqs.length < docs.length) {
freqs = new int[docs.length];
}
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){
if (i >= docs.length) {
docs = ArrayUtil.grow(docs, docs.length + 1);
freqs = ArrayUtil.grow(freqs, freqs.length + 1);
}
docs[i] = docMap.oldToNew(doc);
freqs[i] = in.freq();
++i;
}
} else {
freqs = null;
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS){
if (i >= docs.length) {
docs = ArrayUtil.grow(docs, docs.length + 1);
}
docs[i++] = docMap.oldToNew(doc);
}
}
// TimSort can save much time compared to other sorts in case of
// reverse sorting, or when sorting a concatenation of sorted readers
sorter.reset(docs, freqs);
sorter.sort(0, i);
upto = i;
}
// for testing
boolean reused(PostingsEnum other) {
if (other == null || !(other instanceof SortingDocsEnum)) {
return false;
}
return docs == ((SortingDocsEnum) other).docs;
}
@Override
public int advance(final int target) throws IOException {
// need to support it for checkIndex, but in practice it won't be called, so
// don't bother to implement efficiently for now.
return slowAdvance(target);
}
@Override
public int docID() {
return docIt < 0 ? -1 : docIt >= upto ? NO_MORE_DOCS : docs[docIt];
}
@Override
public int freq() throws IOException {
return withFreqs && docIt < upto ? freqs[docIt] : 1;
}
@Override
public int nextDoc() throws IOException {
if (++docIt >= upto) return NO_MORE_DOCS;
return docs[docIt];
}
/** Returns the wrapped {@link PostingsEnum}. */
PostingsEnum getWrapped() {
return in;
}
// we buffer up docs/freqs only, don't forward any positions requests to underlying enum
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
}
static class SortingPostingsEnum extends FilterLeafReader.FilterPostingsEnum {
/**
* A {@link TimSorter} which sorts two parallel arrays of doc IDs and
* offsets in one go. Everyti
* me a doc ID is 'swapped', its corresponding offset
* is swapped too.
*/
private static final class DocOffsetSorter extends TimSorter {
private int[] docs;
private long[] offsets;
private final int[] tmpDocs;
private final long[] tmpOffsets;
public DocOffsetSorter(int maxDoc) {
super(maxDoc / 64);
this.tmpDocs = new int[maxDoc / 64];
this.tmpOffsets = new long[maxDoc / 64];
}
public void reset(int[] docs, long[] offsets) {
this.docs = docs;
this.offsets = offsets;
}
@Override
protected int compare(int i, int j) {
return docs[i] - docs[j];
}
@Override
protected void swap(int i, int j) {
int tmpDoc = docs[i];
docs[i] = docs[j];
docs[j] = tmpDoc;
long tmpOffset = offsets[i];
offsets[i] = offsets[j];
offsets[j] = tmpOffset;
}
@Override
protected void copy(int src, int dest) {
docs[dest] = docs[src];
offsets[dest] = offsets[src];
}
@Override
protected void save(int i, int len) {
System.arraycopy(docs, i, tmpDocs, 0, len);
System.arraycopy(offsets, i, tmpOffsets, 0, len);
}
@Override
protected void restore(int i, int j) {
docs[j] = tmpDocs[i];
offsets[j] = tmpOffsets[i];
}
@Override
protected int compareSaved(int i, int j) {
return tmpDocs[i] - docs[j];
}
}
private final int maxDoc;
private final DocOffsetSorter sorter;
private int[] docs;
private long[] offsets;
private final int upto;
private final ByteBuffersDataInput postingInput;
private final boolean storeOffsets;
private int docIt = -1;
private int pos;
private int startOffset = -1;
private int endOffset = -1;
private final BytesRef payload;
private int currFreq;
private final ByteBuffersDataOutput buffer;
SortingPostingsEnum(int maxDoc, SortingPostingsEnum reuse, final PostingsEnum in, Sorter.DocMap docMap, boolean storeOffsets) throws IOException {
super(in);
this.maxDoc = maxDoc;
this.storeOffsets = storeOffsets;
if (reuse != null) {
docs = reuse.docs;
offsets = reuse.offsets;
payload = reuse.payload;
buffer = reuse.buffer;
buffer.reset();
if (reuse.maxDoc == maxDoc) {
sorter = reuse.sorter;
} else {
sorter = new DocOffsetSorter(maxDoc);
}
} else {
docs = new int[32];
offsets = new long[32];
payload = new BytesRef(32);
buffer = ByteBuffersDataOutput.newResettableInstance();
sorter = new DocOffsetSorter(maxDoc);
}
int doc;
int i = 0;
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (i == docs.length) {
final int newLength = ArrayUtil.oversize(i + 1, 4);
docs = ArrayUtil.growExact(docs, newLength);
offsets = ArrayUtil.growExact(offsets, newLength);
}
docs[i] = docMap.oldToNew(doc);
offsets[i] = buffer.size();
addPositions(in, buffer);
i++;
}
upto = i;
sorter.reset(docs, offsets);
sorter.sort(0, upto);
this.postingInput = buffer.toDataInput();
}
// for testing
boolean reused(PostingsEnum other) {
if (other == null || !(other instanceof SortingPostingsEnum)) {
return false;
}
return docs == ((SortingPostingsEnum) other).docs;
}
private void addPositions(final PostingsEnum in, final DataOutput out) throws IOException {
int freq = in.freq();
out.writeVInt(freq);
int previousPosition = 0;
int previousEndOffset = 0;
for (int i = 0; i < freq; i++) {
final int pos = in.nextPosition();
final BytesRef payload = in.getPayload();
// The low-order bit of token is set only if there is a payload, the
// previous bits are the delta-encoded position.
final int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1);
out.writeVInt(token);
previousPosition = pos;
if (storeOffsets) { // don't encode offsets if they are not stored
final int startOffset = in.startOffset();
final int endOffset = in.endOffset();
out.writeVInt(startOffset - previousEndOffset);
out.writeVInt(endOffset - startOffset);
previousEndOffset = endOffset;
}
if (payload != null) {
out.writeVInt(payload.length);
out.writeBytes(payload.bytes, payload.offset, payload.length);
}
}
}
@Override
public int advance(final int target) throws IOException {
// need to support it for checkIndex, but in practice it won't be called, so
// don't bother to implement efficiently for now.
return slowAdvance(target);
}
@Override
public int docID() {
return docIt < 0 ? -1 : docIt >= upto ? NO_MORE_DOCS : docs[docIt];
}
@Override
public int endOffset() throws IOException {
return endOffset;
}
@Override
public int freq() throws IOException {
return currFreq;
}
@Override
public BytesRef getPayload() throws IOException {
return payload.length == 0 ? null : payload;
}
@Override
public int nextDoc() throws IOException {
if (++docIt >= upto) return DocIdSetIterator.NO_MORE_DOCS;
postingInput.seek(offsets[docIt]);
currFreq = postingInput.readVInt();
// reset variables used in nextPosition
pos = 0;
endOffset = 0;
return docs[docIt];
}
@Override
public int nextPosition() throws IOException {
final int token = postingInput.readVInt();
pos += token >>> 1;
if (storeOffsets) {
startOffset = endOffset + postingInput.readVInt();
endOffset = startOffset + postingInput.readVInt();
}
if ((token & 1) != 0) {
payload.offset = 0;
payload.length = postingInput.readVInt();
if (payload.length > payload.bytes.length) {
payload.bytes = new byte[ArrayUtil.oversize(payload.length, 1)];
}
postingInput.readBytes(payload.bytes, 0, payload.length);
} else {
payload.length = 0;
}
return pos;
}
@Override
public int startOffset() throws IOException {
return startOffset;
}
/** Returns the wrapped {@link PostingsEnum}. */
PostingsEnum getWrapped() {
return in;
}
}
}

View File

@ -70,7 +70,7 @@ class NormValuesWriter {
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, NormsConsumer normsConsumer) throws IOException {
final PackedLongValues values = pending.build();
final SortingLeafReader.CachedNumericDVs sorted;
final NumericDocValuesWriter.CachedNumericDVs sorted;
if (sortMap != null) {
sorted = NumericDocValuesWriter.sortDocValues(state.segmentInfo.maxDoc(), sortMap,
new BufferedNorms(values, docsWithField.iterator()));
@ -87,7 +87,7 @@ class NormValuesWriter {
if (sorted == null) {
return new BufferedNorms(values, docsWithField.iterator());
} else {
return new SortingLeafReader.SortingNumericDocValues(sorted);
return new NumericDocValuesWriter.SortingNumericDocValues(sorted);
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.packed.PackedInts;
@ -40,7 +41,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
private final FieldInfo fieldInfo;
private int lastDocID = -1;
public NumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
NumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new DocsWithFieldSet();
bytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed();
@ -76,7 +77,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
}
static SortingLeafReader.CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
static CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc);
long[] values = new long[maxDoc];
while (true) {
@ -88,7 +89,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
docsWithField.set(newDocID);
values[newDocID] = oldDocValues.longValue();
}
return new SortingLeafReader.CachedNumericDVs(values, docsWithField);
return new CachedNumericDVs(values, docsWithField);
}
@Override
@ -96,7 +97,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
if (finalValues == null) {
finalValues = pending.build();
}
final SortingLeafReader.CachedNumericDVs sorted;
final CachedNumericDVs sorted;
if (sortMap != null) {
NumericDocValues oldValues = new BufferedNumericDocValues(finalValues, docsWithField.iterator());
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues);
@ -114,7 +115,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
if (sorted == null) {
return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
} else {
return new SortingLeafReader.SortingNumericDocValues(sorted);
return new SortingNumericDocValues(sorted);
}
}
});
@ -165,4 +166,65 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
return value;
}
}
static class SortingNumericDocValues extends NumericDocValues {
private final CachedNumericDVs dvs;
private int docID = -1;
private long cost = -1;
SortingNumericDocValues(CachedNumericDVs dvs) {
this.dvs = dvs;
}
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() {
if (docID+1 == dvs.docsWithField.length()) {
docID = NO_MORE_DOCS;
} else {
docID = dvs.docsWithField.nextSetBit(docID+1);
}
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException("use nextDoc() instead");
}
@Override
public boolean advanceExact(int target) throws IOException {
// needed in IndexSorter#{Long|Int|Double|Float}Sorter
docID = target;
return dvs.docsWithField.get(target);
}
@Override
public long longValue() {
return dvs.values[docID];
}
@Override
public long cost() {
if (cost == -1) {
cost = dvs.docsWithField.cardinality();
}
return cost;
}
}
static class CachedNumericDVs {
private final long[] values;
private final BitSet docsWithField;
CachedNumericDVs(long[] values, BitSet docsWithField) {
this.values = values;
this.docsWithField = docsWithField;
}
}
}

View File

@ -160,7 +160,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
if (sorted == null) {
return buf;
}
return new SortingLeafReader.SortingSortedDocValues(buf, sorted);
return new SortingSortedDocValues(buf, sorted);
}
});
}
@ -233,4 +233,72 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
}
}
static class SortingSortedDocValues extends SortedDocValues {
private final SortedDocValues in;
private final int[] ords;
private int docID = -1;
SortingSortedDocValues(SortedDocValues in, int[] ords) {
this.in = in;
this.ords = ords;
assert ords != null;
}
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() {
while (true) {
docID++;
if (docID == ords.length) {
docID = NO_MORE_DOCS;
break;
}
if (ords[docID] != -1) {
break;
}
// skip missing docs
}
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException("use nextDoc instead");
}
@Override
public boolean advanceExact(int target) throws IOException {
// needed in IndexSorter#StringSorter
docID = target;
return ords[target] != -1;
}
@Override
public int ordValue() {
return ords[docID];
}
@Override
public long cost() {
return in.cost();
}
@Override
public BytesRef lookupOrd(int ord) throws IOException {
return in.lookupOrd(ord);
}
@Override
public int getValueCount() {
return in.getValueCount();
}
}
}

View File

@ -108,7 +108,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
return new BufferedSortedNumericDocValues(finalValues, finalValuesCount, docsWithField.iterator());
}
private long[][] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedNumericDocValues oldValues) throws IOException {
static long[][] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedNumericDocValues oldValues) throws IOException {
long[][] values = new long[maxDoc][];
int docID;
while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
@ -155,7 +155,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
if (sorted == null) {
return buf;
} else {
return new SortingLeafReader.SortingSortedNumericDocValues(buf, sorted);
return new SortingSortedNumericDocValues(buf, sorted);
}
}
});
@ -168,7 +168,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
private int valueCount;
private int valueUpto;
public BufferedSortedNumericDocValues(PackedLongValues values, PackedLongValues valueCounts, DocIdSetIterator docsWithField) {
BufferedSortedNumericDocValues(PackedLongValues values, PackedLongValues valueCounts, DocIdSetIterator docsWithField) {
valuesIter = values.iterator();
valueCountsIter = valueCounts.iterator();
this.docsWithField = docsWithField;
@ -223,4 +223,68 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
}
}
static class SortingSortedNumericDocValues extends SortedNumericDocValues {
private final SortedNumericDocValues in;
private final long[][] values;
private int docID = -1;
private int upto;
SortingSortedNumericDocValues(SortedNumericDocValues in, long[][] values) {
this.in = in;
this.values = values;
}
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() {
while (true) {
docID++;
if (docID == values.length) {
docID = NO_MORE_DOCS;
break;
}
if (values[docID] != null) {
break;
}
// skip missing docs
}
upto = 0;
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException("use nextDoc instead");
}
@Override
public boolean advanceExact(int target) throws IOException {
docID = target;
upto = 0;
return values[docID] != null;
}
@Override
public long nextValue() {
if (upto == values[docID].length) {
throw new AssertionError();
} else {
return values[docID][upto++];
}
}
@Override
public long cost() {
return in.cost();
}
@Override
public int docValueCount() {
return values[docID].length;
}
}
}

View File

@ -139,7 +139,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
bytesUsed = newBytesUsed;
}
private long[][] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedSetDocValues oldValues) throws IOException {
static long[][] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedSetDocValues oldValues) throws IOException {
long[][] ords = new long[maxDoc][];
int docID;
while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
@ -222,7 +222,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
if (sorted == null) {
return buf;
} else {
return new SortingLeafReader.SortingSortedSetDocValues(buf, sorted);
return new SortingSortedSetDocValues(buf, sorted);
}
}
});
@ -241,7 +241,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
private int ordCount;
private int ordUpto;
public BufferedSortedSetDocValues(int[] sortedValues, int[] ordMap, BytesRefHash hash, PackedLongValues ords, PackedLongValues ordCounts, int maxCount, DocIdSetIterator docsWithField) {
BufferedSortedSetDocValues(int[] sortedValues, int[] ordMap, BytesRefHash hash, PackedLongValues ords, PackedLongValues ordCounts, int maxCount, DocIdSetIterator docsWithField) {
this.currentDoc = new int[maxCount];
this.sortedValues = sortedValues;
this.ordMap = ordMap;
@ -308,4 +308,75 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
}
}
static class SortingSortedSetDocValues extends SortedSetDocValues {
private final SortedSetDocValues in;
private final long[][] ords;
private int docID = -1;
private int ordUpto;
SortingSortedSetDocValues(SortedSetDocValues in, long[][] ords) {
this.in = in;
this.ords = ords;
}
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() {
while (true) {
docID++;
if (docID == ords.length) {
docID = NO_MORE_DOCS;
break;
}
if (ords[docID] != null) {
break;
}
// skip missing docs
}
ordUpto = 0;
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException("use nextDoc instead");
}
@Override
public boolean advanceExact(int target) throws IOException {
// needed in IndexSorter#StringSorter
docID = target;
ordUpto = 0;
return ords[docID] != null;
}
@Override
public long nextOrd() {
if (ordUpto == ords[docID].length) {
return NO_MORE_ORDS;
} else {
return ords[docID][ordUpto++];
}
}
@Override
public long cost() {
return in.cost();
}
@Override
public BytesRef lookupOrd(long ord) throws IOException {
return in.lookupOrd(ord);
}
@Override
public long getValueCount() {
return in.getValueCount();
}
}
}

View File

@ -0,0 +1,513 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.search.Sort;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
/**
* An {@link org.apache.lucene.index.CodecReader} which supports sorting documents by a given
* {@link Sort}. This can be used to re-sort and index after it's been created by wrapping all
* readers of the index with this reader and adding it to a fresh IndexWriter via
* {@link IndexWriter#addIndexes(CodecReader...)}.
*
* @lucene.experimental
*/
public final class SortingCodecReader extends FilterCodecReader {
private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNumericDVs = new HashMap<>();
private final Map<String, BinaryDocValuesWriter.CachedBinaryDVs> cachedBinaryDVs = new HashMap<>();
private final Map<String, int[]> cachedSortedDVs = new HashMap<>();
// TODO: pack long[][] into an int[] (offset) and long[] instead:
private final Map<String, long[][]> cachedSortedSetDVs = new HashMap<>();
private final Map<String, long[][]> cachedSortedNumericDVs = new HashMap<>();
private static class SortingBits implements Bits {
private final Bits in;
private final Sorter.DocMap docMap;
SortingBits(final Bits in, Sorter.DocMap docMap) {
this.in = in;
this.docMap = docMap;
}
@Override
public boolean get(int index) {
return in.get(docMap.newToOld(index));
}
@Override
public int length() {
return in.length();
}
}
private static class SortingPointValues extends PointValues {
private final PointValues in;
private final Sorter.DocMap docMap;
SortingPointValues(final PointValues in, Sorter.DocMap docMap) {
this.in = in;
this.docMap = docMap;
}
@Override
public void intersect(IntersectVisitor visitor) throws IOException {
in.intersect(new IntersectVisitor() {
@Override
public void visit(int docID) throws IOException {
visitor.visit(docMap.oldToNew(docID));
}
@Override
public void visit(int docID, byte[] packedValue) throws IOException {
visitor.visit(docMap.oldToNew(docID), packedValue);
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return visitor.compare(minPackedValue, maxPackedValue);
}
});
}
@Override
public long estimatePointCount(IntersectVisitor visitor) {
return in.estimatePointCount(visitor);
}
@Override
public byte[] getMinPackedValue() throws IOException {
return in.getMinPackedValue();
}
@Override
public byte[] getMaxPackedValue() throws IOException {
return in.getMaxPackedValue();
}
@Override
public int getNumDimensions() throws IOException {
return in.getNumDimensions();
}
@Override
public int getNumIndexDimensions() throws IOException {
return in.getNumIndexDimensions();
}
@Override
public int getBytesPerDimension() throws IOException {
return in.getBytesPerDimension();
}
@Override
public long size() {
return in.size();
}
@Override
public int getDocCount() {
return in.getDocCount();
}
}
/** Return a sorted view of <code>reader</code> according to the order
* defined by <code>sort</code>. If the reader is already sorted, this
* method might return the reader as-is. */
public static CodecReader wrap(CodecReader reader, Sort sort) throws IOException {
return wrap(reader, new Sorter(sort).sort(reader), sort);
}
/** Expert: same as {@link #wrap(org.apache.lucene.index.CodecReader, Sort)} but operates directly on a {@link Sorter.DocMap}. */
static CodecReader wrap(CodecReader reader, Sorter.DocMap docMap, Sort sort) {
LeafMetaData metaData = reader.getMetaData();
LeafMetaData newMetaData = new LeafMetaData(metaData.getCreatedVersionMajor(), metaData.getMinVersion(), sort);
if (docMap == null) {
// the reader is already sorted
return new FilterCodecReader(reader) {
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
@Override
public LeafMetaData getMetaData() {
return newMetaData;
}
@Override
public String toString() {
return "SortingCodecReader(" + in + ")";
}
};
}
if (reader.maxDoc() != docMap.size()) {
throw new IllegalArgumentException("reader.maxDoc() should be equal to docMap.size(), got" + reader.maxDoc() + " != " + docMap.size());
}
assert Sorter.isConsistent(docMap);
return new SortingCodecReader(reader, docMap, newMetaData);
}
final Sorter.DocMap docMap; // pkg-protected to avoid synthetic accessor methods
final LeafMetaData metaData;
private SortingCodecReader(final CodecReader in, final Sorter.DocMap docMap, LeafMetaData metaData) {
super(in);
this.docMap = docMap;
this.metaData = metaData;
}
@Override
public FieldsProducer getPostingsReader() {
FieldsProducer postingsReader = in.getPostingsReader();
return new FieldsProducer() {
@Override
public void close() throws IOException {
postingsReader.close();
}
@Override
public void checkIntegrity() throws IOException {
postingsReader.checkIntegrity();
}
@Override
public Iterator<String> iterator() {
return postingsReader.iterator();
}
@Override
public Terms terms(String field) throws IOException {
Terms terms = postingsReader.terms(field);
return terms == null ? null : new FreqProxTermsWriter.SortingTerms(terms,
in.getFieldInfos().fieldInfo(field).getIndexOptions(), docMap);
}
@Override
public int size() {
return postingsReader.size();
}
@Override
public long ramBytesUsed() {
return postingsReader.ramBytesUsed();
}
};
}
@Override
public StoredFieldsReader getFieldsReader() {
StoredFieldsReader delegate = in.getFieldsReader();
return newStoredFieldsReader(delegate);
}
private StoredFieldsReader newStoredFieldsReader(StoredFieldsReader delegate) {
return new StoredFieldsReader() {
@Override
public void visitDocument(int docID, StoredFieldVisitor visitor) throws IOException {
delegate.visitDocument(docMap.newToOld(docID), visitor);
}
@Override
public StoredFieldsReader clone() {
return newStoredFieldsReader(delegate.clone());
}
@Override
public void checkIntegrity() throws IOException {
delegate.checkIntegrity();
}
@Override
public void close() throws IOException {
delegate.close();
}
@Override
public long ramBytesUsed() {
return delegate.ramBytesUsed();
}
};
}
@Override
public Bits getLiveDocs() {
final Bits inLiveDocs = in.getLiveDocs();
if (inLiveDocs == null) {
return null;
} else {
return new SortingBits(inLiveDocs, docMap);
}
}
@Override
public PointsReader getPointsReader() {
final PointsReader delegate = in.getPointsReader();
return new PointsReader() {
@Override
public void checkIntegrity() throws IOException {
delegate.checkIntegrity();
}
@Override
public PointValues getValues(String field) throws IOException {
return new SortingPointValues(delegate.getValues(field), docMap);
}
@Override
public void close() throws IOException {
delegate.close();
}
@Override
public long ramBytesUsed() {
return delegate.ramBytesUsed();
}
};
}
private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms = new HashMap<>();
@Override
public NormsProducer getNormsReader() {
final NormsProducer delegate = in.getNormsReader();
return new NormsProducer() {
@Override
public NumericDocValues getNorms(FieldInfo field) throws IOException {
return produceNumericDocValues(field, delegate.getNorms(field), cachedNorms);
}
@Override
public void checkIntegrity() throws IOException {
delegate.checkIntegrity();
}
@Override
public void close() throws IOException {
delegate.close();
}
@Override
public long ramBytesUsed() {
return delegate.ramBytesUsed();
}
};
}
private NumericDocValues produceNumericDocValues(FieldInfo field, NumericDocValues oldNorms,
Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms) throws IOException {
NumericDocValuesWriter.CachedNumericDVs norms;
synchronized (cachedNorms) {
norms = cachedNorms.get(field);
if (norms == null) {
FixedBitSet docsWithField = new FixedBitSet(maxDoc());
long[] values = new long[maxDoc()];
while (true) {
int docID = oldNorms.nextDoc();
if (docID == NO_MORE_DOCS) {
break;
}
int newDocID = docMap.oldToNew(docID);
docsWithField.set(newDocID);
values[newDocID] = oldNorms.longValue();
}
norms = new NumericDocValuesWriter.CachedNumericDVs(values, docsWithField);
cachedNorms.put(field.name, norms);
}
}
return new NumericDocValuesWriter.SortingNumericDocValues(norms);
}
@Override
public DocValuesProducer getDocValuesReader() {
final DocValuesProducer delegate = in.getDocValuesReader();
return new DocValuesProducer() {
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
return produceNumericDocValues(field,delegate.getNumeric(field), cachedNumericDVs);
}
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
final BinaryDocValues oldDocValues = delegate.getBinary(field);
BinaryDocValuesWriter.CachedBinaryDVs dvs;
synchronized (cachedBinaryDVs) {
dvs = cachedBinaryDVs.get(field);
if (dvs == null) {
dvs = BinaryDocValuesWriter.sortDocValues(maxDoc(), docMap, oldDocValues);
cachedBinaryDVs.put(field.name, dvs);
}
}
return new BinaryDocValuesWriter.SortingBinaryDocValues(dvs);
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
SortedDocValues oldDocValues = delegate.getSorted(field);
int[] ords;
synchronized (cachedSortedDVs) {
ords = cachedSortedDVs.get(field);
if (ords == null) {
ords = new int[maxDoc()];
Arrays.fill(ords, -1);
int docID;
while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
int newDocID = docMap.oldToNew(docID);
ords[newDocID] = oldDocValues.ordValue();
}
cachedSortedDVs.put(field.name, ords);
}
}
return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, ords);
}
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field);
long[][] values;
synchronized (cachedSortedNumericDVs) {
values = cachedSortedNumericDVs.get(field);
if (values == null) {
values = SortedNumericDocValuesWriter.sortDocValues(maxDoc(), docMap, oldDocValues);
cachedSortedNumericDVs.put(field.name, values);
}
}
return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, values);
}
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetDocValues oldDocValues = delegate.getSortedSet(field);
long[][] ords;
synchronized (cachedSortedSetDVs) {
ords = cachedSortedSetDVs.get(field);
if (ords == null) {
ords = SortedSetDocValuesWriter.sortDocValues(maxDoc(), docMap, oldDocValues);
cachedSortedSetDVs.put(field.name, ords);
}
}
return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, ords);
}
@Override
public void checkIntegrity() throws IOException {
delegate.checkIntegrity();
}
@Override
public void close() throws IOException {
delegate.close();
}
@Override
public long ramBytesUsed() {
return delegate.ramBytesUsed();
}
};
}
@Override
public TermVectorsReader getTermVectorsReader() {
return newTermVectorsReader(in.getTermVectorsReader());
}
private TermVectorsReader newTermVectorsReader(TermVectorsReader delegate) {
return new TermVectorsReader() {
@Override
public Fields get(int doc) throws IOException {
return delegate.get(docMap.newToOld(doc));
}
@Override
public void checkIntegrity() throws IOException {
delegate.checkIntegrity();
}
@Override
public TermVectorsReader clone() {
return newTermVectorsReader(delegate.clone());
}
@Override
public void close() throws IOException {
delegate.close();
}
@Override
public long ramBytesUsed() {
return delegate.ramBytesUsed();
}
};
}
@Override
public String toString() {
return "SortingCodecReader(" + in + ")";
}
// no caching on sorted views
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
@Override
public LeafMetaData getMetaData() {
return metaData;
}
}

View File

@ -0,0 +1,224 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
public class TestSortingCodecReader extends LuceneTestCase {
public void testSortOnAddIndicesInt() throws IOException {
Directory tmpDir = newDirectory();
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
IndexWriter w = new IndexWriter(tmpDir, iwc);
Document doc = new Document();
doc.add(new NumericDocValuesField("foo", 18));
w.addDocument(doc);
doc = new Document();
doc.add(new NumericDocValuesField("foo", -1));
w.addDocument(doc);
w.commit();
doc = new Document();
doc.add(new NumericDocValuesField("foo", 7));
w.addDocument(doc);
w.commit();
w.close();
Sort indexSort = new Sort(new SortField("foo", SortField.Type.INT));
iwc = new IndexWriterConfig(new MockAnalyzer(random())).setIndexSort(indexSort);
w = new IndexWriter(dir, iwc);
try (DirectoryReader reader = DirectoryReader.open(tmpDir)) {
List<CodecReader> readers = new ArrayList<>();
for (LeafReaderContext ctx : reader.leaves()) {
CodecReader wrap = SortingCodecReader.wrap(SlowCodecReaderWrapper.wrap(ctx.reader()), indexSort);
assertTrue(wrap.toString(), wrap.toString().startsWith("SortingCodecReader("));
readers.add(wrap);
}
w.addIndexes(readers.toArray(new CodecReader[0]));
}
DirectoryReader r = DirectoryReader.open(w);
LeafReader leaf = getOnlyLeafReader(r);
assertEquals(3, leaf.maxDoc());
NumericDocValues values = leaf.getNumericDocValues("foo");
assertEquals(0, values.nextDoc());
assertEquals(-1, values.longValue());
assertEquals(1, values.nextDoc());
assertEquals(7, values.longValue());
assertEquals(2, values.nextDoc());
assertEquals(18, values.longValue());
assertNotNull(leaf.getMetaData().getSort());
IOUtils.close(r, w, dir, tmpDir);
}
public void testSortOnAddIndicesRandom() throws IOException {
try (Directory dir = newDirectory()) {
int numDocs = atLeast(200);
int actualNumDocs;
List<Integer> docIds = new ArrayList<>(numDocs);
for (int i = 0; i < numDocs; i++) {
docIds.add(i);
}
Collections.shuffle(docIds, random());
try (RandomIndexWriter iw = new RandomIndexWriter(random(), dir)) {
for (int i = 0; i < numDocs; i++) {
int docId = docIds.get(i);
Document doc = new Document();
doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
doc.add(new StringField("id", Integer.toString(docId), Field.Store.YES));
doc.add(new LongPoint("id", docId));
doc.add(new TextField("text_field", RandomStrings.randomRealisticUnicodeOfLength(random(), 25), Field.Store.YES));
doc.add(new SortedNumericDocValuesField("sorted_numeric_dv", docId));
doc.add(new SortedDocValuesField("binary_sorted_dv", new BytesRef(Integer.toString(docId))));
doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId))));
doc.add(new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId))));
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
doc.add(new Field("term_vectors", "test" + docId, ft));
if (rarely() == false) {
doc.add(new NumericDocValuesField("id", docId));
doc.add(new SortedSetDocValuesField("sorted_set_sort_field", new BytesRef(String.format(Locale.ROOT, "%06d", docId))));
doc.add(new SortedDocValuesField("sorted_binary_sort_field", new BytesRef(String.format(Locale.ROOT, "%06d", docId))));
doc.add(new SortedNumericDocValuesField("sorted_numeric_sort_field", docId));
} else {
doc.add(new NumericDocValuesField("alt_id", docId));
}
iw.addDocument(doc);
if (i > 0 && random().nextInt(5) == 0) {
final int id = RandomPicks.randomFrom(random(), docIds.subList(0, i));
iw.deleteDocuments(new Term("id", Integer.toString(id)));
}
}
iw.commit();
actualNumDocs = iw.getDocStats().numDocs;
}
Sort indexSort = RandomPicks.randomFrom(random(), Arrays.asList(
new Sort(new SortField("id", SortField.Type.INT),
new SortField("alt_id", SortField.Type.INT)),
new Sort(new SortedSetSortField("sorted_set_sort_field", false),
new SortField("alt_id", SortField.Type.INT)),
new Sort(new SortedNumericSortField("sorted_numeric_sort_field", SortField.Type.INT),
new SortField("alt_id", SortField.Type.INT)),
new Sort(new SortField("sorted_binary_sort_field", SortField.Type.STRING, false),
new SortField("alt_id", SortField.Type.INT))
));
try (Directory sortDir = newDirectory()) {
try (IndexWriter writer = new IndexWriter(sortDir, newIndexWriterConfig().setIndexSort(indexSort))) {
try (DirectoryReader reader = DirectoryReader.open(dir)) {
List<CodecReader> readers = new ArrayList<>();
for (LeafReaderContext ctx : reader.leaves()) {
CodecReader wrap = SortingCodecReader.wrap(SlowCodecReaderWrapper.wrap(ctx.reader()), indexSort);
readers.add(wrap);
TermVectorsReader termVectorsReader = wrap.getTermVectorsReader();
TermVectorsReader clone = termVectorsReader.clone();
assertNotSame(termVectorsReader, clone);
clone.close();
}
writer.addIndexes(readers.toArray(new CodecReader[0]));
}
assumeTrue("must have at least one doc", actualNumDocs > 0);
try (DirectoryReader r = DirectoryReader.open(writer)) {
LeafReader leaf = getOnlyLeafReader(r);
assertEquals(actualNumDocs, leaf.maxDoc());
BinaryDocValues binary_dv = leaf.getBinaryDocValues("binary_dv");
SortedNumericDocValues sorted_numeric_dv = leaf.getSortedNumericDocValues("sorted_numeric_dv");
SortedSetDocValues sorted_set_dv = leaf.getSortedSetDocValues("sorted_set_dv");
SortedDocValues binary_sorted_dv = leaf.getSortedDocValues("binary_sorted_dv");
NumericDocValues ids = leaf.getNumericDocValues("id");
long prevValue = -1;
boolean usingAltIds = false;
for (int i = 0; i < actualNumDocs; i++) {
int idNext = ids.nextDoc();
if (idNext == DocIdSetIterator.NO_MORE_DOCS) {
assertFalse(usingAltIds);
usingAltIds = true;
ids = leaf.getNumericDocValues("alt_id");
idNext = ids.nextDoc();
binary_dv = leaf.getBinaryDocValues("binary_dv");
sorted_numeric_dv = leaf.getSortedNumericDocValues("sorted_numeric_dv");
sorted_set_dv = leaf.getSortedSetDocValues("sorted_set_dv");
binary_sorted_dv = leaf.getSortedDocValues("binary_sorted_dv");
prevValue = -1;
}
assertTrue(prevValue + " < " + ids.longValue(), prevValue < ids.longValue());
prevValue = ids.longValue();
assertTrue(binary_dv.advanceExact(idNext));
assertTrue(sorted_numeric_dv.advanceExact(idNext));
assertTrue(sorted_set_dv.advanceExact(idNext));
assertTrue(binary_sorted_dv.advanceExact(idNext));
assertEquals(new BytesRef(ids.longValue() + ""), binary_dv.binaryValue());
assertEquals(new BytesRef(ids.longValue() + ""), binary_sorted_dv.binaryValue());
assertEquals(new BytesRef(ids.longValue() + ""), sorted_set_dv.lookupOrd(sorted_set_dv.nextOrd()));
assertEquals(1, sorted_numeric_dv.docValueCount());
assertEquals(ids.longValue(), sorted_numeric_dv.nextValue());
Fields termVectors = leaf.getTermVectors(idNext);
assertTrue(termVectors.terms("term_vectors").iterator().seekExact(new BytesRef("test" + ids.longValue())));
assertEquals(Long.toString(ids.longValue()), leaf.document(idNext).get("id"));
IndexSearcher searcher = new IndexSearcher(r);
TopDocs result = searcher.search(LongPoint.newExactQuery("id", ids.longValue()), 1);
assertEquals(1, result.totalHits.value);
assertEquals(idNext, result.scoreDocs[0].doc);
result = searcher.search(new TermQuery(new Term("id", "" + ids.longValue())), 1);
assertEquals(1, result.totalHits.value);
assertEquals(idNext, result.scoreDocs[0].doc);
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, ids.nextDoc());
}
}
}
}
}
}