mirror of https://github.com/apache/lucene.git
Use radix sort to sort postings when index sorting is enabled. (#12114)
This switches to LSBRadixSorter instead of TimSorter to sort postings whose index options are `DOCS`. On a synthetic benchmark this yielded barely any difference in the case when the index order is the same as the sort order, or reverse, but almost a 3x speedup for writing postings in the case when the index order is mostly random.
This commit is contained in:
parent
d407edf4b8
commit
805eb0b613
|
@ -35,9 +35,11 @@ import org.apache.lucene.util.Counter;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
import org.apache.lucene.util.IntBlockPool;
|
import org.apache.lucene.util.IntBlockPool;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.LSBRadixSorter;
|
||||||
import org.apache.lucene.util.LongsRef;
|
import org.apache.lucene.util.LongsRef;
|
||||||
import org.apache.lucene.util.TimSorter;
|
import org.apache.lucene.util.TimSorter;
|
||||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||||
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
final class FreqProxTermsWriter extends TermsHash {
|
final class FreqProxTermsWriter extends TermsHash {
|
||||||
|
|
||||||
|
@ -153,13 +155,12 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TermsEnum iterator() throws IOException {
|
public TermsEnum iterator() throws IOException {
|
||||||
return new SortingTermsEnum(in.iterator(), docMap, indexOptions, hasPositions());
|
return new SortingTermsEnum(in.iterator(), docMap, indexOptions);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
|
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
|
||||||
return new SortingTermsEnum(
|
return new SortingTermsEnum(in.intersect(compiled, startTerm), docMap, indexOptions);
|
||||||
in.intersect(compiled, startTerm), docMap, indexOptions, hasPositions());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,20 +168,18 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
|
|
||||||
final Sorter.DocMap docMap; // pkg-protected to avoid synthetic accessor methods
|
final Sorter.DocMap docMap; // pkg-protected to avoid synthetic accessor methods
|
||||||
private final IndexOptions indexOptions;
|
private final IndexOptions indexOptions;
|
||||||
private final boolean hasPositions;
|
|
||||||
|
|
||||||
SortingTermsEnum(
|
SortingTermsEnum(final TermsEnum in, Sorter.DocMap docMap, IndexOptions indexOptions) {
|
||||||
final TermsEnum in, Sorter.DocMap docMap, IndexOptions indexOptions, boolean hasPositions) {
|
|
||||||
super(in);
|
super(in);
|
||||||
this.docMap = docMap;
|
this.docMap = docMap;
|
||||||
this.indexOptions = indexOptions;
|
this.indexOptions = indexOptions;
|
||||||
this.hasPositions = hasPositions;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public PostingsEnum postings(PostingsEnum reuse, final int flags) throws IOException {
|
public PostingsEnum postings(PostingsEnum reuse, final int flags) throws IOException {
|
||||||
|
|
||||||
if (hasPositions && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) {
|
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0
|
||||||
|
&& PostingsEnum.featureRequested(flags, PostingsEnum.FREQS)) {
|
||||||
final PostingsEnum inReuse;
|
final PostingsEnum inReuse;
|
||||||
final SortingPostingsEnum wrapReuse;
|
final SortingPostingsEnum wrapReuse;
|
||||||
if (reuse != null && reuse instanceof SortingPostingsEnum) {
|
if (reuse != null && reuse instanceof SortingPostingsEnum) {
|
||||||
|
@ -194,14 +193,16 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
}
|
}
|
||||||
|
|
||||||
final PostingsEnum inDocsAndPositions = in.postings(inReuse, flags);
|
final PostingsEnum inDocsAndPositions = in.postings(inReuse, flags);
|
||||||
// we ignore the fact that offsets may be stored but not asked for,
|
// we ignore the fact that positions/offsets may be stored but not asked for,
|
||||||
// since this code is expected to be used during addIndexes which will
|
// since this code is expected to be used during addIndexes which will
|
||||||
// ask for everything. if that assumption changes in the future, we can
|
// ask for everything. if that assumption changes in the future, we can
|
||||||
// factor in whether 'flags' says offsets are not required.
|
// factor in whether 'flags' says offsets are not required.
|
||||||
|
final boolean storePositions =
|
||||||
|
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
final boolean storeOffsets =
|
final boolean storeOffsets =
|
||||||
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
return new SortingPostingsEnum(
|
return new SortingPostingsEnum(
|
||||||
docMap.size(), wrapReuse, inDocsAndPositions, docMap, storeOffsets);
|
docMap.size(), wrapReuse, inDocsAndPositions, docMap, storePositions, storeOffsets);
|
||||||
}
|
}
|
||||||
|
|
||||||
final PostingsEnum inReuse;
|
final PostingsEnum inReuse;
|
||||||
|
@ -213,161 +214,53 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
inReuse = wrapReuse.getWrapped();
|
inReuse = wrapReuse.getWrapped();
|
||||||
} else {
|
} else {
|
||||||
wrapReuse = null;
|
wrapReuse = null;
|
||||||
inReuse = reuse;
|
inReuse = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
final PostingsEnum inDocs = in.postings(inReuse, flags);
|
final PostingsEnum inDocs = in.postings(inReuse, flags);
|
||||||
final boolean withFreqs =
|
return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, docMap);
|
||||||
indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0
|
|
||||||
&& PostingsEnum.featureRequested(flags, PostingsEnum.FREQS);
|
|
||||||
return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, withFreqs, docMap);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static class SortingDocsEnum extends FilterLeafReader.FilterPostingsEnum {
|
static class SortingDocsEnum extends PostingsEnum {
|
||||||
|
|
||||||
private static final class DocFreqSorter extends TimSorter {
|
private final PostingsEnum in;
|
||||||
|
private final LSBRadixSorter sorter;
|
||||||
private int[] docs;
|
|
||||||
private int[] freqs;
|
|
||||||
private int[] tmpDocs;
|
|
||||||
private int[] tmpFreqs;
|
|
||||||
|
|
||||||
DocFreqSorter(int maxDoc) {
|
|
||||||
super(maxDoc / 8);
|
|
||||||
this.tmpDocs = IntsRef.EMPTY_INTS;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void reset(int[] docs, int[] freqs) {
|
|
||||||
this.docs = docs;
|
|
||||||
this.freqs = freqs;
|
|
||||||
if (freqs != null && tmpFreqs == null) {
|
|
||||||
tmpFreqs = new int[tmpDocs.length];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected int compare(int i, int j) {
|
|
||||||
return docs[i] - docs[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void swap(int i, int j) {
|
|
||||||
int tmpDoc = docs[i];
|
|
||||||
docs[i] = docs[j];
|
|
||||||
docs[j] = tmpDoc;
|
|
||||||
|
|
||||||
if (freqs != null) {
|
|
||||||
int tmpFreq = freqs[i];
|
|
||||||
freqs[i] = freqs[j];
|
|
||||||
freqs[j] = tmpFreq;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void copy(int src, int dest) {
|
|
||||||
docs[dest] = docs[src];
|
|
||||||
if (freqs != null) {
|
|
||||||
freqs[dest] = freqs[src];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void save(int i, int len) {
|
|
||||||
if (tmpDocs.length < len) {
|
|
||||||
tmpDocs = new int[ArrayUtil.oversize(len, Integer.BYTES)];
|
|
||||||
if (freqs != null) {
|
|
||||||
tmpFreqs = new int[tmpDocs.length];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
System.arraycopy(docs, i, tmpDocs, 0, len);
|
|
||||||
if (freqs != null) {
|
|
||||||
System.arraycopy(freqs, i, tmpFreqs, 0, len);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void restore(int i, int j) {
|
|
||||||
docs[j] = tmpDocs[i];
|
|
||||||
if (freqs != null) {
|
|
||||||
freqs[j] = tmpFreqs[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected int compareSaved(int i, int j) {
|
|
||||||
return tmpDocs[i] - docs[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private final int maxDoc;
|
|
||||||
private final DocFreqSorter sorter;
|
|
||||||
private int[] docs;
|
private int[] docs;
|
||||||
private int[] freqs;
|
|
||||||
private int docIt = -1;
|
private int docIt = -1;
|
||||||
private final int upto;
|
private final int upTo;
|
||||||
private final boolean withFreqs;
|
|
||||||
|
|
||||||
SortingDocsEnum(
|
SortingDocsEnum(
|
||||||
int maxDoc,
|
int maxDoc, SortingDocsEnum reuse, final PostingsEnum in, final Sorter.DocMap docMap)
|
||||||
SortingDocsEnum reuse,
|
|
||||||
final PostingsEnum in,
|
|
||||||
boolean withFreqs,
|
|
||||||
final Sorter.DocMap docMap)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
super(in);
|
|
||||||
this.maxDoc = maxDoc;
|
|
||||||
this.withFreqs = withFreqs;
|
|
||||||
if (reuse != null) {
|
if (reuse != null) {
|
||||||
if (reuse.maxDoc == maxDoc) {
|
sorter = reuse.sorter;
|
||||||
sorter = reuse.sorter;
|
|
||||||
} else {
|
|
||||||
sorter = new DocFreqSorter(maxDoc);
|
|
||||||
}
|
|
||||||
docs = reuse.docs;
|
docs = reuse.docs;
|
||||||
freqs = reuse.freqs; // maybe null
|
|
||||||
} else {
|
} else {
|
||||||
docs = new int[64];
|
sorter = new LSBRadixSorter();
|
||||||
sorter = new DocFreqSorter(maxDoc);
|
docs = IntsRef.EMPTY_INTS;
|
||||||
}
|
}
|
||||||
docIt = -1;
|
this.in = in;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
int doc;
|
for (int doc = in.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = in.nextDoc()) {
|
||||||
if (withFreqs) {
|
if (docs.length <= i) {
|
||||||
if (freqs == null || freqs.length < docs.length) {
|
docs = ArrayUtil.grow(docs);
|
||||||
freqs = new int[docs.length];
|
|
||||||
}
|
|
||||||
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
|
||||||
if (i >= docs.length) {
|
|
||||||
docs = ArrayUtil.grow(docs, docs.length + 1);
|
|
||||||
freqs = ArrayUtil.grow(freqs, freqs.length + 1);
|
|
||||||
}
|
|
||||||
docs[i] = docMap.oldToNew(doc);
|
|
||||||
freqs[i] = in.freq();
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
freqs = null;
|
|
||||||
while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
|
||||||
if (i >= docs.length) {
|
|
||||||
docs = ArrayUtil.grow(docs, docs.length + 1);
|
|
||||||
}
|
|
||||||
docs[i++] = docMap.oldToNew(doc);
|
|
||||||
}
|
}
|
||||||
|
docs[i++] = docMap.oldToNew(doc);
|
||||||
}
|
}
|
||||||
// TimSort can save much time compared to other sorts in case of
|
upTo = i;
|
||||||
// reverse sorting, or when sorting a concatenation of sorted readers
|
if (docs.length == upTo) {
|
||||||
sorter.reset(docs, freqs);
|
docs = ArrayUtil.grow(docs);
|
||||||
sorter.sort(0, i);
|
}
|
||||||
upto = i;
|
docs[upTo] = DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
final int numBits = PackedInts.bitsRequired(Math.max(0, maxDoc - 1));
|
||||||
|
// Even though LSBRadixSorter cannot take advantage of partial ordering like TimSorter it is
|
||||||
|
// often still faster for nearly-sorted inputs.
|
||||||
|
sorter.sort(numBits, docs, upTo);
|
||||||
}
|
}
|
||||||
|
|
||||||
// for testing
|
PostingsEnum getWrapped() {
|
||||||
boolean reused(PostingsEnum other) {
|
return in;
|
||||||
if (other == null || !(other instanceof SortingDocsEnum)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return docs == ((SortingDocsEnum) other).docs;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -379,26 +272,23 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int docID() {
|
public int docID() {
|
||||||
return docIt < 0 ? -1 : docIt >= upto ? NO_MORE_DOCS : docs[docIt];
|
return docIt < 0 ? -1 : docs[docIt];
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int freq() throws IOException {
|
|
||||||
return withFreqs && docIt < upto ? freqs[docIt] : 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int nextDoc() throws IOException {
|
public int nextDoc() throws IOException {
|
||||||
if (++docIt >= upto) return NO_MORE_DOCS;
|
return docs[++docIt];
|
||||||
return docs[docIt];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the wrapped {@link PostingsEnum}. */
|
@Override
|
||||||
PostingsEnum getWrapped() {
|
public long cost() {
|
||||||
return in;
|
return upTo;
|
||||||
}
|
}
|
||||||
|
|
||||||
// we buffer up docs/freqs only, don't forward any positions requests to underlying enum
|
@Override
|
||||||
|
public int freq() throws IOException {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int nextPosition() throws IOException {
|
public int nextPosition() throws IOException {
|
||||||
|
@ -496,7 +386,7 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
private final int upto;
|
private final int upto;
|
||||||
|
|
||||||
private final ByteBuffersDataInput postingInput;
|
private final ByteBuffersDataInput postingInput;
|
||||||
private final boolean storeOffsets;
|
private final boolean storePositions, storeOffsets;
|
||||||
|
|
||||||
private int docIt = -1;
|
private int docIt = -1;
|
||||||
private int pos;
|
private int pos;
|
||||||
|
@ -512,10 +402,12 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
SortingPostingsEnum reuse,
|
SortingPostingsEnum reuse,
|
||||||
final PostingsEnum in,
|
final PostingsEnum in,
|
||||||
Sorter.DocMap docMap,
|
Sorter.DocMap docMap,
|
||||||
|
boolean storePositions,
|
||||||
boolean storeOffsets)
|
boolean storeOffsets)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
super(in);
|
super(in);
|
||||||
this.maxDoc = maxDoc;
|
this.maxDoc = maxDoc;
|
||||||
|
this.storePositions = storePositions;
|
||||||
this.storeOffsets = storeOffsets;
|
this.storeOffsets = storeOffsets;
|
||||||
if (reuse != null) {
|
if (reuse != null) {
|
||||||
docs = reuse.docs;
|
docs = reuse.docs;
|
||||||
|
@ -556,37 +448,31 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
this.postingInput = buffer.toDataInput();
|
this.postingInput = buffer.toDataInput();
|
||||||
}
|
}
|
||||||
|
|
||||||
// for testing
|
|
||||||
boolean reused(PostingsEnum other) {
|
|
||||||
if (other == null || !(other instanceof SortingPostingsEnum)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return docs == ((SortingPostingsEnum) other).docs;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addPositions(final PostingsEnum in, final DataOutput out) throws IOException {
|
private void addPositions(final PostingsEnum in, final DataOutput out) throws IOException {
|
||||||
int freq = in.freq();
|
int freq = in.freq();
|
||||||
out.writeVInt(freq);
|
out.writeVInt(freq);
|
||||||
int previousPosition = 0;
|
if (storePositions) {
|
||||||
int previousEndOffset = 0;
|
int previousPosition = 0;
|
||||||
for (int i = 0; i < freq; i++) {
|
int previousEndOffset = 0;
|
||||||
final int pos = in.nextPosition();
|
for (int i = 0; i < freq; i++) {
|
||||||
final BytesRef payload = in.getPayload();
|
final int pos = in.nextPosition();
|
||||||
// The low-order bit of token is set only if there is a payload, the
|
final BytesRef payload = in.getPayload();
|
||||||
// previous bits are the delta-encoded position.
|
// The low-order bit of token is set only if there is a payload, the
|
||||||
final int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1);
|
// previous bits are the delta-encoded position.
|
||||||
out.writeVInt(token);
|
final int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1);
|
||||||
previousPosition = pos;
|
out.writeVInt(token);
|
||||||
if (storeOffsets) { // don't encode offsets if they are not stored
|
previousPosition = pos;
|
||||||
final int startOffset = in.startOffset();
|
if (storeOffsets) { // don't encode offsets if they are not stored
|
||||||
final int endOffset = in.endOffset();
|
final int startOffset = in.startOffset();
|
||||||
out.writeVInt(startOffset - previousEndOffset);
|
final int endOffset = in.endOffset();
|
||||||
out.writeVInt(endOffset - startOffset);
|
out.writeVInt(startOffset - previousEndOffset);
|
||||||
previousEndOffset = endOffset;
|
out.writeVInt(endOffset - startOffset);
|
||||||
}
|
previousEndOffset = endOffset;
|
||||||
if (payload != null) {
|
}
|
||||||
out.writeVInt(payload.length);
|
if (payload != null) {
|
||||||
out.writeBytes(payload.bytes, payload.offset, payload.length);
|
out.writeVInt(payload.length);
|
||||||
|
out.writeBytes(payload.bytes, payload.offset, payload.length);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -631,6 +517,9 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int nextPosition() throws IOException {
|
public int nextPosition() throws IOException {
|
||||||
|
if (storePositions == false) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
final int token = postingInput.readVInt();
|
final int token = postingInput.readVInt();
|
||||||
pos += token >>> 1;
|
pos += token >>> 1;
|
||||||
if (storeOffsets) {
|
if (storeOffsets) {
|
||||||
|
|
|
@ -62,6 +62,7 @@ import org.apache.lucene.document.StoredField;
|
||||||
import org.apache.lucene.document.StringField;
|
import org.apache.lucene.document.StringField;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
import org.apache.lucene.search.CollectionStatistics;
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.search.FieldDoc;
|
import org.apache.lucene.search.FieldDoc;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
|
@ -83,6 +84,7 @@ import org.apache.lucene.tests.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
import org.apache.lucene.tests.util.TestUtil;
|
import org.apache.lucene.tests.util.TestUtil;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.NumericUtils;
|
import org.apache.lucene.util.NumericUtils;
|
||||||
|
@ -2905,4 +2907,270 @@ public class TestIndexSorting extends LuceneTestCase {
|
||||||
w.close();
|
w.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSortDocs() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig config = newIndexWriterConfig();
|
||||||
|
config.setIndexSort(new Sort(new SortField("sort", SortField.Type.LONG)));
|
||||||
|
IndexWriter w = new IndexWriter(dir, config);
|
||||||
|
Document doc = new Document();
|
||||||
|
NumericDocValuesField sort = new NumericDocValuesField("sort", 0L);
|
||||||
|
doc.add(sort);
|
||||||
|
StringField field = new StringField("field", "a", Field.Store.NO);
|
||||||
|
doc.add(field);
|
||||||
|
w.addDocument(doc);
|
||||||
|
sort.setLongValue(1);
|
||||||
|
field.setStringValue("b");
|
||||||
|
w.addDocument(doc);
|
||||||
|
sort.setLongValue(-1);
|
||||||
|
field.setStringValue("a");
|
||||||
|
w.addDocument(doc);
|
||||||
|
sort.setLongValue(2);
|
||||||
|
field.setStringValue("a");
|
||||||
|
w.addDocument(doc);
|
||||||
|
sort.setLongValue(3);
|
||||||
|
field.setStringValue("b");
|
||||||
|
w.addDocument(doc);
|
||||||
|
w.forceMerge(1);
|
||||||
|
DirectoryReader reader = DirectoryReader.open(w);
|
||||||
|
w.close();
|
||||||
|
LeafReader leafReader = getOnlyLeafReader(reader);
|
||||||
|
TermsEnum fieldTerms = leafReader.terms("field").iterator();
|
||||||
|
assertEquals(new BytesRef("a"), fieldTerms.next());
|
||||||
|
PostingsEnum postings = fieldTerms.postings(null, PostingsEnum.ALL);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.nextDoc());
|
||||||
|
assertEquals(3, postings.nextDoc());
|
||||||
|
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
assertEquals(new BytesRef("b"), fieldTerms.next());
|
||||||
|
postings = fieldTerms.postings(postings, PostingsEnum.ALL);
|
||||||
|
assertEquals(2, postings.nextDoc());
|
||||||
|
assertEquals(4, postings.nextDoc());
|
||||||
|
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
assertNull(fieldTerms.next());
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSortDocsAndFreqs() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig config = newIndexWriterConfig();
|
||||||
|
config.setIndexSort(new Sort(new SortField("sort", SortField.Type.LONG)));
|
||||||
|
IndexWriter w = new IndexWriter(dir, config);
|
||||||
|
FieldType ft = new FieldType();
|
||||||
|
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
ft.setTokenized(false);
|
||||||
|
ft.freeze();
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 0L));
|
||||||
|
doc.add(new Field("field", "a", ft));
|
||||||
|
doc.add(new Field("field", "a", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 1L));
|
||||||
|
doc.add(new Field("field", "b", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", -1L));
|
||||||
|
doc.add(new Field("field", "a", ft));
|
||||||
|
doc.add(new Field("field", "a", ft));
|
||||||
|
doc.add(new Field("field", "a", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 2L));
|
||||||
|
doc.add(new Field("field", "a", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 3L));
|
||||||
|
doc.add(new Field("field", "b", ft));
|
||||||
|
doc.add(new Field("field", "b", ft));
|
||||||
|
doc.add(new Field("field", "b", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
w.forceMerge(1);
|
||||||
|
DirectoryReader reader = DirectoryReader.open(w);
|
||||||
|
w.close();
|
||||||
|
LeafReader leafReader = getOnlyLeafReader(reader);
|
||||||
|
TermsEnum fieldTerms = leafReader.terms("field").iterator();
|
||||||
|
assertEquals(new BytesRef("a"), fieldTerms.next());
|
||||||
|
PostingsEnum postings = fieldTerms.postings(null, PostingsEnum.ALL);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(3, postings.freq());
|
||||||
|
assertEquals(1, postings.nextDoc());
|
||||||
|
assertEquals(2, postings.freq());
|
||||||
|
assertEquals(3, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.freq());
|
||||||
|
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
assertEquals(new BytesRef("b"), fieldTerms.next());
|
||||||
|
postings = fieldTerms.postings(postings, PostingsEnum.ALL);
|
||||||
|
assertEquals(2, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.freq());
|
||||||
|
assertEquals(4, postings.nextDoc());
|
||||||
|
assertEquals(3, postings.freq());
|
||||||
|
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
assertNull(fieldTerms.next());
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSortDocsAndFreqsAndPositions() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig config = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
config.setIndexSort(new Sort(new SortField("sort", SortField.Type.LONG)));
|
||||||
|
IndexWriter w = new IndexWriter(dir, config);
|
||||||
|
FieldType ft = new FieldType();
|
||||||
|
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||||
|
ft.setTokenized(true);
|
||||||
|
ft.freeze();
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 0L));
|
||||||
|
doc.add(new Field("field", "a a b", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 1L));
|
||||||
|
doc.add(new Field("field", "b", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", -1L));
|
||||||
|
doc.add(new Field("field", "b a b b", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 2L));
|
||||||
|
doc.add(new Field("field", "a", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 3L));
|
||||||
|
doc.add(new Field("field", "b b", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
w.forceMerge(1);
|
||||||
|
DirectoryReader reader = DirectoryReader.open(w);
|
||||||
|
w.close();
|
||||||
|
LeafReader leafReader = getOnlyLeafReader(reader);
|
||||||
|
TermsEnum fieldTerms = leafReader.terms("field").iterator();
|
||||||
|
assertEquals(new BytesRef("a"), fieldTerms.next());
|
||||||
|
PostingsEnum postings = fieldTerms.postings(null, PostingsEnum.ALL);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.freq());
|
||||||
|
assertEquals(1, postings.nextPosition());
|
||||||
|
assertEquals(1, postings.nextDoc());
|
||||||
|
assertEquals(2, postings.freq());
|
||||||
|
assertEquals(0, postings.nextPosition());
|
||||||
|
assertEquals(1, postings.nextPosition());
|
||||||
|
assertEquals(3, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.freq());
|
||||||
|
assertEquals(0, postings.nextPosition());
|
||||||
|
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
assertEquals(new BytesRef("b"), fieldTerms.next());
|
||||||
|
postings = fieldTerms.postings(postings, PostingsEnum.ALL);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(3, postings.freq());
|
||||||
|
assertEquals(0, postings.nextPosition());
|
||||||
|
assertEquals(2, postings.nextPosition());
|
||||||
|
assertEquals(3, postings.nextPosition());
|
||||||
|
assertEquals(1, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.freq());
|
||||||
|
assertEquals(2, postings.nextPosition());
|
||||||
|
assertEquals(2, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.freq());
|
||||||
|
assertEquals(0, postings.nextPosition());
|
||||||
|
assertEquals(4, postings.nextDoc());
|
||||||
|
assertEquals(2, postings.freq());
|
||||||
|
assertEquals(0, postings.nextPosition());
|
||||||
|
assertEquals(1, postings.nextPosition());
|
||||||
|
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
assertNull(fieldTerms.next());
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSortDocsAndFreqsAndPositionsAndOffsets() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig config = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
config.setIndexSort(new Sort(new SortField("sort", SortField.Type.LONG)));
|
||||||
|
IndexWriter w = new IndexWriter(dir, config);
|
||||||
|
FieldType ft = new FieldType();
|
||||||
|
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||||
|
ft.setTokenized(true);
|
||||||
|
ft.freeze();
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 0L));
|
||||||
|
doc.add(new Field("field", "a a b", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 1L));
|
||||||
|
doc.add(new Field("field", "b", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", -1L));
|
||||||
|
doc.add(new Field("field", "b a b b", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 2L));
|
||||||
|
doc.add(new Field("field", "a", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new NumericDocValuesField("sort", 3L));
|
||||||
|
doc.add(new Field("field", "b b", ft));
|
||||||
|
w.addDocument(doc);
|
||||||
|
w.forceMerge(1);
|
||||||
|
DirectoryReader reader = DirectoryReader.open(w);
|
||||||
|
w.close();
|
||||||
|
LeafReader leafReader = getOnlyLeafReader(reader);
|
||||||
|
TermsEnum fieldTerms = leafReader.terms("field").iterator();
|
||||||
|
assertEquals(new BytesRef("a"), fieldTerms.next());
|
||||||
|
PostingsEnum postings = fieldTerms.postings(null, PostingsEnum.ALL);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.freq());
|
||||||
|
assertEquals(1, postings.nextPosition());
|
||||||
|
assertEquals(2, postings.startOffset());
|
||||||
|
assertEquals(3, postings.endOffset());
|
||||||
|
assertEquals(1, postings.nextDoc());
|
||||||
|
assertEquals(2, postings.freq());
|
||||||
|
assertEquals(0, postings.nextPosition());
|
||||||
|
assertEquals(0, postings.startOffset());
|
||||||
|
assertEquals(1, postings.endOffset());
|
||||||
|
assertEquals(1, postings.nextPosition());
|
||||||
|
assertEquals(2, postings.startOffset());
|
||||||
|
assertEquals(3, postings.endOffset());
|
||||||
|
assertEquals(3, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.freq());
|
||||||
|
assertEquals(0, postings.nextPosition());
|
||||||
|
assertEquals(0, postings.startOffset());
|
||||||
|
assertEquals(1, postings.endOffset());
|
||||||
|
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
assertEquals(new BytesRef("b"), fieldTerms.next());
|
||||||
|
postings = fieldTerms.postings(postings, PostingsEnum.ALL);
|
||||||
|
assertEquals(0, postings.nextDoc());
|
||||||
|
assertEquals(3, postings.freq());
|
||||||
|
assertEquals(0, postings.nextPosition());
|
||||||
|
assertEquals(0, postings.startOffset());
|
||||||
|
assertEquals(1, postings.endOffset());
|
||||||
|
assertEquals(2, postings.nextPosition());
|
||||||
|
assertEquals(4, postings.startOffset());
|
||||||
|
assertEquals(5, postings.endOffset());
|
||||||
|
assertEquals(3, postings.nextPosition());
|
||||||
|
assertEquals(6, postings.startOffset());
|
||||||
|
assertEquals(7, postings.endOffset());
|
||||||
|
assertEquals(1, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.freq());
|
||||||
|
assertEquals(2, postings.nextPosition());
|
||||||
|
assertEquals(4, postings.startOffset());
|
||||||
|
assertEquals(5, postings.endOffset());
|
||||||
|
assertEquals(2, postings.nextDoc());
|
||||||
|
assertEquals(1, postings.freq());
|
||||||
|
assertEquals(0, postings.nextPosition());
|
||||||
|
assertEquals(0, postings.startOffset());
|
||||||
|
assertEquals(1, postings.endOffset());
|
||||||
|
assertEquals(4, postings.nextDoc());
|
||||||
|
assertEquals(2, postings.freq());
|
||||||
|
assertEquals(0, postings.nextPosition());
|
||||||
|
assertEquals(0, postings.startOffset());
|
||||||
|
assertEquals(1, postings.endOffset());
|
||||||
|
assertEquals(1, postings.nextPosition());
|
||||||
|
assertEquals(2, postings.startOffset());
|
||||||
|
assertEquals(3, postings.endOffset());
|
||||||
|
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
|
||||||
|
assertNull(fieldTerms.next());
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue