LUCENE-4830: Sorter API: Make the doc ID mapping an abstract class.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1456787 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-03-15 07:10:26 +00:00
parent 67083534ff
commit 44ca68f294
5 changed files with 219 additions and 129 deletions

View File

@ -18,8 +18,6 @@ package org.apache.lucene.index.sorter;
*/
import java.io.IOException;
import java.util.AbstractList;
import java.util.List;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.NumericDocValues;
@ -39,27 +37,19 @@ public class NumericDocValuesSorter extends Sorter {
}
@Override
public int[] oldToNew(final AtomicReader reader) throws IOException {
public Sorter.DocMap sort(final AtomicReader reader) throws IOException {
final NumericDocValues ndv = reader.getNumericDocValues(fieldName);
final int maxDoc = reader.maxDoc();
final int[] docs = new int[maxDoc];
final List<Long> values = new AbstractList<Long>() {
final DocComparator comparator = new DocComparator() {
@Override
public Long get(int doc) {
return ndv.get(doc);
}
@Override
public int size() {
return reader.maxDoc();
public int compare(int docID1, int docID2) {
final long v1 = ndv.get(docID1);
final long v2 = ndv.get(docID2);
return v1 < v2 ? -1 : v1 == v2 ? 0 : 1;
}
};
for (int i = 0; i < maxDoc; i++) {
docs[i] = i;
}
return compute(docs, values);
return sort(reader.maxDoc(), comparator);
}
}

View File

@ -18,58 +18,102 @@ package org.apache.lucene.index.sorter;
*/
import java.io.IOException;
import java.util.List;
import java.util.Comparator;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.SorterTemplate;
import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
/**
* Sorts documents in a given index by returning a permutation on the docs.
* Implementations can call {@link #compute(int[], List)} to compute the
* Implementations can call {@link #sort(int, DocComparator)} to compute the
* old-to-new permutation over the given documents and values.
*
* @lucene.experimental
*/
public abstract class Sorter {
/**
* A permutation of doc IDs. For every document ID between <tt>0</tt> and
* {@link IndexReader#maxDoc()}, <code>oldToNew(newToOld(docID))</code> must
* return <code>docID</code>.
*/
public static abstract class DocMap {
/** Given a doc ID from the original index, return its ordinal in the
* sorted index. */
public abstract int oldToNew(int docID);
/** Given the ordinal of a doc ID, return its doc ID in the original index. */
public abstract int newToOld(int docID);
}
/** Check consistency of a {@link DocMap}, useful for assertions. */
static boolean isConsistent(DocMap docMap, int maxDoc) {
for (int i = 0; i < maxDoc; ++i) {
final int newID = docMap.oldToNew(i);
final int oldID = docMap.newToOld(newID);
assert newID >= 0 && newID < maxDoc : "doc IDs must be in [0-" + maxDoc + "[, got " + newID;
assert i == oldID : "mapping is inconsistent: " + i + " --oldToNew--> " + newID + " --newToOld--> " + oldID;
if (i != oldID || newID < 0 || newID >= maxDoc) {
return false;
}
}
return true;
}
/** A comparator of doc IDs. */
public static abstract class DocComparator {
/** Compare docID1 against docID2. The contract for the return value is the
* same as {@link Comparator#compare(Object, Object)}. */
public abstract int compare(int docID1, int docID2);
}
/** Sorts documents in reverse order. */
public static final Sorter REVERSE_DOCS = new Sorter() {
@Override
public int[] oldToNew(final AtomicReader reader) throws IOException {
public DocMap sort(final AtomicReader reader) throws IOException {
final int maxDoc = reader.maxDoc();
int[] reverseDocs = new int[maxDoc];
for (int i = 0; i < maxDoc; i++) {
reverseDocs[i] = maxDoc - (i + 1);
}
return reverseDocs;
return new DocMap() {
public int oldToNew(int docID) {
return maxDoc - docID - 1;
}
public int newToOld(int docID) {
return maxDoc - docID - 1;
}
};
}
};
private static final class DocValueSorterTemplate<T extends Comparable<? super T>> extends SorterTemplate {
private static final class DocValueSorterTemplate extends SorterTemplate {
private final int[] docs;
private final List<T> values;
private final Sorter.DocComparator comparator;
private T pivot;
private int pivot;
public DocValueSorterTemplate(int[] docs, List<T> values) {
public DocValueSorterTemplate(int[] docs, Sorter.DocComparator comparator) {
this.docs = docs;
this.values = values;
this.comparator = comparator;
}
@Override
protected int compare(int i, int j) {
return values.get(docs[i]).compareTo(values.get(docs[j]));
return comparator.compare(docs[i], docs[j]);
}
@Override
protected int comparePivot(int j) {
return pivot.compareTo(values.get(docs[j]));
return comparator.compare(pivot, docs[j]);
}
@Override
protected void setPivot(int i) {
pivot = values.get(docs[i]);
pivot = docs[i];
}
@Override
@ -80,27 +124,73 @@ public abstract class Sorter {
}
}
/** Computes the old-to-new permutation over the given documents and values. */
protected static <T extends Comparable<? super T>> int[] compute(int[] docs, List<T> values) {
SorterTemplate sorter = new DocValueSorterTemplate<T>(docs, values);
sorter.quickSort(0, docs.length - 1);
final int[] oldToNew = new int[docs.length];
for (int i = 0; i < docs.length; i++) {
oldToNew[docs[i]] = i;
/** Computes the old-to-new permutation over the given comparator. */
protected static Sorter.DocMap sort(final int maxDoc, DocComparator comparator) {
// check if the index is sorted
boolean sorted = true;
for (int i = 1; i < maxDoc; ++i) {
if (comparator.compare(i-1, i) > 0) {
sorted = false;
break;
}
}
return oldToNew;
if (sorted) {
return null;
}
// sort doc IDs
final int[] docs = new int[maxDoc];
for (int i = 0; i < maxDoc; i++) {
docs[i] = i;
}
SorterTemplate sorter = new DocValueSorterTemplate(docs, comparator);
// TODO: use a stable sort instead?
sorter.quickSort(0, docs.length - 1); // docs is now the newToOld mapping
// The reason why we use MonotonicAppendingLongBuffer here is that it
// wastes very little memory if the index is in random order but can save
// a lot of memory if the index is already "almost" sorted
final MonotonicAppendingLongBuffer newToOld = new MonotonicAppendingLongBuffer();
for (int i = 0; i < maxDoc; ++i) {
newToOld.add(docs[i]);
}
for (int i = 0; i < maxDoc; ++i) {
docs[(int) newToOld.get(i)] = i;
} // docs is now the oldToNew mapping
final MonotonicAppendingLongBuffer oldToNew = new MonotonicAppendingLongBuffer();
for (int i = 0; i < maxDoc; ++i) {
oldToNew.add(docs[i]);
}
return new Sorter.DocMap() {
@Override
public int oldToNew(int docID) {
return (int) oldToNew.get(docID);
}
@Override
public int newToOld(int docID) {
return (int) newToOld.get(docID);
}
};
}
/**
* Returns a mapping from the old document ID to its new location in the
* sorted index. Implementations can use the auxiliary
* {@link #compute(int[], List)} to compute the old-to-new permutation
* given an array of documents and their corresponding values.
* {@link #sort(int, DocComparator)} to compute the old-to-new permutation
* given a list of documents and their corresponding values.
* <p>
* A return value of <tt>null</tt> is allowed and means that
* <code>reader</code> is already sorted.
* <p>
* <b>NOTE:</b> deleted documents are expected to appear in the mapping as
* well, they will however be dropped when the index is actually sorted.
*/
public abstract int[] oldToNew(AtomicReader reader) throws IOException;
public abstract DocMap sort(AtomicReader reader) throws IOException;
}

View File

@ -43,7 +43,6 @@ import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.SorterTemplate;
/**
@ -66,13 +65,13 @@ public class SortingAtomicReader extends FilterAtomicReader {
private static class SortingFields extends FilterFields {
private final int[] old2new;
private final Sorter.DocMap docMap;
private final Bits inLiveDocs;
private final FieldInfos infos;
public SortingFields(final Fields in, final Bits inLiveDocs, FieldInfos infos, final int[] old2new) {
public SortingFields(final Fields in, final Bits inLiveDocs, FieldInfos infos, Sorter.DocMap docMap) {
super(in);
this.old2new = old2new;
this.docMap = docMap;
this.inLiveDocs = inLiveDocs;
this.infos = infos;
}
@ -83,7 +82,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
if (terms == null) {
return null;
} else {
return new SortingTerms(terms, inLiveDocs, infos.fieldInfo(field).getIndexOptions(), old2new);
return new SortingTerms(terms, inLiveDocs, infos.fieldInfo(field).getIndexOptions(), docMap);
}
}
@ -91,33 +90,33 @@ public class SortingAtomicReader extends FilterAtomicReader {
private static class SortingTerms extends FilterTerms {
private final int[] old2new;
private final Sorter.DocMap docMap;
private final Bits inLiveDocs;
private final IndexOptions indexOptions;
public SortingTerms(final Terms in, final Bits inLiveDocs, IndexOptions indexOptions, final int[] old2new) {
public SortingTerms(final Terms in, final Bits inLiveDocs, IndexOptions indexOptions, final Sorter.DocMap docMap) {
super(in);
this.old2new = old2new;
this.docMap = docMap;
this.inLiveDocs = inLiveDocs;
this.indexOptions = indexOptions;
}
@Override
public TermsEnum iterator(final TermsEnum reuse) throws IOException {
return new SortingTermsEnum(in.iterator(reuse), inLiveDocs, old2new, indexOptions);
return new SortingTermsEnum(in.iterator(reuse), inLiveDocs, docMap, indexOptions);
}
}
private static class SortingTermsEnum extends FilterTermsEnum {
private final int[] old2new;
private final Sorter.DocMap docMap;
private final Bits inLiveDocs;
private final IndexOptions indexOptions;
public SortingTermsEnum(final TermsEnum in, final Bits inLiveDocs, final int[] old2new, IndexOptions indexOptions) {
public SortingTermsEnum(final TermsEnum in, final Bits inLiveDocs, Sorter.DocMap docMap, IndexOptions indexOptions) {
super(in);
this.old2new = old2new;
this.docMap = docMap;
this.inLiveDocs = inLiveDocs;
this.indexOptions = indexOptions;
}
@ -134,7 +133,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
reuse = ((SortingDocsEnum) reuse).getWrapped();
}
boolean withFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >=0 && (flags & DocsEnum.FLAG_FREQS) != 0;
return new SortingDocsEnum(in.docs(liveDocs, reuse, flags), withFreqs, old2new);
return new SortingDocsEnum(in.docs(liveDocs, reuse, flags), withFreqs, docMap);
}
@Override
@ -158,7 +157,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
// ask for everything. if that assumption changes in the future, we can
// factor in whether 'flags' says offsets are not required.
boolean storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
return new SortingDocsAndPositionsEnum(positions, old2new, storeOffsets);
return new SortingDocsAndPositionsEnum(positions, docMap, storeOffsets);
}
}
@ -167,48 +166,48 @@ public class SortingAtomicReader extends FilterAtomicReader {
private static class SortingBinaryDocValues extends BinaryDocValues {
private final BinaryDocValues in;
private final int[] new2old;
private final Sorter.DocMap docMap;
SortingBinaryDocValues(BinaryDocValues in, int[] new2old) {
SortingBinaryDocValues(BinaryDocValues in, Sorter.DocMap docMap) {
this.in = in;
this.new2old = new2old;
this.docMap = docMap;
}
@Override
public void get(int docID, BytesRef result) {
in.get(new2old[docID], result);
in.get(docMap.newToOld(docID), result);
}
}
private static class SortingNumericDocValues extends NumericDocValues {
private final NumericDocValues in;
private final int[] new2old;
private final Sorter.DocMap docMap;
public SortingNumericDocValues(final NumericDocValues in, final int[] new2old) {
public SortingNumericDocValues(final NumericDocValues in, Sorter.DocMap docMap) {
this.in = in;
this.new2old = new2old;
this.docMap = docMap;
}
@Override
public long get(int docID) {
return in.get(new2old[docID]);
return in.get(docMap.newToOld(docID));
}
}
private static class SortingSortedDocValues extends SortedDocValues {
private final SortedDocValues in;
private final int[] new2old;
private final Sorter.DocMap docMap;
SortingSortedDocValues(SortedDocValues in, int[] new2old) {
SortingSortedDocValues(SortedDocValues in, Sorter.DocMap docMap) {
this.in = in;
this.new2old = new2old;
this.docMap = docMap;
}
@Override
public int getOrd(int docID) {
return in.getOrd(new2old[docID]);
return in.getOrd(docMap.newToOld(docID));
}
@Override
@ -223,7 +222,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
@Override
public void get(int docID, BytesRef result) {
in.get(new2old[docID], result);
in.get(docMap.newToOld(docID), result);
}
@Override
@ -235,11 +234,11 @@ public class SortingAtomicReader extends FilterAtomicReader {
private static class SortingSortedSetDocValues extends SortedSetDocValues {
private final SortedSetDocValues in;
private final int[] new2old;
private final Sorter.DocMap docMap;
SortingSortedSetDocValues(SortedSetDocValues in, int[] new2old) {
SortingSortedSetDocValues(SortedSetDocValues in, Sorter.DocMap docMap) {
this.in = in;
this.new2old = new2old;
this.docMap = docMap;
}
@Override
@ -249,7 +248,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
@Override
public void setDocument(int docID) {
in.setDocument(new2old[docID]);
in.setDocument(docMap.newToOld(docID));
}
@Override
@ -315,7 +314,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
private final int upto;
private final boolean withFreqs;
public SortingDocsEnum(final DocsEnum in, boolean withFreqs, final int[] old2new) throws IOException {
public SortingDocsEnum(final DocsEnum in, boolean withFreqs, final Sorter.DocMap docMap) throws IOException {
super(in);
this.withFreqs = withFreqs;
int i = 0;
@ -327,7 +326,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
docs = ArrayUtil.grow(docs, docs.length + 1);
freqs = ArrayUtil.grow(freqs, freqs.length + 1);
}
docs[i] = old2new[doc];
docs[i] = docMap.oldToNew(doc);
freqs[i] = in.freq();
++i;
}
@ -339,7 +338,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
if (i >= docs.length) {
docs = ArrayUtil.grow(docs, docs.length + 1);
}
docs[i++] = old2new[doc];
docs[i++] = docMap.oldToNew(doc);
}
Arrays.sort(docs, 0, i);
}
@ -436,7 +435,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
private final BytesRef payload = new BytesRef(32);
private int currFreq;
public SortingDocsAndPositionsEnum(final DocsAndPositionsEnum in, final int[] old2new, boolean storeOffsets) throws IOException {
public SortingDocsAndPositionsEnum(final DocsAndPositionsEnum in, Sorter.DocMap docMap, boolean storeOffsets) throws IOException {
super(in);
this.storeOffsets = storeOffsets;
final RAMFile file = new RAMFile();
@ -454,7 +453,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
System.arraycopy(offsets, 0, tmp, 0, offsets.length);
offsets = tmp;
}
docs[i] = old2new[doc];
docs[i] = docMap.oldToNew(doc);
offsets[i] = out.getFilePointer();
addPositions(in, out);
i++;
@ -551,38 +550,29 @@ public class SortingAtomicReader extends FilterAtomicReader {
}
}
private final int[] old2new, new2old;
private final FixedBitSet mappedLiveDocs;
/** Return a sorted view of <code>reader</code> according to the order
* defined by <code>sorter</code>. If the reader is already sorted, this
* method might return the reader as-is. */
public static AtomicReader sort(AtomicReader reader, Sorter sorter) throws IOException {
final Sorter.DocMap docMap = sorter.sort(reader);
if (docMap == null) {
// the reader is already sorter
return reader;
}
assert Sorter.isConsistent(docMap, reader.maxDoc());
return new SortingAtomicReader(reader, docMap);
}
public SortingAtomicReader(final AtomicReader in, final Sorter sorter) throws IOException {
private final Sorter.DocMap docMap;
private SortingAtomicReader(final AtomicReader in, final Sorter.DocMap docMap) {
super(in);
old2new = sorter.oldToNew(in);
if (old2new.length != in.maxDoc()) {
throw new IllegalArgumentException("sorter should provide mapping for every document in the index, including deleted ones");
}
new2old = new int[old2new.length];
for (int i = 0; i < new2old.length; i++) {
new2old[old2new[i]] = i;
}
if (!in.hasDeletions()) {
mappedLiveDocs = null;
} else {
mappedLiveDocs = new FixedBitSet(in.maxDoc());
mappedLiveDocs.set(0, in.maxDoc());
Bits liveDocs = in.getLiveDocs();
int len = liveDocs.length();
for (int i = 0; i < len; i++) {
if (!liveDocs.get(i)) {
mappedLiveDocs.clear(old2new[i]);
}
}
}
this.docMap = docMap;
}
@Override
public void document(final int docID, final StoredFieldVisitor visitor) throws IOException {
in.document(new2old[docID], visitor);
in.document(docMap.newToOld(docID), visitor);
}
@Override
@ -591,7 +581,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
if (fields == null) {
return null;
} else {
return new SortingFields(fields, in.getLiveDocs(), in.getFieldInfos(), old2new);
return new SortingFields(fields, in.getLiveDocs(), in.getFieldInfos(), docMap);
}
}
@ -601,14 +591,29 @@ public class SortingAtomicReader extends FilterAtomicReader {
if (oldDocValues == null) {
return null;
} else {
return new SortingBinaryDocValues(oldDocValues, new2old);
return new SortingBinaryDocValues(oldDocValues, docMap);
}
}
@Override
public Bits getLiveDocs() {
ensureOpen();
return mappedLiveDocs;
final Bits inLiveDocs = in.getLiveDocs();
if (inLiveDocs == null) {
return null;
}
return new Bits() {
@Override
public boolean get(int index) {
return inLiveDocs.get(docMap.newToOld(index));
}
@Override
public int length() {
return inLiveDocs.length();
}
};
}
@Override
@ -617,7 +622,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
if (norm == null) {
return null;
} else {
return new SortingNumericDocValues(norm, new2old);
return new SortingNumericDocValues(norm, docMap);
}
}
@ -625,7 +630,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
public NumericDocValues getNumericDocValues(String field) throws IOException {
final NumericDocValues oldDocValues = in.getNumericDocValues(field);
if (oldDocValues == null) return null;
return new SortingNumericDocValues(oldDocValues, new2old);
return new SortingNumericDocValues(oldDocValues, docMap);
}
@Override
@ -634,7 +639,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
if (sortedDV == null) {
return null;
} else {
return new SortingSortedDocValues(sortedDV, new2old);
return new SortingSortedDocValues(sortedDV, docMap);
}
}
@ -644,13 +649,13 @@ public class SortingAtomicReader extends FilterAtomicReader {
if (sortedSetDV == null) {
return null;
} else {
return new SortingSortedSetDocValues(sortedSetDV, new2old);
return new SortingSortedSetDocValues(sortedSetDV, docMap);
}
}
@Override
public Fields getTermVectors(final int docID) throws IOException {
return in.getTermVectors(new2old[docID]);
return in.getTermVectors(docMap.newToOld(docID));
}
}

View File

@ -61,7 +61,7 @@ public class IndexSortingTest extends SorterTestBase {
Directory target = newDirectory();
IndexWriter writer = new IndexWriter(target, newIndexWriterConfig(TEST_VERSION_CURRENT, null));
reader = new SortingAtomicReader(reader, sorter);
reader = SortingAtomicReader.sort(reader, sorter);
writer.addIndexes(reader);
writer.close();
reader.close();

View File

@ -19,7 +19,6 @@ package org.apache.lucene.index.sorter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.util.Bits;
@ -33,28 +32,34 @@ public class SortingAtomicReaderTest extends SorterTestBase {
// build the mapping from the reader, since we deleted documents, some of
// them might have disappeared from the index (e.g. if an entire segment is
// dropped b/c all its docs are deleted)
Integer[] values = new Integer[reader.maxDoc()];
int[] docs = new int[reader.maxDoc()];
final int[] values = new int[reader.maxDoc()];
for (int i = 0; i < reader.maxDoc(); i++) {
docs[i] = i;
values[i] = Integer.valueOf(reader.document(i).get(ID_FIELD));
}
final Sorter.DocComparator comparator = new Sorter.DocComparator() {
@Override
public int compare(int docID1, int docID2) {
final int v1 = values[docID1];
final int v2 = values[docID2];
return v1 < v2 ? -1 : v1 == v2 ? 0 : 1;
}
};
final int[] oldToNew = Sorter.compute(docs, Collections.unmodifiableList(Arrays.asList(values)));
final Sorter.DocMap docMap = Sorter.sort(reader.maxDoc(), comparator);
// Sorter.compute also sorts the values
sortedValues = new Integer[reader.maxDoc()];
for (int i = 0; i < reader.maxDoc(); ++i) {
sortedValues[oldToNew[i]] = values[i];
sortedValues[docMap.oldToNew(i)] = values[i];
}
if (VERBOSE) {
System.out.println("oldToNew: " + Arrays.toString(oldToNew));
System.out.println("docMap: " + docMap);
System.out.println("sortedValues: " + Arrays.toString(sortedValues));
}
reader = new SortingAtomicReader(reader, new Sorter() {
reader = SortingAtomicReader.sort(reader, new Sorter() {
@Override
public int[] oldToNew(AtomicReader reader) throws IOException {
return oldToNew;
public Sorter.DocMap sort(AtomicReader reader) throws IOException {
return docMap;
}
});