make multidocvalues.getSortedValues more efficient

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1442294 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-02-04 18:35:35 +00:00
parent 563c6fa093
commit 15803eece1
2 changed files with 102 additions and 69 deletions

View File

@ -18,18 +18,12 @@ package org.apache.lucene.index;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex;
import org.apache.lucene.index.FieldInfo.DocValuesType; import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice;
import org.apache.lucene.index.IndexReader.ReaderClosedListener;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.AppendingLongBuffer;
import org.apache.lucene.util.Version;
/** /**
* A wrapper for CompositeIndexReader providing access to DocValues. * A wrapper for CompositeIndexReader providing access to DocValues.
@ -45,7 +39,6 @@ import org.apache.lucene.util.Version;
* @lucene.experimental * @lucene.experimental
* @lucene.internal * @lucene.internal
*/ */
// nocommit move this back to test-framework!!!
public class MultiDocValues { public class MultiDocValues {
/** No instantiation */ /** No instantiation */
@ -194,75 +187,115 @@ public class MultiDocValues {
*/ */
public static SortedDocValues getSortedValues(final IndexReader r, final String field) throws IOException { public static SortedDocValues getSortedValues(final IndexReader r, final String field) throws IOException {
final List<AtomicReaderContext> leaves = r.leaves(); final List<AtomicReaderContext> leaves = r.leaves();
if (leaves.size() == 1) { final int size = leaves.size();
if (size == 0) {
return null;
} else if (size == 1) {
return leaves.get(0).reader().getSortedDocValues(field); return leaves.get(0).reader().getSortedDocValues(field);
} }
boolean anyReal = false; boolean anyReal = false;
final SortedDocValues[] values = new SortedDocValues[size];
for(AtomicReaderContext ctx : leaves) { final int[] starts = new int[size+1];
SortedDocValues values = ctx.reader().getSortedDocValues(field); for (int i = 0; i < size; i++) {
AtomicReaderContext context = leaves.get(i);
if (values != null) { SortedDocValues v = context.reader().getSortedDocValues(field);
if (v == null) {
v = SortedDocValues.EMPTY;
} else {
anyReal = true; anyReal = true;
} }
values[i] = v;
starts[i] = context.docBase;
} }
starts[size] = r.maxDoc();
if (!anyReal) { if (!anyReal) {
return null; return null;
} else { } else {
// its called slow-wrapper for a reason right? OrdinalMapping mapping = new OrdinalMapping(values);
final Directory scratch = new RAMDirectory(); return new MultiSortedDocValues(values, starts, mapping);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_50, null); }
config.setCodec(Codec.forName("SimpleText")); }
IndexWriter writer = new IndexWriter(scratch, config);
List<AtomicReader> newLeaves = new ArrayList<AtomicReader>(); /** maps per-segment ordinals to/from global ordinal space */
// fake up fieldinfos // TODO: use more efficient packed ints structures (these are all positive values!)
FieldInfo fi = new FieldInfo(field, false, 0, false, false, false, null, DocValuesType.SORTED, null, null); // nocommit: cache this in SlowWrapper, it can create MultiSortedDV with it directly.
final FieldInfos fis = new FieldInfos(new FieldInfo[] { fi }); static class OrdinalMapping {
for (AtomicReaderContext ctx : leaves) { // globalOrd -> (globalOrd - segmentOrd)
final AtomicReader a = ctx.reader(); final AppendingLongBuffer globalOrdDeltas;
newLeaves.add(new FilterAtomicReader(a) { // globalOrd -> sub index
@Override final AppendingLongBuffer subIndexes;
public Bits getLiveDocs() { // segmentOrd -> (globalOrd - segmentOrd)
return null; // lie final AppendingLongBuffer ordDeltas[];
}
@Override OrdinalMapping(SortedDocValues subs[]) throws IOException {
public int numDocs() { // create the ordinal mappings by pulling a termsenum over each sub's
return maxDoc(); // lie // unique terms, and walking a multitermsenum over those
} globalOrdDeltas = new AppendingLongBuffer();
@Override subIndexes = new AppendingLongBuffer();
public boolean hasDeletions() { ordDeltas = new AppendingLongBuffer[subs.length];
return false; // lie for (int i = 0; i < ordDeltas.length; i++) {
} ordDeltas[i] = new AppendingLongBuffer();
@Override
public FieldInfos getFieldInfos() {
return fis;
}
@Override
public Fields getTermVectors(int docID) throws IOException {
return null; // lie
}
@Override
public void document(int docID, StoredFieldVisitor visitor) throws IOException {
// lie
}
@Override
public Fields fields() throws IOException {
return null; // lie
}
});
} }
writer.addIndexes(newLeaves.toArray(new AtomicReader[0])); int segmentOrds[] = new int[subs.length];
writer.close(); ReaderSlice slices[] = new ReaderSlice[subs.length];
final IndexReader newR = DirectoryReader.open(scratch); TermsEnumIndex indexes[] = new TermsEnumIndex[slices.length];
assert newR.leaves().size() == 1; for (int i = 0; i < slices.length; i++) {
r.addReaderClosedListener(new ReaderClosedListener() { slices[i] = new ReaderSlice(0, 0, i);
@Override indexes[i] = new TermsEnumIndex(new SortedDocValuesTermsEnum(subs[i]), i);
public void onClose(IndexReader reader) { }
IOUtils.closeWhileHandlingException(newR, scratch); MultiTermsEnum mte = new MultiTermsEnum(slices);
mte.reset(indexes);
int globalOrd = 0;
while (mte.next() != null) {
TermsEnumWithSlice matches[] = mte.getMatchArray();
for (int i = 0; i < mte.getMatchCount(); i++) {
int subIndex = matches[i].index;
// for each unique term, just mark the first subindex/delta where it occurs
if (i == 0) {
subIndexes.add(subIndex);
globalOrdDeltas.add(globalOrd - segmentOrds[subIndex]);
}
// for each per-segment ord, map it back to the global term.
ordDeltas[subIndex].add(globalOrd - segmentOrds[subIndex]);
segmentOrds[subIndex]++;
} }
}); globalOrd++;
return newR.leaves().get(0).reader().getSortedDocValues(field); }
}
}
/** implements SortedDocValues over n subs, using a SortedBytesMapping */
static class MultiSortedDocValues extends SortedDocValues {
final int docStarts[];
final SortedDocValues values[];
final OrdinalMapping mapping;
MultiSortedDocValues(SortedDocValues values[], int docStarts[], OrdinalMapping mapping) throws IOException {
this.values = values;
this.docStarts = docStarts;
this.mapping = mapping;
}
@Override
public int getOrd(int docID) {
int subIndex = ReaderUtil.subIndex(docID, docStarts);
int segmentOrd = values[subIndex].getOrd(docID - docStarts[subIndex]);
return (int) (segmentOrd + mapping.ordDeltas[subIndex].get(segmentOrd));
}
@Override
public void lookupOrd(int ord, BytesRef result) {
int subIndex = (int) mapping.subIndexes.get(ord);
int segmentOrd = (int) (ord - mapping.globalOrdDeltas.get(ord));
values[subIndex].lookupOrd(segmentOrd, result);
}
@Override
public int getValueCount() {
return mapping.globalOrdDeltas.size();
} }
} }
} }

View File

@ -495,7 +495,7 @@ public final class MultiTermsEnum extends TermsEnum {
} }
} }
private final static class TermsEnumWithSlice { final static class TermsEnumWithSlice {
private final ReaderSlice subSlice; private final ReaderSlice subSlice;
private TermsEnum terms; private TermsEnum terms;
public BytesRef current; public BytesRef current;