LUCENE-6766: implement STRING sort, using segment-local ordinals

This commit is contained in:
Mike McCandless 2016-05-07 11:46:17 -04:00
parent 54650eccf3
commit 231cb3eb42
4 changed files with 96 additions and 26 deletions

View File

@ -213,9 +213,7 @@ public class MergeState {
//System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort);
for (CodecReader leaf : originalReaders) {
if (leaf instanceof SegmentReader) {
SegmentReader segmentReader = (SegmentReader) leaf;
Sort segmentSort = segmentReader.getSegmentInfo().info.getIndexSort();
Sort segmentSort = leaf.getIndexSort();
//System.out.println(" leaf=" + leaf + " sort=" + segmentSort);
if (segmentSort == null) {
@ -239,9 +237,6 @@ public class MergeState {
} else if (segmentSort.equals(indexSort) == false) {
throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
}
} else {
throw new IllegalArgumentException("cannot sort index with foreign readers; leaf=" + leaf);
}
readers.add(leaf);
}

View File

@ -123,7 +123,47 @@ final class MultiSorter {
private static CrossReaderComparator getComparator(List<CodecReader> readers, SortField sortField) throws IOException {
switch(sortField.getType()) {
// ncommit: use segment-local ords for string sort
case STRING:
{
// this uses the efficient segment-local ordinal map:
MultiReader multiReader = new MultiReader(readers.toArray(new LeafReader[readers.size()]));
final SortedDocValues sorted = MultiDocValues.getSortedValues(multiReader, sortField.getField());
final int[] docStarts = new int[readers.size()];
List<LeafReaderContext> leaves = multiReader.leaves();
for(int i=0;i<readers.size();i++) {
docStarts[i] = leaves.get(i).docBase;
}
final int missingOrd;
if (sortField.getMissingValue() == SortField.STRING_LAST) {
missingOrd = Integer.MIN_VALUE;
} else {
missingOrd = Integer.MAX_VALUE;
}
final int reverseMul;
if (sortField.getReverse()) {
reverseMul = -1;
} else {
reverseMul = 1;
}
return new CrossReaderComparator() {
@Override
public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) {
int ordA = sorted.getOrd(docStarts[readerIndexA] + docIDA);
if (ordA == -1) {
ordA = missingOrd;
}
int ordB = sorted.getOrd(docStarts[readerIndexB] + docIDB);
if (ordB == -1) {
ordB = missingOrd;
}
return reverseMul * Integer.compare(ordA, ordB);
}
};
}
case INT:
{
List<NumericDocValues> values = new ArrayList<>();

View File

@ -79,6 +79,40 @@ import org.junit.BeforeClass;
public class TestIndexSorting extends LuceneTestCase {
public void testBasicString() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
Sort indexSort = new Sort(new SortField("foo", SortField.Type.STRING));
iwc.setIndexSort(indexSort);
IndexWriter w = new IndexWriter(dir, iwc);
Document doc = new Document();
doc.add(new SortedDocValuesField("foo", new BytesRef("zzz")));
w.addDocument(doc);
// so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging:
w.commit();
doc = new Document();
doc.add(new SortedDocValuesField("foo", new BytesRef("aaa")));
w.addDocument(doc);
w.commit();
doc = new Document();
doc.add(new SortedDocValuesField("foo", new BytesRef("mmm")));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
LeafReader leaf = getOnlyLeafReader(r);
assertEquals(3, leaf.maxDoc());
SortedDocValues values = leaf.getSortedDocValues("foo");
assertEquals("aaa", values.get(0).utf8ToString());
assertEquals("mmm", values.get(1).utf8ToString());
assertEquals("zzz", values.get(2).utf8ToString());
r.close();
w.close();
dir.close();
}
public void testSortOnMerge(boolean withDeletes) throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));

View File

@ -146,6 +146,7 @@ public class MockRandomMergePolicy extends MergePolicy {
@Override
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
// wrap it (e.g. prevent bulk merge etc)
// TODO: cut this over to FilterCodecReader api, we can explicitly
// enable/disable bulk merge for portions of the index we want.