mirror of https://github.com/apache/lucene.git
LUCENE-6766: implement STRING sort, using segment-local ordinals
This commit is contained in:
parent
54650eccf3
commit
231cb3eb42
|
@ -213,34 +213,29 @@ public class MergeState {
|
|||
//System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort);
|
||||
|
||||
for (CodecReader leaf : originalReaders) {
|
||||
if (leaf instanceof SegmentReader) {
|
||||
SegmentReader segmentReader = (SegmentReader) leaf;
|
||||
Sort segmentSort = segmentReader.getSegmentInfo().info.getIndexSort();
|
||||
//System.out.println(" leaf=" + leaf + " sort=" + segmentSort);
|
||||
Sort segmentSort = leaf.getIndexSort();
|
||||
//System.out.println(" leaf=" + leaf + " sort=" + segmentSort);
|
||||
|
||||
if (segmentSort == null) {
|
||||
// TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live"
|
||||
// to the files on each indexed document:
|
||||
if (segmentSort == null) {
|
||||
// TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live"
|
||||
// to the files on each indexed document:
|
||||
|
||||
// This segment was written by flush, so documents are not yet sorted, so we sort them now:
|
||||
Sorter.DocMap sortDocMap = sorter.sort(leaf);
|
||||
if (sortDocMap != null) {
|
||||
//System.out.println(" sort!");
|
||||
// nocommit what about MergedReaderWrapper in here?
|
||||
leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap));
|
||||
leafDocMaps[readers.size()] = new DocMap() {
|
||||
@Override
|
||||
public int get(int docID) {
|
||||
return sortDocMap.oldToNew(docID);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
} else if (segmentSort.equals(indexSort) == false) {
|
||||
throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
|
||||
// This segment was written by flush, so documents are not yet sorted, so we sort them now:
|
||||
Sorter.DocMap sortDocMap = sorter.sort(leaf);
|
||||
if (sortDocMap != null) {
|
||||
//System.out.println(" sort!");
|
||||
// nocommit what about MergedReaderWrapper in here?
|
||||
leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap));
|
||||
leafDocMaps[readers.size()] = new DocMap() {
|
||||
@Override
|
||||
public int get(int docID) {
|
||||
return sortDocMap.oldToNew(docID);
|
||||
}
|
||||
};
|
||||
}
|
||||
} else {
|
||||
throw new IllegalArgumentException("cannot sort index with foreign readers; leaf=" + leaf);
|
||||
|
||||
} else if (segmentSort.equals(indexSort) == false) {
|
||||
throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
|
||||
}
|
||||
|
||||
readers.add(leaf);
|
||||
|
|
|
@ -123,7 +123,47 @@ final class MultiSorter {
|
|||
|
||||
private static CrossReaderComparator getComparator(List<CodecReader> readers, SortField sortField) throws IOException {
|
||||
switch(sortField.getType()) {
|
||||
// ncommit: use segment-local ords for string sort
|
||||
|
||||
case STRING:
|
||||
{
|
||||
// this uses the efficient segment-local ordinal map:
|
||||
MultiReader multiReader = new MultiReader(readers.toArray(new LeafReader[readers.size()]));
|
||||
final SortedDocValues sorted = MultiDocValues.getSortedValues(multiReader, sortField.getField());
|
||||
final int[] docStarts = new int[readers.size()];
|
||||
List<LeafReaderContext> leaves = multiReader.leaves();
|
||||
for(int i=0;i<readers.size();i++) {
|
||||
docStarts[i] = leaves.get(i).docBase;
|
||||
}
|
||||
final int missingOrd;
|
||||
if (sortField.getMissingValue() == SortField.STRING_LAST) {
|
||||
missingOrd = Integer.MIN_VALUE;
|
||||
} else {
|
||||
missingOrd = Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
final int reverseMul;
|
||||
if (sortField.getReverse()) {
|
||||
reverseMul = -1;
|
||||
} else {
|
||||
reverseMul = 1;
|
||||
}
|
||||
|
||||
return new CrossReaderComparator() {
|
||||
@Override
|
||||
public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) {
|
||||
int ordA = sorted.getOrd(docStarts[readerIndexA] + docIDA);
|
||||
if (ordA == -1) {
|
||||
ordA = missingOrd;
|
||||
}
|
||||
int ordB = sorted.getOrd(docStarts[readerIndexB] + docIDB);
|
||||
if (ordB == -1) {
|
||||
ordB = missingOrd;
|
||||
}
|
||||
return reverseMul * Integer.compare(ordA, ordB);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
case INT:
|
||||
{
|
||||
List<NumericDocValues> values = new ArrayList<>();
|
||||
|
|
|
@ -79,6 +79,40 @@ import org.junit.BeforeClass;
|
|||
|
||||
public class TestIndexSorting extends LuceneTestCase {
|
||||
|
||||
public void testBasicString() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||
Sort indexSort = new Sort(new SortField("foo", SortField.Type.STRING));
|
||||
iwc.setIndexSort(indexSort);
|
||||
IndexWriter w = new IndexWriter(dir, iwc);
|
||||
Document doc = new Document();
|
||||
doc.add(new SortedDocValuesField("foo", new BytesRef("zzz")));
|
||||
w.addDocument(doc);
|
||||
// so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging:
|
||||
w.commit();
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new SortedDocValuesField("foo", new BytesRef("aaa")));
|
||||
w.addDocument(doc);
|
||||
w.commit();
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new SortedDocValuesField("foo", new BytesRef("mmm")));
|
||||
w.addDocument(doc);
|
||||
w.forceMerge(1);
|
||||
|
||||
DirectoryReader r = DirectoryReader.open(w);
|
||||
LeafReader leaf = getOnlyLeafReader(r);
|
||||
assertEquals(3, leaf.maxDoc());
|
||||
SortedDocValues values = leaf.getSortedDocValues("foo");
|
||||
assertEquals("aaa", values.get(0).utf8ToString());
|
||||
assertEquals("mmm", values.get(1).utf8ToString());
|
||||
assertEquals("zzz", values.get(2).utf8ToString());
|
||||
r.close();
|
||||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSortOnMerge(boolean withDeletes) throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||
|
|
|
@ -146,6 +146,7 @@ public class MockRandomMergePolicy extends MergePolicy {
|
|||
|
||||
@Override
|
||||
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
|
||||
|
||||
// wrap it (e.g. prevent bulk merge etc)
|
||||
// TODO: cut this over to FilterCodecReader api, we can explicitly
|
||||
// enable/disable bulk merge for portions of the index we want.
|
||||
|
|
Loading…
Reference in New Issue