mirror of https://github.com/apache/lucene.git
LUCENE-6766: implement STRING sort, using segment-local ordinals
This commit is contained in:
parent
54650eccf3
commit
231cb3eb42
|
@ -213,34 +213,29 @@ public class MergeState {
|
||||||
//System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort);
|
//System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort);
|
||||||
|
|
||||||
for (CodecReader leaf : originalReaders) {
|
for (CodecReader leaf : originalReaders) {
|
||||||
if (leaf instanceof SegmentReader) {
|
Sort segmentSort = leaf.getIndexSort();
|
||||||
SegmentReader segmentReader = (SegmentReader) leaf;
|
//System.out.println(" leaf=" + leaf + " sort=" + segmentSort);
|
||||||
Sort segmentSort = segmentReader.getSegmentInfo().info.getIndexSort();
|
|
||||||
//System.out.println(" leaf=" + leaf + " sort=" + segmentSort);
|
|
||||||
|
|
||||||
if (segmentSort == null) {
|
if (segmentSort == null) {
|
||||||
// TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live"
|
// TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live"
|
||||||
// to the files on each indexed document:
|
// to the files on each indexed document:
|
||||||
|
|
||||||
// This segment was written by flush, so documents are not yet sorted, so we sort them now:
|
// This segment was written by flush, so documents are not yet sorted, so we sort them now:
|
||||||
Sorter.DocMap sortDocMap = sorter.sort(leaf);
|
Sorter.DocMap sortDocMap = sorter.sort(leaf);
|
||||||
if (sortDocMap != null) {
|
if (sortDocMap != null) {
|
||||||
//System.out.println(" sort!");
|
//System.out.println(" sort!");
|
||||||
// nocommit what about MergedReaderWrapper in here?
|
// nocommit what about MergedReaderWrapper in here?
|
||||||
leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap));
|
leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap));
|
||||||
leafDocMaps[readers.size()] = new DocMap() {
|
leafDocMaps[readers.size()] = new DocMap() {
|
||||||
@Override
|
@Override
|
||||||
public int get(int docID) {
|
public int get(int docID) {
|
||||||
return sortDocMap.oldToNew(docID);
|
return sortDocMap.oldToNew(docID);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
|
||||||
|
|
||||||
} else if (segmentSort.equals(indexSort) == false) {
|
|
||||||
throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
throw new IllegalArgumentException("cannot sort index with foreign readers; leaf=" + leaf);
|
} else if (segmentSort.equals(indexSort) == false) {
|
||||||
|
throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
|
||||||
}
|
}
|
||||||
|
|
||||||
readers.add(leaf);
|
readers.add(leaf);
|
||||||
|
|
|
@ -123,7 +123,47 @@ final class MultiSorter {
|
||||||
|
|
||||||
private static CrossReaderComparator getComparator(List<CodecReader> readers, SortField sortField) throws IOException {
|
private static CrossReaderComparator getComparator(List<CodecReader> readers, SortField sortField) throws IOException {
|
||||||
switch(sortField.getType()) {
|
switch(sortField.getType()) {
|
||||||
// ncommit: use segment-local ords for string sort
|
|
||||||
|
case STRING:
|
||||||
|
{
|
||||||
|
// this uses the efficient segment-local ordinal map:
|
||||||
|
MultiReader multiReader = new MultiReader(readers.toArray(new LeafReader[readers.size()]));
|
||||||
|
final SortedDocValues sorted = MultiDocValues.getSortedValues(multiReader, sortField.getField());
|
||||||
|
final int[] docStarts = new int[readers.size()];
|
||||||
|
List<LeafReaderContext> leaves = multiReader.leaves();
|
||||||
|
for(int i=0;i<readers.size();i++) {
|
||||||
|
docStarts[i] = leaves.get(i).docBase;
|
||||||
|
}
|
||||||
|
final int missingOrd;
|
||||||
|
if (sortField.getMissingValue() == SortField.STRING_LAST) {
|
||||||
|
missingOrd = Integer.MIN_VALUE;
|
||||||
|
} else {
|
||||||
|
missingOrd = Integer.MAX_VALUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int reverseMul;
|
||||||
|
if (sortField.getReverse()) {
|
||||||
|
reverseMul = -1;
|
||||||
|
} else {
|
||||||
|
reverseMul = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new CrossReaderComparator() {
|
||||||
|
@Override
|
||||||
|
public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) {
|
||||||
|
int ordA = sorted.getOrd(docStarts[readerIndexA] + docIDA);
|
||||||
|
if (ordA == -1) {
|
||||||
|
ordA = missingOrd;
|
||||||
|
}
|
||||||
|
int ordB = sorted.getOrd(docStarts[readerIndexB] + docIDB);
|
||||||
|
if (ordB == -1) {
|
||||||
|
ordB = missingOrd;
|
||||||
|
}
|
||||||
|
return reverseMul * Integer.compare(ordA, ordB);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
case INT:
|
case INT:
|
||||||
{
|
{
|
||||||
List<NumericDocValues> values = new ArrayList<>();
|
List<NumericDocValues> values = new ArrayList<>();
|
||||||
|
|
|
@ -79,6 +79,40 @@ import org.junit.BeforeClass;
|
||||||
|
|
||||||
public class TestIndexSorting extends LuceneTestCase {
|
public class TestIndexSorting extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testBasicString() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
Sort indexSort = new Sort(new SortField("foo", SortField.Type.STRING));
|
||||||
|
iwc.setIndexSort(indexSort);
|
||||||
|
IndexWriter w = new IndexWriter(dir, iwc);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new SortedDocValuesField("foo", new BytesRef("zzz")));
|
||||||
|
w.addDocument(doc);
|
||||||
|
// so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging:
|
||||||
|
w.commit();
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new SortedDocValuesField("foo", new BytesRef("aaa")));
|
||||||
|
w.addDocument(doc);
|
||||||
|
w.commit();
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new SortedDocValuesField("foo", new BytesRef("mmm")));
|
||||||
|
w.addDocument(doc);
|
||||||
|
w.forceMerge(1);
|
||||||
|
|
||||||
|
DirectoryReader r = DirectoryReader.open(w);
|
||||||
|
LeafReader leaf = getOnlyLeafReader(r);
|
||||||
|
assertEquals(3, leaf.maxDoc());
|
||||||
|
SortedDocValues values = leaf.getSortedDocValues("foo");
|
||||||
|
assertEquals("aaa", values.get(0).utf8ToString());
|
||||||
|
assertEquals("mmm", values.get(1).utf8ToString());
|
||||||
|
assertEquals("zzz", values.get(2).utf8ToString());
|
||||||
|
r.close();
|
||||||
|
w.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
public void testSortOnMerge(boolean withDeletes) throws IOException {
|
public void testSortOnMerge(boolean withDeletes) throws IOException {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
|
|
@ -146,6 +146,7 @@ public class MockRandomMergePolicy extends MergePolicy {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
|
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
|
||||||
|
|
||||||
// wrap it (e.g. prevent bulk merge etc)
|
// wrap it (e.g. prevent bulk merge etc)
|
||||||
// TODO: cut this over to FilterCodecReader api, we can explicitly
|
// TODO: cut this over to FilterCodecReader api, we can explicitly
|
||||||
// enable/disable bulk merge for portions of the index we want.
|
// enable/disable bulk merge for portions of the index we want.
|
||||||
|
|
Loading…
Reference in New Issue