From 231cb3eb42e03de721329a8c40fbb47a9bde4bbd Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sat, 7 May 2016 11:46:17 -0400 Subject: [PATCH] LUCENE-6766: implement STRING sort, using segment-local ordinals --- .../org/apache/lucene/index/MergeState.java | 45 +++++++++---------- .../org/apache/lucene/index/MultiSorter.java | 42 ++++++++++++++++- .../apache/lucene/index/TestIndexSorting.java | 34 ++++++++++++++ .../lucene/index/MockRandomMergePolicy.java | 1 + 4 files changed, 96 insertions(+), 26 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 32e048086a7..31065e31656 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -213,34 +213,29 @@ public class MergeState { //System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort); for (CodecReader leaf : originalReaders) { - if (leaf instanceof SegmentReader) { - SegmentReader segmentReader = (SegmentReader) leaf; - Sort segmentSort = segmentReader.getSegmentInfo().info.getIndexSort(); - //System.out.println(" leaf=" + leaf + " sort=" + segmentSort); + Sort segmentSort = leaf.getIndexSort(); + //System.out.println(" leaf=" + leaf + " sort=" + segmentSort); - if (segmentSort == null) { - // TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live" - // to the files on each indexed document: + if (segmentSort == null) { + // TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live" + // to the files on each indexed document: - // This segment was written by flush, so documents are not yet sorted, so we sort them now: - Sorter.DocMap sortDocMap = sorter.sort(leaf); - if (sortDocMap != null) { - //System.out.println(" sort!"); - // nocommit what about MergedReaderWrapper in here? - leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap)); - leafDocMaps[readers.size()] = new DocMap() { - @Override - public int get(int docID) { - return sortDocMap.oldToNew(docID); - } - }; - } - - } else if (segmentSort.equals(indexSort) == false) { - throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort); + // This segment was written by flush, so documents are not yet sorted, so we sort them now: + Sorter.DocMap sortDocMap = sorter.sort(leaf); + if (sortDocMap != null) { + //System.out.println(" sort!"); + // nocommit what about MergedReaderWrapper in here? + leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap)); + leafDocMaps[readers.size()] = new DocMap() { + @Override + public int get(int docID) { + return sortDocMap.oldToNew(docID); + } + }; } - } else { - throw new IllegalArgumentException("cannot sort index with foreign readers; leaf=" + leaf); + + } else if (segmentSort.equals(indexSort) == false) { + throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort); } readers.add(leaf); diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 7f71eb55880..ca1ebe57780 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -123,7 +123,47 @@ final class MultiSorter { private static CrossReaderComparator getComparator(List readers, SortField sortField) throws IOException { switch(sortField.getType()) { - // ncommit: use segment-local ords for string sort + + case STRING: + { + // this uses the efficient segment-local ordinal map: + MultiReader multiReader = new MultiReader(readers.toArray(new LeafReader[readers.size()])); + final SortedDocValues sorted = MultiDocValues.getSortedValues(multiReader, sortField.getField()); + final int[] docStarts = new int[readers.size()]; + List leaves = multiReader.leaves(); + for(int i=0;i values = new ArrayList<>(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 8df81bab264..1da6c82cc0e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -79,6 +79,40 @@ import org.junit.BeforeClass; public class TestIndexSorting extends LuceneTestCase { + public void testBasicString() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + Sort indexSort = new Sort(new SortField("foo", SortField.Type.STRING)); + iwc.setIndexSort(indexSort); + IndexWriter w = new IndexWriter(dir, iwc); + Document doc = new Document(); + doc.add(new SortedDocValuesField("foo", new BytesRef("zzz"))); + w.addDocument(doc); + // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging: + w.commit(); + + doc = new Document(); + doc.add(new SortedDocValuesField("foo", new BytesRef("aaa"))); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new SortedDocValuesField("foo", new BytesRef("mmm"))); + w.addDocument(doc); + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(3, leaf.maxDoc()); + SortedDocValues values = leaf.getSortedDocValues("foo"); + assertEquals("aaa", values.get(0).utf8ToString()); + assertEquals("mmm", values.get(1).utf8ToString()); + assertEquals("zzz", values.get(2).utf8ToString()); + r.close(); + w.close(); + dir.close(); + } + public void testSortOnMerge(boolean withDeletes) throws IOException { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java b/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java index 93898881416..f32e4d3c118 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java @@ -146,6 +146,7 @@ public class MockRandomMergePolicy extends MergePolicy { @Override public CodecReader wrapForMerge(CodecReader reader) throws IOException { + // wrap it (e.g. prevent bulk merge etc) // TODO: cut this over to FilterCodecReader api, we can explicitly // enable/disable bulk merge for portions of the index we want.