mirror of https://github.com/apache/lucene.git
LUCENE-7101: OfflineSorter had O(N^2) merge cost, and used too many temporary file descriptors, for large sorts
This commit is contained in:
parent
4e3f376ca4
commit
287d6c8be6
|
@ -189,6 +189,9 @@ Bug Fixes
|
||||||
On top of that with score mode average, the explain would fail with a NPE.
|
On top of that with score mode average, the explain would fail with a NPE.
|
||||||
(Martijn van Groningen)
|
(Martijn van Groningen)
|
||||||
|
|
||||||
|
* LUCENE-7101: OfflineSorter had O(N^2) merge cost, and used too many
|
||||||
|
temporary file descriptors, for large sorts (Mike McCandless)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
|
|
||||||
* LUCENE-7035: Upgrade icu4j to 56.1/unicode 8. (Robert Muir)
|
* LUCENE-7035: Upgrade icu4j to 56.1/unicode 8. (Robert Muir)
|
||||||
|
|
|
@ -64,7 +64,7 @@ public class OfflineSorter {
|
||||||
/**
|
/**
|
||||||
* Maximum number of temporary files before doing an intermediate merge.
|
* Maximum number of temporary files before doing an intermediate merge.
|
||||||
*/
|
*/
|
||||||
public final static int MAX_TEMPFILES = 128;
|
public final static int MAX_TEMPFILES = 10;
|
||||||
|
|
||||||
private final Directory dir;
|
private final Directory dir;
|
||||||
|
|
||||||
|
@ -232,6 +232,7 @@ public class OfflineSorter {
|
||||||
sortInfo.totalTime = System.currentTimeMillis();
|
sortInfo.totalTime = System.currentTimeMillis();
|
||||||
|
|
||||||
List<String> segments = new ArrayList<>();
|
List<String> segments = new ArrayList<>();
|
||||||
|
int[] levelCounts = new int[1];
|
||||||
|
|
||||||
// So we can remove any partially written temp files on exception:
|
// So we can remove any partially written temp files on exception:
|
||||||
TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(dir);
|
TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(dir);
|
||||||
|
@ -244,15 +245,26 @@ public class OfflineSorter {
|
||||||
segments.add(sortPartition(trackingDir));
|
segments.add(sortPartition(trackingDir));
|
||||||
sortInfo.tempMergeFiles++;
|
sortInfo.tempMergeFiles++;
|
||||||
sortInfo.lineCount += lineCount;
|
sortInfo.lineCount += lineCount;
|
||||||
|
levelCounts[0]++;
|
||||||
|
|
||||||
// Handle intermediate merges.
|
// Handle intermediate merges; we need a while loop to "cascade" the merge when necessary:
|
||||||
if (segments.size() == maxTempFiles) {
|
int mergeLevel = 0;
|
||||||
|
while (levelCounts[mergeLevel] == maxTempFiles) {
|
||||||
mergePartitions(trackingDir, segments);
|
mergePartitions(trackingDir, segments);
|
||||||
|
if (mergeLevel+2 > levelCounts.length) {
|
||||||
|
levelCounts = ArrayUtil.grow(levelCounts, mergeLevel+2);
|
||||||
|
}
|
||||||
|
levelCounts[mergeLevel+1]++;
|
||||||
|
levelCounts[mergeLevel] = 0;
|
||||||
|
mergeLevel++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: we shouldn't have to do this? Can't we return a merged reader to
|
||||||
|
// the caller, who often consumes the result just once, instead?
|
||||||
|
|
||||||
// Merge the partitions to the output file with a priority queue.
|
// Merge all partitions down to 1 (basically a forceMerge(1)):
|
||||||
if (segments.size() > 1) {
|
while (segments.size() > 1) {
|
||||||
mergePartitions(trackingDir, segments);
|
mergePartitions(trackingDir, segments);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -304,19 +316,25 @@ public class OfflineSorter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Merge a list of sorted temporary files (partitions) into an output file. Note that this closes the
|
/** Merge the most recent {@code maxTempFile} partitions into a new partition. */
|
||||||
* incoming {@link IndexOutput}. */
|
|
||||||
void mergePartitions(Directory trackingDir, List<String> segments) throws IOException {
|
void mergePartitions(Directory trackingDir, List<String> segments) throws IOException {
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
|
||||||
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(segments.size()) {
|
List<String> segmentsToMerge;
|
||||||
|
if (segments.size() > maxTempFiles) {
|
||||||
|
segmentsToMerge = segments.subList(segments.size() - maxTempFiles, segments.size());
|
||||||
|
} else {
|
||||||
|
segmentsToMerge = segments;
|
||||||
|
}
|
||||||
|
|
||||||
|
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(segmentsToMerge.size()) {
|
||||||
@Override
|
@Override
|
||||||
protected boolean lessThan(FileAndTop a, FileAndTop b) {
|
protected boolean lessThan(FileAndTop a, FileAndTop b) {
|
||||||
return comparator.compare(a.current.get(), b.current.get()) < 0;
|
return comparator.compare(a.current.get(), b.current.get()) < 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
ByteSequencesReader[] streams = new ByteSequencesReader[segments.size()];
|
ByteSequencesReader[] streams = new ByteSequencesReader[segmentsToMerge.size()];
|
||||||
|
|
||||||
String newSegmentName = null;
|
String newSegmentName = null;
|
||||||
|
|
||||||
|
@ -326,8 +344,8 @@ public class OfflineSorter {
|
||||||
newSegmentName = out.getName();
|
newSegmentName = out.getName();
|
||||||
|
|
||||||
// Open streams and read the top for each file
|
// Open streams and read the top for each file
|
||||||
for (int i = 0; i < segments.size(); i++) {
|
for (int i = 0; i < segmentsToMerge.size(); i++) {
|
||||||
streams[i] = getReader(dir.openInput(segments.get(i), IOContext.READONCE));
|
streams[i] = getReader(dir.openInput(segmentsToMerge.get(i), IOContext.READONCE));
|
||||||
BytesRefBuilder bytes = new BytesRefBuilder();
|
BytesRefBuilder bytes = new BytesRefBuilder();
|
||||||
boolean result = streams[i].read(bytes);
|
boolean result = streams[i].read(bytes);
|
||||||
assert result;
|
assert result;
|
||||||
|
@ -354,9 +372,9 @@ public class OfflineSorter {
|
||||||
IOUtils.close(streams);
|
IOUtils.close(streams);
|
||||||
}
|
}
|
||||||
|
|
||||||
IOUtils.deleteFiles(trackingDir, segments);
|
IOUtils.deleteFiles(trackingDir, segmentsToMerge);
|
||||||
|
|
||||||
segments.clear();
|
segmentsToMerge.clear();
|
||||||
segments.add(newSegmentName);
|
segments.add(newSegmentName);
|
||||||
|
|
||||||
sortInfo.tempMergeFiles++;
|
sortInfo.tempMergeFiles++;
|
||||||
|
|
|
@ -212,11 +212,11 @@ public class BKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** If the current segment has too many points then we switchover to temp files / offline sort. */
|
/** If the current segment has too many points then we spill over to temp files / offline sort. */
|
||||||
private void switchToOffline() throws IOException {
|
private void spillToOffline() throws IOException {
|
||||||
|
|
||||||
// For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree:
|
// For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree:
|
||||||
offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, "switch");
|
offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, "spill");
|
||||||
tempInput = offlinePointWriter.out;
|
tempInput = offlinePointWriter.out;
|
||||||
PointReader reader = heapPointWriter.getReader(0);
|
PointReader reader = heapPointWriter.getReader(0);
|
||||||
for(int i=0;i<pointCount;i++) {
|
for(int i=0;i<pointCount;i++) {
|
||||||
|
@ -235,7 +235,7 @@ public class BKDWriter implements Closeable {
|
||||||
|
|
||||||
if (pointCount >= maxPointsSortInHeap) {
|
if (pointCount >= maxPointsSortInHeap) {
|
||||||
if (offlinePointWriter == null) {
|
if (offlinePointWriter == null) {
|
||||||
switchToOffline();
|
spillToOffline();
|
||||||
}
|
}
|
||||||
offlinePointWriter.append(packedValue, pointCount, docID);
|
offlinePointWriter.append(packedValue, pointCount, docID);
|
||||||
} else {
|
} else {
|
||||||
|
@ -733,7 +733,7 @@ public class BKDWriter implements Closeable {
|
||||||
// TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter:
|
// TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter:
|
||||||
IndexOutput[] lastWriter = new IndexOutput[1];
|
IndexOutput[] lastWriter = new IndexOutput[1];
|
||||||
|
|
||||||
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, cmp, OfflineSorter.BufferSize.megabytes(Math.max(1, (long) maxMBSortInHeap)), OfflineSorter.MAX_TEMPFILES) {
|
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, OfflineSorter.BufferSize.megabytes(Math.max(1, (long) maxMBSortInHeap)), OfflineSorter.MAX_TEMPFILES) {
|
||||||
|
|
||||||
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
|
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
|
||||||
@Override
|
@Override
|
||||||
|
@ -742,9 +742,7 @@ public class BKDWriter implements Closeable {
|
||||||
return new ByteSequencesWriter(out) {
|
return new ByteSequencesWriter(out) {
|
||||||
@Override
|
@Override
|
||||||
public void write(byte[] bytes, int off, int len) throws IOException {
|
public void write(byte[] bytes, int off, int len) throws IOException {
|
||||||
if (len != bytesPerDoc) {
|
assert len == bytesPerDoc: "len=" + len + " bytesPerDoc=" + bytesPerDoc;
|
||||||
throw new IllegalArgumentException("len=" + len + " bytesPerDoc=" + bytesPerDoc);
|
|
||||||
}
|
|
||||||
out.writeBytes(bytes, off, len);
|
out.writeBytes(bytes, off, len);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -49,7 +49,6 @@ import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||||
public class Test2BPoints extends LuceneTestCase {
|
public class Test2BPoints extends LuceneTestCase {
|
||||||
public void test1D() throws Exception {
|
public void test1D() throws Exception {
|
||||||
Directory dir = FSDirectory.open(createTempDir("2BPoints1D"));
|
Directory dir = FSDirectory.open(createTempDir("2BPoints1D"));
|
||||||
System.out.println("DIR: " + ((FSDirectory) dir).getDirectory());
|
|
||||||
|
|
||||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()))
|
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()))
|
||||||
.setCodec(getCodec())
|
.setCodec(getCodec())
|
||||||
|
@ -144,24 +143,6 @@ public class Test2BPoints extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Codec getCodec() {
|
private static Codec getCodec() {
|
||||||
|
return Codec.forName("Lucene60");
|
||||||
return new FilterCodec("Lucene60", Codec.forName("Lucene60")) {
|
|
||||||
@Override
|
|
||||||
public PointsFormat pointsFormat() {
|
|
||||||
return new PointsFormat() {
|
|
||||||
@Override
|
|
||||||
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
|
|
||||||
int maxPointsInLeafNode = 1024;
|
|
||||||
double maxMBSortInHeap = 256.0;
|
|
||||||
return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public PointsReader fieldsReader(SegmentReadState readState) throws IOException {
|
|
||||||
return new Lucene60PointsReader(readState);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,7 +82,7 @@ public class TestOfflineSorter extends LuceneTestCase {
|
||||||
try (Directory dir = newDirectory()) {
|
try (Directory dir = newDirectory()) {
|
||||||
SortInfo sortInfo = checkSort(dir, new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(1), OfflineSorter.MAX_TEMPFILES),
|
SortInfo sortInfo = checkSort(dir, new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(1), OfflineSorter.MAX_TEMPFILES),
|
||||||
generateRandom((int)OfflineSorter.MB * 20));
|
generateRandom((int)OfflineSorter.MB * 20));
|
||||||
assertEquals(1, sortInfo.mergeRounds);
|
assertEquals(3, sortInfo.mergeRounds);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,7 @@ public class Test2BBKDPoints extends LuceneTestCase {
|
||||||
|
|
||||||
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
|
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
|
||||||
|
|
||||||
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, 1024, 256, 26L * numDocs);
|
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 1, Long.BYTES, 26L * numDocs);
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
byte[] packedBytes = new byte[Long.BYTES];
|
byte[] packedBytes = new byte[Long.BYTES];
|
||||||
for (int docID = 0; docID < numDocs; docID++) {
|
for (int docID = 0; docID < numDocs; docID++) {
|
||||||
|
@ -88,7 +88,7 @@ public class Test2BBKDPoints extends LuceneTestCase {
|
||||||
|
|
||||||
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
|
final int numDocs = (Integer.MAX_VALUE / 26) + 100;
|
||||||
|
|
||||||
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, 1024, 256, 26L * numDocs);
|
BKDWriter w = new BKDWriter(numDocs, dir, "_0", 2, Long.BYTES, 26L * numDocs);
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
byte[] packedBytes = new byte[2*Long.BYTES];
|
byte[] packedBytes = new byte[2*Long.BYTES];
|
||||||
for (int docID = 0; docID < numDocs; docID++) {
|
for (int docID = 0; docID < numDocs; docID++) {
|
||||||
|
|
Loading…
Reference in New Issue