mirror of https://github.com/apache/lucene.git
LUCENE-5312: Add BlockJoinSorter, a block-join-friendly index sorter.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1538750 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7b9e044b06
commit
04fed2814b
|
@ -142,6 +142,10 @@ New Features
|
||||||
* LUCENE-5323: Add .sizeInBytes method to all suggesters (Lookup).
|
* LUCENE-5323: Add .sizeInBytes method to all suggesters (Lookup).
|
||||||
(Areek Zillur via Mike McCandless)
|
(Areek Zillur via Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-5312: Add BlockJoinSorter, a new Sorter implementation that makes sure
|
||||||
|
to never split up blocks of documents indexed with IndexWriter.addDocuments.
|
||||||
|
(Adrien Grand)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
|
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
|
||||||
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
package org.apache.lucene.index.sorter;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.AtomicReader;
|
||||||
|
import org.apache.lucene.search.DocIdSet;
|
||||||
|
import org.apache.lucene.search.Filter;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper class to sort readers that contain blocks of documents.
|
||||||
|
*/
|
||||||
|
public abstract class BlockJoinSorter extends Sorter {
|
||||||
|
|
||||||
|
protected final Filter parentsFilter;
|
||||||
|
|
||||||
|
/** Sole constructor. */
|
||||||
|
public BlockJoinSorter(Filter parentsFilter) {
|
||||||
|
this.parentsFilter = parentsFilter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return a {@link Sorter.DocComparator} instance that will be called on
|
||||||
|
* parent doc IDs. */
|
||||||
|
protected abstract DocComparator getParentComparator(AtomicReader reader);
|
||||||
|
|
||||||
|
/** Return a {@link Sorter.DocComparator} instance that will be called on
|
||||||
|
* children of the same parent. By default, children of the same parent are
|
||||||
|
* not reordered. */
|
||||||
|
protected DocComparator getChildComparator(AtomicReader reader) {
|
||||||
|
return INDEX_ORDER_COMPARATOR;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final DocMap sort(AtomicReader reader) throws IOException {
|
||||||
|
final DocIdSet parents = parentsFilter.getDocIdSet(reader.getContext(), null);
|
||||||
|
if (parents == null) {
|
||||||
|
throw new IllegalStateException("AtomicReader " + reader + " contains no parents!");
|
||||||
|
}
|
||||||
|
if (!(parents instanceof FixedBitSet)) {
|
||||||
|
throw new IllegalStateException("parentFilter must return FixedBitSet; got " + parents);
|
||||||
|
}
|
||||||
|
final FixedBitSet parentBits = (FixedBitSet) parents;
|
||||||
|
final DocComparator parentComparator = getParentComparator(reader);
|
||||||
|
final DocComparator childComparator = getChildComparator(reader);
|
||||||
|
final DocComparator comparator = new DocComparator() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(int docID1, int docID2) {
|
||||||
|
final int parent1 = parentBits.nextSetBit(docID1);
|
||||||
|
final int parent2 = parentBits.nextSetBit(docID2);
|
||||||
|
if (parent1 == parent2) { // both are in the same block
|
||||||
|
if (docID1 == parent1 || docID2 == parent2) {
|
||||||
|
// keep parents at the end of blocks
|
||||||
|
return docID1 - docID2;
|
||||||
|
} else {
|
||||||
|
return childComparator.compare(docID1, docID2);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int cmp = parentComparator.compare(parent1, parent2);
|
||||||
|
if (cmp == 0) {
|
||||||
|
cmp = parent1 - parent2;
|
||||||
|
}
|
||||||
|
return cmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
return sort(reader.maxDoc(), comparator);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -36,6 +36,14 @@ import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
|
||||||
*/
|
*/
|
||||||
public abstract class Sorter {
|
public abstract class Sorter {
|
||||||
|
|
||||||
|
/** A comparator that keeps documents in index order. */
|
||||||
|
public static final DocComparator INDEX_ORDER_COMPARATOR = new DocComparator() {
|
||||||
|
@Override
|
||||||
|
public int compare(int docID1, int docID2) {
|
||||||
|
return docID1 - docID2;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A permutation of doc IDs. For every document ID between <tt>0</tt> and
|
* A permutation of doc IDs. For every document ID between <tt>0</tt> and
|
||||||
* {@link IndexReader#maxDoc()}, <code>oldToNew(newToOld(docID))</code> must
|
* {@link IndexReader#maxDoc()}, <code>oldToNew(newToOld(docID))</code> must
|
||||||
|
|
|
@ -0,0 +1,176 @@
|
||||||
|
package org.apache.lucene.index.sorter;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field.Store;
|
||||||
|
import org.apache.lucene.document.NumericDocValuesField;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.index.AtomicReader;
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.CachingWrapperFilter;
|
||||||
|
import org.apache.lucene.search.DocIdSet;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.search.Filter;
|
||||||
|
import org.apache.lucene.search.QueryWrapperFilter;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestBlockJoinSorter extends LuceneTestCase {
|
||||||
|
|
||||||
|
private static class FixedBitSetCachingWrapperFilter extends CachingWrapperFilter {
|
||||||
|
|
||||||
|
public FixedBitSetCachingWrapperFilter(Filter filter) {
|
||||||
|
super(filter);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected DocIdSet cacheImpl(DocIdSetIterator iterator, AtomicReader reader)
|
||||||
|
throws IOException {
|
||||||
|
final FixedBitSet cached = new FixedBitSet(reader.maxDoc());
|
||||||
|
cached.or(iterator);
|
||||||
|
return cached;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() throws IOException {
|
||||||
|
final int numParents = atLeast(200);
|
||||||
|
IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||||
|
cfg.setMergePolicy(newLogMergePolicy());
|
||||||
|
final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), cfg);
|
||||||
|
final Document parentDoc = new Document();
|
||||||
|
final NumericDocValuesField parentVal = new NumericDocValuesField("parent_val", 0L);
|
||||||
|
parentDoc.add(parentVal);
|
||||||
|
final StringField parent = new StringField("parent", "true", Store.YES);
|
||||||
|
parentDoc.add(parent);
|
||||||
|
for (int i = 0; i < numParents; ++i) {
|
||||||
|
List<Document> documents = new ArrayList<Document>();
|
||||||
|
final int numChildren = random().nextInt(10);
|
||||||
|
for (int j = 0; j < numChildren; ++j) {
|
||||||
|
final Document childDoc = new Document();
|
||||||
|
childDoc.add(new NumericDocValuesField("child_val", random().nextInt(5)));
|
||||||
|
documents.add(childDoc);
|
||||||
|
}
|
||||||
|
parentVal.setLongValue(random().nextInt(50));
|
||||||
|
documents.add(parentDoc);
|
||||||
|
writer.addDocuments(documents);
|
||||||
|
}
|
||||||
|
writer.forceMerge(1);
|
||||||
|
final DirectoryReader indexReader = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
final AtomicReader reader = getOnlySegmentReader(indexReader);
|
||||||
|
final Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("parent", "true"))));
|
||||||
|
final FixedBitSet parentBits = (FixedBitSet) parentsFilter.getDocIdSet(reader.getContext(), null);
|
||||||
|
|
||||||
|
final NumericDocValues parentValues = reader.getNumericDocValues("parent_val");
|
||||||
|
final Sorter.DocComparator parentComparator = new Sorter.DocComparator() {
|
||||||
|
@Override
|
||||||
|
public int compare(int docID1, int docID2) {
|
||||||
|
assertTrue(parentBits.get(docID1));
|
||||||
|
assertTrue(parentBits.get(docID2));
|
||||||
|
return Long.compare(parentValues.get(docID1), parentValues.get(docID2));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
final NumericDocValues childValues = reader.getNumericDocValues("child_val");
|
||||||
|
final Sorter.DocComparator childComparator = new Sorter.DocComparator() {
|
||||||
|
@Override
|
||||||
|
public int compare(int docID1, int docID2) {
|
||||||
|
assertFalse(parentBits.get(docID1));
|
||||||
|
assertFalse(parentBits.get(docID2));
|
||||||
|
return Long.compare(childValues.get(docID1), childValues.get(docID2));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
final Sorter sorter = new BlockJoinSorter(parentsFilter) {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getID() {
|
||||||
|
return "Dummy";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected DocComparator getParentComparator(AtomicReader r) {
|
||||||
|
assertEquals(reader, r);
|
||||||
|
return parentComparator;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected DocComparator getChildComparator(AtomicReader r) {
|
||||||
|
assertEquals(reader, r);
|
||||||
|
return childComparator;
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
final Sorter.DocMap docMap = sorter.sort(reader);
|
||||||
|
assertEquals(reader.maxDoc(), docMap.size());
|
||||||
|
|
||||||
|
int[] children = new int[1];
|
||||||
|
int numChildren = 0;
|
||||||
|
int previousParent = -1;
|
||||||
|
for (int i = 0; i < docMap.size(); ++i) {
|
||||||
|
final int oldID = docMap.newToOld(i);
|
||||||
|
if (parentBits.get(oldID)) {
|
||||||
|
// check that we have the right children
|
||||||
|
for (int j = 0; j < numChildren; ++j) {
|
||||||
|
assertEquals(oldID, parentBits.nextSetBit(children[j]));
|
||||||
|
}
|
||||||
|
// check that children are sorted
|
||||||
|
for (int j = 1; j < numChildren; ++j) {
|
||||||
|
final int doc1 = children[j-1];
|
||||||
|
final int doc2 = children[j];
|
||||||
|
if (childValues.get(doc1) == childValues.get(doc2)) {
|
||||||
|
assertTrue(doc1 < doc2); // sort is stable
|
||||||
|
} else {
|
||||||
|
assertTrue(childValues.get(doc1) < childValues.get(doc2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check that parents are sorted
|
||||||
|
if (previousParent != -1) {
|
||||||
|
if (parentValues.get(previousParent) == parentValues.get(oldID)) {
|
||||||
|
assertTrue(previousParent < oldID);
|
||||||
|
} else {
|
||||||
|
assertTrue(parentValues.get(previousParent) < parentValues.get(oldID));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// reset
|
||||||
|
previousParent = oldID;
|
||||||
|
numChildren = 0;
|
||||||
|
} else {
|
||||||
|
children = ArrayUtil.grow(children, numChildren+1);
|
||||||
|
children[numChildren++] = oldID;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
indexReader.close();
|
||||||
|
writer.w.getDirectory().close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue