LUCENE-5312: Add BlockJoinSorter, a block-join-friendly index sorter.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1538750 13f79535-47bb-0310-9956-ffa450edef68
2013-11-04 20:17:38 +00:00 · 2013-11-04 20:17:38 +00:00 · 04fed2814b
parent 7b9e044b06
commit 04fed2814b
4 changed files with 276 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -142,6 +142,10 @@ New Features
 * LUCENE-5323: Add .sizeInBytes method to all suggesters (Lookup).
  (Areek Zillur via Mike McCandless)

+* LUCENE-5312: Add BlockJoinSorter, a new Sorter implementation that makes sure
+  to never split up blocks of documents indexed with IndexWriter.addDocuments.
+  (Adrien Grand)
+
 Bug Fixes

 * LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
--- a/lucene/misc/src/java/org/apache/lucene/index/sorter/BlockJoinSorter.java
+++ b/lucene/misc/src/java/org/apache/lucene/index/sorter/BlockJoinSorter.java
@ -0,0 +1,88 @@
+package org.apache.lucene.index.sorter;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * Helper class to sort readers that contain blocks of documents.
+ */
+public abstract class BlockJoinSorter extends Sorter {
+
+  protected final Filter parentsFilter;
+
+  /** Sole constructor. */
+  public BlockJoinSorter(Filter parentsFilter) {
+    this.parentsFilter = parentsFilter;
+  }
+
+  /** Return a {@link Sorter.DocComparator} instance that will be called on
+   *  parent doc IDs. */
+  protected abstract DocComparator getParentComparator(AtomicReader reader);
+
+  /** Return a {@link Sorter.DocComparator} instance that will be called on
+   *  children of the same parent. By default, children of the same parent are
+   *  not reordered. */
+  protected DocComparator getChildComparator(AtomicReader reader) {
+    return INDEX_ORDER_COMPARATOR;
+  }
+
+  @Override
+  public final DocMap sort(AtomicReader reader) throws IOException {
+    final DocIdSet parents = parentsFilter.getDocIdSet(reader.getContext(), null);
+    if (parents == null) {
+      throw new IllegalStateException("AtomicReader " + reader + " contains no parents!");
+    }
+    if (!(parents instanceof FixedBitSet)) {
+      throw new IllegalStateException("parentFilter must return FixedBitSet; got " + parents);
+    }
+    final FixedBitSet parentBits = (FixedBitSet) parents;
+    final DocComparator parentComparator = getParentComparator(reader);
+    final DocComparator childComparator = getChildComparator(reader);
+    final DocComparator comparator = new DocComparator() {
+
+      @Override
+      public int compare(int docID1, int docID2) {
+        final int parent1 = parentBits.nextSetBit(docID1);
+        final int parent2 = parentBits.nextSetBit(docID2);
+        if (parent1 == parent2) { // both are in the same block
+          if (docID1 == parent1 || docID2 == parent2) {
+            // keep parents at the end of blocks
+            return docID1 - docID2;
+          } else {
+            return childComparator.compare(docID1, docID2);
+          }
+        } else {
+          int cmp = parentComparator.compare(parent1, parent2);
+          if (cmp == 0) {
+            cmp = parent1 - parent2;
+          }
+          return cmp;
+        }
+      }
+
+    };
+    return sort(reader.maxDoc(), comparator);
+  }
+
+}
--- a/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java
+++ b/lucene/misc/src/java/org/apache/lucene/index/sorter/Sorter.java
@ -36,6 +36,14 @@ import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
 */
 public abstract class Sorter {

+  /** A comparator that keeps documents in index order. */
+  public static final DocComparator INDEX_ORDER_COMPARATOR = new DocComparator() {
+    @Override
+    public int compare(int docID1, int docID2) {
+      return docID1 - docID2;
+    }
+  };
+
  /**
   * A permutation of doc IDs. For every document ID between <tt>0</tt> and
   * {@link IndexReader#maxDoc()}, <code>oldToNew(newToOld(docID))</code> must
--- a/lucene/misc/src/test/org/apache/lucene/index/sorter/TestBlockJoinSorter.java
+++ b/lucene/misc/src/test/org/apache/lucene/index/sorter/TestBlockJoinSorter.java
@ -0,0 +1,176 @@
+package org.apache.lucene.index.sorter;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.CachingWrapperFilter;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestBlockJoinSorter extends LuceneTestCase {
+
+  private static class FixedBitSetCachingWrapperFilter extends CachingWrapperFilter {
+
+    public FixedBitSetCachingWrapperFilter(Filter filter) {
+      super(filter);
+    }
+
+    @Override
+    protected DocIdSet cacheImpl(DocIdSetIterator iterator, AtomicReader reader)
+        throws IOException {
+      final FixedBitSet cached = new FixedBitSet(reader.maxDoc());
+      cached.or(iterator);
+      return cached;
+    }
+
+  }
+
+  public void test() throws IOException {
+    final int numParents = atLeast(200);
+    IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    cfg.setMergePolicy(newLogMergePolicy());
+    final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), cfg);
+    final Document parentDoc = new Document();
+    final NumericDocValuesField parentVal = new NumericDocValuesField("parent_val", 0L);
+    parentDoc.add(parentVal);
+    final StringField parent = new StringField("parent", "true", Store.YES);
+    parentDoc.add(parent);
+    for (int i = 0; i < numParents; ++i) {
+      List<Document> documents = new ArrayList<Document>();
+      final int numChildren = random().nextInt(10);
+      for (int j = 0; j < numChildren; ++j) {
+        final Document childDoc = new Document();
+        childDoc.add(new NumericDocValuesField("child_val", random().nextInt(5)));
+        documents.add(childDoc);
+      }
+      parentVal.setLongValue(random().nextInt(50));
+      documents.add(parentDoc);
+      writer.addDocuments(documents);
+    }
+    writer.forceMerge(1);
+    final DirectoryReader indexReader = writer.getReader();
+    writer.close();
+
+    final AtomicReader reader = getOnlySegmentReader(indexReader);
+    final Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("parent", "true"))));
+    final FixedBitSet parentBits = (FixedBitSet) parentsFilter.getDocIdSet(reader.getContext(), null);
+
+    final NumericDocValues parentValues = reader.getNumericDocValues("parent_val");
+    final Sorter.DocComparator parentComparator = new Sorter.DocComparator() {
+      @Override
+      public int compare(int docID1, int docID2) {
+        assertTrue(parentBits.get(docID1));
+        assertTrue(parentBits.get(docID2));
+        return Long.compare(parentValues.get(docID1), parentValues.get(docID2));
+      }
+    };
+
+    final NumericDocValues childValues = reader.getNumericDocValues("child_val");
+    final Sorter.DocComparator childComparator = new Sorter.DocComparator() {
+      @Override
+      public int compare(int docID1, int docID2) {
+        assertFalse(parentBits.get(docID1));
+        assertFalse(parentBits.get(docID2));
+        return Long.compare(childValues.get(docID1), childValues.get(docID2));
+      }
+    };
+
+    final Sorter sorter = new BlockJoinSorter(parentsFilter) {
+      
+      @Override
+      public String getID() {
+        return "Dummy";
+      }
+      
+      @Override
+      protected DocComparator getParentComparator(AtomicReader r) {
+        assertEquals(reader, r);
+        return parentComparator;
+      }
+
+      @Override
+      protected DocComparator getChildComparator(AtomicReader r) {
+        assertEquals(reader, r);
+        return childComparator;
+      }
+
+    };
+    final Sorter.DocMap docMap = sorter.sort(reader);
+    assertEquals(reader.maxDoc(), docMap.size());
+
+    int[] children = new int[1];
+    int numChildren = 0;
+    int previousParent = -1;
+    for (int i = 0; i < docMap.size(); ++i) {
+      final int oldID = docMap.newToOld(i);
+      if (parentBits.get(oldID)) {
+        // check that we have the right children
+        for (int j = 0; j < numChildren; ++j) {
+          assertEquals(oldID, parentBits.nextSetBit(children[j]));
+        }
+        // check that children are sorted
+        for (int j = 1; j < numChildren; ++j) {
+          final int doc1 = children[j-1];
+          final int doc2 = children[j];
+          if (childValues.get(doc1) == childValues.get(doc2)) {
+            assertTrue(doc1 < doc2); // sort is stable
+          } else {
+            assertTrue(childValues.get(doc1) < childValues.get(doc2));
+          }
+        }
+        // check that parents are sorted
+        if (previousParent != -1) {
+          if (parentValues.get(previousParent) == parentValues.get(oldID)) {
+            assertTrue(previousParent < oldID);
+          } else {
+            assertTrue(parentValues.get(previousParent) < parentValues.get(oldID));
+          }
+        }
+        // reset
+        previousParent = oldID;
+        numChildren = 0;
+      } else {
+        children = ArrayUtil.grow(children, numChildren+1);
+        children[numChildren++] = oldID;
+      }
+    }
+    indexReader.close();
+    writer.w.getDirectory().close();
+  }
+
+}