randomize how BKDWriter splits in RandomCodec so we exercise geo shape APIs with more exotic rectangles

2016-03-25 15:40:16 -04:00 · 2016-03-25 15:40:16 -04:00 · f785d2a034
parent 9cbaa028dc
commit f785d2a034
3 changed files with 70 additions and 5 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
@ -42,8 +42,8 @@ import org.apache.lucene.util.bkd.BKDWriter;
 /** Writes dimensional values */
 public class Lucene60PointsWriter extends PointsWriter implements Closeable {
  
-  final IndexOutput dataOut;
-  final Map<String,Long> indexFPs = new HashMap<>();
+  protected final IndexOutput dataOut;
+  protected final Map<String,Long> indexFPs = new HashMap<>();
  final SegmentWriteState writeState;
  final int maxPointsInLeafNode;
  final double maxMBSortInHeap;
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@ -1033,8 +1033,7 @@ public class BKDWriter implements Closeable {
    return true;
  }

-  // TODO: make this protected when we want to subclass to play with different splitting criteria
-  private int split(byte[] minPackedValue, byte[] maxPackedValue) {
+  protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
    // Find which dim has the largest span so we can split on it:
    int splitDim = -1;
    for(int dim=0;dim<numDims;dim++) {
--- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
@ -52,8 +52,11 @@ import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
 import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
 import org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat;
 import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.bkd.BKDWriter;

 /**
 * Codec that assigns per-field random postings formats.
@ -93,13 +96,55 @@ public class RandomCodec extends AssertingCodec {
  // TODO: improve how we randomize this...
  private final int maxPointsInLeafNode;
  private final double maxMBSortInHeap;
+  private final int bkdSplitRandomSeed;

  @Override
  public PointsFormat pointsFormat() {
    return new AssertingPointsFormat(new PointsFormat() {
      @Override
      public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
-        return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
+
+        // Randomize how BKDWriter chooses its splis:
+
+        return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap) {
+          @Override
+          public void writeField(FieldInfo fieldInfo, PointsReader values) throws IOException {
+
+            boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
+
+            try (BKDWriter writer = new RandomlySplittingBKDWriter(writeState.segmentInfo.maxDoc(),
+                                                                   writeState.directory,
+                                                                   writeState.segmentInfo.name,
+                                                                   fieldInfo.getPointDimensionCount(),
+                                                                   fieldInfo.getPointNumBytes(),
+                                                                   maxPointsInLeafNode,
+                                                                   maxMBSortInHeap,
+                                                                   values.size(fieldInfo.name),
+                                                                   singleValuePerDoc,
+                                                                   bkdSplitRandomSeed ^ fieldInfo.name.hashCode())) {
+                values.intersect(fieldInfo.name, new IntersectVisitor() {
+                    @Override
+                    public void visit(int docID) {
+                      throw new IllegalStateException();
+                    }
+
+                    public void visit(int docID, byte[] packedValue) throws IOException {
+                      writer.add(packedValue, docID);
+                    }
+
+                    @Override
+                    public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+                      return PointValues.Relation.CELL_CROSSES_QUERY;
+                    }
+                  });
+
+                // We could have 0 points on merge since all docs with dimensional fields may be deleted:
+                if (writer.getPointCount() > 0) {
+                  indexFPs.put(fieldInfo.name, writer.finish(dataOut));
+                }
+              }
+          }
+        };
      }

      @Override
@ -152,6 +197,7 @@ public class RandomCodec extends AssertingCodec {

    maxPointsInLeafNode = TestUtil.nextInt(random, 16, 2048);
    maxMBSortInHeap = 4.0 + (3*random.nextDouble());
+    bkdSplitRandomSeed = random.nextInt();

    add(avoidCodecs,
        TestUtil.getDefaultPostingsFormat(minItemsPerBlock, maxItemsPerBlock),
@ -221,4 +267,24 @@ public class RandomCodec extends AssertingCodec {
           ", maxPointsInLeafNode=" + maxPointsInLeafNode +
           ", maxMBSortInHeap=" + maxMBSortInHeap;
  }
+
+  /** Just like {@link BKDWriter} except it evilly picks random ways to split cells on
+   *  recursion to try to provoke geo APIs that get upset at fun rectangles. */
+  private static class RandomlySplittingBKDWriter extends BKDWriter {
+
+    final Random random;
+
+    public RandomlySplittingBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims,
+                                      int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap,
+                                      long totalPointCount, boolean singleValuePerDoc, int randomSeed) throws IOException {
+      super(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc);
+      this.random = new Random(randomSeed);
+    }
+
+    @Override
+    protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
+      // BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;)
+      return random.nextInt(numDims);
+    }
+  }
 }