LUCENE-6066: new DiversifiedTopDocsCollector in misc and PriorityQueue.remove method

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1659195 13f79535-47bb-0310-9956-ffa450edef68
2015-02-12 10:10:16 +00:00 · 2015-02-12 10:10:16 +00:00 · ef3a99b385
parent 17bfed1212
commit ef3a99b385
5 changed files with 847 additions and 27 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -33,6 +33,10 @@ API Changes

 New Features

+* LUCENE-6066: Added DiversifiedTopDocsCollector to misc for collecting no more 
+  than a given number of results under a choice of key. Introduces new remove 
+  method to core's PriorityQueue. (Mark Harwood)
+
 * LUCENE-3922: Added JapaneseNumberFilter that normalizes Japanese numbers
  in kansuji form to regular/Arabic numbers. (Gaute Lambertsen, Christian Moen)

--- a/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java
+++ b/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java
@ -17,17 +17,19 @@ package org.apache.lucene.util;
 * limitations under the License.
 */

-/** A PriorityQueue maintains a partial ordering of its elements such that the
- * least element can always be found in constant time.  Put()'s and pop()'s
- * require log(size) time.
+/**
+ * A PriorityQueue maintains a partial ordering of its elements such that the
+ * least element can always be found in constant time. Put()'s and pop()'s
+ * require log(size) time but the remove() cost implemented here is linear.
 *
- * <p><b>NOTE</b>: This class will pre-allocate a full array of
- * length <code>maxSize+1</code> if instantiated via the
- * {@link #PriorityQueue(int,boolean)} constructor with
- * <code>prepopulate</code> set to <code>true</code>.
+ * <p>
+ * <b>NOTE</b>: This class will pre-allocate a full array of length
+ * <code>maxSize+1</code> if instantiated via the
+ * {@link #PriorityQueue(int,boolean)} constructor with <code>prepopulate</code>
+ * set to <code>true</code>.
 * 
 * @lucene.internal
-*/
+ */
 public abstract class PriorityQueue<T> {
  private int size = 0;
  private final int maxSize;
@ -130,7 +132,7 @@ public abstract class PriorityQueue<T> {
  public final T add(T element) {
    size++;
    heap[size] = element;
-    upHeap();
+    upHeap(size);
    return heap[1];
  }

@ -174,7 +176,7 @@ public abstract class PriorityQueue<T> {
      heap[1] = heap[size];     // move last to first
      heap[size] = null;        // permit GC of objects
      size--;
-      downHeap();               // adjust heap
+      downHeap(1);              // adjust heap
      return result;
    } else {
      return null;
@ -201,7 +203,7 @@ public abstract class PriorityQueue<T> {
   * @return the new 'top' element.
   */
  public final T updateTop() {
-    downHeap();
+    downHeap(1);
    return heap[1];
  }

@ -226,8 +228,31 @@ public abstract class PriorityQueue<T> {
    size = 0;
  }

-  private final void upHeap() {
-    int i = size;
+  /**
+   * Removes an existing element currently stored in the PriorityQueue. Cost is
+   * linear with the size of the queue. (A specialization of PriorityQueue which
+   * tracks element positions would provide a constant remove time but the
+   * trade-off would be extra cost to all additions/insertions)
+   */
+  public final boolean remove(T element) {
+    for (int i = 1; i <= size; i++) {
+      if (heap[i] == element) {
+        heap[i] = heap[size];
+        heap[size] = null; // permit GC of objects
+        size--;
+        if (i <= size) {
+          if (!upHeap(i)) {
+            downHeap(i);
+          }
+        }
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private final boolean upHeap(int origPos) {
+    int i = origPos;
    T node = heap[i];          // save bottom node
    int j = i >>> 1;
    while (j > 0 && lessThan(node, heap[j])) {
@ -236,10 +261,10 @@ public abstract class PriorityQueue<T> {
      j = j >>> 1;
    }
    heap[i] = node;            // install saved node
+    return i != origPos;
  }
-
-  private final void downHeap() {
-    int i = 1;
+  
+  private final void downHeap(int i) {
    T node = heap[i];          // save top node
    int j = i << 1;            // find smaller child
    int k = j + 1;
@ -257,7 +282,7 @@ public abstract class PriorityQueue<T> {
    }
    heap[i] = node;            // install saved node
  }
-  
+
  /** This method returns the internal heap array as Object[].
   * @lucene.internal
   */
--- a/lucene/core/src/test/org/apache/lucene/util/TestPriorityQueue.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestPriorityQueue.java
@ -17,21 +17,41 @@ package org.apache.lucene.util;
 * limitations under the License.
 */

+import java.util.ArrayList;
 import java.util.Random;

 public class TestPriorityQueue extends LuceneTestCase {

-    private static class IntegerQueue extends PriorityQueue<Integer> {
-        public IntegerQueue(int count) {
-            super(count);
-        }
-
-        @Override
-        protected boolean lessThan(Integer a, Integer b) {
-            return (a < b);
-        }
+  private static class IntegerQueue extends PriorityQueue<Integer> {
+    public IntegerQueue(int count) {
+      super(count);
    }

+    @Override
+    protected boolean lessThan(Integer a, Integer b) {
+      if (a.equals(b)) {
+        assert (a != b);
+        int hashA = System.identityHashCode(a);
+        int hashB = System.identityHashCode(b);
+        assert (hashA != hashB);
+        return hashA < hashB;
+      }
+      return (a < b);
+    }
+
+    protected final void checkValidity() {
+      Object[] heapArray = getHeapArray();
+      for (int i = 1; i <= size(); i++) {
+        int parent = i >>> 1;
+        if (parent > 1) {
+          assertTrue(lessThan((Integer) heapArray[parent],
+              (Integer) heapArray[i]));
+        }
+      }
+    }
+
+  }
+
    public void testPQ() throws Exception {
        testPQ(atLeast(10000), random());
    }
@ -111,5 +131,61 @@ public class TestPriorityQueue extends LuceneTestCase {
      assertEquals(size, pq.size());
      assertEquals((Integer) 2, pq.top());
    }
-  
+
+  public void testRemovalsAndInsertions() {
+    Random random = random();
+    int numDocsInPQ = TestUtil.nextInt(random, 1, 100);
+    IntegerQueue pq = new IntegerQueue(numDocsInPQ);
+    Integer lastLeast = null;
+
+    // Basic insertion of new content
+    ArrayList<Integer> sds = new ArrayList<Integer>(numDocsInPQ);
+    for (int i = 0; i < numDocsInPQ * 10; i++) {
+      Integer newEntry = new Integer(Math.abs(random.nextInt()));
+      sds.add(newEntry);
+      Integer evicted = pq.insertWithOverflow(newEntry);
+      pq.checkValidity();
+      if (evicted != null) {
+        assertTrue(sds.remove(evicted));
+        if (evicted != newEntry) {
+          assertTrue(evicted == lastLeast);
+        }
+      }
+      Integer newLeast = pq.top();
+      if ((lastLeast != null) && (newLeast != newEntry)
+          && (newLeast != lastLeast)) {
+        // If there has been a change of least entry and it wasn't our new
+        // addition we expect the scores to increase
+        assertTrue(newLeast <= newEntry);
+        assertTrue(newLeast >= lastLeast);
+      }
+      lastLeast = newLeast;
+
+    }
+
+    // Try many random additions to existing entries - we should always see
+    // increasing scores in the lowest entry in the PQ
+    for (int p = 0; p < 500000; p++) {
+      int element = (int) (random.nextFloat() * (sds.size() - 1));
+      Integer objectToRemove = sds.get(element);
+      assertTrue(sds.remove(element) == objectToRemove);
+      assertTrue(pq.remove(objectToRemove));
+      pq.checkValidity();
+      Integer newEntry = new Integer(Math.abs(random.nextInt()));
+      sds.add(newEntry);
+      assertNull(pq.insertWithOverflow(newEntry));
+      pq.checkValidity();
+      Integer newLeast = pq.top();
+      if ((objectToRemove != lastLeast) && (lastLeast != null)
+          && (newLeast != newEntry)) {
+        // If there has been a change of least entry and it wasn't our new
+        // addition or the loss of our randomly removed entry we expect the
+        // scores to increase
+        assertTrue(newLeast <= newEntry);
+        assertTrue(newLeast >= lastLeast);
+      }
+      lastLeast = newLeast;
+    }
+  }
+
 }
--- a/lucene/misc/src/java/org/apache/lucene/search/DiversifiedTopDocsCollector.java
+++ b/lucene/misc/src/java/org/apache/lucene/search/DiversifiedTopDocsCollector.java
@ -0,0 +1,251 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.search.DiversifiedTopDocsCollector.ScoreDocKey;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * A {@link TopDocsCollector} that controls diversity in results by ensuring no
+ * more than maxHitsPerKey results from a common source are collected in the
+ * final results.
+ * 
+ * An example application might be a product search in a marketplace where no
+ * more than 3 results per retailer are permitted in search results.
+ * 
+ * <p>
+ * To compare behaviour with other forms of collector, a useful analogy might be
+ * the problem of making a compilation album of 1967's top hit records:
+ * <ol>
+ * <li>A vanilla query's results might look like a "Best of the Beatles" album -
+ * high quality but not much diversity</li>
+ * <li>A GroupingSearch would produce the equivalent of "The 10 top-selling
+ * artists of 1967 - some killer and quite a lot of filler"</li>
+ * <li>A "diversified" query would be the top 20 hit records of that year - with
+ * a max of 3 Beatles hits in order to maintain diversity</li>
+ * </ol>
+ * This collector improves on the "GroupingSearch" type queries by
+ * <ul>
+ * <li>Working in one pass over the data</li>
+ * <li>Not requiring the client to guess how many groups are required</li>
+ * <li>Removing low-scoring "filler" which sits at the end of each group's hits</li>
+ * </ul>
+ * 
+ * This is an abstract class and subclasses have to provide a source of keys for
+ * documents which is then used to help identify duplicate sources.
+ * 
+ * @lucene.experimental
+ * 
+ */
+public abstract class DiversifiedTopDocsCollector extends
+    TopDocsCollector<ScoreDocKey> {
+  ScoreDocKey spare;
+  private ScoreDocKeyQueue globalQueue;
+  private int numHits;
+  private Map<Long, ScoreDocKeyQueue> perKeyQueues;
+  protected int maxNumPerKey;
+  private Stack<ScoreDocKeyQueue> sparePerKeyQueues = new Stack<>();
+
+  public DiversifiedTopDocsCollector(int numHits, int maxHitsPerKey) {
+    super(new ScoreDocKeyQueue(numHits));
+    // Need to access pq.lessThan() which is protected so have to cast here...
+    this.globalQueue = (ScoreDocKeyQueue) pq;
+    perKeyQueues = new HashMap<Long, ScoreDocKeyQueue>();
+    this.numHits = numHits;
+    this.maxNumPerKey = maxHitsPerKey;
+  }
+
+  /**
+   * Get a source of values used for grouping keys
+   */
+  protected abstract NumericDocValues getKeys(LeafReaderContext context);
+
+  @Override
+  public boolean needsScores() {
+    return true;
+  }
+
+  @Override
+  protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
+    if (results == null) {
+      return EMPTY_TOPDOCS;
+    }
+
+    // We need to compute maxScore in order to set it in TopDocs. If start == 0,
+    // it means the largest element is already in results, use its score as
+    // maxScore. Otherwise pop everything else, until the largest element is
+    // extracted and use its score as maxScore.
+    float maxScore = Float.NaN;
+    if (start == 0) {
+      maxScore = results[0].score;
+    } else {
+      for (int i = globalQueue.size(); i > 1; i--) {
+        globalQueue.pop();
+      }
+      maxScore = globalQueue.pop().score;
+    }
+
+    return new TopDocs(totalHits, results, maxScore);
+  }
+
+  protected ScoreDocKey insert(ScoreDocKey addition, int docBase,
+      NumericDocValues keys) {
+    if ((globalQueue.size() >= numHits)
+        && (globalQueue.lessThan(addition, globalQueue.top()))) {
+      // Queue is full and proposed addition is not a globally
+      // competitive score
+      return addition;
+    }
+    // The addition stands a chance of being entered - check the
+    // key-specific restrictions.
+    // We delay fetching the key until we are certain the score is globally
+    // competitive. We need to adjust the ScoreDoc's global doc value to be
+    // a leaf reader value when looking up keys
+    addition.key = keys.get(addition.doc - docBase);
+
+    // For this to work the choice of key class needs to implement
+    // hashcode and equals.
+    ScoreDocKeyQueue thisKeyQ = perKeyQueues.get(addition.key);
+
+    if (thisKeyQ == null) {
+      if (sparePerKeyQueues.size() == 0) {
+        thisKeyQ = new ScoreDocKeyQueue(maxNumPerKey);
+      } else {
+        thisKeyQ = sparePerKeyQueues.pop();
+      }
+      perKeyQueues.put(addition.key, thisKeyQ);
+    }
+    ScoreDocKey perKeyOverflow = thisKeyQ.insertWithOverflow(addition);
+    if (perKeyOverflow == addition) {
+      // This key group has reached capacity and our proposed addition
+      // was not competitive in the group - do not insert into the
+      // main PQ or the key will be overly-populated in final results.
+      return addition;
+    }
+    if (perKeyOverflow == null) {
+      // This proposed addition is also locally competitive within the
+      // key group - make a global entry and return
+      ScoreDocKey globalOverflow = globalQueue.insertWithOverflow(addition);
+      perKeyGroupRemove(globalOverflow);
+      return globalOverflow;
+    }
+    // For the given key, we have reached max capacity but the new addition
+    // is better than a prior entry that still exists in the global results
+    // - request the weaker-scoring entry to be removed from the global
+    // queue.
+    globalQueue.remove(perKeyOverflow);
+    // Add the locally-competitive addition into the globally queue
+    globalQueue.add(addition);
+    return perKeyOverflow;
+  }
+
+  private void perKeyGroupRemove(ScoreDocKey globalOverflow) {
+    if (globalOverflow == null) {
+      return;
+    }
+    ScoreDocKeyQueue q = perKeyQueues.get(globalOverflow.key);
+    ScoreDocKey perKeyLowest = q.pop();
+    // The least globally-competitive item should also always be the least
+    // key-local item
+    assert (globalOverflow == perKeyLowest);
+    if (q.size() == 0) {
+      perKeyQueues.remove(globalOverflow.key);
+      sparePerKeyQueues.push(q);
+    }
+  }
+
+  @Override
+  public LeafCollector getLeafCollector(LeafReaderContext context)
+      throws IOException {
+    final int base = context.docBase;
+    final NumericDocValues keySource = getKeys(context);
+
+    return new LeafCollector() {
+      Scorer scorer;
+
+      @Override
+      public void setScorer(Scorer scorer) throws IOException {
+        this.scorer = scorer;
+      }
+
+      @Override
+      public void collect(int doc) throws IOException {
+        float score = scorer.score();
+
+        // This collector cannot handle NaN
+        assert !Float.isNaN(score);
+
+        totalHits++;
+
+        doc += base;
+
+        if (spare == null) {
+          spare = new ScoreDocKey(doc, score);
+        } else {
+          spare.doc = doc;
+          spare.score = score;
+        }
+        spare = insert(spare, base, keySource);
+      }
+    };
+  }
+
+  static class ScoreDocKeyQueue extends PriorityQueue<ScoreDocKey> {
+
+    ScoreDocKeyQueue(int size) {
+      super(size);
+    }
+
+    @Override
+    protected final boolean lessThan(ScoreDocKey hitA, ScoreDocKey hitB) {
+      if (hitA.score == hitB.score)
+        return hitA.doc > hitB.doc;
+      else
+        return hitA.score < hitB.score;
+    }
+  }
+
+  // 
+  /**
+   * An extension to ScoreDoc that includes a key used for grouping purposes
+   */
+  static public class ScoreDocKey extends ScoreDoc {
+    Long key;
+
+    protected ScoreDocKey(int doc, float score) {
+      super(doc, score);
+    }
+
+    public Long getKey() {
+      return key;
+    }
+
+    @Override
+    public String toString() {
+      return "key:" + key + " doc=" + doc + " s=" + score;
+    }
+
+  }
+
+}
--- a/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java
+++ b/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java
@ -0,0 +1,464 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.FloatDocValuesField;
+import org.apache.lucene.document.FloatField;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Demonstrates an application of the {@link DiversifiedTopDocsCollector} in
+ * assembling a collection of top results but without over-representation of any
+ * one source (in this case top-selling singles from the 60s without having them
+ * all be Beatles records...). Results are ranked by the number of weeks a
+ * single is top of the charts and de-duped by the artist name.
+ * 
+ */
+public class TestDiversifiedTopDocsCollector extends LuceneTestCase {
+
+  public void testNonDiversifiedResults() throws Exception {
+    int numberOfTracksOnCompilation = 10;
+    int expectedMinNumOfBeatlesHits = 5;
+    TopDocs res = searcher.search(getTestQuery(), numberOfTracksOnCompilation);
+    assertEquals(numberOfTracksOnCompilation, res.scoreDocs.length);
+    // due to randomization of segment merging in tests the exact number of Beatles hits 
+    // selected varies between 5 and 6 but we prove the point they are over-represented
+    // in our result set using a standard search.
+    assertTrue(getMaxNumRecordsPerArtist(res.scoreDocs) >= expectedMinNumOfBeatlesHits);
+  }
+
+  public void testFirstPageDiversifiedResults() throws Exception {
+    // Using a diversified collector we can limit the results from
+    // any one artist.
+    int requiredMaxHitsPerArtist = 2;
+    int numberOfTracksOnCompilation = 10;
+    DiversifiedTopDocsCollector tdc = doDiversifiedSearch(
+        numberOfTracksOnCompilation, requiredMaxHitsPerArtist);
+    ScoreDoc[] sd = tdc.topDocs(0).scoreDocs;
+    assertEquals(numberOfTracksOnCompilation, sd.length);
+    assertTrue(getMaxNumRecordsPerArtist(sd) <= requiredMaxHitsPerArtist);
+  }
+
+  public void testSecondPageResults() throws Exception {
+    int numberOfTracksPerCompilation = 10;
+    int numberOfCompilations = 2;
+    int requiredMaxHitsPerArtist = 1;
+
+    // Volume 2 of our hits compilation - start at position 10
+    DiversifiedTopDocsCollector tdc = doDiversifiedSearch(
+        numberOfTracksPerCompilation * numberOfCompilations,
+        requiredMaxHitsPerArtist);
+    ScoreDoc[] volume2 = tdc.topDocs(numberOfTracksPerCompilation,
+        numberOfTracksPerCompilation).scoreDocs;
+    assertEquals(numberOfTracksPerCompilation, volume2.length);
+    assertTrue(getMaxNumRecordsPerArtist(volume2) <= requiredMaxHitsPerArtist);
+
+  }
+
+  public void testInvalidArguments() throws Exception {
+    int numResults = 5;
+    DiversifiedTopDocsCollector tdc = doDiversifiedSearch(numResults, 15);
+
+    // start < 0
+    assertEquals(0, tdc.topDocs(-1).scoreDocs.length);
+
+    // start > pq.size()
+    assertEquals(0, tdc.topDocs(numResults + 1).scoreDocs.length);
+
+    // start == pq.size()
+    assertEquals(0, tdc.topDocs(numResults).scoreDocs.length);
+
+    // howMany < 0
+    assertEquals(0, tdc.topDocs(0, -1).scoreDocs.length);
+
+    // howMany == 0
+    assertEquals(0, tdc.topDocs(0, 0).scoreDocs.length);
+
+  }
+
+  // Diversifying collector that looks up de-dup keys using SortedDocValues
+  // from a top-level Reader
+  private static final class DocValuesDiversifiedCollector extends
+      DiversifiedTopDocsCollector {
+    private final SortedDocValues sdv;
+
+    public DocValuesDiversifiedCollector(int size, int maxHitsPerKey,
+        SortedDocValues sdv) {
+      super(size, maxHitsPerKey);
+      this.sdv = sdv;
+    }
+
+    @Override
+    protected NumericDocValues getKeys(final LeafReaderContext context) {
+
+      return new NumericDocValues() {
+        @Override
+        public long get(int docID) {
+          // Keys are always expressed as a long so we obtain the
+          // ordinal for our String-based artist name here
+          return sdv.getOrd(context.docBase + docID);
+        }
+      };
+    }
+  }
+
+  // Alternative, faster implementation for converting String keys to longs
+  // but with the potential for hash collisions
+  private static final class HashedDocValuesDiversifiedCollector extends
+      DiversifiedTopDocsCollector {
+
+    private final String field;
+    private BinaryDocValues vals;
+
+    public HashedDocValuesDiversifiedCollector(int size, int maxHitsPerKey,
+        String field) {
+      super(size, maxHitsPerKey);
+      this.field = field;
+    }
+
+    @Override
+    protected NumericDocValues getKeys(LeafReaderContext context) {
+      return new NumericDocValues() {
+        @Override
+        public long get(int docID) {
+          return vals == null ? -1 : vals.get(docID).hashCode();
+        }
+      };
+    }
+
+    @Override
+    public LeafCollector getLeafCollector(LeafReaderContext context)
+        throws IOException {
+      this.vals = DocValues.getBinary(context.reader(), field);
+      return super.getLeafCollector(context);
+    }
+  }
+
+  // Test data - format is artist, song, weeks at top of charts
+  private static String[] hitsOfThe60s = {
+      "1966\tSPENCER DAVIS GROUP\tKEEP ON RUNNING\t1",
+      "1966\tOVERLANDERS\tMICHELLE\t3",
+      "1966\tNANCY SINATRA\tTHESE BOOTS ARE MADE FOR WALKIN'\t4",
+      "1966\tWALKER BROTHERS\tTHE SUN AIN'T GONNA SHINE ANYMORE\t4",
+      "1966\tSPENCER DAVIS GROUP\tSOMEBODY HELP ME\t2",
+      "1966\tDUSTY SPRINGFIELD\tYOU DON'T HAVE TO SAY YOU LOVE ME\t1",
+      "1966\tMANFRED MANN\tPRETTY FLAMINGO\t3",
+      "1966\tROLLING STONES\tPAINT IT, BLACK\t1",
+      "1966\tFRANK SINATRA\tSTRANGERS IN THE NIGHT\t3",
+      "1966\tBEATLES\tPAPERBACK WRITER\t5",
+      "1966\tKINKS\tSUNNY AFTERNOON\t2",
+      "1966\tGEORGIE FAME AND THE BLUE FLAMES\tGETAWAY\t1",
+      "1966\tCHRIS FARLOWE\tOUT OF TIME\t1",
+      "1966\tTROGGS\tWITH A GIRL LIKE YOU\t2",
+      "1966\tBEATLES\tYELLOW SUBMARINE/ELEANOR RIGBY\t4",
+      "1966\tSMALL FACES\tALL OR NOTHING\t1",
+      "1966\tJIM REEVES\tDISTANT DRUMS\t5",
+      "1966\tFOUR TOPS\tREACH OUT I'LL BE THERE\t3",
+      "1966\tBEACH BOYS\tGOOD VIBRATIONS\t2",
+      "1966\tTOM JONES\tGREEN GREEN GRASS OF HOME\t4",
+      "1967\tMONKEES\tI'M A BELIEVER\t4",
+      "1967\tPETULA CLARK\tTHIS IS MY SONG\t2",
+      "1967\tENGELBERT HUMPERDINCK\tRELEASE ME\t4",
+      "1967\tNANCY SINATRA AND FRANK SINATRA\tSOMETHIN' STUPID\t2",
+      "1967\tSANDIE SHAW\tPUPPET ON A STRING\t3",
+      "1967\tTREMELOES\tSILENCE IS GOLDEN\t3",
+      "1967\tPROCOL HARUM\tA WHITER SHADE OF PALE\t4",
+      "1967\tBEATLES\tALL YOU NEED IS LOVE\t7",
+      "1967\tSCOTT MCKENZIE\tSAN FRANCISCO (BE SURE TO WEAR SOME FLOWERS INYOUR HAIR)\t4",
+      "1967\tENGELBERT HUMPERDINCK\tTHE LAST WALTZ\t5",
+      "1967\tBEE GEES\tMASSACHUSETTS (THE LIGHTS WENT OUT IN)\t4",
+      "1967\tFOUNDATIONS\tBABY NOW THAT I'VE FOUND YOU\t2",
+      "1967\tLONG JOHN BALDRY\tLET THE HEARTACHES BEGIN\t2",
+      "1967\tBEATLES\tHELLO GOODBYE\t5",
+      "1968\tGEORGIE FAME\tTHE BALLAD OF BONNIE AND CLYDE\t1",
+      "1968\tLOVE AFFAIR\tEVERLASTING LOVE\t2",
+      "1968\tMANFRED MANN\tMIGHTY QUINN\t2",
+      "1968\tESTHER AND ABI OFARIM\tCINDERELLA ROCKEFELLA\t3",
+      "1968\tDAVE DEE, DOZY, BEAKY, MICK AND TICH\tTHE LEGEND OF XANADU\t1",
+      "1968\tBEATLES\tLADY MADONNA\t2",
+      "1968\tCLIFF RICHARD\tCONGRATULATIONS\t2",
+      "1968\tLOUIS ARMSTRONG\tWHAT A WONDERFUL WORLD/CABARET\t4",
+      "1968\tGARRY PUCKETT AND THE UNION GAP\tYOUNG GIRL\t4",
+      "1968\tROLLING STONES\tJUMPING JACK FLASH\t2",
+      "1968\tEQUALS\tBABY COME BACK\t3", "1968\tDES O'CONNOR\tI PRETEND\t1",
+      "1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t2",
+      "1968\tCRAZY WORLD OF ARTHUR BROWN\tFIRE!\t1",
+      "1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t1",
+      "1968\tBEACH BOYS\tDO IT AGAIN\t1",
+      "1968\tBEE GEES\tI'VE GOTTA GET A MESSAGE TO YOU\t1",
+      "1968\tBEATLES\tHEY JUDE\t8",
+      "1968\tMARY HOPKIN\tTHOSE WERE THE DAYS\t6",
+      "1968\tJOE COCKER\tWITH A LITTLE HELP FROM MY FRIENDS\t1",
+      "1968\tHUGO MONTENEGRO\tTHE GOOD THE BAD AND THE UGLY\t4",
+      "1968\tSCAFFOLD\tLILY THE PINK\t3",
+      "1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t1",
+      "1969\tSCAFFOLD\tLILY THE PINK\t1",
+      "1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t2",
+      "1969\tFLEETWOOD MAC\tALBATROSS\t1", "1969\tMOVE\tBLACKBERRY WAY\t1",
+      "1969\tAMEN CORNER\t(IF PARADISE IS) HALF AS NICE\t2",
+      "1969\tPETER SARSTEDT\tWHERE DO YOU GO TO (MY LOVELY)\t4",
+      "1969\tMARVIN GAYE\tI HEARD IT THROUGH THE GRAPEVINE\t3",
+      "1969\tDESMOND DEKKER AND THE ACES\tTHE ISRAELITES\t1",
+      "1969\tBEATLES\tGET BACK\t6", "1969\tTOMMY ROE\tDIZZY\t1",
+      "1969\tBEATLES\tTHE BALLAD OF JOHN AND YOKO\t3",
+      "1969\tTHUNDERCLAP NEWMAN\tSOMETHING IN THE AIR\t3",
+      "1969\tROLLING STONES\tHONKY TONK WOMEN\t5",
+      "1969\tZAGER AND EVANS\tIN THE YEAR 2525 (EXORDIUM AND TERMINUS)\t3",
+      "1969\tCREEDENCE CLEARWATER REVIVAL\tBAD MOON RISING\t3",
+      "1969\tJANE BIRKIN AND SERGE GAINSBOURG\tJE T'AIME... MOI NON PLUS\t1",
+      "1969\tBOBBIE GENTRY\tI'LL NEVER FALL IN LOVE AGAIN\t1",
+      "1969\tARCHIES\tSUGAR, SUGAR\t4" };
+
+  private static final Map<String, Record> parsedRecords = new HashMap<String, Record>();
+  private Directory dir;
+  private IndexReader reader;
+  private IndexSearcher searcher;
+  private SortedDocValues artistDocValues;
+
+  static class Record {
+    String year;
+    String artist;
+    String song;
+    float weeks;
+    String id;
+
+    public Record(String id, String year, String artist, String song,
+        float weeks) {
+      super();
+      this.id = id;
+      this.year = year;
+      this.artist = artist;
+      this.song = song;
+      this.weeks = weeks;
+    }
+
+    @Override
+    public String toString() {
+      return "Record [id=" + id + ", artist=" + artist + ", weeks=" + weeks
+          + ", year=" + year + ", song=" + song + "]";
+    }
+
+  }
+
+  private DiversifiedTopDocsCollector doDiversifiedSearch(int numResults,
+      int maxResultsPerArtist) throws IOException {
+    // Alternate between implementations used for key lookups 
+    if (random().nextBoolean()) {
+      // Faster key lookup but with potential for collisions on larger datasets
+      return doFuzzyDiversifiedSearch(numResults, maxResultsPerArtist);
+    } else {
+      // Slower key lookup but 100% accurate
+      return doAccurateDiversifiedSearch(numResults, maxResultsPerArtist);
+    }
+  }
+
+  private DiversifiedTopDocsCollector doFuzzyDiversifiedSearch(int numResults,
+      int maxResultsPerArtist) throws IOException {
+    DiversifiedTopDocsCollector tdc = new HashedDocValuesDiversifiedCollector(
+        numResults, maxResultsPerArtist, "artist");
+    searcher.search(getTestQuery(), tdc);
+    return tdc;
+  }
+
+  private DiversifiedTopDocsCollector doAccurateDiversifiedSearch(
+      int numResults, int maxResultsPerArtist) throws IOException {
+    DiversifiedTopDocsCollector tdc = new DocValuesDiversifiedCollector(
+        numResults, maxResultsPerArtist, artistDocValues);
+    searcher.search(getTestQuery(), tdc);
+    return tdc;
+  }
+
+  private Query getTestQuery() {
+    BooleanQuery testQuery = new BooleanQuery();
+    testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1966")),
+        Occur.SHOULD));
+    testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1967")),
+        Occur.SHOULD));
+    testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1968")),
+        Occur.SHOULD));
+    testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1969")),
+        Occur.SHOULD));
+    return testQuery;
+  }
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+
+    // populate an index with documents - artist, song and weeksAtNumberOne
+    dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+
+    Field yearField = newTextField("year", "", Field.Store.NO);
+    SortedDocValuesField artistField = new SortedDocValuesField("artist",
+        new BytesRef(""));
+    Field weeksAtNumberOneField = new FloatDocValuesField("weeksAtNumberOne",
+        0.0F);
+    Field weeksStoredField = new FloatField("weeks", 0.0F, Store.YES);
+    Field idField = newStringField("id", "", Field.Store.YES);
+    Field songField = newTextField("song", "", Field.Store.NO);
+    Field storedArtistField = newTextField("artistName", "", Field.Store.NO);
+
+    doc.add(idField);
+    doc.add(weeksAtNumberOneField);
+    doc.add(storedArtistField);
+    doc.add(songField);
+    doc.add(weeksStoredField);
+    doc.add(yearField);
+    doc.add(artistField);
+
+    parsedRecords.clear();
+    for (int i = 0; i < hitsOfThe60s.length; i++) {
+      String cols[] = hitsOfThe60s[i].split("\t");
+      Record record = new Record(String.valueOf(i), cols[0], cols[1], cols[2],
+          Float.valueOf(cols[3]));
+      parsedRecords.put(record.id, record);
+      idField.setStringValue(record.id);
+      yearField.setStringValue(record.year);
+      storedArtistField.setStringValue(record.artist);
+      artistField.setBytesValue(new BytesRef(record.artist));
+      songField.setStringValue(record.song);
+      weeksStoredField.setFloatValue(record.weeks);
+      weeksAtNumberOneField.setFloatValue(record.weeks);
+      writer.addDocument(doc);
+      if (i % 10 == 0) {
+        // Causes the creation of multiple segments for our test
+        writer.commit();
+      }
+    }
+    reader = writer.getReader();
+    writer.close();
+    searcher = newSearcher(reader);
+    LeafReader ar = SlowCompositeReaderWrapper.wrap(reader);
+    artistDocValues = ar.getSortedDocValues("artist");
+
+    // All searches sort by song popularity 
+    final Similarity base = searcher.getSimilarity();
+    searcher.setSimilarity(new DocValueSimilarity(base, "weeksAtNumberOne"));
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    reader.close();
+    dir.close();
+    dir = null;
+    super.tearDown();
+  }
+
+  private int getMaxNumRecordsPerArtist(ScoreDoc[] sd) throws IOException {
+    int result = 0;
+    HashMap<String, Integer> artistCounts = new HashMap<String, Integer>();
+    for (int i = 0; i < sd.length; i++) {
+      StoredDocument doc = reader.document(sd[i].doc);
+      Record record = parsedRecords.get(doc.get("id"));
+      Integer count = artistCounts.get(record.artist);
+      int newCount = 1;
+      if (count != null) {
+        newCount = count.intValue() + 1;
+      }
+      result = Math.max(result, newCount);
+      artistCounts.put(record.artist, newCount);
+    }
+    return result;
+  }
+
+  /**
+   * Similarity that wraps another similarity and replaces the final score
+   * according to whats in a docvalues field.
+   * 
+   * @lucene.experimental
+   */
+  static class DocValueSimilarity extends Similarity {
+    private final Similarity sim;
+    private final String scoreValueField;
+
+    public DocValueSimilarity(Similarity sim, String scoreValueField) {
+      this.sim = sim;
+      this.scoreValueField = scoreValueField;
+    }
+
+    @Override
+    public long computeNorm(FieldInvertState state) {
+      return sim.computeNorm(state);
+    }
+
+    @Override
+    public SimWeight computeWeight(float queryBoost,
+        CollectionStatistics collectionStats, TermStatistics... termStats) {
+      return sim.computeWeight(queryBoost, collectionStats, termStats);
+    }
+
+    @Override
+    public SimScorer simScorer(SimWeight stats, LeafReaderContext context)
+        throws IOException {
+      final SimScorer sub = sim.simScorer(stats, context);
+      final NumericDocValues values = DocValues.getNumeric(context.reader(),
+          scoreValueField);
+
+      return new SimScorer() {
+        @Override
+        public float score(int doc, float freq) {
+          return Float.intBitsToFloat((int) values.get(doc));
+        }
+
+        @Override
+        public float computeSlopFactor(int distance) {
+          return sub.computeSlopFactor(distance);
+        }
+
+        @Override
+        public float computePayloadFactor(int doc, int start, int end,
+            BytesRef payload) {
+          return sub.computePayloadFactor(doc, start, end, payload);
+        }
+
+        @Override
+        public Explanation explain(int doc, Explanation freq) {
+          return new Explanation(Float.intBitsToFloat((int) values.get(doc)),
+              "indexDocValue(" + scoreValueField + ")");
+        }
+      };
+    }
+  }
+
+}