mirror of https://github.com/apache/lucene.git
LUCENE-6066: new DiversifiedTopDocsCollector in misc and PriorityQueue.remove method
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1659195 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
17bfed1212
commit
ef3a99b385
|
@ -33,6 +33,10 @@ API Changes
|
|||
|
||||
New Features
|
||||
|
||||
* LUCENE-6066: Added DiversifiedTopDocsCollector to misc for collecting no more
|
||||
than a given number of results under a choice of key. Introduces new remove
|
||||
method to core's PriorityQueue. (Mark Harwood)
|
||||
|
||||
* LUCENE-3922: Added JapaneseNumberFilter that normalizes Japanese numbers
|
||||
in kansuji form to regular/Arabic numbers. (Gaute Lambertsen, Christian Moen)
|
||||
|
||||
|
|
|
@ -17,17 +17,19 @@ package org.apache.lucene.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** A PriorityQueue maintains a partial ordering of its elements such that the
|
||||
* least element can always be found in constant time. Put()'s and pop()'s
|
||||
* require log(size) time.
|
||||
/**
|
||||
* A PriorityQueue maintains a partial ordering of its elements such that the
|
||||
* least element can always be found in constant time. Put()'s and pop()'s
|
||||
* require log(size) time but the remove() cost implemented here is linear.
|
||||
*
|
||||
* <p><b>NOTE</b>: This class will pre-allocate a full array of
|
||||
* length <code>maxSize+1</code> if instantiated via the
|
||||
* {@link #PriorityQueue(int,boolean)} constructor with
|
||||
* <code>prepopulate</code> set to <code>true</code>.
|
||||
* <p>
|
||||
* <b>NOTE</b>: This class will pre-allocate a full array of length
|
||||
* <code>maxSize+1</code> if instantiated via the
|
||||
* {@link #PriorityQueue(int,boolean)} constructor with <code>prepopulate</code>
|
||||
* set to <code>true</code>.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
*/
|
||||
public abstract class PriorityQueue<T> {
|
||||
private int size = 0;
|
||||
private final int maxSize;
|
||||
|
@ -130,7 +132,7 @@ public abstract class PriorityQueue<T> {
|
|||
public final T add(T element) {
|
||||
size++;
|
||||
heap[size] = element;
|
||||
upHeap();
|
||||
upHeap(size);
|
||||
return heap[1];
|
||||
}
|
||||
|
||||
|
@ -174,7 +176,7 @@ public abstract class PriorityQueue<T> {
|
|||
heap[1] = heap[size]; // move last to first
|
||||
heap[size] = null; // permit GC of objects
|
||||
size--;
|
||||
downHeap(); // adjust heap
|
||||
downHeap(1); // adjust heap
|
||||
return result;
|
||||
} else {
|
||||
return null;
|
||||
|
@ -201,7 +203,7 @@ public abstract class PriorityQueue<T> {
|
|||
* @return the new 'top' element.
|
||||
*/
|
||||
public final T updateTop() {
|
||||
downHeap();
|
||||
downHeap(1);
|
||||
return heap[1];
|
||||
}
|
||||
|
||||
|
@ -226,8 +228,31 @@ public abstract class PriorityQueue<T> {
|
|||
size = 0;
|
||||
}
|
||||
|
||||
private final void upHeap() {
|
||||
int i = size;
|
||||
/**
|
||||
* Removes an existing element currently stored in the PriorityQueue. Cost is
|
||||
* linear with the size of the queue. (A specialization of PriorityQueue which
|
||||
* tracks element positions would provide a constant remove time but the
|
||||
* trade-off would be extra cost to all additions/insertions)
|
||||
*/
|
||||
public final boolean remove(T element) {
|
||||
for (int i = 1; i <= size; i++) {
|
||||
if (heap[i] == element) {
|
||||
heap[i] = heap[size];
|
||||
heap[size] = null; // permit GC of objects
|
||||
size--;
|
||||
if (i <= size) {
|
||||
if (!upHeap(i)) {
|
||||
downHeap(i);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private final boolean upHeap(int origPos) {
|
||||
int i = origPos;
|
||||
T node = heap[i]; // save bottom node
|
||||
int j = i >>> 1;
|
||||
while (j > 0 && lessThan(node, heap[j])) {
|
||||
|
@ -236,10 +261,10 @@ public abstract class PriorityQueue<T> {
|
|||
j = j >>> 1;
|
||||
}
|
||||
heap[i] = node; // install saved node
|
||||
return i != origPos;
|
||||
}
|
||||
|
||||
private final void downHeap() {
|
||||
int i = 1;
|
||||
|
||||
private final void downHeap(int i) {
|
||||
T node = heap[i]; // save top node
|
||||
int j = i << 1; // find smaller child
|
||||
int k = j + 1;
|
||||
|
@ -257,7 +282,7 @@ public abstract class PriorityQueue<T> {
|
|||
}
|
||||
heap[i] = node; // install saved node
|
||||
}
|
||||
|
||||
|
||||
/** This method returns the internal heap array as Object[].
|
||||
* @lucene.internal
|
||||
*/
|
||||
|
|
|
@ -17,21 +17,41 @@ package org.apache.lucene.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Random;
|
||||
|
||||
public class TestPriorityQueue extends LuceneTestCase {
|
||||
|
||||
private static class IntegerQueue extends PriorityQueue<Integer> {
|
||||
public IntegerQueue(int count) {
|
||||
super(count);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(Integer a, Integer b) {
|
||||
return (a < b);
|
||||
}
|
||||
private static class IntegerQueue extends PriorityQueue<Integer> {
|
||||
public IntegerQueue(int count) {
|
||||
super(count);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(Integer a, Integer b) {
|
||||
if (a.equals(b)) {
|
||||
assert (a != b);
|
||||
int hashA = System.identityHashCode(a);
|
||||
int hashB = System.identityHashCode(b);
|
||||
assert (hashA != hashB);
|
||||
return hashA < hashB;
|
||||
}
|
||||
return (a < b);
|
||||
}
|
||||
|
||||
protected final void checkValidity() {
|
||||
Object[] heapArray = getHeapArray();
|
||||
for (int i = 1; i <= size(); i++) {
|
||||
int parent = i >>> 1;
|
||||
if (parent > 1) {
|
||||
assertTrue(lessThan((Integer) heapArray[parent],
|
||||
(Integer) heapArray[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void testPQ() throws Exception {
|
||||
testPQ(atLeast(10000), random());
|
||||
}
|
||||
|
@ -111,5 +131,61 @@ public class TestPriorityQueue extends LuceneTestCase {
|
|||
assertEquals(size, pq.size());
|
||||
assertEquals((Integer) 2, pq.top());
|
||||
}
|
||||
|
||||
|
||||
public void testRemovalsAndInsertions() {
|
||||
Random random = random();
|
||||
int numDocsInPQ = TestUtil.nextInt(random, 1, 100);
|
||||
IntegerQueue pq = new IntegerQueue(numDocsInPQ);
|
||||
Integer lastLeast = null;
|
||||
|
||||
// Basic insertion of new content
|
||||
ArrayList<Integer> sds = new ArrayList<Integer>(numDocsInPQ);
|
||||
for (int i = 0; i < numDocsInPQ * 10; i++) {
|
||||
Integer newEntry = new Integer(Math.abs(random.nextInt()));
|
||||
sds.add(newEntry);
|
||||
Integer evicted = pq.insertWithOverflow(newEntry);
|
||||
pq.checkValidity();
|
||||
if (evicted != null) {
|
||||
assertTrue(sds.remove(evicted));
|
||||
if (evicted != newEntry) {
|
||||
assertTrue(evicted == lastLeast);
|
||||
}
|
||||
}
|
||||
Integer newLeast = pq.top();
|
||||
if ((lastLeast != null) && (newLeast != newEntry)
|
||||
&& (newLeast != lastLeast)) {
|
||||
// If there has been a change of least entry and it wasn't our new
|
||||
// addition we expect the scores to increase
|
||||
assertTrue(newLeast <= newEntry);
|
||||
assertTrue(newLeast >= lastLeast);
|
||||
}
|
||||
lastLeast = newLeast;
|
||||
|
||||
}
|
||||
|
||||
// Try many random additions to existing entries - we should always see
|
||||
// increasing scores in the lowest entry in the PQ
|
||||
for (int p = 0; p < 500000; p++) {
|
||||
int element = (int) (random.nextFloat() * (sds.size() - 1));
|
||||
Integer objectToRemove = sds.get(element);
|
||||
assertTrue(sds.remove(element) == objectToRemove);
|
||||
assertTrue(pq.remove(objectToRemove));
|
||||
pq.checkValidity();
|
||||
Integer newEntry = new Integer(Math.abs(random.nextInt()));
|
||||
sds.add(newEntry);
|
||||
assertNull(pq.insertWithOverflow(newEntry));
|
||||
pq.checkValidity();
|
||||
Integer newLeast = pq.top();
|
||||
if ((objectToRemove != lastLeast) && (lastLeast != null)
|
||||
&& (newLeast != newEntry)) {
|
||||
// If there has been a change of least entry and it wasn't our new
|
||||
// addition or the loss of our randomly removed entry we expect the
|
||||
// scores to increase
|
||||
assertTrue(newLeast <= newEntry);
|
||||
assertTrue(newLeast >= lastLeast);
|
||||
}
|
||||
lastLeast = newLeast;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,251 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Stack;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.DiversifiedTopDocsCollector.ScoreDocKey;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* A {@link TopDocsCollector} that controls diversity in results by ensuring no
|
||||
* more than maxHitsPerKey results from a common source are collected in the
|
||||
* final results.
|
||||
*
|
||||
* An example application might be a product search in a marketplace where no
|
||||
* more than 3 results per retailer are permitted in search results.
|
||||
*
|
||||
* <p>
|
||||
* To compare behaviour with other forms of collector, a useful analogy might be
|
||||
* the problem of making a compilation album of 1967's top hit records:
|
||||
* <ol>
|
||||
* <li>A vanilla query's results might look like a "Best of the Beatles" album -
|
||||
* high quality but not much diversity</li>
|
||||
* <li>A GroupingSearch would produce the equivalent of "The 10 top-selling
|
||||
* artists of 1967 - some killer and quite a lot of filler"</li>
|
||||
* <li>A "diversified" query would be the top 20 hit records of that year - with
|
||||
* a max of 3 Beatles hits in order to maintain diversity</li>
|
||||
* </ol>
|
||||
* This collector improves on the "GroupingSearch" type queries by
|
||||
* <ul>
|
||||
* <li>Working in one pass over the data</li>
|
||||
* <li>Not requiring the client to guess how many groups are required</li>
|
||||
* <li>Removing low-scoring "filler" which sits at the end of each group's hits</li>
|
||||
* </ul>
|
||||
*
|
||||
* This is an abstract class and subclasses have to provide a source of keys for
|
||||
* documents which is then used to help identify duplicate sources.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
*/
|
||||
public abstract class DiversifiedTopDocsCollector extends
|
||||
TopDocsCollector<ScoreDocKey> {
|
||||
ScoreDocKey spare;
|
||||
private ScoreDocKeyQueue globalQueue;
|
||||
private int numHits;
|
||||
private Map<Long, ScoreDocKeyQueue> perKeyQueues;
|
||||
protected int maxNumPerKey;
|
||||
private Stack<ScoreDocKeyQueue> sparePerKeyQueues = new Stack<>();
|
||||
|
||||
public DiversifiedTopDocsCollector(int numHits, int maxHitsPerKey) {
|
||||
super(new ScoreDocKeyQueue(numHits));
|
||||
// Need to access pq.lessThan() which is protected so have to cast here...
|
||||
this.globalQueue = (ScoreDocKeyQueue) pq;
|
||||
perKeyQueues = new HashMap<Long, ScoreDocKeyQueue>();
|
||||
this.numHits = numHits;
|
||||
this.maxNumPerKey = maxHitsPerKey;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a source of values used for grouping keys
|
||||
*/
|
||||
protected abstract NumericDocValues getKeys(LeafReaderContext context);
|
||||
|
||||
@Override
|
||||
public boolean needsScores() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
|
||||
if (results == null) {
|
||||
return EMPTY_TOPDOCS;
|
||||
}
|
||||
|
||||
// We need to compute maxScore in order to set it in TopDocs. If start == 0,
|
||||
// it means the largest element is already in results, use its score as
|
||||
// maxScore. Otherwise pop everything else, until the largest element is
|
||||
// extracted and use its score as maxScore.
|
||||
float maxScore = Float.NaN;
|
||||
if (start == 0) {
|
||||
maxScore = results[0].score;
|
||||
} else {
|
||||
for (int i = globalQueue.size(); i > 1; i--) {
|
||||
globalQueue.pop();
|
||||
}
|
||||
maxScore = globalQueue.pop().score;
|
||||
}
|
||||
|
||||
return new TopDocs(totalHits, results, maxScore);
|
||||
}
|
||||
|
||||
protected ScoreDocKey insert(ScoreDocKey addition, int docBase,
|
||||
NumericDocValues keys) {
|
||||
if ((globalQueue.size() >= numHits)
|
||||
&& (globalQueue.lessThan(addition, globalQueue.top()))) {
|
||||
// Queue is full and proposed addition is not a globally
|
||||
// competitive score
|
||||
return addition;
|
||||
}
|
||||
// The addition stands a chance of being entered - check the
|
||||
// key-specific restrictions.
|
||||
// We delay fetching the key until we are certain the score is globally
|
||||
// competitive. We need to adjust the ScoreDoc's global doc value to be
|
||||
// a leaf reader value when looking up keys
|
||||
addition.key = keys.get(addition.doc - docBase);
|
||||
|
||||
// For this to work the choice of key class needs to implement
|
||||
// hashcode and equals.
|
||||
ScoreDocKeyQueue thisKeyQ = perKeyQueues.get(addition.key);
|
||||
|
||||
if (thisKeyQ == null) {
|
||||
if (sparePerKeyQueues.size() == 0) {
|
||||
thisKeyQ = new ScoreDocKeyQueue(maxNumPerKey);
|
||||
} else {
|
||||
thisKeyQ = sparePerKeyQueues.pop();
|
||||
}
|
||||
perKeyQueues.put(addition.key, thisKeyQ);
|
||||
}
|
||||
ScoreDocKey perKeyOverflow = thisKeyQ.insertWithOverflow(addition);
|
||||
if (perKeyOverflow == addition) {
|
||||
// This key group has reached capacity and our proposed addition
|
||||
// was not competitive in the group - do not insert into the
|
||||
// main PQ or the key will be overly-populated in final results.
|
||||
return addition;
|
||||
}
|
||||
if (perKeyOverflow == null) {
|
||||
// This proposed addition is also locally competitive within the
|
||||
// key group - make a global entry and return
|
||||
ScoreDocKey globalOverflow = globalQueue.insertWithOverflow(addition);
|
||||
perKeyGroupRemove(globalOverflow);
|
||||
return globalOverflow;
|
||||
}
|
||||
// For the given key, we have reached max capacity but the new addition
|
||||
// is better than a prior entry that still exists in the global results
|
||||
// - request the weaker-scoring entry to be removed from the global
|
||||
// queue.
|
||||
globalQueue.remove(perKeyOverflow);
|
||||
// Add the locally-competitive addition into the globally queue
|
||||
globalQueue.add(addition);
|
||||
return perKeyOverflow;
|
||||
}
|
||||
|
||||
private void perKeyGroupRemove(ScoreDocKey globalOverflow) {
|
||||
if (globalOverflow == null) {
|
||||
return;
|
||||
}
|
||||
ScoreDocKeyQueue q = perKeyQueues.get(globalOverflow.key);
|
||||
ScoreDocKey perKeyLowest = q.pop();
|
||||
// The least globally-competitive item should also always be the least
|
||||
// key-local item
|
||||
assert (globalOverflow == perKeyLowest);
|
||||
if (q.size() == 0) {
|
||||
perKeyQueues.remove(globalOverflow.key);
|
||||
sparePerKeyQueues.push(q);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public LeafCollector getLeafCollector(LeafReaderContext context)
|
||||
throws IOException {
|
||||
final int base = context.docBase;
|
||||
final NumericDocValues keySource = getKeys(context);
|
||||
|
||||
return new LeafCollector() {
|
||||
Scorer scorer;
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
this.scorer = scorer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
float score = scorer.score();
|
||||
|
||||
// This collector cannot handle NaN
|
||||
assert !Float.isNaN(score);
|
||||
|
||||
totalHits++;
|
||||
|
||||
doc += base;
|
||||
|
||||
if (spare == null) {
|
||||
spare = new ScoreDocKey(doc, score);
|
||||
} else {
|
||||
spare.doc = doc;
|
||||
spare.score = score;
|
||||
}
|
||||
spare = insert(spare, base, keySource);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
static class ScoreDocKeyQueue extends PriorityQueue<ScoreDocKey> {
|
||||
|
||||
ScoreDocKeyQueue(int size) {
|
||||
super(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final boolean lessThan(ScoreDocKey hitA, ScoreDocKey hitB) {
|
||||
if (hitA.score == hitB.score)
|
||||
return hitA.doc > hitB.doc;
|
||||
else
|
||||
return hitA.score < hitB.score;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
/**
|
||||
* An extension to ScoreDoc that includes a key used for grouping purposes
|
||||
*/
|
||||
static public class ScoreDocKey extends ScoreDoc {
|
||||
Long key;
|
||||
|
||||
protected ScoreDocKey(int doc, float score) {
|
||||
super(doc, score);
|
||||
}
|
||||
|
||||
public Long getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "key:" + key + " doc=" + doc + " s=" + score;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,464 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.FloatDocValuesField;
|
||||
import org.apache.lucene.document.FloatField;
|
||||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SlowCompositeReaderWrapper;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Demonstrates an application of the {@link DiversifiedTopDocsCollector} in
|
||||
* assembling a collection of top results but without over-representation of any
|
||||
* one source (in this case top-selling singles from the 60s without having them
|
||||
* all be Beatles records...). Results are ranked by the number of weeks a
|
||||
* single is top of the charts and de-duped by the artist name.
|
||||
*
|
||||
*/
|
||||
public class TestDiversifiedTopDocsCollector extends LuceneTestCase {
|
||||
|
||||
public void testNonDiversifiedResults() throws Exception {
|
||||
int numberOfTracksOnCompilation = 10;
|
||||
int expectedMinNumOfBeatlesHits = 5;
|
||||
TopDocs res = searcher.search(getTestQuery(), numberOfTracksOnCompilation);
|
||||
assertEquals(numberOfTracksOnCompilation, res.scoreDocs.length);
|
||||
// due to randomization of segment merging in tests the exact number of Beatles hits
|
||||
// selected varies between 5 and 6 but we prove the point they are over-represented
|
||||
// in our result set using a standard search.
|
||||
assertTrue(getMaxNumRecordsPerArtist(res.scoreDocs) >= expectedMinNumOfBeatlesHits);
|
||||
}
|
||||
|
||||
public void testFirstPageDiversifiedResults() throws Exception {
|
||||
// Using a diversified collector we can limit the results from
|
||||
// any one artist.
|
||||
int requiredMaxHitsPerArtist = 2;
|
||||
int numberOfTracksOnCompilation = 10;
|
||||
DiversifiedTopDocsCollector tdc = doDiversifiedSearch(
|
||||
numberOfTracksOnCompilation, requiredMaxHitsPerArtist);
|
||||
ScoreDoc[] sd = tdc.topDocs(0).scoreDocs;
|
||||
assertEquals(numberOfTracksOnCompilation, sd.length);
|
||||
assertTrue(getMaxNumRecordsPerArtist(sd) <= requiredMaxHitsPerArtist);
|
||||
}
|
||||
|
||||
public void testSecondPageResults() throws Exception {
|
||||
int numberOfTracksPerCompilation = 10;
|
||||
int numberOfCompilations = 2;
|
||||
int requiredMaxHitsPerArtist = 1;
|
||||
|
||||
// Volume 2 of our hits compilation - start at position 10
|
||||
DiversifiedTopDocsCollector tdc = doDiversifiedSearch(
|
||||
numberOfTracksPerCompilation * numberOfCompilations,
|
||||
requiredMaxHitsPerArtist);
|
||||
ScoreDoc[] volume2 = tdc.topDocs(numberOfTracksPerCompilation,
|
||||
numberOfTracksPerCompilation).scoreDocs;
|
||||
assertEquals(numberOfTracksPerCompilation, volume2.length);
|
||||
assertTrue(getMaxNumRecordsPerArtist(volume2) <= requiredMaxHitsPerArtist);
|
||||
|
||||
}
|
||||
|
||||
public void testInvalidArguments() throws Exception {
|
||||
int numResults = 5;
|
||||
DiversifiedTopDocsCollector tdc = doDiversifiedSearch(numResults, 15);
|
||||
|
||||
// start < 0
|
||||
assertEquals(0, tdc.topDocs(-1).scoreDocs.length);
|
||||
|
||||
// start > pq.size()
|
||||
assertEquals(0, tdc.topDocs(numResults + 1).scoreDocs.length);
|
||||
|
||||
// start == pq.size()
|
||||
assertEquals(0, tdc.topDocs(numResults).scoreDocs.length);
|
||||
|
||||
// howMany < 0
|
||||
assertEquals(0, tdc.topDocs(0, -1).scoreDocs.length);
|
||||
|
||||
// howMany == 0
|
||||
assertEquals(0, tdc.topDocs(0, 0).scoreDocs.length);
|
||||
|
||||
}
|
||||
|
||||
// Diversifying collector that looks up de-dup keys using SortedDocValues
|
||||
// from a top-level Reader
|
||||
private static final class DocValuesDiversifiedCollector extends
|
||||
DiversifiedTopDocsCollector {
|
||||
private final SortedDocValues sdv;
|
||||
|
||||
public DocValuesDiversifiedCollector(int size, int maxHitsPerKey,
|
||||
SortedDocValues sdv) {
|
||||
super(size, maxHitsPerKey);
|
||||
this.sdv = sdv;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected NumericDocValues getKeys(final LeafReaderContext context) {
|
||||
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
// Keys are always expressed as a long so we obtain the
|
||||
// ordinal for our String-based artist name here
|
||||
return sdv.getOrd(context.docBase + docID);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Alternative, faster implementation for converting String keys to longs
|
||||
// but with the potential for hash collisions
|
||||
private static final class HashedDocValuesDiversifiedCollector extends
|
||||
DiversifiedTopDocsCollector {
|
||||
|
||||
private final String field;
|
||||
private BinaryDocValues vals;
|
||||
|
||||
public HashedDocValuesDiversifiedCollector(int size, int maxHitsPerKey,
|
||||
String field) {
|
||||
super(size, maxHitsPerKey);
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected NumericDocValues getKeys(LeafReaderContext context) {
|
||||
return new NumericDocValues() {
|
||||
@Override
|
||||
public long get(int docID) {
|
||||
return vals == null ? -1 : vals.get(docID).hashCode();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public LeafCollector getLeafCollector(LeafReaderContext context)
|
||||
throws IOException {
|
||||
this.vals = DocValues.getBinary(context.reader(), field);
|
||||
return super.getLeafCollector(context);
|
||||
}
|
||||
}
|
||||
|
||||
// Test data - format is artist, song, weeks at top of charts
|
||||
private static String[] hitsOfThe60s = {
|
||||
"1966\tSPENCER DAVIS GROUP\tKEEP ON RUNNING\t1",
|
||||
"1966\tOVERLANDERS\tMICHELLE\t3",
|
||||
"1966\tNANCY SINATRA\tTHESE BOOTS ARE MADE FOR WALKIN'\t4",
|
||||
"1966\tWALKER BROTHERS\tTHE SUN AIN'T GONNA SHINE ANYMORE\t4",
|
||||
"1966\tSPENCER DAVIS GROUP\tSOMEBODY HELP ME\t2",
|
||||
"1966\tDUSTY SPRINGFIELD\tYOU DON'T HAVE TO SAY YOU LOVE ME\t1",
|
||||
"1966\tMANFRED MANN\tPRETTY FLAMINGO\t3",
|
||||
"1966\tROLLING STONES\tPAINT IT, BLACK\t1",
|
||||
"1966\tFRANK SINATRA\tSTRANGERS IN THE NIGHT\t3",
|
||||
"1966\tBEATLES\tPAPERBACK WRITER\t5",
|
||||
"1966\tKINKS\tSUNNY AFTERNOON\t2",
|
||||
"1966\tGEORGIE FAME AND THE BLUE FLAMES\tGETAWAY\t1",
|
||||
"1966\tCHRIS FARLOWE\tOUT OF TIME\t1",
|
||||
"1966\tTROGGS\tWITH A GIRL LIKE YOU\t2",
|
||||
"1966\tBEATLES\tYELLOW SUBMARINE/ELEANOR RIGBY\t4",
|
||||
"1966\tSMALL FACES\tALL OR NOTHING\t1",
|
||||
"1966\tJIM REEVES\tDISTANT DRUMS\t5",
|
||||
"1966\tFOUR TOPS\tREACH OUT I'LL BE THERE\t3",
|
||||
"1966\tBEACH BOYS\tGOOD VIBRATIONS\t2",
|
||||
"1966\tTOM JONES\tGREEN GREEN GRASS OF HOME\t4",
|
||||
"1967\tMONKEES\tI'M A BELIEVER\t4",
|
||||
"1967\tPETULA CLARK\tTHIS IS MY SONG\t2",
|
||||
"1967\tENGELBERT HUMPERDINCK\tRELEASE ME\t4",
|
||||
"1967\tNANCY SINATRA AND FRANK SINATRA\tSOMETHIN' STUPID\t2",
|
||||
"1967\tSANDIE SHAW\tPUPPET ON A STRING\t3",
|
||||
"1967\tTREMELOES\tSILENCE IS GOLDEN\t3",
|
||||
"1967\tPROCOL HARUM\tA WHITER SHADE OF PALE\t4",
|
||||
"1967\tBEATLES\tALL YOU NEED IS LOVE\t7",
|
||||
"1967\tSCOTT MCKENZIE\tSAN FRANCISCO (BE SURE TO WEAR SOME FLOWERS INYOUR HAIR)\t4",
|
||||
"1967\tENGELBERT HUMPERDINCK\tTHE LAST WALTZ\t5",
|
||||
"1967\tBEE GEES\tMASSACHUSETTS (THE LIGHTS WENT OUT IN)\t4",
|
||||
"1967\tFOUNDATIONS\tBABY NOW THAT I'VE FOUND YOU\t2",
|
||||
"1967\tLONG JOHN BALDRY\tLET THE HEARTACHES BEGIN\t2",
|
||||
"1967\tBEATLES\tHELLO GOODBYE\t5",
|
||||
"1968\tGEORGIE FAME\tTHE BALLAD OF BONNIE AND CLYDE\t1",
|
||||
"1968\tLOVE AFFAIR\tEVERLASTING LOVE\t2",
|
||||
"1968\tMANFRED MANN\tMIGHTY QUINN\t2",
|
||||
"1968\tESTHER AND ABI OFARIM\tCINDERELLA ROCKEFELLA\t3",
|
||||
"1968\tDAVE DEE, DOZY, BEAKY, MICK AND TICH\tTHE LEGEND OF XANADU\t1",
|
||||
"1968\tBEATLES\tLADY MADONNA\t2",
|
||||
"1968\tCLIFF RICHARD\tCONGRATULATIONS\t2",
|
||||
"1968\tLOUIS ARMSTRONG\tWHAT A WONDERFUL WORLD/CABARET\t4",
|
||||
"1968\tGARRY PUCKETT AND THE UNION GAP\tYOUNG GIRL\t4",
|
||||
"1968\tROLLING STONES\tJUMPING JACK FLASH\t2",
|
||||
"1968\tEQUALS\tBABY COME BACK\t3", "1968\tDES O'CONNOR\tI PRETEND\t1",
|
||||
"1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t2",
|
||||
"1968\tCRAZY WORLD OF ARTHUR BROWN\tFIRE!\t1",
|
||||
"1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t1",
|
||||
"1968\tBEACH BOYS\tDO IT AGAIN\t1",
|
||||
"1968\tBEE GEES\tI'VE GOTTA GET A MESSAGE TO YOU\t1",
|
||||
"1968\tBEATLES\tHEY JUDE\t8",
|
||||
"1968\tMARY HOPKIN\tTHOSE WERE THE DAYS\t6",
|
||||
"1968\tJOE COCKER\tWITH A LITTLE HELP FROM MY FRIENDS\t1",
|
||||
"1968\tHUGO MONTENEGRO\tTHE GOOD THE BAD AND THE UGLY\t4",
|
||||
"1968\tSCAFFOLD\tLILY THE PINK\t3",
|
||||
"1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t1",
|
||||
"1969\tSCAFFOLD\tLILY THE PINK\t1",
|
||||
"1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t2",
|
||||
"1969\tFLEETWOOD MAC\tALBATROSS\t1", "1969\tMOVE\tBLACKBERRY WAY\t1",
|
||||
"1969\tAMEN CORNER\t(IF PARADISE IS) HALF AS NICE\t2",
|
||||
"1969\tPETER SARSTEDT\tWHERE DO YOU GO TO (MY LOVELY)\t4",
|
||||
"1969\tMARVIN GAYE\tI HEARD IT THROUGH THE GRAPEVINE\t3",
|
||||
"1969\tDESMOND DEKKER AND THE ACES\tTHE ISRAELITES\t1",
|
||||
"1969\tBEATLES\tGET BACK\t6", "1969\tTOMMY ROE\tDIZZY\t1",
|
||||
"1969\tBEATLES\tTHE BALLAD OF JOHN AND YOKO\t3",
|
||||
"1969\tTHUNDERCLAP NEWMAN\tSOMETHING IN THE AIR\t3",
|
||||
"1969\tROLLING STONES\tHONKY TONK WOMEN\t5",
|
||||
"1969\tZAGER AND EVANS\tIN THE YEAR 2525 (EXORDIUM AND TERMINUS)\t3",
|
||||
"1969\tCREEDENCE CLEARWATER REVIVAL\tBAD MOON RISING\t3",
|
||||
"1969\tJANE BIRKIN AND SERGE GAINSBOURG\tJE T'AIME... MOI NON PLUS\t1",
|
||||
"1969\tBOBBIE GENTRY\tI'LL NEVER FALL IN LOVE AGAIN\t1",
|
||||
"1969\tARCHIES\tSUGAR, SUGAR\t4" };
|
||||
|
||||
private static final Map<String, Record> parsedRecords = new HashMap<String, Record>();
|
||||
private Directory dir;
|
||||
private IndexReader reader;
|
||||
private IndexSearcher searcher;
|
||||
private SortedDocValues artistDocValues;
|
||||
|
||||
static class Record {
|
||||
String year;
|
||||
String artist;
|
||||
String song;
|
||||
float weeks;
|
||||
String id;
|
||||
|
||||
public Record(String id, String year, String artist, String song,
|
||||
float weeks) {
|
||||
super();
|
||||
this.id = id;
|
||||
this.year = year;
|
||||
this.artist = artist;
|
||||
this.song = song;
|
||||
this.weeks = weeks;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Record [id=" + id + ", artist=" + artist + ", weeks=" + weeks
|
||||
+ ", year=" + year + ", song=" + song + "]";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private DiversifiedTopDocsCollector doDiversifiedSearch(int numResults,
|
||||
int maxResultsPerArtist) throws IOException {
|
||||
// Alternate between implementations used for key lookups
|
||||
if (random().nextBoolean()) {
|
||||
// Faster key lookup but with potential for collisions on larger datasets
|
||||
return doFuzzyDiversifiedSearch(numResults, maxResultsPerArtist);
|
||||
} else {
|
||||
// Slower key lookup but 100% accurate
|
||||
return doAccurateDiversifiedSearch(numResults, maxResultsPerArtist);
|
||||
}
|
||||
}
|
||||
|
||||
private DiversifiedTopDocsCollector doFuzzyDiversifiedSearch(int numResults,
|
||||
int maxResultsPerArtist) throws IOException {
|
||||
DiversifiedTopDocsCollector tdc = new HashedDocValuesDiversifiedCollector(
|
||||
numResults, maxResultsPerArtist, "artist");
|
||||
searcher.search(getTestQuery(), tdc);
|
||||
return tdc;
|
||||
}
|
||||
|
||||
private DiversifiedTopDocsCollector doAccurateDiversifiedSearch(
|
||||
int numResults, int maxResultsPerArtist) throws IOException {
|
||||
DiversifiedTopDocsCollector tdc = new DocValuesDiversifiedCollector(
|
||||
numResults, maxResultsPerArtist, artistDocValues);
|
||||
searcher.search(getTestQuery(), tdc);
|
||||
return tdc;
|
||||
}
|
||||
|
||||
private Query getTestQuery() {
|
||||
BooleanQuery testQuery = new BooleanQuery();
|
||||
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1966")),
|
||||
Occur.SHOULD));
|
||||
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1967")),
|
||||
Occur.SHOULD));
|
||||
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1968")),
|
||||
Occur.SHOULD));
|
||||
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1969")),
|
||||
Occur.SHOULD));
|
||||
return testQuery;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
// populate an index with documents - artist, song and weeksAtNumberOne
|
||||
dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
|
||||
Field yearField = newTextField("year", "", Field.Store.NO);
|
||||
SortedDocValuesField artistField = new SortedDocValuesField("artist",
|
||||
new BytesRef(""));
|
||||
Field weeksAtNumberOneField = new FloatDocValuesField("weeksAtNumberOne",
|
||||
0.0F);
|
||||
Field weeksStoredField = new FloatField("weeks", 0.0F, Store.YES);
|
||||
Field idField = newStringField("id", "", Field.Store.YES);
|
||||
Field songField = newTextField("song", "", Field.Store.NO);
|
||||
Field storedArtistField = newTextField("artistName", "", Field.Store.NO);
|
||||
|
||||
doc.add(idField);
|
||||
doc.add(weeksAtNumberOneField);
|
||||
doc.add(storedArtistField);
|
||||
doc.add(songField);
|
||||
doc.add(weeksStoredField);
|
||||
doc.add(yearField);
|
||||
doc.add(artistField);
|
||||
|
||||
parsedRecords.clear();
|
||||
for (int i = 0; i < hitsOfThe60s.length; i++) {
|
||||
String cols[] = hitsOfThe60s[i].split("\t");
|
||||
Record record = new Record(String.valueOf(i), cols[0], cols[1], cols[2],
|
||||
Float.valueOf(cols[3]));
|
||||
parsedRecords.put(record.id, record);
|
||||
idField.setStringValue(record.id);
|
||||
yearField.setStringValue(record.year);
|
||||
storedArtistField.setStringValue(record.artist);
|
||||
artistField.setBytesValue(new BytesRef(record.artist));
|
||||
songField.setStringValue(record.song);
|
||||
weeksStoredField.setFloatValue(record.weeks);
|
||||
weeksAtNumberOneField.setFloatValue(record.weeks);
|
||||
writer.addDocument(doc);
|
||||
if (i % 10 == 0) {
|
||||
// Causes the creation of multiple segments for our test
|
||||
writer.commit();
|
||||
}
|
||||
}
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
searcher = newSearcher(reader);
|
||||
LeafReader ar = SlowCompositeReaderWrapper.wrap(reader);
|
||||
artistDocValues = ar.getSortedDocValues("artist");
|
||||
|
||||
// All searches sort by song popularity
|
||||
final Similarity base = searcher.getSimilarity();
|
||||
searcher.setSimilarity(new DocValueSimilarity(base, "weeksAtNumberOne"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
reader.close();
|
||||
dir.close();
|
||||
dir = null;
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private int getMaxNumRecordsPerArtist(ScoreDoc[] sd) throws IOException {
|
||||
int result = 0;
|
||||
HashMap<String, Integer> artistCounts = new HashMap<String, Integer>();
|
||||
for (int i = 0; i < sd.length; i++) {
|
||||
StoredDocument doc = reader.document(sd[i].doc);
|
||||
Record record = parsedRecords.get(doc.get("id"));
|
||||
Integer count = artistCounts.get(record.artist);
|
||||
int newCount = 1;
|
||||
if (count != null) {
|
||||
newCount = count.intValue() + 1;
|
||||
}
|
||||
result = Math.max(result, newCount);
|
||||
artistCounts.put(record.artist, newCount);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Similarity that wraps another similarity and replaces the final score
|
||||
* according to whats in a docvalues field.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
static class DocValueSimilarity extends Similarity {
|
||||
private final Similarity sim;
|
||||
private final String scoreValueField;
|
||||
|
||||
public DocValueSimilarity(Similarity sim, String scoreValueField) {
|
||||
this.sim = sim;
|
||||
this.scoreValueField = scoreValueField;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
return sim.computeNorm(state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimWeight computeWeight(float queryBoost,
|
||||
CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
return sim.computeWeight(queryBoost, collectionStats, termStats);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimScorer simScorer(SimWeight stats, LeafReaderContext context)
|
||||
throws IOException {
|
||||
final SimScorer sub = sim.simScorer(stats, context);
|
||||
final NumericDocValues values = DocValues.getNumeric(context.reader(),
|
||||
scoreValueField);
|
||||
|
||||
return new SimScorer() {
|
||||
@Override
|
||||
public float score(int doc, float freq) {
|
||||
return Float.intBitsToFloat((int) values.get(doc));
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computeSlopFactor(int distance) {
|
||||
return sub.computeSlopFactor(distance);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computePayloadFactor(int doc, int start, int end,
|
||||
BytesRef payload) {
|
||||
return sub.computePayloadFactor(doc, start, end, payload);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) {
|
||||
return new Explanation(Float.intBitsToFloat((int) values.get(doc)),
|
||||
"indexDocValue(" + scoreValueField + ")");
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue