LUCENE-6066: new DiversifiedTopDocsCollector in misc and PriorityQueue.remove method

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1659195 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2015-02-12 10:10:16 +00:00
parent 17bfed1212
commit ef3a99b385
5 changed files with 847 additions and 27 deletions

View File

@ -33,6 +33,10 @@ API Changes
New Features
* LUCENE-6066: Added DiversifiedTopDocsCollector to misc for collecting no more
than a given number of results under a choice of key. Introduces new remove
method to core's PriorityQueue. (Mark Harwood)
* LUCENE-3922: Added JapaneseNumberFilter that normalizes Japanese numbers
in kansuji form to regular/Arabic numbers. (Gaute Lambertsen, Christian Moen)

View File

@ -17,17 +17,19 @@ package org.apache.lucene.util;
* limitations under the License.
*/
/** A PriorityQueue maintains a partial ordering of its elements such that the
* least element can always be found in constant time. Put()'s and pop()'s
* require log(size) time.
/**
* A PriorityQueue maintains a partial ordering of its elements such that the
* least element can always be found in constant time. Put()'s and pop()'s
* require log(size) time but the remove() cost implemented here is linear.
*
* <p><b>NOTE</b>: This class will pre-allocate a full array of
* length <code>maxSize+1</code> if instantiated via the
* {@link #PriorityQueue(int,boolean)} constructor with
* <code>prepopulate</code> set to <code>true</code>.
* <p>
* <b>NOTE</b>: This class will pre-allocate a full array of length
* <code>maxSize+1</code> if instantiated via the
* {@link #PriorityQueue(int,boolean)} constructor with <code>prepopulate</code>
* set to <code>true</code>.
*
* @lucene.internal
*/
*/
public abstract class PriorityQueue<T> {
private int size = 0;
private final int maxSize;
@ -130,7 +132,7 @@ public abstract class PriorityQueue<T> {
public final T add(T element) {
size++;
heap[size] = element;
upHeap();
upHeap(size);
return heap[1];
}
@ -174,7 +176,7 @@ public abstract class PriorityQueue<T> {
heap[1] = heap[size]; // move last to first
heap[size] = null; // permit GC of objects
size--;
downHeap(); // adjust heap
downHeap(1); // adjust heap
return result;
} else {
return null;
@ -201,7 +203,7 @@ public abstract class PriorityQueue<T> {
* @return the new 'top' element.
*/
public final T updateTop() {
downHeap();
downHeap(1);
return heap[1];
}
@ -226,8 +228,31 @@ public abstract class PriorityQueue<T> {
size = 0;
}
private final void upHeap() {
int i = size;
/**
* Removes an existing element currently stored in the PriorityQueue. Cost is
* linear with the size of the queue. (A specialization of PriorityQueue which
* tracks element positions would provide a constant remove time but the
* trade-off would be extra cost to all additions/insertions)
*/
public final boolean remove(T element) {
for (int i = 1; i <= size; i++) {
if (heap[i] == element) {
heap[i] = heap[size];
heap[size] = null; // permit GC of objects
size--;
if (i <= size) {
if (!upHeap(i)) {
downHeap(i);
}
}
return true;
}
}
return false;
}
private final boolean upHeap(int origPos) {
int i = origPos;
T node = heap[i]; // save bottom node
int j = i >>> 1;
while (j > 0 && lessThan(node, heap[j])) {
@ -236,10 +261,10 @@ public abstract class PriorityQueue<T> {
j = j >>> 1;
}
heap[i] = node; // install saved node
return i != origPos;
}
private final void downHeap() {
int i = 1;
private final void downHeap(int i) {
T node = heap[i]; // save top node
int j = i << 1; // find smaller child
int k = j + 1;
@ -257,7 +282,7 @@ public abstract class PriorityQueue<T> {
}
heap[i] = node; // install saved node
}
/** This method returns the internal heap array as Object[].
* @lucene.internal
*/

View File

@ -17,21 +17,41 @@ package org.apache.lucene.util;
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Random;
public class TestPriorityQueue extends LuceneTestCase {
private static class IntegerQueue extends PriorityQueue<Integer> {
public IntegerQueue(int count) {
super(count);
}
@Override
protected boolean lessThan(Integer a, Integer b) {
return (a < b);
}
private static class IntegerQueue extends PriorityQueue<Integer> {
public IntegerQueue(int count) {
super(count);
}
@Override
protected boolean lessThan(Integer a, Integer b) {
if (a.equals(b)) {
assert (a != b);
int hashA = System.identityHashCode(a);
int hashB = System.identityHashCode(b);
assert (hashA != hashB);
return hashA < hashB;
}
return (a < b);
}
protected final void checkValidity() {
Object[] heapArray = getHeapArray();
for (int i = 1; i <= size(); i++) {
int parent = i >>> 1;
if (parent > 1) {
assertTrue(lessThan((Integer) heapArray[parent],
(Integer) heapArray[i]));
}
}
}
}
public void testPQ() throws Exception {
testPQ(atLeast(10000), random());
}
@ -111,5 +131,61 @@ public class TestPriorityQueue extends LuceneTestCase {
assertEquals(size, pq.size());
assertEquals((Integer) 2, pq.top());
}
public void testRemovalsAndInsertions() {
Random random = random();
int numDocsInPQ = TestUtil.nextInt(random, 1, 100);
IntegerQueue pq = new IntegerQueue(numDocsInPQ);
Integer lastLeast = null;
// Basic insertion of new content
ArrayList<Integer> sds = new ArrayList<Integer>(numDocsInPQ);
for (int i = 0; i < numDocsInPQ * 10; i++) {
Integer newEntry = new Integer(Math.abs(random.nextInt()));
sds.add(newEntry);
Integer evicted = pq.insertWithOverflow(newEntry);
pq.checkValidity();
if (evicted != null) {
assertTrue(sds.remove(evicted));
if (evicted != newEntry) {
assertTrue(evicted == lastLeast);
}
}
Integer newLeast = pq.top();
if ((lastLeast != null) && (newLeast != newEntry)
&& (newLeast != lastLeast)) {
// If there has been a change of least entry and it wasn't our new
// addition we expect the scores to increase
assertTrue(newLeast <= newEntry);
assertTrue(newLeast >= lastLeast);
}
lastLeast = newLeast;
}
// Try many random additions to existing entries - we should always see
// increasing scores in the lowest entry in the PQ
for (int p = 0; p < 500000; p++) {
int element = (int) (random.nextFloat() * (sds.size() - 1));
Integer objectToRemove = sds.get(element);
assertTrue(sds.remove(element) == objectToRemove);
assertTrue(pq.remove(objectToRemove));
pq.checkValidity();
Integer newEntry = new Integer(Math.abs(random.nextInt()));
sds.add(newEntry);
assertNull(pq.insertWithOverflow(newEntry));
pq.checkValidity();
Integer newLeast = pq.top();
if ((objectToRemove != lastLeast) && (lastLeast != null)
&& (newLeast != newEntry)) {
// If there has been a change of least entry and it wasn't our new
// addition or the loss of our randomly removed entry we expect the
// scores to increase
assertTrue(newLeast <= newEntry);
assertTrue(newLeast >= lastLeast);
}
lastLeast = newLeast;
}
}
}

View File

@ -0,0 +1,251 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.DiversifiedTopDocsCollector.ScoreDocKey;
import org.apache.lucene.util.PriorityQueue;
/**
* A {@link TopDocsCollector} that controls diversity in results by ensuring no
* more than maxHitsPerKey results from a common source are collected in the
* final results.
*
* An example application might be a product search in a marketplace where no
* more than 3 results per retailer are permitted in search results.
*
* <p>
* To compare behaviour with other forms of collector, a useful analogy might be
* the problem of making a compilation album of 1967's top hit records:
* <ol>
* <li>A vanilla query's results might look like a "Best of the Beatles" album -
* high quality but not much diversity</li>
* <li>A GroupingSearch would produce the equivalent of "The 10 top-selling
* artists of 1967 - some killer and quite a lot of filler"</li>
* <li>A "diversified" query would be the top 20 hit records of that year - with
* a max of 3 Beatles hits in order to maintain diversity</li>
* </ol>
* This collector improves on the "GroupingSearch" type queries by
* <ul>
* <li>Working in one pass over the data</li>
* <li>Not requiring the client to guess how many groups are required</li>
* <li>Removing low-scoring "filler" which sits at the end of each group's hits</li>
* </ul>
*
* This is an abstract class and subclasses have to provide a source of keys for
* documents which is then used to help identify duplicate sources.
*
* @lucene.experimental
*
*/
public abstract class DiversifiedTopDocsCollector extends
TopDocsCollector<ScoreDocKey> {
ScoreDocKey spare;
private ScoreDocKeyQueue globalQueue;
private int numHits;
private Map<Long, ScoreDocKeyQueue> perKeyQueues;
protected int maxNumPerKey;
private Stack<ScoreDocKeyQueue> sparePerKeyQueues = new Stack<>();
public DiversifiedTopDocsCollector(int numHits, int maxHitsPerKey) {
super(new ScoreDocKeyQueue(numHits));
// Need to access pq.lessThan() which is protected so have to cast here...
this.globalQueue = (ScoreDocKeyQueue) pq;
perKeyQueues = new HashMap<Long, ScoreDocKeyQueue>();
this.numHits = numHits;
this.maxNumPerKey = maxHitsPerKey;
}
/**
* Get a source of values used for grouping keys
*/
protected abstract NumericDocValues getKeys(LeafReaderContext context);
@Override
public boolean needsScores() {
return true;
}
@Override
protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
if (results == null) {
return EMPTY_TOPDOCS;
}
// We need to compute maxScore in order to set it in TopDocs. If start == 0,
// it means the largest element is already in results, use its score as
// maxScore. Otherwise pop everything else, until the largest element is
// extracted and use its score as maxScore.
float maxScore = Float.NaN;
if (start == 0) {
maxScore = results[0].score;
} else {
for (int i = globalQueue.size(); i > 1; i--) {
globalQueue.pop();
}
maxScore = globalQueue.pop().score;
}
return new TopDocs(totalHits, results, maxScore);
}
protected ScoreDocKey insert(ScoreDocKey addition, int docBase,
NumericDocValues keys) {
if ((globalQueue.size() >= numHits)
&& (globalQueue.lessThan(addition, globalQueue.top()))) {
// Queue is full and proposed addition is not a globally
// competitive score
return addition;
}
// The addition stands a chance of being entered - check the
// key-specific restrictions.
// We delay fetching the key until we are certain the score is globally
// competitive. We need to adjust the ScoreDoc's global doc value to be
// a leaf reader value when looking up keys
addition.key = keys.get(addition.doc - docBase);
// For this to work the choice of key class needs to implement
// hashcode and equals.
ScoreDocKeyQueue thisKeyQ = perKeyQueues.get(addition.key);
if (thisKeyQ == null) {
if (sparePerKeyQueues.size() == 0) {
thisKeyQ = new ScoreDocKeyQueue(maxNumPerKey);
} else {
thisKeyQ = sparePerKeyQueues.pop();
}
perKeyQueues.put(addition.key, thisKeyQ);
}
ScoreDocKey perKeyOverflow = thisKeyQ.insertWithOverflow(addition);
if (perKeyOverflow == addition) {
// This key group has reached capacity and our proposed addition
// was not competitive in the group - do not insert into the
// main PQ or the key will be overly-populated in final results.
return addition;
}
if (perKeyOverflow == null) {
// This proposed addition is also locally competitive within the
// key group - make a global entry and return
ScoreDocKey globalOverflow = globalQueue.insertWithOverflow(addition);
perKeyGroupRemove(globalOverflow);
return globalOverflow;
}
// For the given key, we have reached max capacity but the new addition
// is better than a prior entry that still exists in the global results
// - request the weaker-scoring entry to be removed from the global
// queue.
globalQueue.remove(perKeyOverflow);
// Add the locally-competitive addition into the globally queue
globalQueue.add(addition);
return perKeyOverflow;
}
private void perKeyGroupRemove(ScoreDocKey globalOverflow) {
if (globalOverflow == null) {
return;
}
ScoreDocKeyQueue q = perKeyQueues.get(globalOverflow.key);
ScoreDocKey perKeyLowest = q.pop();
// The least globally-competitive item should also always be the least
// key-local item
assert (globalOverflow == perKeyLowest);
if (q.size() == 0) {
perKeyQueues.remove(globalOverflow.key);
sparePerKeyQueues.push(q);
}
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context)
throws IOException {
final int base = context.docBase;
final NumericDocValues keySource = getKeys(context);
return new LeafCollector() {
Scorer scorer;
@Override
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
}
@Override
public void collect(int doc) throws IOException {
float score = scorer.score();
// This collector cannot handle NaN
assert !Float.isNaN(score);
totalHits++;
doc += base;
if (spare == null) {
spare = new ScoreDocKey(doc, score);
} else {
spare.doc = doc;
spare.score = score;
}
spare = insert(spare, base, keySource);
}
};
}
static class ScoreDocKeyQueue extends PriorityQueue<ScoreDocKey> {
ScoreDocKeyQueue(int size) {
super(size);
}
@Override
protected final boolean lessThan(ScoreDocKey hitA, ScoreDocKey hitB) {
if (hitA.score == hitB.score)
return hitA.doc > hitB.doc;
else
return hitA.score < hitB.score;
}
}
//
/**
* An extension to ScoreDoc that includes a key used for grouping purposes
*/
static public class ScoreDocKey extends ScoreDoc {
Long key;
protected ScoreDocKey(int doc, float score) {
super(doc, score);
}
public Long getKey() {
return key;
}
@Override
public String toString() {
return "key:" + key + " doc=" + doc + " s=" + score;
}
}
}

View File

@ -0,0 +1,464 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FloatDocValuesField;
import org.apache.lucene.document.FloatField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
/**
* Demonstrates an application of the {@link DiversifiedTopDocsCollector} in
* assembling a collection of top results but without over-representation of any
* one source (in this case top-selling singles from the 60s without having them
* all be Beatles records...). Results are ranked by the number of weeks a
* single is top of the charts and de-duped by the artist name.
*
*/
public class TestDiversifiedTopDocsCollector extends LuceneTestCase {
public void testNonDiversifiedResults() throws Exception {
int numberOfTracksOnCompilation = 10;
int expectedMinNumOfBeatlesHits = 5;
TopDocs res = searcher.search(getTestQuery(), numberOfTracksOnCompilation);
assertEquals(numberOfTracksOnCompilation, res.scoreDocs.length);
// due to randomization of segment merging in tests the exact number of Beatles hits
// selected varies between 5 and 6 but we prove the point they are over-represented
// in our result set using a standard search.
assertTrue(getMaxNumRecordsPerArtist(res.scoreDocs) >= expectedMinNumOfBeatlesHits);
}
public void testFirstPageDiversifiedResults() throws Exception {
// Using a diversified collector we can limit the results from
// any one artist.
int requiredMaxHitsPerArtist = 2;
int numberOfTracksOnCompilation = 10;
DiversifiedTopDocsCollector tdc = doDiversifiedSearch(
numberOfTracksOnCompilation, requiredMaxHitsPerArtist);
ScoreDoc[] sd = tdc.topDocs(0).scoreDocs;
assertEquals(numberOfTracksOnCompilation, sd.length);
assertTrue(getMaxNumRecordsPerArtist(sd) <= requiredMaxHitsPerArtist);
}
public void testSecondPageResults() throws Exception {
int numberOfTracksPerCompilation = 10;
int numberOfCompilations = 2;
int requiredMaxHitsPerArtist = 1;
// Volume 2 of our hits compilation - start at position 10
DiversifiedTopDocsCollector tdc = doDiversifiedSearch(
numberOfTracksPerCompilation * numberOfCompilations,
requiredMaxHitsPerArtist);
ScoreDoc[] volume2 = tdc.topDocs(numberOfTracksPerCompilation,
numberOfTracksPerCompilation).scoreDocs;
assertEquals(numberOfTracksPerCompilation, volume2.length);
assertTrue(getMaxNumRecordsPerArtist(volume2) <= requiredMaxHitsPerArtist);
}
public void testInvalidArguments() throws Exception {
int numResults = 5;
DiversifiedTopDocsCollector tdc = doDiversifiedSearch(numResults, 15);
// start < 0
assertEquals(0, tdc.topDocs(-1).scoreDocs.length);
// start > pq.size()
assertEquals(0, tdc.topDocs(numResults + 1).scoreDocs.length);
// start == pq.size()
assertEquals(0, tdc.topDocs(numResults).scoreDocs.length);
// howMany < 0
assertEquals(0, tdc.topDocs(0, -1).scoreDocs.length);
// howMany == 0
assertEquals(0, tdc.topDocs(0, 0).scoreDocs.length);
}
// Diversifying collector that looks up de-dup keys using SortedDocValues
// from a top-level Reader
private static final class DocValuesDiversifiedCollector extends
DiversifiedTopDocsCollector {
private final SortedDocValues sdv;
public DocValuesDiversifiedCollector(int size, int maxHitsPerKey,
SortedDocValues sdv) {
super(size, maxHitsPerKey);
this.sdv = sdv;
}
@Override
protected NumericDocValues getKeys(final LeafReaderContext context) {
return new NumericDocValues() {
@Override
public long get(int docID) {
// Keys are always expressed as a long so we obtain the
// ordinal for our String-based artist name here
return sdv.getOrd(context.docBase + docID);
}
};
}
}
// Alternative, faster implementation for converting String keys to longs
// but with the potential for hash collisions
private static final class HashedDocValuesDiversifiedCollector extends
DiversifiedTopDocsCollector {
private final String field;
private BinaryDocValues vals;
public HashedDocValuesDiversifiedCollector(int size, int maxHitsPerKey,
String field) {
super(size, maxHitsPerKey);
this.field = field;
}
@Override
protected NumericDocValues getKeys(LeafReaderContext context) {
return new NumericDocValues() {
@Override
public long get(int docID) {
return vals == null ? -1 : vals.get(docID).hashCode();
}
};
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context)
throws IOException {
this.vals = DocValues.getBinary(context.reader(), field);
return super.getLeafCollector(context);
}
}
// Test data - format is artist, song, weeks at top of charts
private static String[] hitsOfThe60s = {
"1966\tSPENCER DAVIS GROUP\tKEEP ON RUNNING\t1",
"1966\tOVERLANDERS\tMICHELLE\t3",
"1966\tNANCY SINATRA\tTHESE BOOTS ARE MADE FOR WALKIN'\t4",
"1966\tWALKER BROTHERS\tTHE SUN AIN'T GONNA SHINE ANYMORE\t4",
"1966\tSPENCER DAVIS GROUP\tSOMEBODY HELP ME\t2",
"1966\tDUSTY SPRINGFIELD\tYOU DON'T HAVE TO SAY YOU LOVE ME\t1",
"1966\tMANFRED MANN\tPRETTY FLAMINGO\t3",
"1966\tROLLING STONES\tPAINT IT, BLACK\t1",
"1966\tFRANK SINATRA\tSTRANGERS IN THE NIGHT\t3",
"1966\tBEATLES\tPAPERBACK WRITER\t5",
"1966\tKINKS\tSUNNY AFTERNOON\t2",
"1966\tGEORGIE FAME AND THE BLUE FLAMES\tGETAWAY\t1",
"1966\tCHRIS FARLOWE\tOUT OF TIME\t1",
"1966\tTROGGS\tWITH A GIRL LIKE YOU\t2",
"1966\tBEATLES\tYELLOW SUBMARINE/ELEANOR RIGBY\t4",
"1966\tSMALL FACES\tALL OR NOTHING\t1",
"1966\tJIM REEVES\tDISTANT DRUMS\t5",
"1966\tFOUR TOPS\tREACH OUT I'LL BE THERE\t3",
"1966\tBEACH BOYS\tGOOD VIBRATIONS\t2",
"1966\tTOM JONES\tGREEN GREEN GRASS OF HOME\t4",
"1967\tMONKEES\tI'M A BELIEVER\t4",
"1967\tPETULA CLARK\tTHIS IS MY SONG\t2",
"1967\tENGELBERT HUMPERDINCK\tRELEASE ME\t4",
"1967\tNANCY SINATRA AND FRANK SINATRA\tSOMETHIN' STUPID\t2",
"1967\tSANDIE SHAW\tPUPPET ON A STRING\t3",
"1967\tTREMELOES\tSILENCE IS GOLDEN\t3",
"1967\tPROCOL HARUM\tA WHITER SHADE OF PALE\t4",
"1967\tBEATLES\tALL YOU NEED IS LOVE\t7",
"1967\tSCOTT MCKENZIE\tSAN FRANCISCO (BE SURE TO WEAR SOME FLOWERS INYOUR HAIR)\t4",
"1967\tENGELBERT HUMPERDINCK\tTHE LAST WALTZ\t5",
"1967\tBEE GEES\tMASSACHUSETTS (THE LIGHTS WENT OUT IN)\t4",
"1967\tFOUNDATIONS\tBABY NOW THAT I'VE FOUND YOU\t2",
"1967\tLONG JOHN BALDRY\tLET THE HEARTACHES BEGIN\t2",
"1967\tBEATLES\tHELLO GOODBYE\t5",
"1968\tGEORGIE FAME\tTHE BALLAD OF BONNIE AND CLYDE\t1",
"1968\tLOVE AFFAIR\tEVERLASTING LOVE\t2",
"1968\tMANFRED MANN\tMIGHTY QUINN\t2",
"1968\tESTHER AND ABI OFARIM\tCINDERELLA ROCKEFELLA\t3",
"1968\tDAVE DEE, DOZY, BEAKY, MICK AND TICH\tTHE LEGEND OF XANADU\t1",
"1968\tBEATLES\tLADY MADONNA\t2",
"1968\tCLIFF RICHARD\tCONGRATULATIONS\t2",
"1968\tLOUIS ARMSTRONG\tWHAT A WONDERFUL WORLD/CABARET\t4",
"1968\tGARRY PUCKETT AND THE UNION GAP\tYOUNG GIRL\t4",
"1968\tROLLING STONES\tJUMPING JACK FLASH\t2",
"1968\tEQUALS\tBABY COME BACK\t3", "1968\tDES O'CONNOR\tI PRETEND\t1",
"1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t2",
"1968\tCRAZY WORLD OF ARTHUR BROWN\tFIRE!\t1",
"1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t1",
"1968\tBEACH BOYS\tDO IT AGAIN\t1",
"1968\tBEE GEES\tI'VE GOTTA GET A MESSAGE TO YOU\t1",
"1968\tBEATLES\tHEY JUDE\t8",
"1968\tMARY HOPKIN\tTHOSE WERE THE DAYS\t6",
"1968\tJOE COCKER\tWITH A LITTLE HELP FROM MY FRIENDS\t1",
"1968\tHUGO MONTENEGRO\tTHE GOOD THE BAD AND THE UGLY\t4",
"1968\tSCAFFOLD\tLILY THE PINK\t3",
"1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t1",
"1969\tSCAFFOLD\tLILY THE PINK\t1",
"1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t2",
"1969\tFLEETWOOD MAC\tALBATROSS\t1", "1969\tMOVE\tBLACKBERRY WAY\t1",
"1969\tAMEN CORNER\t(IF PARADISE IS) HALF AS NICE\t2",
"1969\tPETER SARSTEDT\tWHERE DO YOU GO TO (MY LOVELY)\t4",
"1969\tMARVIN GAYE\tI HEARD IT THROUGH THE GRAPEVINE\t3",
"1969\tDESMOND DEKKER AND THE ACES\tTHE ISRAELITES\t1",
"1969\tBEATLES\tGET BACK\t6", "1969\tTOMMY ROE\tDIZZY\t1",
"1969\tBEATLES\tTHE BALLAD OF JOHN AND YOKO\t3",
"1969\tTHUNDERCLAP NEWMAN\tSOMETHING IN THE AIR\t3",
"1969\tROLLING STONES\tHONKY TONK WOMEN\t5",
"1969\tZAGER AND EVANS\tIN THE YEAR 2525 (EXORDIUM AND TERMINUS)\t3",
"1969\tCREEDENCE CLEARWATER REVIVAL\tBAD MOON RISING\t3",
"1969\tJANE BIRKIN AND SERGE GAINSBOURG\tJE T'AIME... MOI NON PLUS\t1",
"1969\tBOBBIE GENTRY\tI'LL NEVER FALL IN LOVE AGAIN\t1",
"1969\tARCHIES\tSUGAR, SUGAR\t4" };
private static final Map<String, Record> parsedRecords = new HashMap<String, Record>();
private Directory dir;
private IndexReader reader;
private IndexSearcher searcher;
private SortedDocValues artistDocValues;
static class Record {
String year;
String artist;
String song;
float weeks;
String id;
public Record(String id, String year, String artist, String song,
float weeks) {
super();
this.id = id;
this.year = year;
this.artist = artist;
this.song = song;
this.weeks = weeks;
}
@Override
public String toString() {
return "Record [id=" + id + ", artist=" + artist + ", weeks=" + weeks
+ ", year=" + year + ", song=" + song + "]";
}
}
private DiversifiedTopDocsCollector doDiversifiedSearch(int numResults,
int maxResultsPerArtist) throws IOException {
// Alternate between implementations used for key lookups
if (random().nextBoolean()) {
// Faster key lookup but with potential for collisions on larger datasets
return doFuzzyDiversifiedSearch(numResults, maxResultsPerArtist);
} else {
// Slower key lookup but 100% accurate
return doAccurateDiversifiedSearch(numResults, maxResultsPerArtist);
}
}
private DiversifiedTopDocsCollector doFuzzyDiversifiedSearch(int numResults,
int maxResultsPerArtist) throws IOException {
DiversifiedTopDocsCollector tdc = new HashedDocValuesDiversifiedCollector(
numResults, maxResultsPerArtist, "artist");
searcher.search(getTestQuery(), tdc);
return tdc;
}
private DiversifiedTopDocsCollector doAccurateDiversifiedSearch(
int numResults, int maxResultsPerArtist) throws IOException {
DiversifiedTopDocsCollector tdc = new DocValuesDiversifiedCollector(
numResults, maxResultsPerArtist, artistDocValues);
searcher.search(getTestQuery(), tdc);
return tdc;
}
private Query getTestQuery() {
BooleanQuery testQuery = new BooleanQuery();
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1966")),
Occur.SHOULD));
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1967")),
Occur.SHOULD));
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1968")),
Occur.SHOULD));
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1969")),
Occur.SHOULD));
return testQuery;
}
@Override
public void setUp() throws Exception {
super.setUp();
// populate an index with documents - artist, song and weeksAtNumberOne
dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
Field yearField = newTextField("year", "", Field.Store.NO);
SortedDocValuesField artistField = new SortedDocValuesField("artist",
new BytesRef(""));
Field weeksAtNumberOneField = new FloatDocValuesField("weeksAtNumberOne",
0.0F);
Field weeksStoredField = new FloatField("weeks", 0.0F, Store.YES);
Field idField = newStringField("id", "", Field.Store.YES);
Field songField = newTextField("song", "", Field.Store.NO);
Field storedArtistField = newTextField("artistName", "", Field.Store.NO);
doc.add(idField);
doc.add(weeksAtNumberOneField);
doc.add(storedArtistField);
doc.add(songField);
doc.add(weeksStoredField);
doc.add(yearField);
doc.add(artistField);
parsedRecords.clear();
for (int i = 0; i < hitsOfThe60s.length; i++) {
String cols[] = hitsOfThe60s[i].split("\t");
Record record = new Record(String.valueOf(i), cols[0], cols[1], cols[2],
Float.valueOf(cols[3]));
parsedRecords.put(record.id, record);
idField.setStringValue(record.id);
yearField.setStringValue(record.year);
storedArtistField.setStringValue(record.artist);
artistField.setBytesValue(new BytesRef(record.artist));
songField.setStringValue(record.song);
weeksStoredField.setFloatValue(record.weeks);
weeksAtNumberOneField.setFloatValue(record.weeks);
writer.addDocument(doc);
if (i % 10 == 0) {
// Causes the creation of multiple segments for our test
writer.commit();
}
}
reader = writer.getReader();
writer.close();
searcher = newSearcher(reader);
LeafReader ar = SlowCompositeReaderWrapper.wrap(reader);
artistDocValues = ar.getSortedDocValues("artist");
// All searches sort by song popularity
final Similarity base = searcher.getSimilarity();
searcher.setSimilarity(new DocValueSimilarity(base, "weeksAtNumberOne"));
}
@Override
public void tearDown() throws Exception {
reader.close();
dir.close();
dir = null;
super.tearDown();
}
private int getMaxNumRecordsPerArtist(ScoreDoc[] sd) throws IOException {
int result = 0;
HashMap<String, Integer> artistCounts = new HashMap<String, Integer>();
for (int i = 0; i < sd.length; i++) {
StoredDocument doc = reader.document(sd[i].doc);
Record record = parsedRecords.get(doc.get("id"));
Integer count = artistCounts.get(record.artist);
int newCount = 1;
if (count != null) {
newCount = count.intValue() + 1;
}
result = Math.max(result, newCount);
artistCounts.put(record.artist, newCount);
}
return result;
}
/**
* Similarity that wraps another similarity and replaces the final score
* according to whats in a docvalues field.
*
* @lucene.experimental
*/
static class DocValueSimilarity extends Similarity {
private final Similarity sim;
private final String scoreValueField;
public DocValueSimilarity(Similarity sim, String scoreValueField) {
this.sim = sim;
this.scoreValueField = scoreValueField;
}
@Override
public long computeNorm(FieldInvertState state) {
return sim.computeNorm(state);
}
@Override
public SimWeight computeWeight(float queryBoost,
CollectionStatistics collectionStats, TermStatistics... termStats) {
return sim.computeWeight(queryBoost, collectionStats, termStats);
}
@Override
public SimScorer simScorer(SimWeight stats, LeafReaderContext context)
throws IOException {
final SimScorer sub = sim.simScorer(stats, context);
final NumericDocValues values = DocValues.getNumeric(context.reader(),
scoreValueField);
return new SimScorer() {
@Override
public float score(int doc, float freq) {
return Float.intBitsToFloat((int) values.get(doc));
}
@Override
public float computeSlopFactor(int distance) {
return sub.computeSlopFactor(distance);
}
@Override
public float computePayloadFactor(int doc, int start, int end,
BytesRef payload) {
return sub.computePayloadFactor(doc, start, end, payload);
}
@Override
public Explanation explain(int doc, Explanation freq) {
return new Explanation(Float.intBitsToFloat((int) values.get(doc)),
"indexDocValue(" + scoreValueField + ")");
}
};
}
}
}