LUCENE-3097: Added a new grouping collector that can be used to retrieve all most relevant documents per group.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1150470 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Martijn van Groningen 2011-07-24 19:06:51 +00:00
parent 30283e1b4d
commit 49075985fb
5 changed files with 1230 additions and 0 deletions

View File

@ -553,6 +553,10 @@ New Features
AbstractField.setOmitTermFrequenciesAndPositions is deprecated, AbstractField.setOmitTermFrequenciesAndPositions is deprecated,
you should use DOCS_ONLY instead. (Robert Muir) you should use DOCS_ONLY instead. (Robert Muir)
* LUCENE-3097: Added a new grouping collector that can be used to retrieve all most relevant
documents per group. This can be useful in situations when one wants to compute grouping
based facets / statistics on the complete query result. (Martijn van Groningen)
Optimizations Optimizations
* LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated * LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated

View File

@ -0,0 +1,179 @@
package org.apache.lucene.search.grouping;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Collector;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import java.util.Collection;
/**
* This collector specializes in collecting the most relevant document (group head) for each group that match the query.
*
* @lucene.experimental
*/
public abstract class AbstractAllGroupHeadsCollector<GH extends AbstractAllGroupHeadsCollector.GroupHead> extends Collector {
protected final int[] reversed;
protected final int compIDXEnd;
protected final TemporalResult temporalResult;
protected AbstractAllGroupHeadsCollector(int numberOfSorts) {
this.reversed = new int[numberOfSorts];
this.compIDXEnd = numberOfSorts - 1;
temporalResult = new TemporalResult();
}
/**
* @param maxDoc The maxDoc of the top level {@link IndexReader}.
* @return an {@link FixedBitSet} containing all group heads.
*/
public FixedBitSet retrieveGroupHeads(int maxDoc) {
FixedBitSet bitSet = new FixedBitSet(maxDoc);
Collection<GH> groupHeads = getCollectedGroupHeads();
for (GroupHead groupHead : groupHeads) {
bitSet.set(groupHead.doc);
}
return bitSet;
}
/**
* @return an int array containing all group heads. The size of the array is equal to number of collected unique groups.
*/
public int[] retrieveGroupHeads() {
Collection<GH> groupHeads = getCollectedGroupHeads();
int[] docHeads = new int[groupHeads.size()];
int i = 0;
for (GroupHead groupHead : groupHeads) {
docHeads[i++] = groupHead.doc;
}
return docHeads;
}
/**
* @return the number of group heads found for a query.
*/
public int groupHeadsSize() {
return getCollectedGroupHeads().size();
}
/**
* Returns the group head and puts it into {@link #temporalResult}.
* If the group head wasn't encountered before then it will be added to the collected group heads.
* <p/>
* The {@link TemporalResult#stop} property will be <code>true</code> if the group head wasn't encountered before
* otherwise <code>false</code>.
*
* @param doc The document to retrieve the group head for.
* @throws IOException If I/O related errors occur
*/
protected abstract void retrieveGroupHeadAndAddIfNotExist(int doc) throws IOException;
/**
* Returns the collected group heads.
* Subsequent calls should return the same group heads.
*
* @return the collected group heads
*/
protected abstract Collection<GH> getCollectedGroupHeads();
public void collect(int doc) throws IOException {
retrieveGroupHeadAndAddIfNotExist(doc);
if (temporalResult.stop) {
return;
}
GH groupHead = temporalResult.groupHead;
// Ok now we need to check if the current doc is more relevant then current doc for this group
for (int compIDX = 0; ; compIDX++) {
final int c = reversed[compIDX] * groupHead.compare(compIDX, doc);
if (c < 0) {
// Definitely not competitive. So don't even bother to continue
return;
} else if (c > 0) {
// Definitely competitive.
break;
} else if (compIDX == compIDXEnd) {
// Here c=0. If we're at the last comparator, this doc is not
// competitive, since docs are visited in doc Id order, which means
// this doc cannot compete with any other document in the queue.
return;
}
}
groupHead.updateDocHead(doc);
}
public boolean acceptsDocsOutOfOrder() {
return true;
}
/**
* Contains the result of group head retrieval.
* To prevent new object creations of this class for every collect.
*/
protected class TemporalResult {
protected GH groupHead;
protected boolean stop;
}
/**
* Represents a group head. A group head is the most relevant document for a particular group.
* The relevancy is based is usually based on the sort.
*
* The group head contains a group value with its associated most relevant document id.
*/
public static abstract class GroupHead<GROUP_VALUE_TYPE> {
public final GROUP_VALUE_TYPE groupValue;
public int doc;
protected GroupHead(GROUP_VALUE_TYPE groupValue, int doc) {
this.groupValue = groupValue;
this.doc = doc;
}
/**
* Compares the specified document for a specified comparator against the current most relevant document.
*
* @param compIDX The comparator index of the specified comparator.
* @param doc The specified document.
* @return -1 if the specified document wasn't competitive against the current most relevant document, 1 if the
* specified document was competitive against the current most relevant document. Otherwise 0.
* @throws IOException If I/O related errors occur
*/
protected abstract int compare(int compIDX, int doc) throws IOException;
/**
* Updates the current most relevant document with the specified document.
*
* @param doc The specified document
* @throws IOException If I/O related errors occur
*/
protected abstract void updateDocHead(int doc) throws IOException;
}
}

View File

@ -0,0 +1,540 @@
package org.apache.lucene.search.grouping;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.*;
/**
* A base implementation of {@link AbstractAllGroupHeadsCollector} for retrieving the most relevant groups when grouping
* on a string based group field. More specifically this all concrete implementations of this base implementation
* use {@link org.apache.lucene.search.FieldCache.DocTermsIndex}.
*
* @lucene.experimental
*/
public abstract class TermAllGroupHeadsCollector<GH extends AbstractAllGroupHeadsCollector.GroupHead> extends AbstractAllGroupHeadsCollector<GH> {
private static final int DEFAULT_INITIAL_SIZE = 128;
final String groupField;
final BytesRef scratchBytesRef = new BytesRef();
FieldCache.DocTermsIndex groupIndex;
IndexReader.AtomicReaderContext readerContext;
protected TermAllGroupHeadsCollector(String groupField, int numberOfSorts) {
super(numberOfSorts);
this.groupField = groupField;
}
/**
* Creates an <code>AbstractAllGroupHeadsCollector</code> instance based on the supplied arguments.
* This factory method decides with implementation is best suited.
*
* Delegates to {@link #create(String, org.apache.lucene.search.Sort, int)} with an initialSize of 128.
*
* @param groupField The field to group by
* @param sortWithinGroup The sort within each group
* @return an <code>AbstractAllGroupHeadsCollector</code> instance based on the supplied arguments
* @throws IOException If I/O related errors occur
*/
public static AbstractAllGroupHeadsCollector create(String groupField, Sort sortWithinGroup) throws IOException {
return create(groupField, sortWithinGroup, DEFAULT_INITIAL_SIZE);
}
/**
* Creates an <code>AbstractAllGroupHeadsCollector</code> instance based on the supplied arguments.
* This factory method decides with implementation is best suited.
*
* @param groupField The field to group by
* @param sortWithinGroup The sort within each group
* @param initialSize The initial allocation size of the internal int set and group list which should roughly match
* the total number of expected unique groups. Be aware that the heap usage is
* 4 bytes * initialSize.
* @return an <code>AbstractAllGroupHeadsCollector</code> instance based on the supplied arguments
* @throws IOException If I/O related errors occur
*/
public static AbstractAllGroupHeadsCollector create(String groupField, Sort sortWithinGroup, int initialSize) throws IOException {
boolean sortAllScore = true;
boolean sortAllFieldValue = true;
for (SortField sortField : sortWithinGroup.getSort()) {
if (sortField.getType() == SortField.Type.SCORE) {
sortAllFieldValue = false;
} else if (needGeneralImpl(sortField)) {
return new GeneralAllGroupHeadsCollector(groupField, sortWithinGroup);
} else {
sortAllScore = false;
}
}
if (sortAllScore) {
return new ScoreAllGroupHeadsCollector(groupField, sortWithinGroup, initialSize);
} else if (sortAllFieldValue) {
return new OrdAllGroupHeadsCollector(groupField, sortWithinGroup, initialSize);
} else {
return new OrdScoreAllGroupHeadsCollector(groupField, sortWithinGroup, initialSize);
}
}
// Returns when a sort field needs the general impl.
private static boolean needGeneralImpl(SortField sortField) {
SortField.Type sortType = sortField.getType();
// Note (MvG): We can also make an optimized impl when sorting is SortField.DOC
return sortType != SortField.Type.STRING_VAL && sortType != SortField.Type.STRING && sortType != SortField.Type.SCORE;
}
// A general impl that works for any group sort.
static class GeneralAllGroupHeadsCollector extends TermAllGroupHeadsCollector<GeneralAllGroupHeadsCollector.GroupHead> {
private final Sort sortWithinGroup;
private final Map<BytesRef, GroupHead> groups;
private Scorer scorer;
GeneralAllGroupHeadsCollector(String groupField, Sort sortWithinGroup) throws IOException {
super(groupField, sortWithinGroup.getSort().length);
this.sortWithinGroup = sortWithinGroup;
groups = new HashMap<BytesRef, GroupHead>();
final SortField[] sortFields = sortWithinGroup.getSort();
for (int i = 0; i < sortFields.length; i++) {
reversed[i] = sortFields[i].getReverse() ? -1 : 1;
}
}
protected void retrieveGroupHeadAndAddIfNotExist(int doc) throws IOException {
final int ord = groupIndex.getOrd(doc);
final BytesRef groupValue = ord == 0 ? null : groupIndex.lookup(ord, scratchBytesRef);
GroupHead groupHead = groups.get(groupValue);
if (groupHead == null) {
groupHead = new GroupHead(groupValue, sortWithinGroup, doc);
groups.put(groupValue == null ? null : new BytesRef(groupValue), groupHead);
temporalResult.stop = true;
} else {
temporalResult.stop = false;
}
temporalResult.groupHead = groupHead;
}
protected Collection<GroupHead> getCollectedGroupHeads() {
return groups.values();
}
public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
this.readerContext = context;
groupIndex = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField);
for (GroupHead groupHead : groups.values()) {
for (int i = 0; i < groupHead.comparators.length; i++) {
groupHead.comparators[i] = groupHead.comparators[i].setNextReader(context);
}
}
}
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
for (GroupHead groupHead : groups.values()) {
for (FieldComparator comparator : groupHead.comparators) {
comparator.setScorer(scorer);
}
}
}
class GroupHead extends AbstractAllGroupHeadsCollector.GroupHead<BytesRef> {
final FieldComparator[] comparators;
private GroupHead(BytesRef groupValue, Sort sort, int doc) throws IOException {
super(groupValue, doc + readerContext.docBase);
final SortField[] sortFields = sort.getSort();
comparators = new FieldComparator[sortFields.length];
for (int i = 0; i < sortFields.length; i++) {
comparators[i] = sortFields[i].getComparator(1, i).setNextReader(readerContext);
comparators[i].setScorer(scorer);
comparators[i].copy(0, doc);
comparators[i].setBottom(0);
}
}
public int compare(int compIDX, int doc) throws IOException {
return comparators[compIDX].compareBottom(doc);
}
public void updateDocHead(int doc) throws IOException {
for (FieldComparator comparator : comparators) {
comparator.copy(0, doc);
comparator.setBottom(0);
}
this.doc = doc + readerContext.docBase;
}
}
}
// AbstractAllGroupHeadsCollector optimized for ord fields and scores.
static class OrdScoreAllGroupHeadsCollector extends TermAllGroupHeadsCollector<OrdScoreAllGroupHeadsCollector.GroupHead> {
private final SentinelIntSet ordSet;
private final List<GroupHead> collectedGroups;
private final SortField[] fields;
private FieldCache.DocTermsIndex[] sortsIndex;
private Scorer scorer;
private GroupHead[] segmentGroupHeads;
OrdScoreAllGroupHeadsCollector(String groupField, Sort sortWithinGroup, int initialSize) {
super(groupField, sortWithinGroup.getSort().length);
ordSet = new SentinelIntSet(initialSize, -1);
collectedGroups = new ArrayList<GroupHead>(initialSize);
final SortField[] sortFields = sortWithinGroup.getSort();
fields = new SortField[sortFields.length];
sortsIndex = new FieldCache.DocTermsIndex[sortFields.length];
for (int i = 0; i < sortFields.length; i++) {
reversed[i] = sortFields[i].getReverse() ? -1 : 1;
fields[i] = sortFields[i];
}
}
protected Collection<GroupHead> getCollectedGroupHeads() {
return collectedGroups;
}
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
}
protected void retrieveGroupHeadAndAddIfNotExist(int doc) throws IOException {
int key = groupIndex.getOrd(doc);
GroupHead groupHead;
if (!ordSet.exists(key)) {
ordSet.put(key);
BytesRef term = key == 0 ? null : groupIndex.getTerm(doc, new BytesRef());
groupHead = new GroupHead(doc, term);
collectedGroups.add(groupHead);
segmentGroupHeads[key] = groupHead;
temporalResult.stop = true;
} else {
temporalResult.stop = false;
groupHead = segmentGroupHeads[key];
}
temporalResult.groupHead = groupHead;
}
public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
this.readerContext = context;
groupIndex = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField);
for (int i = 0; i < fields.length; i++) {
if (fields[i].getType() == SortField.Type.SCORE) {
continue;
}
sortsIndex[i] = FieldCache.DEFAULT.getTermsIndex(context.reader, fields[i].getField());
}
// Clear ordSet and fill it with previous encountered groups that can occur in the current segment.
ordSet.clear();
segmentGroupHeads = new GroupHead[groupIndex.numOrd()];
for (GroupHead collectedGroup : collectedGroups) {
int ord = groupIndex.binarySearchLookup(collectedGroup.groupValue, scratchBytesRef);
if (ord >= 0) {
ordSet.put(ord);
segmentGroupHeads[ord] = collectedGroup;
for (int i = 0; i < sortsIndex.length; i++) {
if (fields[i].getType() == SortField.Type.SCORE) {
continue;
}
collectedGroup.sortOrds[i] = sortsIndex[i].binarySearchLookup(collectedGroup.sortValues[i], scratchBytesRef);
}
}
}
}
class GroupHead extends AbstractAllGroupHeadsCollector.GroupHead<BytesRef> {
BytesRef[] sortValues;
int[] sortOrds;
float[] scores;
private GroupHead(int doc, BytesRef groupValue) throws IOException {
super(groupValue, doc + readerContext.docBase);
sortValues = new BytesRef[sortsIndex.length];
sortOrds = new int[sortsIndex.length];
scores = new float[sortsIndex.length];
for (int i = 0; i < sortsIndex.length; i++) {
if (fields[i].getType() == SortField.Type.SCORE) {
scores[i] = scorer.score();
} else {
sortValues[i] = sortsIndex[i].getTerm(doc, new BytesRef());
sortOrds[i] = sortsIndex[i].getOrd(doc);
}
}
}
public int compare(int compIDX, int doc) throws IOException {
if (fields[compIDX].getType() == SortField.Type.SCORE) {
float score = scorer.score();
if (scores[compIDX] < score) {
return 1;
} else if (scores[compIDX] > score) {
return -1;
}
return 0;
} else {
if (sortOrds[compIDX] < 0) {
// The current segment doesn't contain the sort value we encountered before. Therefore the ord is negative.
return sortValues[compIDX].compareTo(sortsIndex[compIDX].getTerm(doc, scratchBytesRef));
} else {
return sortOrds[compIDX] - sortsIndex[compIDX].getOrd(doc);
}
}
}
public void updateDocHead(int doc) throws IOException {
for (int i = 0; i < sortsIndex.length; i++) {
if (fields[i].getType() == SortField.Type.SCORE) {
scores[i] = scorer.score();
} else {
sortValues[i] = sortsIndex[i].getTerm(doc, sortValues[i]);
sortOrds[i] = sortsIndex[i].getOrd(doc);
}
}
this.doc = doc + readerContext.docBase;
}
}
}
// AbstractAllGroupHeadsCollector optimized for ord fields.
static class OrdAllGroupHeadsCollector extends TermAllGroupHeadsCollector<OrdAllGroupHeadsCollector.GroupHead> {
private final SentinelIntSet ordSet;
private final List<GroupHead> collectedGroups;
private final SortField[] fields;
private FieldCache.DocTermsIndex[] sortsIndex;
private GroupHead[] segmentGroupHeads;
OrdAllGroupHeadsCollector(String groupField, Sort sortWithinGroup, int initialSize) {
super(groupField, sortWithinGroup.getSort().length);
ordSet = new SentinelIntSet(initialSize, -1);
collectedGroups = new ArrayList<GroupHead>(initialSize);
final SortField[] sortFields = sortWithinGroup.getSort();
fields = new SortField[sortFields.length];
sortsIndex = new FieldCache.DocTermsIndex[sortFields.length];
for (int i = 0; i < sortFields.length; i++) {
reversed[i] = sortFields[i].getReverse() ? -1 : 1;
fields[i] = sortFields[i];
}
}
protected Collection<GroupHead> getCollectedGroupHeads() {
return collectedGroups;
}
public void setScorer(Scorer scorer) throws IOException {
}
protected void retrieveGroupHeadAndAddIfNotExist(int doc) throws IOException {
int key = groupIndex.getOrd(doc);
GroupHead groupHead;
if (!ordSet.exists(key)) {
ordSet.put(key);
BytesRef term = key == 0 ? null : groupIndex.getTerm(doc, new BytesRef());
groupHead = new GroupHead(doc, term);
collectedGroups.add(groupHead);
segmentGroupHeads[key] = groupHead;
temporalResult.stop = true;
} else {
temporalResult.stop = false;
groupHead = segmentGroupHeads[key];
}
temporalResult.groupHead = groupHead;
}
public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
this.readerContext = context;
groupIndex = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField);
for (int i = 0; i < fields.length; i++) {
sortsIndex[i] = FieldCache.DEFAULT.getTermsIndex(context.reader, fields[i].getField());
}
// Clear ordSet and fill it with previous encountered groups that can occur in the current segment.
ordSet.clear();
segmentGroupHeads = new GroupHead[groupIndex.size()];
for (GroupHead collectedGroup : collectedGroups) {
int groupOrd = groupIndex.binarySearchLookup(collectedGroup.groupValue, scratchBytesRef);
if (groupOrd >= 0) {
ordSet.put(groupOrd);
segmentGroupHeads[groupOrd] = collectedGroup;
for (int i = 0; i < sortsIndex.length; i++) {
collectedGroup.sortOrds[i] = sortsIndex[i].binarySearchLookup(collectedGroup.sortValues[i], scratchBytesRef);
}
}
}
}
class GroupHead extends AbstractAllGroupHeadsCollector.GroupHead<BytesRef> {
BytesRef[] sortValues;
int[] sortOrds;
private GroupHead(int doc, BytesRef groupValue) throws IOException {
super(groupValue, doc + readerContext.docBase);
sortValues = new BytesRef[sortsIndex.length];
sortOrds = new int[sortsIndex.length];
for (int i = 0; i < sortsIndex.length; i++) {
sortValues[i] = sortsIndex[i].getTerm(doc, new BytesRef());
sortOrds[i] = sortsIndex[i].getOrd(doc);
}
}
public int compare(int compIDX, int doc) throws IOException {
if (sortOrds[compIDX] < 0) {
// The current segment doesn't contain the sort value we encountered before. Therefore the ord is negative.
return sortValues[compIDX].compareTo(sortsIndex[compIDX].getTerm(doc, scratchBytesRef));
} else {
return sortOrds[compIDX] - sortsIndex[compIDX].getOrd(doc);
}
}
public void updateDocHead(int doc) throws IOException {
for (int i = 0; i < sortsIndex.length; i++) {
sortValues[i] = sortsIndex[i].getTerm(doc, sortValues[i]);
sortOrds[i] = sortsIndex[i].getOrd(doc);
}
this.doc = doc + readerContext.docBase;
}
}
}
// AbstractAllGroupHeadsCollector optimized for scores.
static class ScoreAllGroupHeadsCollector extends TermAllGroupHeadsCollector<ScoreAllGroupHeadsCollector.GroupHead> {
private final SentinelIntSet ordSet;
private final List<GroupHead> collectedGroups;
private final SortField[] fields;
private Scorer scorer;
private GroupHead[] segmentGroupHeads;
ScoreAllGroupHeadsCollector(String groupField, Sort sortWithinGroup, int initialSize) {
super(groupField, sortWithinGroup.getSort().length);
ordSet = new SentinelIntSet(initialSize, -1);
collectedGroups = new ArrayList<GroupHead>(initialSize);
final SortField[] sortFields = sortWithinGroup.getSort();
fields = new SortField[sortFields.length];
for (int i = 0; i < sortFields.length; i++) {
reversed[i] = sortFields[i].getReverse() ? -1 : 1;
fields[i] = sortFields[i];
}
}
protected Collection<GroupHead> getCollectedGroupHeads() {
return collectedGroups;
}
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
}
protected void retrieveGroupHeadAndAddIfNotExist(int doc) throws IOException {
int key = groupIndex.getOrd(doc);
GroupHead groupHead;
if (!ordSet.exists(key)) {
ordSet.put(key);
BytesRef term = key == 0 ? null : groupIndex.getTerm(doc, new BytesRef());
groupHead = new GroupHead(doc, term);
collectedGroups.add(groupHead);
segmentGroupHeads[key] = groupHead;
temporalResult.stop = true;
} else {
temporalResult.stop = false;
groupHead = segmentGroupHeads[key];
}
temporalResult.groupHead = groupHead;
}
public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
this.readerContext = context;
groupIndex = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField);
// Clear ordSet and fill it with previous encountered groups that can occur in the current segment.
ordSet.clear();
segmentGroupHeads = new GroupHead[groupIndex.numOrd()];
for (GroupHead collectedGroup : collectedGroups) {
int ord = groupIndex.binarySearchLookup(collectedGroup.groupValue, scratchBytesRef);
if (ord >= 0) {
ordSet.put(ord);
segmentGroupHeads[ord] = collectedGroup;
}
}
}
class GroupHead extends AbstractAllGroupHeadsCollector.GroupHead<BytesRef> {
float[] scores;
private GroupHead(int doc, BytesRef groupValue) throws IOException {
super(groupValue, doc + readerContext.docBase);
scores = new float[fields.length];
float score = scorer.score();
for (int i = 0; i < scores.length; i++) {
scores[i] = score;
}
}
public int compare(int compIDX, int doc) throws IOException {
float score = scorer.score();
if (scores[compIDX] < score) {
return 1;
} else if (scores[compIDX] > score) {
return -1;
}
return 0;
}
public void updateDocHead(int doc) throws IOException {
float score = scorer.score();
for (int i = 0; i < scores.length; i++) {
scores[i] = score;
}
this.doc = doc + readerContext.docBase;
}
}
}
}

View File

@ -164,5 +164,20 @@ will be <code>null</code>, so if you need to present this value you'll
have to separately retrieve it (for example using stored have to separately retrieve it (for example using stored
fields, <code>FieldCache</code>, etc.). fields, <code>FieldCache</code>, etc.).
<p>Another collector is the <code>TermAllGroupHeadsCollector</code> that can be used to retrieve all most relevant
documents per group. Also known as group heads. This can be useful in situations when one wants to compute group
based facets / statistics on the complete query result. The collector can be executed during the first or second
phase.</p>
<pre class="prettyprint">
AbstractAllGroupHeadsCollector c = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup);
s.search(new TermQuery(new Term("content", searchTerm)), c);
// Return all group heads as int array
int[] groupHeadsArray = c.retrieveGroupHeads()
// Return all group heads as OpenBitSet.
int maxDoc = s.maxDoc();
OpenBitSet groupHeadsBitSet = c.retrieveGroupHeads(maxDoc)
</pre>
</body> </body>
</html> </html>

View File

@ -0,0 +1,492 @@
package org.apache.lucene.search.grouping;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import java.io.IOException;
import java.util.*;
public class TermAllGroupHeadsCollectorTest extends LuceneTestCase {
public void testBasic() throws Exception {
final String groupField = "author";
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(
random,
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
// 0
Document doc = new Document();
doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
w.addDocument(doc);
// 1
doc = new Document();
doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
w.addDocument(doc);
// 2
doc = new Document();
doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
w.addDocument(doc);
w.commit(); // To ensure a second segment
// 3
doc = new Document();
doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
w.addDocument(doc);
// 4
doc = new Document();
doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
w.addDocument(doc);
// 5
doc = new Document();
doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
w.addDocument(doc);
// 6 -- no author field
doc = new Document();
doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
w.addDocument(doc);
// 7 -- no author field
doc = new Document();
doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", "7", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
w.addDocument(doc);
IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
w.close();
int maxDoc = indexSearcher.maxDoc();
Sort sortWithinGroup = new Sort(new SortField("id", SortField.Type.INT, true));
AbstractAllGroupHeadsCollector c1 = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup);
indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
assertTrue(arrayContains(new int[]{2, 3, 5, 7}, c1.retrieveGroupHeads()));
assertTrue(openBitSetContains(new int[]{2, 3, 5, 7}, c1.retrieveGroupHeads(maxDoc), maxDoc));
AbstractAllGroupHeadsCollector c2 = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup);
indexSearcher.search(new TermQuery(new Term("content", "some")), c2);
assertTrue(arrayContains(new int[]{2, 3, 4}, c2.retrieveGroupHeads()));
assertTrue(openBitSetContains(new int[]{2, 3, 4}, c2.retrieveGroupHeads(maxDoc), maxDoc));
AbstractAllGroupHeadsCollector c3 = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup);
indexSearcher.search(new TermQuery(new Term("content", "blob")), c3);
assertTrue(arrayContains(new int[]{1, 5}, c3.retrieveGroupHeads()));
assertTrue(openBitSetContains(new int[]{1, 5}, c3.retrieveGroupHeads(maxDoc), maxDoc));
// STRING sort type triggers different implementation
Sort sortWithinGroup2 = new Sort(new SortField("id", SortField.Type.STRING, true));
AbstractAllGroupHeadsCollector c4 = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup2);
indexSearcher.search(new TermQuery(new Term("content", "random")), c4);
assertTrue(arrayContains(new int[]{2, 3, 5, 7}, c4.retrieveGroupHeads()));
assertTrue(openBitSetContains(new int[]{2, 3, 5, 7}, c4.retrieveGroupHeads(maxDoc), maxDoc));
Sort sortWithinGroup3 = new Sort(new SortField("id", SortField.Type.STRING, false));
AbstractAllGroupHeadsCollector c5 = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup3);
indexSearcher.search(new TermQuery(new Term("content", "random")), c5);
// 7 b/c higher doc id wins, even if order of field is in not in reverse.
assertTrue(arrayContains(new int[]{0, 3, 4, 6}, c5.retrieveGroupHeads()));
assertTrue(openBitSetContains(new int[]{0, 3, 4, 6}, c5.retrieveGroupHeads(maxDoc), maxDoc));
indexSearcher.getIndexReader().close();
dir.close();
}
public void testRandom() throws Exception {
int numberOfRuns = _TestUtil.nextInt(random, 3, 6);
for (int iter = 0; iter < numberOfRuns; iter++) {
if (VERBOSE) {
System.out.println(String.format("TEST: iter=%d total=%d", iter, numberOfRuns));
}
final int numDocs = _TestUtil.nextInt(random, 100, 1000) * RANDOM_MULTIPLIER;
final int numGroups = _TestUtil.nextInt(random, 1, numDocs);
if (VERBOSE) {
System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
}
final List<BytesRef> groups = new ArrayList<BytesRef>();
for (int i = 0; i < numGroups; i++) {
groups.add(new BytesRef(_TestUtil.randomRealisticUnicodeString(random)));
}
final String[] contentStrings = new String[_TestUtil.nextInt(random, 2, 20)];
if (VERBOSE) {
System.out.println("TEST: create fake content");
}
for (int contentIDX = 0; contentIDX < contentStrings.length; contentIDX++) {
final StringBuilder sb = new StringBuilder();
sb.append("real").append(random.nextInt(3)).append(' ');
final int fakeCount = random.nextInt(10);
for (int fakeIDX = 0; fakeIDX < fakeCount; fakeIDX++) {
sb.append("fake ");
}
contentStrings[contentIDX] = sb.toString();
if (VERBOSE) {
System.out.println(" content=" + sb.toString());
}
}
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(
random,
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random)));
Document doc = new Document();
Document docNoGroup = new Document();
Field group = newField("group", "", Field.Index.NOT_ANALYZED);
doc.add(group);
Field sort1 = newField("sort1", "", Field.Index.NOT_ANALYZED);
doc.add(sort1);
docNoGroup.add(sort1);
Field sort2 = newField("sort2", "", Field.Index.NOT_ANALYZED);
doc.add(sort2);
docNoGroup.add(sort2);
Field sort3 = newField("sort3", "", Field.Index.NOT_ANALYZED);
doc.add(sort3);
docNoGroup.add(sort3);
Field content = newField("content", "", Field.Index.ANALYZED);
doc.add(content);
docNoGroup.add(content);
NumericField id = new NumericField("id");
doc.add(id);
docNoGroup.add(id);
final GroupDoc[] groupDocs = new GroupDoc[numDocs];
for (int i = 0; i < numDocs; i++) {
final BytesRef groupValue;
if (random.nextInt(24) == 17) {
// So we test the "doc doesn't have the group'd
// field" case:
groupValue = null;
} else {
groupValue = groups.get(random.nextInt(groups.size()));
}
final GroupDoc groupDoc = new GroupDoc(
i,
groupValue,
groups.get(random.nextInt(groups.size())),
groups.get(random.nextInt(groups.size())),
new BytesRef(String.format("%05d", i)),
contentStrings[random.nextInt(contentStrings.length)]
);
if (VERBOSE) {
System.out.println(" doc content=" + groupDoc.content + " id=" + i + " group=" + (groupDoc.group == null ? "null" : groupDoc.group.utf8ToString()) + " sort1=" + groupDoc.sort1.utf8ToString() + " sort2=" + groupDoc.sort2.utf8ToString() + " sort3=" + groupDoc.sort3.utf8ToString());
}
groupDocs[i] = groupDoc;
if (groupDoc.group != null) {
group.setValue(groupDoc.group.utf8ToString());
}
sort1.setValue(groupDoc.sort1.utf8ToString());
sort2.setValue(groupDoc.sort2.utf8ToString());
sort3.setValue(groupDoc.sort3.utf8ToString());
content.setValue(groupDoc.content);
id.setIntValue(groupDoc.id);
if (groupDoc.group == null) {
w.addDocument(docNoGroup);
} else {
w.addDocument(doc);
}
}
final IndexReader r = w.getReader();
w.close();
// NOTE: intentional but temporary field cache insanity!
final int[] docIdToFieldId = FieldCache.DEFAULT.getInts(r, "id");
final int[] fieldIdToDocID = new int[numDocs];
for (int i = 0; i < docIdToFieldId.length; i++) {
int fieldId = docIdToFieldId[i];
fieldIdToDocID[fieldId] = i;
}
try {
final IndexSearcher s = newSearcher(r);
for (int contentID = 0; contentID < 3; contentID++) {
final ScoreDoc[] hits = s.search(new TermQuery(new Term("content", "real" + contentID)), numDocs).scoreDocs;
for (ScoreDoc hit : hits) {
final GroupDoc gd = groupDocs[docIdToFieldId[hit.doc]];
assertTrue(gd.score == 0.0);
gd.score = hit.score;
int docId = gd.id;
assertEquals(docId, docIdToFieldId[hit.doc]);
}
}
for (GroupDoc gd : groupDocs) {
assertTrue(gd.score != 0.0);
}
for (int searchIter = 0; searchIter < 100; searchIter++) {
if (VERBOSE) {
System.out.println("TEST: searchIter=" + searchIter);
}
final String searchTerm = "real" + random.nextInt(3);
boolean sortByScoreOnly = random.nextBoolean();
Sort sortWithinGroup = getRandomSort(sortByScoreOnly);
AbstractAllGroupHeadsCollector allGroupHeadsCollector = TermAllGroupHeadsCollector.create("group", sortWithinGroup);
s.search(new TermQuery(new Term("content", searchTerm)), allGroupHeadsCollector);
int[] expectedGroupHeads = createExpectedGroupHeads(searchTerm, groupDocs, sortWithinGroup, sortByScoreOnly, fieldIdToDocID);
int[] actualGroupHeads = allGroupHeadsCollector.retrieveGroupHeads();
// The actual group heads contains Lucene ids. Need to change them into our id value.
for (int i = 0; i < actualGroupHeads.length; i++) {
actualGroupHeads[i] = docIdToFieldId[actualGroupHeads[i]];
}
// Allows us the easily iterate and assert the actual and expected results.
Arrays.sort(expectedGroupHeads);
Arrays.sort(actualGroupHeads);
if (VERBOSE) {
System.out.println("Collector: " + allGroupHeadsCollector.getClass().getSimpleName());
System.out.println("Sort within group: " + sortWithinGroup);
System.out.println("Num group: " + numGroups);
System.out.println("Num doc: " + numDocs);
System.out.println("\n=== Expected: \n");
for (int expectedDocId : expectedGroupHeads) {
GroupDoc expectedGroupDoc = groupDocs[expectedDocId];
String expectedGroup = expectedGroupDoc.group == null ? null : expectedGroupDoc.group.utf8ToString();
System.out.println(
String.format(
"Group:%10s score%5f Sort1:%10s Sort2:%10s Sort3:%10s doc:%5d",
expectedGroup, expectedGroupDoc.score, expectedGroupDoc.sort1.utf8ToString(),
expectedGroupDoc.sort2.utf8ToString(), expectedGroupDoc.sort3.utf8ToString(), expectedDocId
)
);
}
System.out.println("\n=== Actual: \n");
for (int actualDocId : actualGroupHeads) {
GroupDoc actualGroupDoc = groupDocs[actualDocId];
String actualGroup = actualGroupDoc.group == null ? null : actualGroupDoc.group.utf8ToString();
System.out.println(
String.format(
"Group:%10s score%5f Sort1:%10s Sort2:%10s Sort3:%10s doc:%5d",
actualGroup, actualGroupDoc.score, actualGroupDoc.sort1.utf8ToString(),
actualGroupDoc.sort2.utf8ToString(), actualGroupDoc.sort3.utf8ToString(), actualDocId
)
);
}
System.out.println("\n===================================================================================");
}
assertEquals(expectedGroupHeads.length, actualGroupHeads.length);
for (int i = 0; i < expectedGroupHeads.length; i++) {
assertEquals(expectedGroupHeads[i], actualGroupHeads[i]);
}
}
s.close();
} finally {
FieldCache.DEFAULT.purge(r);
}
r.close();
dir.close();
}
}
private boolean arrayContains(int[] expected, int[] actual) {
if (expected.length != actual.length) {
return false;
}
for (int e : expected) {
boolean found = false;
for (int a : actual) {
if (e == a) {
found = true;
}
}
if (!found) {
return false;
}
}
return true;
}
private boolean openBitSetContains(int[] expectedDocs, FixedBitSet actual, int maxDoc) throws IOException {
if (expectedDocs.length != actual.cardinality()) {
return false;
}
FixedBitSet expected = new FixedBitSet(maxDoc);
for (int expectedDoc : expectedDocs) {
expected.set(expectedDoc);
}
int docId;
DocIdSetIterator iterator = expected.iterator();
while ((docId = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (!actual.get(docId)) {
return false;
}
}
return true;
}
private int[] createExpectedGroupHeads(String searchTerm, GroupDoc[] groupDocs, Sort docSort, boolean sortByScoreOnly, int[] fieldIdToDocID) throws IOException {
Map<BytesRef, List<GroupDoc>> groupHeads = new HashMap<BytesRef, List<GroupDoc>>();
for (GroupDoc groupDoc : groupDocs) {
if (!groupDoc.content.startsWith(searchTerm)) {
continue;
}
if (!groupHeads.containsKey(groupDoc.group)) {
List<GroupDoc> list = new ArrayList<GroupDoc>();
list.add(groupDoc);
groupHeads.put(groupDoc.group, list);
continue;
}
groupHeads.get(groupDoc.group).add(groupDoc);
}
int[] allGroupHeads = new int[groupHeads.size()];
int i = 0;
for (BytesRef groupValue : groupHeads.keySet()) {
List<GroupDoc> docs = groupHeads.get(groupValue);
Collections.sort(docs, getComparator(docSort, sortByScoreOnly, fieldIdToDocID));
allGroupHeads[i++] = docs.get(0).id;
}
return allGroupHeads;
}
private Sort getRandomSort(boolean scoreOnly) {
final List<SortField> sortFields = new ArrayList<SortField>();
if (random.nextInt(7) == 2 || scoreOnly) {
sortFields.add(SortField.FIELD_SCORE);
} else {
if (random.nextBoolean()) {
if (random.nextBoolean()) {
sortFields.add(new SortField("sort1", SortField.Type.STRING, random.nextBoolean()));
} else {
sortFields.add(new SortField("sort2", SortField.Type.STRING, random.nextBoolean()));
}
} else if (random.nextBoolean()) {
sortFields.add(new SortField("sort1", SortField.Type.STRING, random.nextBoolean()));
sortFields.add(new SortField("sort2", SortField.Type.STRING, random.nextBoolean()));
}
}
// Break ties:
if (random.nextBoolean() && !scoreOnly) {
sortFields.add(new SortField("sort3", SortField.Type.STRING));
} else if (!scoreOnly) {
sortFields.add(new SortField("id", SortField.Type.INT));
}
return new Sort(sortFields.toArray(new SortField[sortFields.size()]));
}
private Comparator<GroupDoc> getComparator(Sort sort, final boolean sortByScoreOnly, final int[] fieldIdToDocID) {
final SortField[] sortFields = sort.getSort();
return new Comparator<GroupDoc>() {
@Override
public int compare(GroupDoc d1, GroupDoc d2) {
for (SortField sf : sortFields) {
final int cmp;
if (sf.getType() == SortField.Type.SCORE) {
if (d1.score > d2.score) {
cmp = -1;
} else if (d1.score < d2.score) {
cmp = 1;
} else {
cmp = sortByScoreOnly ? fieldIdToDocID[d1.id] - fieldIdToDocID[d2.id] : 0;
}
} else if (sf.getField().equals("sort1")) {
cmp = d1.sort1.compareTo(d2.sort1);
} else if (sf.getField().equals("sort2")) {
cmp = d1.sort2.compareTo(d2.sort2);
} else if (sf.getField().equals("sort3")) {
cmp = d1.sort3.compareTo(d2.sort3);
} else {
assertEquals(sf.getField(), "id");
cmp = d1.id - d2.id;
}
if (cmp != 0) {
return sf.getReverse() ? -cmp : cmp;
}
}
// Our sort always fully tie breaks:
fail();
return 0;
}
};
}
private static class GroupDoc {
final int id;
final BytesRef group;
final BytesRef sort1;
final BytesRef sort2;
final BytesRef sort3;
// content must be "realN ..."
final String content;
float score;
public GroupDoc(int id, BytesRef group, BytesRef sort1, BytesRef sort2, BytesRef sort3, String content) {
this.id = id;
this.group = group;
this.sort1 = sort1;
this.sort2 = sort2;
this.sort3 = sort3;
this.content = content;
}
}
}