mirror of https://github.com/apache/lucene.git
LUCENE-3802: Support for grouped faceting.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1298144 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a57f29a367
commit
64f0ebe2b6
|
@ -72,6 +72,8 @@ New Features
|
||||||
start/endOffset, if offsets are indexed. (Alan Woodward via Mike
|
start/endOffset, if offsets are indexed. (Alan Woodward via Mike
|
||||||
McCandless)
|
McCandless)
|
||||||
|
|
||||||
|
* LUCENE-3802: Support for grouped faceting. (Martijn van Groningen)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-2606: Changed RegexCapabilities interface to fix thread
|
* LUCENE-2606: Changed RegexCapabilities interface to fix thread
|
||||||
|
|
|
@ -216,6 +216,13 @@ public class DocTermOrds {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return The number of terms in this field
|
||||||
|
*/
|
||||||
|
public int numTerms() {
|
||||||
|
return numTermsInField;
|
||||||
|
}
|
||||||
|
|
||||||
/** Subclass can override this */
|
/** Subclass can override this */
|
||||||
protected void visitTerm(TermsEnum te, int termNum) throws IOException {
|
protected void visitTerm(TermsEnum te, int termNum) throws IOException {
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,224 @@
|
||||||
|
package org.apache.lucene.search.grouping;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.search.Collector;
|
||||||
|
import org.apache.lucene.search.Scorer;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class for computing grouped facets.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public abstract class AbstractGroupFacetCollector extends Collector {
|
||||||
|
|
||||||
|
protected final String groupField;
|
||||||
|
protected final String facetField;
|
||||||
|
protected final BytesRef facetPrefix;
|
||||||
|
|
||||||
|
protected AbstractGroupFacetCollector(String groupField, String facetField, BytesRef facetPrefix) {
|
||||||
|
this.groupField = groupField;
|
||||||
|
this.facetField = facetField;
|
||||||
|
this.facetPrefix = facetPrefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns grouped facet results that were computed over zero or more segments.
|
||||||
|
* Grouped facet counts are merged from zero or more segment results.
|
||||||
|
*
|
||||||
|
* @param size The total number of facets to include. This is typically offset + limit
|
||||||
|
* @param minCount The minimum count a facet entry should have to be included in the grouped facet result
|
||||||
|
* @param orderByCount Whether to sort the facet entries by facet entry count. If <code>false</code> then the facets
|
||||||
|
* are sorted lexicographically in ascending order.
|
||||||
|
* @return grouped facet results
|
||||||
|
* @throws IOException If I/O related errors occur during merging segment grouped facet counts.
|
||||||
|
*/
|
||||||
|
public abstract GroupedFacetResult mergeSegmentResults(int size, int minCount, boolean orderByCount) throws IOException;
|
||||||
|
|
||||||
|
public void setScorer(Scorer scorer) throws IOException {
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean acceptsDocsOutOfOrder() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The grouped facet result. Containing grouped facet entries, total count and total missing count.
|
||||||
|
*/
|
||||||
|
public static class GroupedFacetResult {
|
||||||
|
|
||||||
|
private final static Comparator<FacetEntry> orderByCountAndValue = new Comparator<FacetEntry>() {
|
||||||
|
|
||||||
|
public int compare(FacetEntry a, FacetEntry b) {
|
||||||
|
int cmp = b.count - a.count; // Highest count first!
|
||||||
|
if (cmp != 0) {
|
||||||
|
return cmp;
|
||||||
|
}
|
||||||
|
return a.value.compareTo(b.value);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
private final static Comparator<FacetEntry> orderByValue = new Comparator<FacetEntry>() {
|
||||||
|
|
||||||
|
public int compare(FacetEntry a, FacetEntry b) {
|
||||||
|
return a.value.compareTo(b.value);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
private final int maxSize;
|
||||||
|
private final NavigableSet<FacetEntry> facetEntries;
|
||||||
|
private final int totalMissingCount;
|
||||||
|
private final int totalCount;
|
||||||
|
|
||||||
|
private int currentMin;
|
||||||
|
|
||||||
|
public GroupedFacetResult(int size, int minCount, boolean orderByCount, int totalCount, int totalMissingCount) {
|
||||||
|
this.facetEntries = new TreeSet<FacetEntry>(orderByCount ? orderByCountAndValue : orderByValue);
|
||||||
|
this.totalMissingCount = totalMissingCount;
|
||||||
|
this.totalCount = totalCount;
|
||||||
|
maxSize = size;
|
||||||
|
currentMin = minCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addFacetCount(BytesRef facetValue, int count) {
|
||||||
|
if (count < currentMin) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
FacetEntry facetEntry = new FacetEntry(facetValue, count);
|
||||||
|
if (facetEntries.size() == maxSize) {
|
||||||
|
if (facetEntries.higher(facetEntry) == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
facetEntries.pollLast();
|
||||||
|
}
|
||||||
|
facetEntries.add(facetEntry);
|
||||||
|
|
||||||
|
if (facetEntries.size() == maxSize) {
|
||||||
|
currentMin = facetEntries.last().count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a list of facet entries to be rendered based on the specified offset and limit.
|
||||||
|
* The facet entries are retrieved from the facet entries collected during merging.
|
||||||
|
*
|
||||||
|
* @param offset The offset in the collected facet entries during merging
|
||||||
|
* @param limit The number of facets to return starting from the offset.
|
||||||
|
* @return a list of facet entries to be rendered based on the specified offset and limit
|
||||||
|
*/
|
||||||
|
public List<FacetEntry> getFacetEntries(int offset, int limit) {
|
||||||
|
List<FacetEntry> entries = new LinkedList<FacetEntry>();
|
||||||
|
limit += offset;
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
for (FacetEntry facetEntry : facetEntries) {
|
||||||
|
if (i < offset) {
|
||||||
|
i++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (i++ >= limit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
entries.add(facetEntry);
|
||||||
|
}
|
||||||
|
return entries;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the sum of all facet entries counts.
|
||||||
|
*
|
||||||
|
* @return the sum of all facet entries counts
|
||||||
|
*/
|
||||||
|
public int getTotalCount() {
|
||||||
|
return totalCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of groups that didn't have a facet value.
|
||||||
|
*
|
||||||
|
* @return the number of groups that didn't have a facet value
|
||||||
|
*/
|
||||||
|
public int getTotalMissingCount() {
|
||||||
|
return totalMissingCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents a facet entry with a value and a count.
|
||||||
|
*/
|
||||||
|
public static class FacetEntry {
|
||||||
|
|
||||||
|
private final BytesRef value;
|
||||||
|
private final int count;
|
||||||
|
|
||||||
|
public FacetEntry(BytesRef value, int count) {
|
||||||
|
this.value = value;
|
||||||
|
this.count = count;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) return true;
|
||||||
|
if (o == null || getClass() != o.getClass()) return false;
|
||||||
|
|
||||||
|
FacetEntry that = (FacetEntry) o;
|
||||||
|
|
||||||
|
if (count != that.count) return false;
|
||||||
|
if (!value.equals(that.value)) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int result = value.hashCode();
|
||||||
|
result = 31 * result + count;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "FacetEntry{" +
|
||||||
|
"value=" + value.utf8ToString() +
|
||||||
|
", count=" + count +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return The value of this facet entry
|
||||||
|
*/
|
||||||
|
public BytesRef getValue() {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return The count (number of groups) of this facet entry.
|
||||||
|
*/
|
||||||
|
public int getCount() {
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,391 @@
|
||||||
|
package org.apache.lucene.search.grouping.term;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.AtomicReaderContext;
|
||||||
|
import org.apache.lucene.index.DocTermOrds;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
|
import org.apache.lucene.search.FieldCache;
|
||||||
|
import org.apache.lucene.search.grouping.AbstractGroupFacetCollector;
|
||||||
|
import org.apache.lucene.util.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An implementation of {@link AbstractGroupFacetCollector} that computes grouped facets based on the indexed terms
|
||||||
|
* from the {@link FieldCache}.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public abstract class TermGroupFacetCollector extends AbstractGroupFacetCollector {
|
||||||
|
|
||||||
|
final List<GroupedFacetHit> groupedFacetHits;
|
||||||
|
final SentinelIntSet segmentGroupedFacetHits;
|
||||||
|
final List<SegmentResult> segmentResults;
|
||||||
|
final BytesRef spare = new BytesRef();
|
||||||
|
|
||||||
|
FieldCache.DocTermsIndex groupFieldTermsIndex;
|
||||||
|
int[] segmentFacetCounts;
|
||||||
|
int segmentTotalCount;
|
||||||
|
int startFacetOrd;
|
||||||
|
int endFacetOrd;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory method for creating the right implementation based on the fact whether the facet field contains
|
||||||
|
* multiple tokens per documents.
|
||||||
|
*
|
||||||
|
* @param groupField The group field
|
||||||
|
* @param facetField The facet field
|
||||||
|
* @param facetFieldMultivalued Whether the facet field has multiple tokens per document
|
||||||
|
* @param facetPrefix The facet prefix a facet entry should start with to be included.
|
||||||
|
* @param initialSize The initial allocation size of the internal int set and group facet list which should roughly
|
||||||
|
* match the total number of expected unique groups. Be aware that the heap usage is
|
||||||
|
* 4 bytes * initialSize.
|
||||||
|
* @return <code>TermGroupFacetCollector</code> implementation
|
||||||
|
*/
|
||||||
|
public static TermGroupFacetCollector createTermGroupFacetCollector(String groupField,
|
||||||
|
String facetField,
|
||||||
|
boolean facetFieldMultivalued,
|
||||||
|
BytesRef facetPrefix,
|
||||||
|
int initialSize) {
|
||||||
|
if (facetFieldMultivalued) {
|
||||||
|
return new MV(groupField, facetField, facetPrefix, initialSize);
|
||||||
|
} else {
|
||||||
|
return new SV(groupField, facetField, facetPrefix, initialSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TermGroupFacetCollector(String groupField, String facetField, BytesRef facetPrefix, int initialSize) {
|
||||||
|
super(groupField, facetField, facetPrefix);
|
||||||
|
groupedFacetHits = new ArrayList<GroupedFacetHit>(initialSize);
|
||||||
|
segmentGroupedFacetHits = new SentinelIntSet(initialSize, -1);
|
||||||
|
segmentResults = new ArrayList<SegmentResult>();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
public GroupedFacetResult mergeSegmentResults(int size, int minCount, boolean orderByCount) throws IOException {
|
||||||
|
if (segmentFacetCounts != null) {
|
||||||
|
segmentResults.add(createSegmentResult());
|
||||||
|
segmentFacetCounts = null; // reset
|
||||||
|
}
|
||||||
|
|
||||||
|
int totalCount = 0;
|
||||||
|
int missingCount = 0;
|
||||||
|
SegmentResultPriorityQueue segments = new SegmentResultPriorityQueue(segmentResults.size());
|
||||||
|
for (SegmentResult segmentResult : segmentResults) {
|
||||||
|
missingCount += segmentResult.missing;
|
||||||
|
if (segmentResult.mergePos >= segmentResult.maxTermPos) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
totalCount += segmentResult.total;
|
||||||
|
segmentResult.initializeForMerge();
|
||||||
|
segments.add(segmentResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
GroupedFacetResult facetResult = new GroupedFacetResult(size, minCount, orderByCount, totalCount, missingCount);
|
||||||
|
while (segments.size() > 0) {
|
||||||
|
SegmentResult segmentResult = segments.top();
|
||||||
|
BytesRef currentFacetValue = BytesRef.deepCopyOf(segmentResult.mergeTerm);
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
do {
|
||||||
|
count += segmentResult.counts[segmentResult.mergePos++];
|
||||||
|
if (segmentResult.mergePos < segmentResult.maxTermPos) {
|
||||||
|
segmentResult.nextTerm();
|
||||||
|
segmentResult = segments.updateTop();
|
||||||
|
} else {
|
||||||
|
segments.pop();
|
||||||
|
segmentResult = segments.top();
|
||||||
|
if (segmentResult == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (currentFacetValue.equals(segmentResult.mergeTerm));
|
||||||
|
facetResult.addFacetCount(currentFacetValue, count);
|
||||||
|
}
|
||||||
|
return facetResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract SegmentResult createSegmentResult();
|
||||||
|
|
||||||
|
// Implementation for single valued facet fields.
|
||||||
|
static class SV extends TermGroupFacetCollector {
|
||||||
|
|
||||||
|
private FieldCache.DocTermsIndex facetFieldTermsIndex;
|
||||||
|
|
||||||
|
SV(String groupField, String facetField, BytesRef facetPrefix, int initialSize) {
|
||||||
|
super(groupField, facetField, facetPrefix, initialSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void collect(int doc) throws IOException {
|
||||||
|
int facetOrd = facetFieldTermsIndex.getOrd(doc);
|
||||||
|
if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int groupOrd = groupFieldTermsIndex.getOrd(doc);
|
||||||
|
int segmentGroupedFacetsIndex = (groupOrd * facetFieldTermsIndex.numOrd()) + facetOrd;
|
||||||
|
if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
segmentTotalCount++;
|
||||||
|
segmentFacetCounts[facetOrd]++;
|
||||||
|
|
||||||
|
segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
|
||||||
|
groupedFacetHits.add(
|
||||||
|
new GroupedFacetHit(
|
||||||
|
groupOrd == 0 ? null : groupFieldTermsIndex.lookup(groupOrd, new BytesRef()),
|
||||||
|
facetOrd == 0 ? null : facetFieldTermsIndex.lookup(facetOrd, new BytesRef())
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setNextReader(AtomicReaderContext context) throws IOException {
|
||||||
|
if (segmentFacetCounts != null) {
|
||||||
|
segmentResults.add(createSegmentResult());
|
||||||
|
}
|
||||||
|
|
||||||
|
groupFieldTermsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), groupField);
|
||||||
|
facetFieldTermsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), facetField);
|
||||||
|
segmentFacetCounts = new int[facetFieldTermsIndex.numOrd()];
|
||||||
|
segmentTotalCount = 0;
|
||||||
|
|
||||||
|
segmentGroupedFacetHits.clear();
|
||||||
|
for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
|
||||||
|
int facetOrd = facetFieldTermsIndex.binarySearchLookup(groupedFacetHit.facetValue, spare);
|
||||||
|
if (facetOrd < 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int groupOrd = groupFieldTermsIndex.binarySearchLookup(groupedFacetHit.groupValue, spare);
|
||||||
|
if (groupOrd < 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int segmentGroupedFacetsIndex = (groupOrd * facetFieldTermsIndex.numOrd()) + facetOrd;
|
||||||
|
segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (facetPrefix != null) {
|
||||||
|
startFacetOrd = facetFieldTermsIndex.binarySearchLookup(facetPrefix, spare);
|
||||||
|
if (startFacetOrd < 0) {
|
||||||
|
// Points to the ord one higher than facetPrefix
|
||||||
|
startFacetOrd = -startFacetOrd - 1;
|
||||||
|
}
|
||||||
|
BytesRef facetEndPrefix = BytesRef.deepCopyOf(facetPrefix);
|
||||||
|
facetEndPrefix.append(UnicodeUtil.BIG_TERM);
|
||||||
|
endFacetOrd = facetFieldTermsIndex.binarySearchLookup(facetEndPrefix, spare);
|
||||||
|
endFacetOrd = -endFacetOrd - 1; // Points to the ord one higher than facetEndPrefix
|
||||||
|
} else {
|
||||||
|
startFacetOrd = 0;
|
||||||
|
endFacetOrd = facetFieldTermsIndex.numOrd();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected SegmentResult createSegmentResult() {
|
||||||
|
return new SegmentResult(segmentFacetCounts, segmentTotalCount, facetFieldTermsIndex.getTermsEnum(), startFacetOrd, endFacetOrd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Implementation for multi valued facet fields.
|
||||||
|
static class MV extends TermGroupFacetCollector {
|
||||||
|
|
||||||
|
private DocTermOrds facetFieldDocTermOrds;
|
||||||
|
private TermsEnum facetOrdTermsEnum;
|
||||||
|
private DocTermOrds.TermOrdsIterator reuse;
|
||||||
|
|
||||||
|
MV(String groupField, String facetField, BytesRef facetPrefix, int initialSize) {
|
||||||
|
super(groupField, facetField, facetPrefix, initialSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void collect(int doc) throws IOException {
|
||||||
|
int groupOrd = groupFieldTermsIndex.getOrd(doc);
|
||||||
|
reuse = facetFieldDocTermOrds.lookup(doc, reuse);
|
||||||
|
int chunk;
|
||||||
|
boolean first = true;
|
||||||
|
int[] buffer = new int[5];
|
||||||
|
do {
|
||||||
|
chunk = reuse.read(buffer);
|
||||||
|
if (first && chunk == 0) {
|
||||||
|
chunk = 1;
|
||||||
|
buffer[0] = facetFieldDocTermOrds.numTerms(); // this facet ord is reserved for docs not containing facet field.
|
||||||
|
}
|
||||||
|
first = false;
|
||||||
|
|
||||||
|
for (int pos = 0; pos < chunk; pos++) {
|
||||||
|
int facetOrd = buffer[pos];
|
||||||
|
if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int segmentGroupedFacetsIndex = (groupOrd * (facetFieldDocTermOrds.numTerms() + 1)) + facetOrd;
|
||||||
|
if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
segmentTotalCount++;
|
||||||
|
segmentFacetCounts[facetOrd]++;
|
||||||
|
|
||||||
|
segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
|
||||||
|
groupedFacetHits.add(
|
||||||
|
new GroupedFacetHit(
|
||||||
|
groupOrd == 0 ? null : groupFieldTermsIndex.lookup(groupOrd, new BytesRef()),
|
||||||
|
facetOrd == facetFieldDocTermOrds.numTerms() ? null : BytesRef.deepCopyOf(facetFieldDocTermOrds.lookupTerm(facetOrdTermsEnum, facetOrd))
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} while (chunk >= buffer.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setNextReader(AtomicReaderContext context) throws IOException {
|
||||||
|
if (segmentFacetCounts != null) {
|
||||||
|
segmentResults.add(createSegmentResult());
|
||||||
|
}
|
||||||
|
|
||||||
|
reuse = null;
|
||||||
|
groupFieldTermsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), groupField);
|
||||||
|
facetFieldDocTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), facetField);
|
||||||
|
facetOrdTermsEnum = facetFieldDocTermOrds.getOrdTermsEnum(context.reader());
|
||||||
|
// [facetFieldDocTermOrds.numTerms() + 1] for all possible facet values and docs not containing facet field
|
||||||
|
segmentFacetCounts = new int[facetFieldDocTermOrds.numTerms() + 1];
|
||||||
|
segmentTotalCount = 0;
|
||||||
|
|
||||||
|
segmentGroupedFacetHits.clear();
|
||||||
|
for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
|
||||||
|
int groupOrd = groupFieldTermsIndex.binarySearchLookup(groupedFacetHit.groupValue, spare);
|
||||||
|
if (groupOrd < 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int facetOrd;
|
||||||
|
if (groupedFacetHit.facetValue != null) {
|
||||||
|
if (!facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue, true)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
facetOrd = (int) facetOrdTermsEnum.ord();
|
||||||
|
} else {
|
||||||
|
facetOrd = facetFieldDocTermOrds.numTerms();
|
||||||
|
}
|
||||||
|
|
||||||
|
// (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not containing facet field
|
||||||
|
int segmentGroupedFacetsIndex = (groupOrd * (facetFieldDocTermOrds.numTerms() + 1)) + facetOrd;
|
||||||
|
segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (facetPrefix != null) {
|
||||||
|
TermsEnum.SeekStatus seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix, true);
|
||||||
|
if (seekStatus != TermsEnum.SeekStatus.END) {
|
||||||
|
startFacetOrd = (int) facetOrdTermsEnum.ord();
|
||||||
|
} else {
|
||||||
|
startFacetOrd = 0;
|
||||||
|
endFacetOrd = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
BytesRef facetEndPrefix = BytesRef.deepCopyOf(facetPrefix);
|
||||||
|
facetEndPrefix.append(UnicodeUtil.BIG_TERM);
|
||||||
|
seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix, true);
|
||||||
|
if (seekStatus != TermsEnum.SeekStatus.END) {
|
||||||
|
endFacetOrd = (int) facetOrdTermsEnum.ord();
|
||||||
|
} else {
|
||||||
|
endFacetOrd = facetFieldDocTermOrds.numTerms(); // Don't include null...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
startFacetOrd = 0;
|
||||||
|
endFacetOrd = facetFieldDocTermOrds.numTerms() + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected SegmentResult createSegmentResult() {
|
||||||
|
return new SegmentResult(segmentFacetCounts, segmentTotalCount, facetFieldDocTermOrds.numTerms(), facetOrdTermsEnum, startFacetOrd, endFacetOrd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
class SegmentResult {
|
||||||
|
|
||||||
|
final int[] counts;
|
||||||
|
final int total;
|
||||||
|
final int missing;
|
||||||
|
|
||||||
|
// Used for merging the segment results
|
||||||
|
BytesRef mergeTerm;
|
||||||
|
int mergePos;
|
||||||
|
final int maxTermPos;
|
||||||
|
final TermsEnum tenum;
|
||||||
|
|
||||||
|
SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd, int endFacetOrd) {
|
||||||
|
this.counts = counts;
|
||||||
|
this.missing = counts[0];
|
||||||
|
this.total = total - missing;
|
||||||
|
this.tenum = tenum;
|
||||||
|
this.mergePos = startFacetOrd == 0 ? 1 : startFacetOrd;
|
||||||
|
this.maxTermPos = endFacetOrd;
|
||||||
|
}
|
||||||
|
|
||||||
|
SegmentResult(int[] counts, int total, int missingCountIndex, TermsEnum tenum, int startFacetOrd, int endFacetOrd) {
|
||||||
|
this.counts = counts;
|
||||||
|
this.missing = counts[missingCountIndex];
|
||||||
|
this.total = total - missing;
|
||||||
|
this.tenum = tenum;
|
||||||
|
this.mergePos = startFacetOrd;
|
||||||
|
if (endFacetOrd == missingCountIndex + 1) {
|
||||||
|
this.maxTermPos = missingCountIndex;
|
||||||
|
} else {
|
||||||
|
this.maxTermPos = endFacetOrd;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void initializeForMerge() throws IOException {
|
||||||
|
tenum.seekExact(mergePos);
|
||||||
|
mergeTerm = tenum.term();
|
||||||
|
}
|
||||||
|
|
||||||
|
void nextTerm() throws IOException {
|
||||||
|
mergeTerm = tenum.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
class GroupedFacetHit {
|
||||||
|
|
||||||
|
final BytesRef groupValue;
|
||||||
|
final BytesRef facetValue;
|
||||||
|
|
||||||
|
GroupedFacetHit(BytesRef groupValue, BytesRef facetValue) {
|
||||||
|
this.groupValue = groupValue;
|
||||||
|
this.facetValue = facetValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class SegmentResultPriorityQueue extends PriorityQueue<SegmentResult> {
|
||||||
|
|
||||||
|
SegmentResultPriorityQueue(int maxSize) {
|
||||||
|
super(maxSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean lessThan(SegmentResult a, SegmentResult b) {
|
||||||
|
return a.mergeTerm.compareTo(b.mergeTerm) < 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
package org.apache.lucene.search.grouping;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.FieldType;
|
||||||
|
import org.apache.lucene.search.Sort;
|
||||||
|
import org.apache.lucene.search.SortField;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class for grouping related tests.
|
||||||
|
*/
|
||||||
|
// TODO (MvG) : The grouping tests contain a lot of code duplication. Try to move the common code to this class..
|
||||||
|
public class AbstractGroupingTestCase extends LuceneTestCase {
|
||||||
|
|
||||||
|
protected String generateRandomNonEmptyString() {
|
||||||
|
String randomValue;
|
||||||
|
do {
|
||||||
|
// B/c of DV based impl we can't see the difference between an empty string and a null value.
|
||||||
|
// For that reason we don't generate empty string groups.
|
||||||
|
randomValue = _TestUtil.randomRealisticUnicodeString(random);
|
||||||
|
} while ("".equals(randomValue));
|
||||||
|
return randomValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,600 @@
|
||||||
|
package org.apache.lucene.search.grouping;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.document.*;
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.DocValues;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.grouping.term.TermGroupFacetCollector;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class TermGroupFacetCollectorTest extends AbstractGroupingTestCase {
|
||||||
|
|
||||||
|
public void testSimple() throws Exception {
|
||||||
|
final String groupField = "hotel";
|
||||||
|
FieldType customType = new FieldType();
|
||||||
|
customType.setStored(true);
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(
|
||||||
|
random,
|
||||||
|
dir,
|
||||||
|
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||||
|
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||||
|
boolean canUseIDV = false;// Enable later... !"Lucene3x".equals(w.w.getConfig().getCodec().getName());
|
||||||
|
|
||||||
|
// 0
|
||||||
|
Document doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "a", canUseIDV);
|
||||||
|
doc.add(new Field("airport", "ams", TextField.TYPE_UNSTORED));
|
||||||
|
doc.add(new Field("duration", "5", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
// 1
|
||||||
|
doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "a", canUseIDV);
|
||||||
|
doc.add(new Field("airport", "dus", TextField.TYPE_STORED));
|
||||||
|
doc.add(new Field("duration", "10", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
// 2
|
||||||
|
doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "b", canUseIDV);
|
||||||
|
doc.add(new Field("airport", "ams", TextField.TYPE_UNSTORED));
|
||||||
|
doc.add(new Field("duration", "10", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
w.commit(); // To ensure a second segment
|
||||||
|
|
||||||
|
// 3
|
||||||
|
doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "b", canUseIDV);
|
||||||
|
doc.add(new Field("airport", "ams", TextField.TYPE_UNSTORED));
|
||||||
|
doc.add(new Field("duration", "5", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
// 4
|
||||||
|
doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "b", canUseIDV);
|
||||||
|
doc.add(new Field("airport", "ams", TextField.TYPE_UNSTORED));
|
||||||
|
doc.add(new Field("duration", "5", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
|
||||||
|
TermGroupFacetCollector groupedAirportFacetCollector =
|
||||||
|
TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "airport", false, null, 128);
|
||||||
|
indexSearcher.search(new MatchAllDocsQuery(), groupedAirportFacetCollector);
|
||||||
|
TermGroupFacetCollector.GroupedFacetResult airportResult = groupedAirportFacetCollector.mergeSegmentResults(10, 0, false);
|
||||||
|
assertEquals(3, airportResult.getTotalCount());
|
||||||
|
assertEquals(0, airportResult.getTotalMissingCount());
|
||||||
|
|
||||||
|
List<TermGroupFacetCollector.FacetEntry> entries = airportResult.getFacetEntries(0, 10);
|
||||||
|
assertEquals(2, entries.size());
|
||||||
|
assertEquals("ams", entries.get(0).getValue().utf8ToString());
|
||||||
|
assertEquals(2, entries.get(0).getCount());
|
||||||
|
assertEquals("dus", entries.get(1).getValue().utf8ToString());
|
||||||
|
assertEquals(1, entries.get(1).getCount());
|
||||||
|
|
||||||
|
|
||||||
|
TermGroupFacetCollector groupedDurationFacetCollector =
|
||||||
|
TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "duration", false, null, 128);
|
||||||
|
indexSearcher.search(new MatchAllDocsQuery(), groupedDurationFacetCollector);
|
||||||
|
TermGroupFacetCollector.GroupedFacetResult durationResult = groupedDurationFacetCollector.mergeSegmentResults(10, 0, false);
|
||||||
|
assertEquals(4, durationResult.getTotalCount());
|
||||||
|
assertEquals(0, durationResult.getTotalMissingCount());
|
||||||
|
|
||||||
|
entries = durationResult.getFacetEntries(0, 10);
|
||||||
|
assertEquals(2, entries.size());
|
||||||
|
assertEquals("10", entries.get(0).getValue().utf8ToString());
|
||||||
|
assertEquals(2, entries.get(0).getCount());
|
||||||
|
assertEquals("5", entries.get(1).getValue().utf8ToString());
|
||||||
|
assertEquals(2, entries.get(1).getCount());
|
||||||
|
|
||||||
|
// 5
|
||||||
|
doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "b", canUseIDV);
|
||||||
|
doc.add(new Field("duration", "5", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
// 6
|
||||||
|
doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "b", canUseIDV);
|
||||||
|
doc.add(new Field("airport", "bru", TextField.TYPE_UNSTORED));
|
||||||
|
doc.add(new Field("duration", "10", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
// 7
|
||||||
|
doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "b", canUseIDV);
|
||||||
|
doc.add(new Field("airport", "bru", TextField.TYPE_UNSTORED));
|
||||||
|
doc.add(new Field("duration", "15", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
// 8
|
||||||
|
doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "a", canUseIDV);
|
||||||
|
doc.add(new Field("airport", "bru", TextField.TYPE_UNSTORED));
|
||||||
|
doc.add(new Field("duration", "10", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
indexSearcher.getIndexReader().close();
|
||||||
|
indexSearcher = new IndexSearcher(w.getReader());
|
||||||
|
groupedAirportFacetCollector = TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "airport", true, null, 128);
|
||||||
|
indexSearcher.search(new MatchAllDocsQuery(), groupedAirportFacetCollector);
|
||||||
|
airportResult = groupedAirportFacetCollector.mergeSegmentResults(3, 0, true);
|
||||||
|
assertEquals(5, airportResult.getTotalCount());
|
||||||
|
assertEquals(1, airportResult.getTotalMissingCount());
|
||||||
|
|
||||||
|
entries = airportResult.getFacetEntries(1, 2);
|
||||||
|
assertEquals(2, entries.size());
|
||||||
|
assertEquals("bru", entries.get(0).getValue().utf8ToString());
|
||||||
|
assertEquals(2, entries.get(0).getCount());
|
||||||
|
assertEquals("dus", entries.get(1).getValue().utf8ToString());
|
||||||
|
assertEquals(1, entries.get(1).getCount());
|
||||||
|
|
||||||
|
groupedDurationFacetCollector = TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "duration", false, null, 128);
|
||||||
|
indexSearcher.search(new MatchAllDocsQuery(), groupedDurationFacetCollector);
|
||||||
|
durationResult = groupedDurationFacetCollector.mergeSegmentResults(10, 2, true);
|
||||||
|
assertEquals(5, durationResult.getTotalCount());
|
||||||
|
assertEquals(0, durationResult.getTotalMissingCount());
|
||||||
|
|
||||||
|
entries = durationResult.getFacetEntries(1, 1);
|
||||||
|
assertEquals(1, entries.size());
|
||||||
|
assertEquals("5", entries.get(0).getValue().utf8ToString());
|
||||||
|
assertEquals(2, entries.get(0).getCount());
|
||||||
|
|
||||||
|
// 9
|
||||||
|
doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "c", canUseIDV);
|
||||||
|
doc.add(new Field("airport", "bru", TextField.TYPE_UNSTORED));
|
||||||
|
doc.add(new Field("duration", "15", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
// 10
|
||||||
|
doc = new Document();
|
||||||
|
addGroupField(doc, groupField, "c", canUseIDV);
|
||||||
|
doc.add(new Field("airport", "dus", TextField.TYPE_UNSTORED));
|
||||||
|
doc.add(new Field("duration", "10", TextField.TYPE_UNSTORED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
indexSearcher.getIndexReader().close();
|
||||||
|
indexSearcher = new IndexSearcher(w.getReader());
|
||||||
|
groupedAirportFacetCollector = TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "airport", false, null, 128);
|
||||||
|
indexSearcher.search(new MatchAllDocsQuery(), groupedAirportFacetCollector);
|
||||||
|
airportResult = groupedAirportFacetCollector.mergeSegmentResults(10, 0, false);
|
||||||
|
assertEquals(7, airportResult.getTotalCount());
|
||||||
|
assertEquals(1, airportResult.getTotalMissingCount());
|
||||||
|
|
||||||
|
entries = airportResult.getFacetEntries(0, 10);
|
||||||
|
assertEquals(3, entries.size());
|
||||||
|
assertEquals("ams", entries.get(0).getValue().utf8ToString());
|
||||||
|
assertEquals(2, entries.get(0).getCount());
|
||||||
|
assertEquals("bru", entries.get(1).getValue().utf8ToString());
|
||||||
|
assertEquals(3, entries.get(1).getCount());
|
||||||
|
assertEquals("dus", entries.get(2).getValue().utf8ToString());
|
||||||
|
assertEquals(2, entries.get(2).getCount());
|
||||||
|
|
||||||
|
groupedDurationFacetCollector = TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "duration", false, new BytesRef("1"), 128);
|
||||||
|
indexSearcher.search(new MatchAllDocsQuery(), groupedDurationFacetCollector);
|
||||||
|
durationResult = groupedDurationFacetCollector.mergeSegmentResults(10, 0, true);
|
||||||
|
assertEquals(5, durationResult.getTotalCount());
|
||||||
|
assertEquals(0, durationResult.getTotalMissingCount());
|
||||||
|
|
||||||
|
entries = durationResult.getFacetEntries(0, 10);
|
||||||
|
assertEquals(2, entries.size());
|
||||||
|
assertEquals("10", entries.get(0).getValue().utf8ToString());
|
||||||
|
assertEquals(3, entries.get(0).getCount());
|
||||||
|
assertEquals("15", entries.get(1).getValue().utf8ToString());
|
||||||
|
assertEquals(2, entries.get(1).getCount());
|
||||||
|
|
||||||
|
w.close();
|
||||||
|
indexSearcher.getIndexReader().close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addGroupField(Document doc, String groupField, String value, boolean canUseIDV) {
|
||||||
|
doc.add(new Field(groupField, value, TextField.TYPE_UNSTORED));
|
||||||
|
if (canUseIDV) {
|
||||||
|
doc.add(new DocValuesField(groupField, new BytesRef(value), DocValues.Type.BYTES_VAR_SORTED));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandom() throws Exception {
|
||||||
|
int numberOfRuns = _TestUtil.nextInt(random, 3, 6);
|
||||||
|
for (int indexIter = 0; indexIter < numberOfRuns; indexIter++) {
|
||||||
|
boolean multipleFacetsPerDocument = random.nextBoolean();
|
||||||
|
IndexContext context = createIndexContext(multipleFacetsPerDocument);
|
||||||
|
final IndexSearcher searcher = newSearcher(context.indexReader);
|
||||||
|
|
||||||
|
for (int searchIter = 0; searchIter < 100; searchIter++) {
|
||||||
|
String searchTerm = context.contentStrings[random.nextInt(context.contentStrings.length)];
|
||||||
|
int limit = random.nextInt(context.facetValues.size());
|
||||||
|
int offset = random.nextInt(context.facetValues.size() - limit);
|
||||||
|
int size = offset + limit;
|
||||||
|
int minCount = random.nextBoolean() ? 0 : random.nextInt(1 + context.facetWithMostGroups / 10);
|
||||||
|
boolean orderByCount = random.nextBoolean();
|
||||||
|
String randomStr = getFromSet(context.facetValues, random.nextInt(context.facetValues.size()));
|
||||||
|
final String facetPrefix;
|
||||||
|
if (randomStr == null) {
|
||||||
|
facetPrefix = null;
|
||||||
|
} else {
|
||||||
|
int codePointLen = randomStr.codePointCount(0, randomStr.length());
|
||||||
|
int randomLen = random.nextInt(codePointLen);
|
||||||
|
if (codePointLen == randomLen - 1) {
|
||||||
|
facetPrefix = null;
|
||||||
|
} else {
|
||||||
|
int end = randomStr.offsetByCodePoints(0, randomLen);
|
||||||
|
facetPrefix = random.nextBoolean() ? null : randomStr.substring(end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GroupedFacetResult expectedFacetResult = createExpectedFacetResult(searchTerm, context, offset, limit, minCount, orderByCount, facetPrefix);
|
||||||
|
TermGroupFacetCollector groupFacetCollector = createRandomCollector("group", "facet", facetPrefix, multipleFacetsPerDocument);
|
||||||
|
searcher.search(new TermQuery(new Term("content", searchTerm)), groupFacetCollector);
|
||||||
|
TermGroupFacetCollector.GroupedFacetResult actualFacetResult = groupFacetCollector.mergeSegmentResults(size, minCount, orderByCount);
|
||||||
|
|
||||||
|
List<TermGroupFacetCollector.FacetEntry> expectedFacetEntries = expectedFacetResult.getFacetEntries();
|
||||||
|
List<TermGroupFacetCollector.FacetEntry> actualFacetEntries = actualFacetResult.getFacetEntries(offset, limit);
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("Collector: " + groupFacetCollector.getClass().getSimpleName());
|
||||||
|
System.out.println("Num group: " + context.numGroups);
|
||||||
|
System.out.println("Num doc: " + context.numDocs);
|
||||||
|
System.out.println("Index iter: " + indexIter);
|
||||||
|
System.out.println("multipleFacetsPerDocument: " + multipleFacetsPerDocument);
|
||||||
|
System.out.println("Search iter: " + searchIter);
|
||||||
|
|
||||||
|
System.out.println("Search term: " + searchTerm);
|
||||||
|
System.out.println("Min count: " + minCount);
|
||||||
|
System.out.println("Facet offset: " + offset);
|
||||||
|
System.out.println("Facet limit: " + limit);
|
||||||
|
System.out.println("Facet prefix: " + facetPrefix);
|
||||||
|
System.out.println("Order by count: " + orderByCount);
|
||||||
|
|
||||||
|
System.out.println("\n=== Expected: \n");
|
||||||
|
System.out.println("Total count " + expectedFacetResult.getTotalCount());
|
||||||
|
System.out.println("Total missing count " + expectedFacetResult.getTotalMissingCount());
|
||||||
|
int counter = 1;
|
||||||
|
for (TermGroupFacetCollector.FacetEntry expectedFacetEntry : expectedFacetEntries) {
|
||||||
|
System.out.println(
|
||||||
|
String.format(
|
||||||
|
"%d. Expected facet value %s with count %d",
|
||||||
|
counter++, expectedFacetEntry.getValue().utf8ToString(), expectedFacetEntry.getCount()
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("\n=== Actual: \n");
|
||||||
|
System.out.println("Total count " + actualFacetResult.getTotalCount());
|
||||||
|
System.out.println("Total missing count " + actualFacetResult.getTotalMissingCount());
|
||||||
|
counter = 1;
|
||||||
|
for (TermGroupFacetCollector.FacetEntry actualFacetEntry : actualFacetEntries) {
|
||||||
|
System.out.println(
|
||||||
|
String.format(
|
||||||
|
"%d. Actual facet value %s with count %d",
|
||||||
|
counter++, actualFacetEntry.getValue().utf8ToString(), actualFacetEntry.getCount()
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
System.out.println("\n===================================================================================");
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expectedFacetResult.getTotalCount(), actualFacetResult.getTotalCount());
|
||||||
|
assertEquals(expectedFacetResult.getTotalMissingCount(), actualFacetResult.getTotalMissingCount());
|
||||||
|
assertEquals(expectedFacetEntries.size(), actualFacetEntries.size());
|
||||||
|
for (int i = 0; i < expectedFacetEntries.size(); i++) {
|
||||||
|
TermGroupFacetCollector.FacetEntry expectedFacetEntry = expectedFacetEntries.get(i);
|
||||||
|
TermGroupFacetCollector.FacetEntry actualFacetEntry = actualFacetEntries.get(i);
|
||||||
|
assertEquals(expectedFacetEntry.getValue().utf8ToString() + " != " + actualFacetEntry.getValue().utf8ToString(), expectedFacetEntry.getValue(), actualFacetEntry.getValue());
|
||||||
|
assertEquals(expectedFacetEntry.getCount() + " != " + actualFacetEntry.getCount(), expectedFacetEntry.getCount(), actualFacetEntry.getCount());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
context.indexReader.close();
|
||||||
|
context.dir.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private IndexContext createIndexContext(boolean multipleFacetValuesPerDocument) throws IOException {
|
||||||
|
final int numDocs = _TestUtil.nextInt(random, 138, 1145) * RANDOM_MULTIPLIER;
|
||||||
|
final int numGroups = _TestUtil.nextInt(random, 1, numDocs / 4);
|
||||||
|
final int numFacets = _TestUtil.nextInt(random, 1, numDocs / 6);
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> groups = new ArrayList<String>();
|
||||||
|
for (int i = 0; i < numGroups; i++) {
|
||||||
|
groups.add(generateRandomNonEmptyString());
|
||||||
|
}
|
||||||
|
final List<String> facetValues = new ArrayList<String>();
|
||||||
|
for (int i = 0; i < numFacets; i++) {
|
||||||
|
facetValues.add(generateRandomNonEmptyString());
|
||||||
|
}
|
||||||
|
final String[] contentBrs = new String[_TestUtil.nextInt(random, 2, 20)];
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: create fake content");
|
||||||
|
}
|
||||||
|
for (int contentIDX = 0; contentIDX < contentBrs.length; contentIDX++) {
|
||||||
|
contentBrs[contentIDX] = generateRandomNonEmptyString();
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" content=" + contentBrs[contentIDX]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(
|
||||||
|
random,
|
||||||
|
dir,
|
||||||
|
newIndexWriterConfig(
|
||||||
|
TEST_VERSION_CURRENT,
|
||||||
|
new MockAnalyzer(random)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
Document docNoGroup = new Document();
|
||||||
|
Document docNoFacet = new Document();
|
||||||
|
Document docNoGroupNoFacet = new Document();
|
||||||
|
Field group = newField("group", "", StringField.TYPE_UNSTORED);
|
||||||
|
doc.add(group);
|
||||||
|
docNoFacet.add(group);
|
||||||
|
Field[] facetFields = multipleFacetValuesPerDocument? new Field[2 + random.nextInt(6)] : new Field[1];
|
||||||
|
for (int i = 0; i < facetFields.length; i++) {
|
||||||
|
facetFields[i] = newField("facet", "", StringField.TYPE_UNSTORED);
|
||||||
|
doc.add(facetFields[i]);
|
||||||
|
docNoGroup.add(facetFields[i]);
|
||||||
|
}
|
||||||
|
Field content = newField("content", "", StringField.TYPE_UNSTORED);
|
||||||
|
doc.add(content);
|
||||||
|
docNoGroup.add(content);
|
||||||
|
docNoFacet.add(content);
|
||||||
|
docNoGroupNoFacet.add(content);
|
||||||
|
|
||||||
|
NavigableSet<String> uniqueFacetValues = new TreeSet<String>(new Comparator<String>() {
|
||||||
|
|
||||||
|
public int compare(String a, String b) {
|
||||||
|
if (a == b) {
|
||||||
|
return 0;
|
||||||
|
} else if (a == null) {
|
||||||
|
return -1;
|
||||||
|
} else if (b == null) {
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
return a.compareTo(b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
Map<String, Map<String, Set<String>>> searchTermToFacetToGroups = new HashMap<String, Map<String, Set<String>>>();
|
||||||
|
int facetWithMostGroups = 0;
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
final String groupValue;
|
||||||
|
if (random.nextInt(24) == 17) {
|
||||||
|
// So we test the "doc doesn't have the group'd
|
||||||
|
// field" case:
|
||||||
|
groupValue = null;
|
||||||
|
} else {
|
||||||
|
groupValue = groups.get(random.nextInt(groups.size()));
|
||||||
|
}
|
||||||
|
|
||||||
|
String contentStr = contentBrs[random.nextInt(contentBrs.length)];
|
||||||
|
if (!searchTermToFacetToGroups.containsKey(contentStr)) {
|
||||||
|
searchTermToFacetToGroups.put(contentStr, new HashMap<String, Set<String>>());
|
||||||
|
}
|
||||||
|
Map<String, Set<String>> facetToGroups = searchTermToFacetToGroups.get(contentStr);
|
||||||
|
|
||||||
|
List<String> facetVals = new ArrayList<String>();
|
||||||
|
if (random.nextInt(24) != 18) {
|
||||||
|
for (Field facetField : facetFields) {
|
||||||
|
String facetValue = facetValues.get(random.nextInt(facetValues.size()));
|
||||||
|
uniqueFacetValues.add(facetValue);
|
||||||
|
if (!facetToGroups.containsKey(facetValue)) {
|
||||||
|
facetToGroups.put(facetValue, new HashSet<String>());
|
||||||
|
}
|
||||||
|
Set<String> groupsInFacet = facetToGroups.get(facetValue);
|
||||||
|
groupsInFacet.add(groupValue);
|
||||||
|
if (groupsInFacet.size() > facetWithMostGroups) {
|
||||||
|
facetWithMostGroups = groupsInFacet.size();
|
||||||
|
}
|
||||||
|
facetField.setStringValue(facetValue);
|
||||||
|
facetVals.add(facetValue);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
uniqueFacetValues.add(null);
|
||||||
|
if (!facetToGroups.containsKey(null)) {
|
||||||
|
facetToGroups.put(null, new HashSet<String>());
|
||||||
|
}
|
||||||
|
Set<String> groupsInFacet = facetToGroups.get(null);
|
||||||
|
groupsInFacet.add(groupValue);
|
||||||
|
if (groupsInFacet.size() > facetWithMostGroups) {
|
||||||
|
facetWithMostGroups = groupsInFacet.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(" doc content=" + contentStr + " group=" + (groupValue == null ? "null" : groupValue) + " facetVals=" + facetVals);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (groupValue != null) {
|
||||||
|
group.setStringValue(groupValue);
|
||||||
|
}
|
||||||
|
content.setStringValue(contentStr);
|
||||||
|
if (groupValue == null && facetVals.isEmpty()) {
|
||||||
|
writer.addDocument(docNoGroupNoFacet);
|
||||||
|
} else if (facetVals.isEmpty()) {
|
||||||
|
writer.addDocument(docNoFacet);
|
||||||
|
} else if (groupValue == null) {
|
||||||
|
writer.addDocument(docNoGroup);
|
||||||
|
} else {
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DirectoryReader reader = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
return new IndexContext(searchTermToFacetToGroups, reader, numDocs, dir, facetWithMostGroups, numGroups, contentBrs, uniqueFacetValues);
|
||||||
|
}
|
||||||
|
|
||||||
|
private GroupedFacetResult createExpectedFacetResult(String searchTerm, IndexContext context, int offset, int limit, int minCount, final boolean orderByCount, String facetPrefix) {
|
||||||
|
Map<String, Set<String>> facetGroups = context.searchTermToFacetGroups.get(searchTerm);
|
||||||
|
if (facetGroups == null) {
|
||||||
|
facetGroups = new HashMap<String, Set<String>>();
|
||||||
|
}
|
||||||
|
|
||||||
|
int totalCount = 0;
|
||||||
|
int totalMissCount = 0;
|
||||||
|
Set<String> facetValues;
|
||||||
|
if (facetPrefix != null) {
|
||||||
|
facetValues = new HashSet<String>();
|
||||||
|
for (String facetValue : context.facetValues) {
|
||||||
|
if (facetValue != null && facetValue.startsWith(facetPrefix)) {
|
||||||
|
facetValues.add(facetValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
facetValues = context.facetValues;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<TermGroupFacetCollector.FacetEntry> entries = new ArrayList<TermGroupFacetCollector.FacetEntry>(facetGroups.size());
|
||||||
|
// also includes facets with count 0
|
||||||
|
for (String facetValue : facetValues) {
|
||||||
|
if (facetValue == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> groups = facetGroups.get(facetValue);
|
||||||
|
int count = groups != null ? groups.size() : 0;
|
||||||
|
if (count >= minCount) {
|
||||||
|
entries.add(new TermGroupFacetCollector.FacetEntry(new BytesRef(facetValue), count));
|
||||||
|
}
|
||||||
|
totalCount += count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only include null count when no facet prefix is specified
|
||||||
|
if (facetPrefix == null) {
|
||||||
|
Set<String> groups = facetGroups.get(null);
|
||||||
|
if (groups != null) {
|
||||||
|
totalMissCount = groups.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Collections.sort(entries, new Comparator<TermGroupFacetCollector.FacetEntry>() {
|
||||||
|
|
||||||
|
public int compare(TermGroupFacetCollector.FacetEntry a, TermGroupFacetCollector.FacetEntry b) {
|
||||||
|
if (orderByCount) {
|
||||||
|
int cmp = b.getCount() - a.getCount();
|
||||||
|
if (cmp != 0) {
|
||||||
|
return cmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return a.getValue().compareTo(b.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
int endOffset = offset + limit;
|
||||||
|
List<TermGroupFacetCollector.FacetEntry> entriesResult;
|
||||||
|
if (offset >= entries.size()) {
|
||||||
|
entriesResult = Collections.emptyList();
|
||||||
|
} else if (endOffset >= entries.size()) {
|
||||||
|
entriesResult = entries.subList(offset, entries.size());
|
||||||
|
} else {
|
||||||
|
entriesResult = entries.subList(offset, endOffset);
|
||||||
|
}
|
||||||
|
return new GroupedFacetResult(totalCount, totalMissCount, entriesResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
private TermGroupFacetCollector createRandomCollector(String groupField, String facetField, String facetPrefix, boolean multipleFacetsPerDocument) {
|
||||||
|
BytesRef facetPrefixBR = facetPrefix == null ? null : new BytesRef(facetPrefix);
|
||||||
|
return TermGroupFacetCollector.createTermGroupFacetCollector(groupField, facetField, multipleFacetsPerDocument, facetPrefixBR, random.nextInt(1024));
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getFromSet(Set<String> set, int index) {
|
||||||
|
int currentIndex = 0;
|
||||||
|
for (String bytesRef : set) {
|
||||||
|
if (currentIndex++ == index) {
|
||||||
|
return bytesRef;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private class IndexContext {
|
||||||
|
|
||||||
|
final int numDocs;
|
||||||
|
final DirectoryReader indexReader;
|
||||||
|
final Map<String, Map<String, Set<String>>> searchTermToFacetGroups;
|
||||||
|
final NavigableSet<String> facetValues;
|
||||||
|
final Directory dir;
|
||||||
|
final int facetWithMostGroups;
|
||||||
|
final int numGroups;
|
||||||
|
final String[] contentStrings;
|
||||||
|
|
||||||
|
public IndexContext(Map<String, Map<String, Set<String>>> searchTermToFacetGroups, DirectoryReader r,
|
||||||
|
int numDocs, Directory dir, int facetWithMostGroups, int numGroups, String[] contentStrings, NavigableSet<String> facetValues) {
|
||||||
|
this.searchTermToFacetGroups = searchTermToFacetGroups;
|
||||||
|
this.indexReader = r;
|
||||||
|
this.numDocs = numDocs;
|
||||||
|
this.dir = dir;
|
||||||
|
this.facetWithMostGroups = facetWithMostGroups;
|
||||||
|
this.numGroups = numGroups;
|
||||||
|
this.contentStrings = contentStrings;
|
||||||
|
this.facetValues = facetValues;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class GroupedFacetResult {
|
||||||
|
|
||||||
|
final int totalCount;
|
||||||
|
final int totalMissingCount;
|
||||||
|
final List<TermGroupFacetCollector.FacetEntry> facetEntries;
|
||||||
|
|
||||||
|
private GroupedFacetResult(int totalCount, int totalMissingCount, List<TermGroupFacetCollector.FacetEntry> facetEntries) {
|
||||||
|
this.totalCount = totalCount;
|
||||||
|
this.totalMissingCount = totalMissingCount;
|
||||||
|
this.facetEntries = facetEntries;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getTotalCount() {
|
||||||
|
return totalCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getTotalMissingCount() {
|
||||||
|
return totalMissingCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<TermGroupFacetCollector.FacetEntry> getFacetEntries() {
|
||||||
|
return facetEntries;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue