LUCENE-3444: Added a second pass grouping collector that keeps track of distinct values for a specified field for the top N group.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303370 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Martijn van Groningen 2012-03-21 12:41:06 +00:00
parent a98decea1d
commit 1d642b3cd7
8 changed files with 1080 additions and 0 deletions

View File

@ -74,6 +74,9 @@ New Features
* LUCENE-3802, LUCENE-3856: Support for grouped faceting. (Martijn van Groningen)
* LUCENE-3444: Added a second pass grouping collector that keeps track of distinct
values for a specified field for the top N group. (Martijn van Groningen)
API Changes
* LUCENE-2606: Changed RegexCapabilities interface to fix thread

View File

@ -0,0 +1,58 @@
package org.apache.lucene.search.grouping;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
import java.io.IOException;
import java.util.*;
/**
* A second pass grouping collector that keeps track of distinct values for a specified field for the top N group.
*
* @lucene.experimental
*/
public abstract class AbstractDistinctValuesCollector<GC extends AbstractDistinctValuesCollector.GroupCount<?>> extends Collector {
/**
* Returns all unique values for each top N group.
*
* @return all unique values for each top N group
*/
public abstract List<GC> getGroups();
public boolean acceptsDocsOutOfOrder() {
return true;
}
public void setScorer(Scorer scorer) throws IOException {
}
public abstract static class GroupCount<GROUP_VALUE_TYPE> {
public final GROUP_VALUE_TYPE groupValue;
public final Set<GROUP_VALUE_TYPE> uniqueValues;
public GroupCount(GROUP_VALUE_TYPE groupValue) {
this.groupValue = groupValue;
this.uniqueValues = new HashSet<GROUP_VALUE_TYPE>();
}
}
}

View File

@ -0,0 +1,297 @@
package org.apache.lucene.search.grouping.dv;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.search.grouping.AbstractDistinctValuesCollector;
import org.apache.lucene.search.grouping.SearchGroup;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SentinelIntSet;
import org.apache.lucene.index.DocValues.Type; // javadocs
import java.io.IOException;
import java.util.*;
/**
* Docvalues implementation of {@link org.apache.lucene.search.grouping.AbstractDistinctValuesCollector}.
*
* @lucene.experimental
*/
public abstract class DVDistinctValuesCollector<GC extends AbstractDistinctValuesCollector.GroupCount<?>> extends AbstractDistinctValuesCollector<GC> {
final String groupField;
final String countField;
final boolean diskResident;
final Type valueType;
DVDistinctValuesCollector(String groupField, String countField, boolean diskResident, Type valueType) {
this.groupField = groupField;
this.countField = countField;
this.diskResident = diskResident;
this.valueType = valueType;
}
/**
* Constructs a docvalues based implementation of {@link org.apache.lucene.search.grouping.AbstractDistinctValuesCollector} based on the specified
* type.
*
* @param groupField The field to group by
* @param countField The field to count distinct values for
* @param groups The top N groups, collected during the first phase search
* @param diskResident Whether the values to group and count by should be disk resident
* @param type The {@link Type} which is used to select a concrete implementation
* @return a docvalues based distinct count collector
*/
@SuppressWarnings("unchecked")
public static <T> DVDistinctValuesCollector<GroupCount<T>> create(String groupField, String countField, Collection<SearchGroup<T>> groups, boolean diskResident, Type type) {
switch (type) {
case VAR_INTS:
case FIXED_INTS_8:
case FIXED_INTS_16:
case FIXED_INTS_32:
case FIXED_INTS_64:
// Type erasure b/c otherwise we have inconvertible types...
return (DVDistinctValuesCollector) new NonSorted.Lng(groupField, countField, (Collection) groups, diskResident, type);
case FLOAT_32:
case FLOAT_64:
// Type erasure b/c otherwise we have inconvertible types...
return (DVDistinctValuesCollector) new NonSorted.Dbl(groupField, countField, (Collection) groups, diskResident, type);
case BYTES_FIXED_STRAIGHT:
case BYTES_FIXED_DEREF:
case BYTES_VAR_STRAIGHT:
case BYTES_VAR_DEREF:
// Type erasure b/c otherwise we have inconvertible types...
return (DVDistinctValuesCollector) new NonSorted.BR(groupField, countField, (Collection) groups, diskResident, type);
case BYTES_VAR_SORTED:
case BYTES_FIXED_SORTED:
// Type erasure b/c otherwise we have inconvertible types...
return (DVDistinctValuesCollector) new Sorted.BR(groupField, countField, (Collection) groups, diskResident, type);
default:
throw new IllegalArgumentException(String.format("ValueType %s not supported", type));
}
}
static abstract class NonSorted<K> extends DVDistinctValuesCollector<NonSorted.GroupCount> {
final Map<K, GroupCount> groupMap = new LinkedHashMap<K, GroupCount>();
DocValues.Source groupFieldSource;
DocValues.Source countFieldSource;
NonSorted(String groupField, String countField, boolean diskResident, Type valueType) {
super(groupField, countField, diskResident, valueType);
}
public List<GroupCount> getGroups() {
return new ArrayList<GroupCount>(groupMap.values());
}
public void setNextReader(AtomicReaderContext context) throws IOException {
groupFieldSource = retrieveSource(groupField, context);
countFieldSource = retrieveSource(countField, context);
}
private DocValues.Source retrieveSource(String fieldName, AtomicReaderContext context) throws IOException {
DocValues groupFieldDv = context.reader().docValues(fieldName);
if (groupFieldDv != null) {
return diskResident ? groupFieldDv.getDirectSource() : groupFieldDv.getSource();
} else {
return DocValues.getDefaultSource(valueType);
}
}
static class Dbl extends NonSorted<Double> {
Dbl(String groupField, String countField, Collection<SearchGroup<Double>> groups, boolean diskResident, Type valueType) {
super(groupField, countField, diskResident, valueType);
for (SearchGroup<Double> group : groups) {
groupMap.put(group.groupValue, new GroupCount(group.groupValue));
}
}
public void collect(int doc) throws IOException {
GroupCount groupCount = groupMap.get(groupFieldSource.getFloat(doc));
if (groupCount != null) {
groupCount.uniqueValues.add(countFieldSource.getFloat(doc));
}
}
}
static class Lng extends NonSorted<Long> {
Lng(String groupField, String countField, Collection<SearchGroup<Long>> groups, boolean diskResident, Type valueType) {
super(groupField, countField, diskResident, valueType);
for (SearchGroup<Long> group : groups) {
groupMap.put(group.groupValue, new GroupCount(group.groupValue));
}
}
public void collect(int doc) throws IOException {
GroupCount groupCount = groupMap.get(groupFieldSource.getInt(doc));
if (groupCount != null) {
groupCount.uniqueValues.add(countFieldSource.getInt(doc));
}
}
}
static class BR extends NonSorted<BytesRef> {
private final BytesRef spare = new BytesRef();
BR(String groupField, String countField, Collection<SearchGroup<BytesRef>> groups, boolean diskResident, Type valueType) {
super(groupField, countField, diskResident, valueType);
for (SearchGroup<BytesRef> group : groups) {
groupMap.put(group.groupValue, new GroupCount(group.groupValue));
}
}
public void collect(int doc) throws IOException {
GroupCount groupCount = groupMap.get(groupFieldSource.getBytes(doc, spare));
if (groupCount != null) {
BytesRef countValue = countFieldSource.getBytes(doc, spare);
if (!groupCount.uniqueValues.contains(countValue)) {
groupCount.uniqueValues.add(BytesRef.deepCopyOf(countValue));
}
}
}
}
static class GroupCount extends AbstractDistinctValuesCollector.GroupCount<Comparable<?>> {
GroupCount(Comparable<?> groupValue) {
super(groupValue);
}
}
}
static abstract class Sorted extends DVDistinctValuesCollector<Sorted.GroupCount> {
final SentinelIntSet ordSet;
final GroupCount groupCounts[];
final List<GroupCount> groups = new ArrayList<GroupCount>();
DocValues.SortedSource groupFieldSource;
DocValues.SortedSource countFieldSource;
Sorted(String groupField, String countField, int groupSize, boolean diskResident, Type valueType) {
super(groupField, countField, diskResident, valueType);
ordSet = new SentinelIntSet(groupSize, -1);
groupCounts = new GroupCount[ordSet.keys.length];
}
public List<GroupCount> getGroups() {
return groups;
}
public void setNextReader(AtomicReaderContext context) throws IOException {
groupFieldSource = retrieveSortedSource(groupField, context);
countFieldSource = retrieveSortedSource(countField, context);
ordSet.clear();
}
private DocValues.SortedSource retrieveSortedSource(String field, AtomicReaderContext context) throws IOException {
DocValues countFieldDv = context.reader().docValues(field);
if (countFieldDv != null) {
return diskResident ? countFieldDv.getDirectSource().asSortedSource() : countFieldDv.getSource().asSortedSource();
} else {
return DocValues.getDefaultSortedSource(valueType, context.reader().maxDoc());
}
}
static class BR extends Sorted {
final BytesRef spare = new BytesRef();
BR(String groupField, String countField, Collection<SearchGroup<BytesRef>> searchGroups, boolean diskResident, Type valueType) {
super(groupField, countField, searchGroups.size(), diskResident, valueType);
for (SearchGroup<BytesRef> group : searchGroups) {
this.groups.add(new GroupCount(group.groupValue));
}
}
public void collect(int doc) throws IOException {
int slot = ordSet.find(groupFieldSource.ord(doc));
if (slot < 0) {
return;
}
GroupCount gc = groupCounts[slot];
int countOrd = countFieldSource.ord(doc);
if (doesNotContainsOrd(countOrd, gc.ords)) {
gc.uniqueValues.add(countFieldSource.getByOrd(countOrd, new BytesRef()));
gc.ords = Arrays.copyOf(gc.ords, gc.ords.length + 1);
gc.ords[gc.ords.length - 1] = countOrd;
if (gc.ords.length > 1) {
Arrays.sort(gc.ords);
}
}
}
private boolean doesNotContainsOrd(int ord, int[] ords) {
if (ords.length == 0) {
return true;
} else if (ords.length == 1) {
return ord != ords[0];
}
return Arrays.binarySearch(ords, ord) < 0;
}
@Override
public void setNextReader(AtomicReaderContext context) throws IOException {
super.setNextReader(context);
for (GroupCount group : groups) {
int groupOrd = groupFieldSource.getOrdByValue((BytesRef) group.groupValue, spare);
if (groupOrd < 0) {
continue;
}
groupCounts[ordSet.put(groupOrd)] = group;
group.ords = new int[group.uniqueValues.size()];
Arrays.fill(group.ords, -1);
int i = 0;
for (Comparable<?> value : group.uniqueValues) {
int countOrd = countFieldSource.getOrdByValue((BytesRef) value, spare);
if (countOrd >= 0) {
group.ords[i++] = countOrd;
}
}
}
}
}
static class GroupCount extends AbstractDistinctValuesCollector.GroupCount<Comparable<?>> {
int[] ords;
GroupCount(Comparable<?> groupValue) {
super(groupValue);
}
}
}
}

View File

@ -0,0 +1,86 @@
package org.apache.lucene.search.grouping.function;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.grouping.AbstractDistinctValuesCollector;
import org.apache.lucene.search.grouping.SearchGroup;
import org.apache.lucene.util.mutable.MutableValue;
import java.io.IOException;
import java.util.*;
/**
* Function based implementation of {@link org.apache.lucene.search.grouping.AbstractDistinctValuesCollector}.
*
* @lucene.experimental
*/
public class FunctionDistinctValuesCollector extends AbstractDistinctValuesCollector<FunctionDistinctValuesCollector.GroupCount> {
private final Map<?, ?> vsContext;
private final ValueSource groupSource;
private final ValueSource countSource;
private final Map<MutableValue, GroupCount> groupMap;
private FunctionValues.ValueFiller groupFiller;
private FunctionValues.ValueFiller countFiller;
private MutableValue groupMval;
private MutableValue countMval;
public FunctionDistinctValuesCollector(Map<?, ?> vsContext, ValueSource groupSource, ValueSource countSource, Collection<SearchGroup<MutableValue>> groups) {
this.vsContext = vsContext;
this.groupSource = groupSource;
this.countSource = countSource;
groupMap = new LinkedHashMap<MutableValue, GroupCount>();
for (SearchGroup<MutableValue> group : groups) {
groupMap.put(group.groupValue, new GroupCount(group.groupValue));
}
}
public List<GroupCount> getGroups() {
return new ArrayList<GroupCount>(groupMap.values());
}
public void collect(int doc) throws IOException {
groupFiller.fillValue(doc);
GroupCount groupCount = groupMap.get(groupMval);
if (groupCount != null) {
countFiller.fillValue(doc);
groupCount.uniqueValues.add(countMval.duplicate());
}
}
public void setNextReader(AtomicReaderContext context) throws IOException {
FunctionValues values = groupSource.getValues(vsContext, context);
groupFiller = values.getValueFiller();
groupMval = groupFiller.getValue();
values = countSource.getValues(vsContext, context);
countFiller = values.getValueFiller();
countMval = countFiller.getValue();
}
static class GroupCount extends AbstractDistinctValuesCollector.GroupCount<MutableValue> {
GroupCount(MutableValue groupValue) {
super(groupValue);
}
}
}

View File

@ -0,0 +1,136 @@
package org.apache.lucene.search.grouping.term;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldCache.DocTermsIndex; // javadocs
import org.apache.lucene.search.grouping.AbstractDistinctValuesCollector;
import org.apache.lucene.search.grouping.SearchGroup;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SentinelIntSet;
import java.io.IOException;
import java.util.*;
/**
* A term based implementation of {@link org.apache.lucene.search.grouping.AbstractDistinctValuesCollector} that relies
* on {@link DocTermsIndex} to count the distinct values per group.
*
* @lucene.experimental
*/
public class TermDistinctValuesCollector extends AbstractDistinctValuesCollector<TermDistinctValuesCollector.GroupCount> {
private final String groupField;
private final String countField;
private final List<GroupCount> groups;
private final SentinelIntSet ordSet;
private final GroupCount groupCounts[];
private final BytesRef spare = new BytesRef();
private FieldCache.DocTermsIndex groupFieldTermIndex;
private FieldCache.DocTermsIndex countFieldTermIndex;
/**
* Constructs {@link TermDistinctValuesCollector} instance.
*
* @param groupField The field to group by
* @param countField The field to count distinct values for
* @param groups The top N groups, collected during the first phase search
*/
public TermDistinctValuesCollector(String groupField, String countField, Collection<SearchGroup<BytesRef>> groups) {
this.groupField = groupField;
this.countField = countField;
this.groups = new ArrayList<GroupCount>(groups.size());
for (SearchGroup<BytesRef> group : groups) {
this.groups.add(new GroupCount(group.groupValue));
}
ordSet = new SentinelIntSet(groups.size(), -1);
groupCounts = new GroupCount[ordSet.keys.length];
}
public void collect(int doc) throws IOException {
int slot = ordSet.find(groupFieldTermIndex.getOrd(doc));
if (slot < 0) {
return;
}
GroupCount gc = groupCounts[slot];
int countOrd = countFieldTermIndex.getOrd(doc);
if (doesNotContainsOrd(countOrd, gc.ords)) {
if (countOrd == 0) {
gc.uniqueValues.add(null);
} else {
gc.uniqueValues.add(countFieldTermIndex.lookup(countOrd, new BytesRef()));
}
gc.ords = Arrays.copyOf(gc.ords, gc.ords.length + 1);
gc.ords[gc.ords.length - 1] = countOrd;
if (gc.ords.length > 1) {
Arrays.sort(gc.ords);
}
}
}
private boolean doesNotContainsOrd(int ord, int[] ords) {
if (ords.length == 0) {
return true;
} else if (ords.length == 1) {
return ord != ords[0];
}
return Arrays.binarySearch(ords, ord) < 0;
}
public List<GroupCount> getGroups() {
return groups;
}
public void setNextReader(AtomicReaderContext context) throws IOException {
groupFieldTermIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), groupField);
countFieldTermIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), countField);
ordSet.clear();
for (GroupCount group : groups) {
int groupOrd = group.groupValue == null ? 0 : groupFieldTermIndex.binarySearchLookup(group.groupValue, spare);
if (groupOrd < 0) {
continue;
}
groupCounts[ordSet.put(groupOrd)] = group;
group.ords = new int[group.uniqueValues.size()];
Arrays.fill(group.ords, -1);
int i = 0;
for (BytesRef value : group.uniqueValues) {
int countOrd = value == null ? 0 : countFieldTermIndex.binarySearchLookup(value, new BytesRef());
if (countOrd >= 0) {
group.ords[i++] = countOrd;
}
}
}
}
static class GroupCount extends AbstractDistinctValuesCollector.GroupCount<BytesRef> {
int[] ords;
GroupCount(BytesRef groupValue) {
super(groupValue);
}
}
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.util._TestUtil;
*/
// TODO (MvG) : The grouping tests contain a lot of code duplication. Try to move the common code to this class..
public abstract class AbstractGroupingTestCase extends LuceneTestCase {
protected String generateRandomNonEmptyString() {
String randomValue;
do {
@ -34,4 +35,5 @@ public abstract class AbstractGroupingTestCase extends LuceneTestCase {
} while ("".equals(randomValue));
return randomValue;
}
}

View File

@ -506,6 +506,7 @@ public class AllGroupHeadsCollectorTest extends LuceneTestCase {
};
}
@SuppressWarnings({"unchecked","rawtypes"})
private AbstractAllGroupHeadsCollector<?> createRandomCollector(String groupField, Sort sortWithinGroup, boolean canUseIDV, Type valueType) throws IOException {
AbstractAllGroupHeadsCollector<? extends AbstractAllGroupHeadsCollector.GroupHead> collector;
if (random.nextBoolean()) {

View File

@ -0,0 +1,497 @@
package org.apache.lucene.search.grouping;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.grouping.dv.DVDistinctValuesCollector;
import org.apache.lucene.search.grouping.dv.DVFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.function.FunctionDistinctValuesCollector;
import org.apache.lucene.search.grouping.function.FunctionFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.term.TermDistinctValuesCollector;
import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.mutable.MutableValue;
import org.apache.lucene.util.mutable.MutableValueStr;
import java.io.IOException;
import java.util.*;
public class DistinctValuesCollectorTest extends AbstractGroupingTestCase {
private final static NullComparator nullComparator = new NullComparator();
private final String groupField = "author";
private final String countField = "publisher";
public void testSimple() throws Exception {
DocValues.Type[] dvTypes = new DocValues.Type[]{
DocValues.Type.VAR_INTS,
DocValues.Type.FLOAT_64,
DocValues.Type.BYTES_VAR_STRAIGHT,
DocValues.Type.BYTES_VAR_SORTED
};
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(
random,
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
boolean canUseDV = !"Lucene3x".equals(w.w.getConfig().getCodec().getName());
DocValues.Type dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.length)] : null;
Document doc = new Document();
addField(doc, groupField, "1", dvType);
addField(doc, countField, "1", dvType);
doc.add(new Field("content", "random text", TextField.TYPE_UNSTORED));
doc.add(new Field("id", "1", StringField.TYPE_UNSTORED));
w.addDocument(doc);
// 1
doc = new Document();
addField(doc, groupField, "1", dvType);
addField(doc, countField, "1", dvType);
doc.add(new Field("content", "some more random text blob", TextField.TYPE_UNSTORED));
doc.add(new Field("id", "2", StringField.TYPE_UNSTORED));
w.addDocument(doc);
// 2
doc = new Document();
addField(doc, groupField, "1", dvType);
addField(doc, countField, "2", dvType);
doc.add(new Field("content", "some more random textual data", TextField.TYPE_UNSTORED));
doc.add(new Field("id", "3", StringField.TYPE_UNSTORED));
w.addDocument(doc);
w.commit(); // To ensure a second segment
// 3
doc = new Document();
addField(doc, groupField, "2", dvType);
doc.add(new Field("content", "some random text", TextField.TYPE_UNSTORED));
doc.add(new Field("id", "4", StringField.TYPE_UNSTORED));
w.addDocument(doc);
// 4
doc = new Document();
addField(doc, groupField, "3", dvType);
addField(doc, countField, "1", dvType);
doc.add(new Field("content", "some more random text", TextField.TYPE_UNSTORED));
doc.add(new Field("id", "5", StringField.TYPE_UNSTORED));
w.addDocument(doc);
// 5
doc = new Document();
addField(doc, groupField, "3", dvType);
addField(doc, countField, "1", dvType);
doc.add(new Field("content", "random blob", TextField.TYPE_UNSTORED));
doc.add(new Field("id", "6", StringField.TYPE_UNSTORED));
w.addDocument(doc);
// 6 -- no author field
doc = new Document();
doc.add(new Field("content", "random word stuck in alot of other text", TextField.TYPE_STORED));
addField(doc, countField, "1", dvType);
doc.add(new Field("id", "6", StringField.TYPE_UNSTORED));
w.addDocument(doc);
IndexSearcher indexSearcher = newSearcher(w.getReader());
w.close();
Comparator<AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> cmp = new Comparator<AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>>() {
public int compare(AbstractDistinctValuesCollector.GroupCount<Comparable<Object>> groupCount1, AbstractDistinctValuesCollector.GroupCount<Comparable<Object>> groupCount2) {
if (groupCount1.groupValue == null) {
if (groupCount2.groupValue == null) {
return 0;
}
return -1;
} else if (groupCount2.groupValue == null) {
return 1;
} else {
return groupCount1.groupValue.compareTo(groupCount2.groupValue);
}
}
};
// === Search for content:random
AbstractFirstPassGroupingCollector<Comparable<Object>> firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
indexSearcher.search(new TermQuery(new Term("content", "random")), firstCollector);
AbstractDistinctValuesCollector<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> distinctValuesCollector
= createDistinctCountCollector(firstCollector, groupField, countField, dvType);
indexSearcher.search(new TermQuery(new Term("content", "random")), distinctValuesCollector);
List<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> gcs = distinctValuesCollector.getGroups();
Collections.sort(gcs, cmp);
assertEquals(4, gcs.size());
compareNull(gcs.get(0).groupValue);
List<Comparable<?>> countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
assertEquals(1, countValues.size());
compare("1", countValues.get(0));
compare("1", gcs.get(1).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
Collections.sort(countValues, nullComparator);
assertEquals(2, countValues.size());
compare("1", countValues.get(0));
compare("2", countValues.get(1));
compare("2", gcs.get(2).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(2).uniqueValues);
assertEquals(1, countValues.size());
compareNull(countValues.get(0));
compare("3", gcs.get(3).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(3).uniqueValues);
assertEquals(1, countValues.size());
compare("1", countValues.get(0));
// === Search for content:some
firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
indexSearcher.search(new TermQuery(new Term("content", "some")), firstCollector);
distinctValuesCollector = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
indexSearcher.search(new TermQuery(new Term("content", "some")), distinctValuesCollector);
gcs = distinctValuesCollector.getGroups();
Collections.sort(gcs, cmp);
assertEquals(3, gcs.size());
compare("1", gcs.get(0).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
assertEquals(2, countValues.size());
Collections.sort(countValues, nullComparator);
compare("1", countValues.get(0));
compare("2", countValues.get(1));
compare("2", gcs.get(1).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
assertEquals(1, countValues.size());
compareNull(countValues.get(0));
compare("3", gcs.get(2).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(2).uniqueValues);
assertEquals(1, countValues.size());
compare("1", countValues.get(0));
// === Search for content:blob
firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
indexSearcher.search(new TermQuery(new Term("content", "blob")), firstCollector);
distinctValuesCollector = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
indexSearcher.search(new TermQuery(new Term("content", "blob")), distinctValuesCollector);
gcs = distinctValuesCollector.getGroups();
Collections.sort(gcs, cmp);
assertEquals(2, gcs.size());
compare("1", gcs.get(0).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
// B/c the only one document matched with blob inside the author 1 group
assertEquals(1, countValues.size());
compare("1", countValues.get(0));
compare("3", gcs.get(1).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
assertEquals(1, countValues.size());
compare("1", countValues.get(0));
indexSearcher.getIndexReader().close();
dir.close();
}
public void testRandom() throws Exception {
int numberOfRuns = _TestUtil.nextInt(random, 3, 6);
for (int indexIter = 0; indexIter < numberOfRuns; indexIter++) {
IndexContext context = createIndexContext();
for (int searchIter = 0; searchIter < 100; searchIter++) {
final IndexSearcher searcher = newSearcher(context.indexReader);
boolean useDv = context.dvType != null && random.nextBoolean();
DocValues.Type dvType = useDv ? context.dvType : null;
String term = context.contentStrings[random.nextInt(context.contentStrings.length)];
Sort groupSort = new Sort(new SortField("id", SortField.Type.STRING));
int topN = 1 + random.nextInt(10);
List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> expectedResult = createExpectedResult(context, term, groupSort, topN);
AbstractFirstPassGroupingCollector<Comparable<?>> firstCollector = createRandomFirstPassCollector(dvType, groupSort, groupField, topN);
searcher.search(new TermQuery(new Term("content", term)), firstCollector);
AbstractDistinctValuesCollector<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> distinctValuesCollector
= createDistinctCountCollector(firstCollector, groupField, countField, dvType);
searcher.search(new TermQuery(new Term("content", term)), distinctValuesCollector);
@SuppressWarnings("unchecked")
List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> actualResult = (List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>>) distinctValuesCollector.getGroups();
if (VERBOSE) {
System.out.println("Index iter=" + indexIter);
System.out.println("Search iter=" + searchIter);
System.out.println("Collector class name=" + distinctValuesCollector.getClass().getName());
}
assertEquals(expectedResult.size(), actualResult.size());
for (int i = 0; i < expectedResult.size(); i++) {
AbstractDistinctValuesCollector.GroupCount<Comparable<?>> expected = expectedResult.get(i);
AbstractDistinctValuesCollector.GroupCount<Comparable<?>> actual = actualResult.get(i);
assertValues(expected.groupValue, actual.groupValue);
assertEquals(expected.uniqueValues.size(), actual.uniqueValues.size());
List<Comparable<?>> expectedUniqueValues = new ArrayList<Comparable<?>>(expected.uniqueValues);
Collections.sort(expectedUniqueValues, nullComparator);
List<Comparable<?>> actualUniqueValues = new ArrayList<Comparable<?>>(actual.uniqueValues);
Collections.sort(actualUniqueValues, nullComparator);
for (int j = 0; j < expected.uniqueValues.size(); j++) {
assertValues(expectedUniqueValues.get(j), actualUniqueValues.get(j));
}
}
}
context.indexReader.close();
context.directory.close();
}
}
private void assertValues(Object expected, Object actual) {
if (expected == null) {
compareNull(actual);
} else {
compare(((BytesRef) expected).utf8ToString(), actual);
}
}
private void compare(String expected, Object groupValue) {
if (BytesRef.class.isAssignableFrom(groupValue.getClass())) {
assertEquals(expected, ((BytesRef) groupValue).utf8ToString());
} else if (Double.class.isAssignableFrom(groupValue.getClass())) {
assertEquals(Double.parseDouble(expected), groupValue);
} else if (Long.class.isAssignableFrom(groupValue.getClass())) {
assertEquals(Long.parseLong(expected), groupValue);
} else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) {
MutableValueStr mutableValue = new MutableValueStr();
mutableValue.value = new BytesRef(expected);
assertEquals(mutableValue, groupValue);
} else {
fail();
}
}
private void compareNull(Object groupValue) {
if (groupValue == null) {
return; // term based impl...
}
// DV based impls..
if (BytesRef.class.isAssignableFrom(groupValue.getClass())) {
assertEquals("", ((BytesRef) groupValue).utf8ToString());
} else if (Double.class.isAssignableFrom(groupValue.getClass())) {
assertEquals(0.0d, groupValue);
} else if (Long.class.isAssignableFrom(groupValue.getClass())) {
assertEquals(0L, groupValue);
// Function based impl
} else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) {
assertFalse(((MutableValue) groupValue).exists());
} else {
fail();
}
}
private void addField(Document doc, String field, String value, DocValues.Type type) {
doc.add(new Field(field, value, StringField.TYPE_UNSTORED));
if (type == null) {
return;
}
DocValuesField valuesField = null;
switch (type) {
case VAR_INTS:
valuesField = new DocValuesField(field, Integer.parseInt(value), type);
break;
case FLOAT_64:
valuesField = new DocValuesField(field, Double.parseDouble(value), type);
break;
case BYTES_VAR_STRAIGHT:
case BYTES_VAR_SORTED:
valuesField = new DocValuesField(field, new BytesRef(value), type);
break;
}
doc.add(valuesField);
}
@SuppressWarnings({"unchecked","rawtypes"})
private <T extends Comparable> AbstractDistinctValuesCollector<AbstractDistinctValuesCollector.GroupCount<T>> createDistinctCountCollector(AbstractFirstPassGroupingCollector<T> firstPassGroupingCollector,
String groupField,
String countField,
DocValues.Type dvType) {
Collection<SearchGroup<T>> searchGroups = firstPassGroupingCollector.getTopGroups(0, false);
if (DVFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) {
boolean diskResident = random.nextBoolean();
return DVDistinctValuesCollector.create(groupField, countField, searchGroups, diskResident, dvType);
} else if (FunctionFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) {
return (AbstractDistinctValuesCollector) new FunctionDistinctValuesCollector(new HashMap<Object, Object>(), new BytesRefFieldSource(groupField), new BytesRefFieldSource(countField), (Collection) searchGroups);
} else {
return (AbstractDistinctValuesCollector) new TermDistinctValuesCollector(groupField, countField, (Collection) searchGroups);
}
}
@SuppressWarnings({"unchecked","rawtypes"})
private <T> AbstractFirstPassGroupingCollector<T> createRandomFirstPassCollector(DocValues.Type dvType, Sort groupSort, String groupField, int topNGroups) throws IOException {
if (dvType != null) {
if (random.nextBoolean()) {
boolean diskResident = random.nextBoolean();
return DVFirstPassGroupingCollector.create(groupSort, topNGroups, groupField, dvType, diskResident);
} else if (random.nextBoolean()) {
return (AbstractFirstPassGroupingCollector<T>) new FunctionFirstPassGroupingCollector(new BytesRefFieldSource(groupField), new HashMap<Object, Object>(), groupSort, topNGroups);
} else {
return (AbstractFirstPassGroupingCollector<T>) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups);
}
} else {
if (random.nextBoolean()) {
return (AbstractFirstPassGroupingCollector<T>) new FunctionFirstPassGroupingCollector(new BytesRefFieldSource(groupField), new HashMap<Object, Object>(), groupSort, topNGroups);
} else {
return (AbstractFirstPassGroupingCollector<T>) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups);
}
}
}
@SuppressWarnings({"unchecked","rawtypes"})
private List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> createExpectedResult(IndexContext context, String term, Sort groupSort, int topN) {
class GroupCount extends AbstractDistinctValuesCollector.GroupCount<BytesRef> {
GroupCount(BytesRef groupValue, Collection<BytesRef> uniqueValues) {
super(groupValue);
this.uniqueValues.addAll(uniqueValues);
}
}
List result = new ArrayList();
Map<String, Set<String>> groupCounts = context.searchTermToGroupCounts.get(term);
int i = 0;
for (String group : groupCounts.keySet()) {
if (topN <= i++) {
break;
}
Set<BytesRef> uniqueValues = new HashSet<BytesRef>();
for (String val : groupCounts.get(group)) {
uniqueValues.add(val != null ? new BytesRef(val) : null);
}
result.add(new GroupCount(group != null ? new BytesRef(group) : null, uniqueValues));
}
return result;
}
private IndexContext createIndexContext() throws Exception {
DocValues.Type[] dvTypes = new DocValues.Type[]{
DocValues.Type.BYTES_VAR_STRAIGHT,
DocValues.Type.BYTES_VAR_SORTED
};
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(
random,
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())
);
boolean canUseDV = !"Lucene3x".equals(w.w.getConfig().getCodec().getName());
DocValues.Type dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.length)] : null;
int numDocs = 86 + random.nextInt(1087) * RANDOM_MULTIPLIER;
String[] groupValues = new String[numDocs / 5];
String[] countValues = new String[numDocs / 10];
for (int i = 0; i < groupValues.length; i++) {
groupValues[i] = generateRandomNonEmptyString();
}
for (int i = 0; i < countValues.length; i++) {
countValues[i] = generateRandomNonEmptyString();
}
List<String> contentStrings = new ArrayList<String>();
Map<String, Map<String, Set<String>>> searchTermToGroupCounts = new HashMap<String, Map<String, Set<String>>>();
for (int i = 1; i <= numDocs; i++) {
String groupValue = random.nextInt(23) == 14 ? null : groupValues[random.nextInt(groupValues.length)];
String countValue = random.nextInt(21) == 13 ? null : countValues[random.nextInt(countValues.length)];
String content = "random" + random.nextInt(numDocs / 20);
Map<String, Set<String>> groupToCounts = searchTermToGroupCounts.get(content);
if (groupToCounts == null) {
// Groups sort always DOCID asc...
searchTermToGroupCounts.put(content, groupToCounts = new LinkedHashMap<String, Set<String>>());
contentStrings.add(content);
}
Set<String> countsVals = groupToCounts.get(groupValue);
if (countsVals == null) {
groupToCounts.put(groupValue, countsVals = new HashSet<String>());
}
countsVals.add(countValue);
Document doc = new Document();
doc.add(new Field("id", String.format("%09d", i), StringField.TYPE_UNSTORED));
if (groupValue != null) {
addField(doc, groupField, groupValue, dvType);
}
if (countValue != null) {
addField(doc, countField, countValue, dvType);
}
doc.add(new Field("content", content, TextField.TYPE_UNSTORED));
w.addDocument(doc);
}
DirectoryReader reader = w.getReader();
w.close();
return new IndexContext(dir, reader, dvType, searchTermToGroupCounts, contentStrings.toArray(new String[contentStrings.size()]));
}
private static class IndexContext {
final Directory directory;
final DirectoryReader indexReader;
final DocValues.Type dvType;
final Map<String, Map<String, Set<String>>> searchTermToGroupCounts;
final String[] contentStrings;
IndexContext(Directory directory, DirectoryReader indexReader, DocValues.Type dvType,
Map<String, Map<String, Set<String>>> searchTermToGroupCounts, String[] contentStrings) {
this.directory = directory;
this.indexReader = indexReader;
this.dvType = dvType;
this.searchTermToGroupCounts = searchTermToGroupCounts;
this.contentStrings = contentStrings;
}
}
private static class NullComparator implements Comparator<Comparable<?>> {
@SuppressWarnings({"unchecked","rawtypes"})
public int compare(Comparable a, Comparable b) {
if (a == b) {
return 0;
} else if (a == null) {
return -1;
} else if (b == null) {
return 1;
} else {
return a.compareTo(b);
}
}
}
}