diff --git a/lucene/src/java/org/apache/lucene/util/BytesRef.java b/lucene/src/java/org/apache/lucene/util/BytesRef.java index 6ad185f4ffa..8099722afe5 100644 --- a/lucene/src/java/org/apache/lucene/util/BytesRef.java +++ b/lucene/src/java/org/apache/lucene/util/BytesRef.java @@ -19,9 +19,6 @@ package org.apache.lucene.util; import java.util.Comparator; import java.io.UnsupportedEncodingException; -import java.io.ObjectInput; -import java.io.ObjectOutput; -import java.io.IOException; /** Represents byte[], as a slice (offset + length) into an * existing byte[]. @@ -192,6 +189,9 @@ public final class BytesRef implements Comparable { @Override public boolean equals(Object other) { + if (other == null) { + return false; + } return this.bytesEquals((BytesRef) other); } diff --git a/modules/build.xml b/modules/build.xml index 2608e3941e0..e2f9d6c0f09 100644 --- a/modules/build.xml +++ b/modules/build.xml @@ -24,6 +24,7 @@ + @@ -33,6 +34,7 @@ + @@ -42,6 +44,7 @@ + @@ -51,6 +54,7 @@ + @@ -61,6 +65,7 @@ + @@ -90,6 +95,7 @@ + diff --git a/modules/grouping/CHANGES.txt b/modules/grouping/CHANGES.txt new file mode 100644 index 00000000000..3c96651ddff --- /dev/null +++ b/modules/grouping/CHANGES.txt @@ -0,0 +1,8 @@ +Grouping Module Change Log + +======================= Trunk (not yet released) ======================= + +LUCENE-1421: create new grouping module, enabling search results to be +grouped by a single-valued indexed field. This module was factored +out of Solr's grouping implementation, except it cannot group by +function queries nor arbitrary queries. (Mike McCandless) diff --git a/modules/grouping/build.xml b/modules/grouping/build.xml new file mode 100644 index 00000000000..962eb81fabb --- /dev/null +++ b/modules/grouping/build.xml @@ -0,0 +1,13 @@ + + + + Collectors for grouping search results + + + + + + + + + diff --git a/modules/grouping/src/java/org/apache/lucene/search/grouping/CachingCollector.java b/modules/grouping/src/java/org/apache/lucene/search/grouping/CachingCollector.java new file mode 100644 index 00000000000..35facce993e --- /dev/null +++ b/modules/grouping/src/java/org/apache/lucene/search/grouping/CachingCollector.java @@ -0,0 +1,256 @@ +package org.apache.lucene.search.grouping; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Caches all docs, and optionally also scores, coming from + * a search, and is then able to replay them to another + * collector. You specify the max RAM this class may use. + * Once the collection is done, call {@link #isCached}. If + * this returns true, you can use {@link #replay} against a + * new collector. If it returns false, this means too much + * RAM was required and you must instead re-run the original + * search. + * + *

NOTE: this class consumes 4 (or 8 bytes, if + * scoring is cached) per collected document. If the result + * set is large this can easily be a very substantial amount + * of RAM! + * + * @lucene.experimental + */ +public class CachingCollector extends Collector { + + private static class SegStart { + public final AtomicReaderContext readerContext; + public final int end; + + public SegStart(AtomicReaderContext readerContext, int end) { + this.readerContext = readerContext; + this.end = end; + } + } + + // TODO: would be nice if a collector defined a + // needsScores() method so we can specialize / do checks + // up front: + private final Collector other; + private final int maxDocsToCache; + + private final Scorer cachedScorer; + private final List cachedDocs; + private final List cachedScores; + private final List cachedSegs = new ArrayList(); + + private Scorer scorer; + private int[] curDocs; + private float[] curScores; + private int upto; + private AtomicReaderContext lastReaderContext; + private float score; + private int base; + private int doc; + + public CachingCollector(Collector other, boolean cacheScores, double maxRAMMB) { + this.other = other; + if (cacheScores) { + cachedScorer = new Scorer(null) { + @Override + public float score() { + return score; + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public int docID() { + return doc; + } + + @Override + public float freq() { + throw new UnsupportedOperationException(); + } + + @Override + public int nextDoc() { + throw new UnsupportedOperationException(); + } + }; + cachedScores = new ArrayList(); + curScores = new float[128]; + cachedScores.add(curScores); + } else { + cachedScorer = null; + cachedScores = null; + } + cachedDocs = new ArrayList(); + curDocs = new int[128]; + cachedDocs.add(curDocs); + + final int bytesPerDoc; + if (curScores != null) { + bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT + RamUsageEstimator.NUM_BYTES_FLOAT; + } else { + bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT; + } + maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024)/bytesPerDoc); + } + + @Override + public void setScorer(Scorer scorer) throws IOException { + this.scorer = scorer; + other.setScorer(cachedScorer); + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return other.acceptsDocsOutOfOrder(); + } + + @Override + public void collect(int doc) throws IOException { + + if (curDocs == null) { + // Cache was too large + if (curScores != null) { + score = scorer.score(); + } + this.doc = doc; + other.collect(doc); + return; + } + + if (upto == curDocs.length) { + base += upto; + final int nextLength; + // Max out at 512K arrays: + if (curDocs.length < 524288) { + nextLength = 8*curDocs.length; + } else { + nextLength = curDocs.length; + } + + if (base + nextLength > maxDocsToCache) { + // Too many docs to collect -- clear cache + curDocs = null; + if (curScores != null) { + score = scorer.score(); + } + this.doc = doc; + other.collect(doc); + cachedDocs.clear(); + cachedScores.clear(); + return; + } + curDocs = new int[nextLength]; + cachedDocs.add(curDocs); + if (curScores != null) { + curScores = new float[nextLength]; + cachedScores.add(curScores); + } + upto = 0; + } + curDocs[upto] = doc; + // TODO: maybe specialize private subclass so we don't + // null check per collect... + if (curScores != null) { + score = curScores[upto] = scorer.score(); + } + upto++; + this.doc = doc; + other.collect(doc); + } + + public boolean isCached() { + return curDocs != null; + } + + @Override + public void setNextReader(AtomicReaderContext context) throws IOException { + other.setNextReader(context); + if (lastReaderContext != null) { + cachedSegs.add(new SegStart(lastReaderContext, base+upto)); + } + lastReaderContext = context; + } + + private final static int[] EMPTY_INT_ARRAY = new int[0]; + + @Override + public String toString() { + if (isCached()) { + return "CachingCollector (" + (base+upto) + " docs " + (curScores != null ? " & scores" : "") + " cached)"; + } else { + return "CachingCollector (cache was cleared)"; + } + } + + public void replay(Collector other) throws IOException { + if (!isCached()) { + throw new IllegalStateException("cannot replay: cache was cleared because too much RAM was required"); + } + //System.out.println("CC: replay totHits=" + (upto + base)); + if (lastReaderContext != null) { + cachedSegs.add(new SegStart(lastReaderContext, base+upto)); + lastReaderContext = null; + } + final int uptoSav = upto; + final int baseSav = base; + try { + upto = 0; + base = 0; + int chunkUpto = 0; + other.setScorer(cachedScorer); + curDocs = EMPTY_INT_ARRAY; + for(SegStart seg : cachedSegs) { + other.setNextReader(seg.readerContext); + while(base+upto < seg.end) { + if (upto == curDocs.length) { + base += curDocs.length; + curDocs = cachedDocs.get(chunkUpto); + if (curScores != null) { + curScores = cachedScores.get(chunkUpto); + } + chunkUpto++; + upto = 0; + } + if (curScores != null) { + score = curScores[upto]; + } + other.collect(curDocs[upto++]); + } + } + } finally { + upto = uptoSav; + base = baseSav; + } + } +} diff --git a/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java b/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java new file mode 100644 index 00000000000..1b2718980aa --- /dev/null +++ b/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java @@ -0,0 +1,362 @@ +package org.apache.lucene.search.grouping; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.TreeSet; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.FieldComparator; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.util.BytesRef; + +/** FirstPassGroupingCollector is the first of two passes necessary + * to collected grouped hits. This pass gathers the top N sorted + * groups. + * + * @lucene.experimental + */ + +public class FirstPassGroupingCollector extends Collector { + + private final String groupField; + private final Sort groupSort; + private final FieldComparator[] comparators; + private final int[] reversed; + private final int topNGroups; + private final HashMap groupMap; + private final BytesRef scratchBytesRef = new BytesRef(); + private final int compIDXEnd; + + // Set once we reach topNGroups unique groups: + private TreeSet orderedGroups; + private int docBase; + private int spareSlot; + private FieldCache.DocTermsIndex index; + + /** + * Create the first pass collector. + * + * @param groupField The field used to group + * documents. This field must be single-valued and + * indexed (FieldCache is used to access its value + * per-document). + * @param groupSort The {@link Sort} used to sort the + * groups. The top sorted document within each group + * according to groupSort, determines how that group + * sorts against other groups. This must be non-null, + * ie, if you want to groupSort by relevance use + * Sort.RELEVANCE. + * @param topNGroups How many top groups to keep. + */ + public FirstPassGroupingCollector(String groupField, Sort groupSort, int topNGroups) throws IOException { + if (topNGroups < 1) { + throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")"); + } + + this.groupField = groupField; + // TODO: allow null groupSort to mean "by relevance", + // and specialize it? + this.groupSort = groupSort; + + this.topNGroups = topNGroups; + + final SortField[] sortFields = groupSort.getSort(); + comparators = new FieldComparator[sortFields.length]; + compIDXEnd = comparators.length - 1; + reversed = new int[sortFields.length]; + for (int i = 0; i < sortFields.length; i++) { + final SortField sortField = sortFields[i]; + + // use topNGroups + 1 so we have a spare slot to use for comparing (tracked by this.spareSlot): + comparators[i] = sortField.getComparator(topNGroups + 1, i); + reversed[i] = sortField.getReverse() ? -1 : 1; + } + + spareSlot = topNGroups; + groupMap = new HashMap(topNGroups); + } + + /** Returns top groups, starting from offset. This may + * return null, if no groups were collected, or if the + * number of unique groups collected is <= offset. */ + public Collection getTopGroups(int groupOffset, boolean fillFields) { + + //System.out.println("FP.getTopGroups groupOffset=" + groupOffset + " fillFields=" + fillFields + " groupMap.size()=" + groupMap.size()); + + if (groupOffset < 0) { + throw new IllegalArgumentException("groupOffset must be >= 0 (got " + groupOffset + ")"); + } + + if (groupMap.size() <= groupOffset) { + return null; + } + + if (orderedGroups == null) { + buildSortedSet(); + } + + final Collection result = new ArrayList(); + int upto = 0; + final int sortFieldCount = groupSort.getSort().length; + for(CollectedSearchGroup group : orderedGroups) { + if (upto++ < groupOffset) { + continue; + } + //System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString())); + SearchGroup searchGroup = new SearchGroup(); + searchGroup.groupValue = group.groupValue; + if (fillFields) { + searchGroup.sortValues = new Comparable[sortFieldCount]; + for(int sortFieldIDX=0;sortFieldIDX 0) { + // Definitely competitive; set remaining comparators: + for (int compIDX2=compIDX+1; compIDX2 comparator = new Comparator() { + public int compare(CollectedSearchGroup o1, CollectedSearchGroup o2) { + for (int compIDX = 0;; compIDX++) { + FieldComparator fc = comparators[compIDX]; + final int c = reversed[compIDX] * fc.compare(o1.comparatorSlot, o2.comparatorSlot); + if (c != 0) { + return c; + } else if (compIDX == compIDXEnd) { + return o1.topDoc - o2.topDoc; + } + } + } + }; + + orderedGroups = new TreeSet(comparator); + orderedGroups.addAll(groupMap.values()); + assert orderedGroups.size() > 0; + + for (FieldComparator fc : comparators) { + fc.setBottom(orderedGroups.last().comparatorSlot); + } + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return false; + } + + @Override + public void setNextReader(AtomicReaderContext readerContext) throws IOException { + docBase = readerContext.docBase; + index = FieldCache.DEFAULT.getTermsIndex(readerContext.reader, groupField); + + for (int i=0; i groupMap; + + private FieldCache.DocTermsIndex index; + private final String groupField; + private final int maxDocsPerGroup; + private final SentinelIntSet ordSet; + private final SearchGroupDocs[] groupDocs; + private final BytesRef spareBytesRef = new BytesRef(); + private final Collection groups; + private final Sort withinGroupSort; + private final Sort groupSort; + + private int totalHitCount; + private int totalGroupedHitCount; + + public SecondPassGroupingCollector(String groupField, Collection groups, Sort groupSort, Sort withinGroupSort, + int maxDocsPerGroup, boolean getScores, boolean getMaxScores, boolean fillSortFields) + throws IOException { + + //System.out.println("SP init"); + if (groups.size() == 0) { + throw new IllegalArgumentException("no groups to collect (groups.size() is 0)"); + } + + this.groupSort = groupSort; + this.withinGroupSort = withinGroupSort; + this.groups = groups; + this.groupField = groupField; + this.maxDocsPerGroup = maxDocsPerGroup; + + groupMap = new HashMap(groups.size()); + + for (SearchGroup group : groups) { + //System.out.println(" prep group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString())); + final TopDocsCollector collector; + if (withinGroupSort == null) { + // Sort by score + collector = TopScoreDocCollector.create(maxDocsPerGroup, true); + } else { + // Sort by fields + collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, getScores, getMaxScores, true); + } + groupMap.put(group.groupValue, + new SearchGroupDocs(group.groupValue, + collector)); + } + + ordSet = new SentinelIntSet(groupMap.size(), -1); + groupDocs = new SearchGroupDocs[ordSet.keys.length]; + } + + @Override + public void setScorer(Scorer scorer) throws IOException { + for (SearchGroupDocs group : groupMap.values()) { + group.collector.setScorer(scorer); + } + } + + @Override + public void collect(int doc) throws IOException { + final int slot = ordSet.find(index.getOrd(doc)); + //System.out.println("SP.collect doc=" + doc + " slot=" + slot); + totalHitCount++; + if (slot >= 0) { + totalGroupedHitCount++; + groupDocs[slot].collector.collect(doc); + } + } + + @Override + public void setNextReader(AtomicReaderContext readerContext) throws IOException { + //System.out.println("SP.setNextReader"); + for (SearchGroupDocs group : groupMap.values()) { + group.collector.setNextReader(readerContext); + } + index = FieldCache.DEFAULT.getTermsIndex(readerContext.reader, groupField); + + // Rebuild ordSet + ordSet.clear(); + for (SearchGroupDocs group : groupMap.values()) { + //System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString())); + int ord = group.groupValue == null ? 0 : index.binarySearchLookup(group.groupValue, spareBytesRef); + if (ord >= 0) { + groupDocs[ordSet.put(ord)] = group; + } + } + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return false; + } + + public TopGroups getTopGroups(int withinGroupOffset) { + final GroupDocs[] groupDocsResult = new GroupDocs[groups.size()]; + + int groupIDX = 0; + for(SearchGroup group : groups) { + final SearchGroupDocs groupDocs = groupMap.get(group.groupValue); + final TopDocs topDocs = groupDocs.collector.topDocs(withinGroupOffset, maxDocsPerGroup); + groupDocsResult[groupIDX++] = new GroupDocs(topDocs.getMaxScore(), + topDocs.totalHits, + topDocs.scoreDocs, + groupDocs.groupValue, + group.sortValues); + } + + return new TopGroups(groupSort.getSort(), + withinGroupSort == null ? null : withinGroupSort.getSort(), + totalHitCount, totalGroupedHitCount, groupDocsResult); + } +} + + +// TODO: merge with SearchGroup or not? +// ad: don't need to build a new hashmap +// disad: blows up the size of SearchGroup if we need many of them, and couples implementations +class SearchGroupDocs { + public final BytesRef groupValue; + public final TopDocsCollector collector; + + public SearchGroupDocs(BytesRef groupValue, TopDocsCollector collector) { + this.groupValue = groupValue; + this.collector = collector; + } +} diff --git a/modules/grouping/src/java/org/apache/lucene/search/grouping/SentinelIntSet.java b/modules/grouping/src/java/org/apache/lucene/search/grouping/SentinelIntSet.java new file mode 100644 index 00000000000..21da977fc95 --- /dev/null +++ b/modules/grouping/src/java/org/apache/lucene/search/grouping/SentinelIntSet.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.grouping; + +import java.util.Arrays; + +/** A native int set where one value is reserved to mean "EMPTY" */ +class SentinelIntSet { + public int[] keys; + public int count; + public final int emptyVal; + public int rehashCount; // the count at which a rehash should be done + + public SentinelIntSet(int size, int emptyVal) { + this.emptyVal = emptyVal; + int tsize = Math.max(org.apache.lucene.util.BitUtil.nextHighestPowerOfTwo(size), 1); + rehashCount = tsize - (tsize>>2); + if (size >= rehashCount) { // should be able to hold "size" w/o rehashing + tsize <<= 1; + rehashCount = tsize - (tsize>>2); + } + keys = new int[tsize]; + if (emptyVal != 0) + clear(); + } + + public void clear() { + Arrays.fill(keys, emptyVal); + count = 0; + } + + public int hash(int key) { + return key; + } + + public int size() { return count; } + + /** returns the slot for this key */ + public int getSlot(int key) { + assert key != emptyVal; + int h = hash(key); + int s = h & (keys.length-1); + if (keys[s] == key || keys[s]== emptyVal) return s; + + int increment = (h>>7)|1; + do { + s = (s + increment) & (keys.length-1); + } while (keys[s] != key && keys[s] != emptyVal); + return s; + } + + /** returns the slot for this key, or -slot-1 if not found */ + public int find(int key) { + assert key != emptyVal; + int h = hash(key); + int s = h & (keys.length-1); + if (keys[s] == key) return s; + if (keys[s] == emptyVal) return -s-1; + + int increment = (h>>7)|1; + for(;;) { + s = (s + increment) & (keys.length-1); + if (keys[s] == key) return s; + if (keys[s] == emptyVal) return -s-1; + } + } + + public boolean exists(int key) { + return find(key) >= 0; + } + + public int put(int key) { + int s = find(key); + if (s < 0) { + if (count >= rehashCount) { + rehash(); + s = getSlot(key); + } else { + s = -s-1; + } + count++; + keys[s] = key; + } + return s; + } + + public void rehash() { + int newSize = keys.length << 1; + int[] oldKeys = keys; + keys = new int[newSize]; + if (emptyVal != 0) Arrays.fill(keys, emptyVal); + + for (int i=0; i>2); + } +} diff --git a/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java b/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java new file mode 100644 index 00000000000..9b0381b58fb --- /dev/null +++ b/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java @@ -0,0 +1,51 @@ +package org.apache.lucene.search.grouping; + +import org.apache.lucene.search.SortField; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Represents result returned by a grouping search. + * + * Note that we do not return the total number of unique + * groups; doing so would be costly. + * + * @lucene.experimental */ +public class TopGroups { + /** Number of documents matching the search */ + public final int totalHitCount; + + /** Number of documents grouped into the topN groups */ + public final int totalGroupedHitCount; + + /** Group results in groupSort order */ + public final GroupDocs[] groups; + + /** How groups are sorted against each other */ + public final SortField[] groupSort; + + /** How docs are sorted within each group */ + public final SortField[] withinGroupSort; + + public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups) { + this.groupSort = groupSort; + this.withinGroupSort = withinGroupSort; + this.totalHitCount = totalHitCount; + this.totalGroupedHitCount = totalGroupedHitCount; + this.groups = groups; + } +} diff --git a/modules/grouping/src/java/overview.html b/modules/grouping/src/java/overview.html new file mode 100644 index 00000000000..b00413ae0be --- /dev/null +++ b/modules/grouping/src/java/overview.html @@ -0,0 +1,105 @@ + + + +

This module enables search result grouping with Lucene, where hits +with the same value in the specified single-valued group field are +grouped together. For example, if you group by the author +field, then all documents with the same value in the author +field fall into a single group.

+ +

Grouping requires a number of inputs:

+ + + +

The implementation is two-pass: the first pass ({@link + org.apache.lucene.search.grouping.FirstPassGroupingCollector}) + gathers the top groups, and the second pass ({@link + org.apache.lucene.search.grouping.SecondPassGroupingCollector}) + gathers documents within those groups. If the search is costly to + run you may want to use the {@link + org.apache.lucene.search.grouping.CachingCollector} class, which + caches hits and can (quickly) replay them for the second pass. This + way you only run the query once, but you pay a RAM cost to (briefly) + hold all hits. Results are returned as a {@link + org.apache.lucene.search.grouping.TopGroups} instance.

+ +

Known limitations:

+ + +

Typical usage looks like this (using the {@link org.apache.lucene.search.grouping.CachingCollector}):

+ +
+  FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
+
+  boolean cacheScores = true;
+  double maxCacheRAMMB = 4.0;
+  CachingCollector cachedCollector = new CachingCollector(c1, cacheScores, maxCacheRAMMB);
+  s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector);
+
+  Collection topGroups = c1.getTopGroups(groupOffset, fillFields);
+
+  if (topGroups == null) {
+    // No groups matched
+    return;
+  }
+
+  boolean getScores = true;
+  boolean getMaxScores = true;
+  boolean fillFields = true;
+  SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
+
+  if (cachedCollector.isCached()) {
+    // Cache fit within maxCacheRAMMB, so we can replay it:
+    cachedCollector.replay(c2);
+  } else {
+    // Cache was too large; must re-execute query:
+    s.search(new TermQuery(new Term("content", searchTerm)), c2);
+  }
+        
+  TopGroups groupsResult = c2.getTopGroups(docOffset);
+
+  // Render groupsResult...
+
+ + + diff --git a/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java b/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java new file mode 100644 index 00000000000..a36bf3d6b62 --- /dev/null +++ b/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java @@ -0,0 +1,539 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.grouping; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.FieldDoc; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +// TODO +// - should test relevance sort too +// - test null +// - test ties +// - test compound sort + +public class TestGrouping extends LuceneTestCase { + + public void testBasic() throws Exception { + + final String groupField = "author"; + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + // 0 + Document doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 1 + doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 2 + doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 3 + doc = new Document(); + doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 4 + doc = new Document(); + doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 5 + doc = new Document(); + doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "random", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 6 -- no author field + doc = new Document(); + doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + IndexSearcher indexSearcher = new IndexSearcher(w.getReader()); + w.close(); + + final Sort groupSort = Sort.RELEVANCE; + final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector(groupField, groupSort, 10); + indexSearcher.search(new TermQuery(new Term("content", "random")), c1); + + final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true); + indexSearcher.search(new TermQuery(new Term("content", "random")), c2); + + final TopGroups groups = c2.getTopGroups(0); + + assertEquals(7, groups.totalHitCount); + assertEquals(7, groups.totalGroupedHitCount); + assertEquals(4, groups.groups.length); + + // relevance order: 5, 0, 3, 4, 1, 2, 6 + + // the later a document is added the higher this docId + // value + GroupDocs group = groups.groups[0]; + assertEquals(new BytesRef("author3"), group.groupValue); + assertEquals(2, group.scoreDocs.length); + assertEquals(5, group.scoreDocs[0].doc); + assertEquals(4, group.scoreDocs[1].doc); + assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score); + + group = groups.groups[1]; + assertEquals(new BytesRef("author1"), group.groupValue); + assertEquals(3, group.scoreDocs.length); + assertEquals(0, group.scoreDocs[0].doc); + assertEquals(1, group.scoreDocs[1].doc); + assertEquals(2, group.scoreDocs[2].doc); + assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score); + assertTrue(group.scoreDocs[1].score > group.scoreDocs[2].score); + + group = groups.groups[2]; + assertEquals(new BytesRef("author2"), group.groupValue); + assertEquals(1, group.scoreDocs.length); + assertEquals(3, group.scoreDocs[0].doc); + + group = groups.groups[3]; + assertNull(group.groupValue); + assertEquals(1, group.scoreDocs.length); + assertEquals(6, group.scoreDocs[0].doc); + + indexSearcher.getIndexReader().close(); + dir.close(); + } + + private static class GroupDoc { + final int id; + final BytesRef group; + final BytesRef sort1; + final BytesRef sort2; + final String content; + + public GroupDoc(int id, BytesRef group, BytesRef sort1, BytesRef sort2, String content) { + this.id = id; + this.group = group; + this.sort1 = sort1; + this.sort2 = sort2; + this.content = content; + } + } + + private Sort getRandomSort() { + final List sortFields = new ArrayList(); + if (random.nextBoolean()) { + if (random.nextBoolean()) { + sortFields.add(new SortField("sort1", SortField.STRING, random.nextBoolean())); + } else { + sortFields.add(new SortField("sort2", SortField.STRING, random.nextBoolean())); + } + } else if (random.nextBoolean()) { + sortFields.add(new SortField("sort1", SortField.STRING, random.nextBoolean())); + sortFields.add(new SortField("sort2", SortField.STRING, random.nextBoolean())); + } + sortFields.add(new SortField("id", SortField.INT)); + return new Sort(sortFields.toArray(new SortField[sortFields.size()])); + } + + private Comparator getComparator(Sort sort) { + final SortField[] sortFields = sort.getSort(); + return new Comparator() { + public int compare(GroupDoc d1, GroupDoc d2) { + for(SortField sf : sortFields) { + final int cmp; + if (sf.getField().equals("sort1")) { + cmp = d1.sort1.compareTo(d2.sort1); + } else if (sf.getField().equals("sort2")) { + cmp = d1.sort2.compareTo(d2.sort2); + } else { + assertEquals(sf.getField(), "id"); + cmp = d1.id - d2.id; + } + if (cmp != 0) { + return sf.getReverse() ? -cmp : cmp; + } + } + // Our sort always fully tie breaks: + fail(); + return 0; + } + }; + } + + private Comparable[] fillFields(GroupDoc d, Sort sort) { + final SortField[] sortFields = sort.getSort(); + final Comparable[] fields = new Comparable[sortFields.length]; + for(int fieldIDX=0;fieldIDX groupSortComp = getComparator(groupSort); + + Arrays.sort(groupDocs, groupSortComp); + final HashMap> groups = new HashMap>(); + final List sortedGroups = new ArrayList(); + final List sortedGroupFields = new ArrayList(); + + int totalHitCount = 0; + + for(GroupDoc d : groupDocs) { + // TODO: would be better to filter by searchTerm before sorting! + if (!d.content.equals(searchTerm)) { + continue; + } + totalHitCount++; + List l = groups.get(d.group); + if (l == null) { + sortedGroups.add(d.group); + if (fillFields) { + sortedGroupFields.add(fillFields(d, groupSort)); + } + l = new ArrayList(); + groups.put(d.group, l); + } + l.add(d); + } + + if (groupOffset >= sortedGroups.size()) { + // slice is out of bounds + return null; + } + + final int limit = Math.min(groupOffset + topNGroups, groups.size()); + + final Comparator docSortComp = getComparator(docSort); + final GroupDocs[] result = new GroupDocs[limit-groupOffset]; + int totalGroupedHitCount = 0; + for(int idx=groupOffset;idx < limit;idx++) { + final BytesRef group = sortedGroups.get(idx); + final List docs = groups.get(group); + totalGroupedHitCount += docs.size(); + Collections.sort(docs, docSortComp); + final ScoreDoc[] hits; + if (docs.size() > docOffset) { + final int docIDXLimit = Math.min(docOffset + docsPerGroup, docs.size()); + hits = new ScoreDoc[docIDXLimit - docOffset]; + for(int docIDX=docOffset; docIDX < docIDXLimit; docIDX++) { + final GroupDoc d = docs.get(docIDX); + final FieldDoc fd; + if (fillFields) { + fd = new FieldDoc(d.id, 0.0f, fillFields(d, docSort)); + } else { + fd = new FieldDoc(d.id, 0.0f); + } + hits[docIDX-docOffset] = fd; + } + } else { + hits = new ScoreDoc[0]; + } + + result[idx-groupOffset] = new GroupDocs(0.0f, + docs.size(), + hits, + group, + fillFields ? sortedGroupFields.get(idx) : null); + } + + return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result); + } + + public void testRandom() throws Exception { + for(int iter=0;iter<3;iter++) { + + if (VERBOSE) { + System.out.println("TEST: iter=" + iter); + } + + final int numDocs = _TestUtil.nextInt(random, 100, 1000) * RANDOM_MULTIPLIER; + //final int numDocs = _TestUtil.nextInt(random, 5, 20); + + final int numGroups = _TestUtil.nextInt(random, 1, numDocs); + + if (VERBOSE) { + System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups); + } + + final List groups = new ArrayList(); + for(int i=0;i topGroups = c1.getTopGroups(groupOffset, fillFields); + final TopGroups groupsResult; + + if (topGroups != null) { + + if (VERBOSE) { + System.out.println("TEST: topGroups"); + for (SearchGroup searchGroup : topGroups) { + System.out.println(" " + (searchGroup.groupValue == null ? "null" : searchGroup.groupValue.utf8ToString()) + ": " + Arrays.deepToString(searchGroup.sortValues)); + } + } + + final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("group", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields); + if (doCache) { + if (cCache.isCached()) { + if (VERBOSE) { + System.out.println("TEST: cache is intact"); + } + cCache.replay(c2); + } else { + if (VERBOSE) { + System.out.println("TEST: cache was too large"); + } + s.search(new TermQuery(new Term("content", searchTerm)), c2); + } + } else { + s.search(new TermQuery(new Term("content", searchTerm)), c2); + } + + groupsResult = c2.getTopGroups(docOffset); + } else { + groupsResult = null; + if (VERBOSE) { + System.out.println("TEST: no results"); + } + } + + final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset); + + try { + // NOTE: intentional but temporary field cache insanity! + assertEquals(FieldCache.DEFAULT.getInts(r, "id"), expectedGroups, groupsResult); + } finally { + FieldCache.DEFAULT.purge(r); + } + } + + r.close(); + dir.close(); + } + } + + private void assertEquals(int[] docIDtoID, TopGroups expected, TopGroups actual) { + if (expected == null) { + assertNull(actual); + return; + } + assertNotNull(actual); + + assertEquals(expected.groups.length, actual.groups.length); + assertEquals(expected.totalHitCount, actual.totalHitCount); + assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount); + + for(int groupIDX=0;groupIDX