mirror of https://github.com/apache/lucene.git
LUCENE-1421: fixup javadocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1103150 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8153b53c8b
commit
a8993ca817
|
@ -41,6 +41,9 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
* set is large this can easily be a very substantial amount
|
||||
* of RAM!
|
||||
*
|
||||
* <p>See {@link org.apache.lucene.search.grouping} for more
|
||||
* details including a full code example.</p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class CachingCollector extends Collector {
|
||||
|
|
|
@ -34,9 +34,12 @@ import org.apache.lucene.search.SortField;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** FirstPassGroupingCollector is the first of two passes necessary
|
||||
* to collected grouped hits. This pass gathers the top N sorted
|
||||
* to collect grouped hits. This pass gathers the top N sorted
|
||||
* groups.
|
||||
*
|
||||
* <p>See {@link org.apache.lucene.search.grouping} for more
|
||||
* details including a full code example.</p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
|
|
|
@ -33,7 +33,14 @@ import org.apache.lucene.search.TopScoreDocCollector;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* See {@link FirstPassGroupingCollector}.
|
||||
* SecondPassGroupingCollector is the second of two passes
|
||||
* necessary to collect grouped docs. This pass gathers the
|
||||
* top N documents per top group computed from the
|
||||
* first pass.
|
||||
*
|
||||
* <p>See {@link org.apache.lucene.search.grouping} for more
|
||||
* details including a full code example.</p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SecondPassGroupingCollector extends Collector {
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
<html>
|
||||
<body>
|
||||
|
||||
<p>This module enables search result grouping with Lucene, where hits
|
||||
with the same value in the specified single-valued group field are
|
||||
grouped together. For example, if you group by the <tt>author</tt>
|
||||
field, then all documents with the same value in the <tt>author</tt>
|
||||
field fall into a single group.</p>
|
||||
|
||||
<p>Grouping requires a number of inputs:</p>
|
||||
|
||||
<ul>
|
||||
<li> <tt>groupField</tt>: this is the field used for grouping.
|
||||
For example, if you use the <tt>author</tt> field then each
|
||||
group has all books by the same author. Documents that don't
|
||||
have this field are grouped under a single group with
|
||||
a <tt>null</tt> group value.
|
||||
|
||||
<li> <tt>groupSort</tt>: how the groups are sorted. For sorting
|
||||
purposes, each group is "represented" by the highest-sorted
|
||||
document according to the <tt>groupSort</tt> within it. For
|
||||
example, if you specify "price" (ascending) then the first group
|
||||
is the one with the lowest price book within it. Or if you
|
||||
specify relevance group sort, then the first group is the one
|
||||
containing the highest scoring book.
|
||||
|
||||
<li> <tt>topNGroups</tt>: how many top groups to keep. For
|
||||
example, 10 means the top 10 groups are computed.
|
||||
|
||||
<li> <tt>groupOffset</tt>: which "slice" of top groups you want to
|
||||
retrieve. For example, 3 means you'll get 7 groups back
|
||||
(assuming <tt>topNGroups</tt> is 10). This is useful for
|
||||
paging, where you might show 5 groups per page.
|
||||
|
||||
<li> <tt>withinGroupSort</tt>: how the documents within each group
|
||||
are sorted. This can be different from the group sort.
|
||||
|
||||
<li> <tt>maxDocsPerGroup</tt>: how many top documents within each
|
||||
group to keep.
|
||||
|
||||
<li> <tt>withinGroupOffset</tt>: which "slice" of top
|
||||
documents you want to retrieve from each group.
|
||||
|
||||
</ul>
|
||||
|
||||
<p>The implementation is two-pass: the first pass ({@link
|
||||
org.apache.lucene.search.grouping.FirstPassGroupingCollector})
|
||||
gathers the top groups, and the second pass ({@link
|
||||
org.apache.lucene.search.grouping.SecondPassGroupingCollector})
|
||||
gathers documents within those groups. If the search is costly to
|
||||
run you may want to use the {@link
|
||||
org.apache.lucene.search.grouping.CachingCollector} class, which
|
||||
caches hits and can (quickly) replay them for the second pass. This
|
||||
way you only run the query once, but you pay a RAM cost to (briefly)
|
||||
hold all hits. Results are returned as a {@link
|
||||
org.apache.lucene.search.grouping.TopGroups} instance.</p>
|
||||
|
||||
<p>Known limitations:</p>
|
||||
<ul>
|
||||
<li> The group field must be a single-valued indexed field.
|
||||
{@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field.
|
||||
<li> Unlike Solr's implementation, this module cannot group by
|
||||
function query values nor by arbitrary queries.
|
||||
<li> Sharding is not directly supported, though is not too
|
||||
difficult, if you can merge the top groups and top documents per
|
||||
group yourself.
|
||||
</ul>
|
||||
|
||||
<p>Typical usage looks like this (using the {@link org.apache.lucene.search.grouping.CachingCollector}):</p>
|
||||
|
||||
<pre>
|
||||
FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
|
||||
|
||||
boolean cacheScores = true;
|
||||
double maxCacheRAMMB = 4.0;
|
||||
CachingCollector cachedCollector = new CachingCollector(c1, cacheScores, maxCacheRAMMB);
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector);
|
||||
|
||||
Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset, fillFields);
|
||||
|
||||
if (topGroups == null) {
|
||||
// No groups matched
|
||||
return;
|
||||
}
|
||||
|
||||
boolean getScores = true;
|
||||
boolean getMaxScores = true;
|
||||
boolean fillFields = true;
|
||||
SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
|
||||
|
||||
if (cachedCollector.isCached()) {
|
||||
// Cache fit within maxCacheRAMMB, so we can replay it:
|
||||
cachedCollector.replay(c2);
|
||||
} else {
|
||||
// Cache was too large; must re-execute query:
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), c2);
|
||||
}
|
||||
|
||||
TopGroups groupsResult = c2.getTopGroups(docOffset);
|
||||
|
||||
// Render groupsResult...
|
||||
</pre>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -1,105 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
|
||||
<p>This module enables search result grouping with Lucene, where hits
|
||||
with the same value in the specified single-valued group field are
|
||||
grouped together. For example, if you group by the <tt>author</tt>
|
||||
field, then all documents with the same value in the <tt>author</tt>
|
||||
field fall into a single group.</p>
|
||||
|
||||
<p>Grouping requires a number of inputs:</p>
|
||||
|
||||
<ul>
|
||||
<li> <tt>groupField</tt>: this is the field used for grouping.
|
||||
For example, if you use the <tt>author</tt> field then each
|
||||
group has all books by the same author. Documents that don't
|
||||
have this field are grouped under a single group with
|
||||
a <tt>null</tt> group value.
|
||||
|
||||
<li> <tt>groupSort</tt>: how the groups are sorted. For sorting
|
||||
purposes, each group is "represented" by the highest-sorted
|
||||
document according to the <tt>groupSort</tt> within it. For
|
||||
example, if you specify "price" (ascending) then the first group
|
||||
is the one with the lowest price book within it. Or if you
|
||||
specify relevance group sort, then the first group is the one
|
||||
containing the highest scoring book.
|
||||
|
||||
<li> <tt>topNGroups</tt>: how many top groups to keep. For
|
||||
example, 10 means the top 10 groups are computed.
|
||||
|
||||
<li> <tt>groupOffset</tt>: which "slice" of top groups you want to
|
||||
retrieve. For example, 3 means you'll get 7 groups back
|
||||
(assuming <tt>topNGroups</tt> is 10). This is useful for
|
||||
paging, where you might show 5 groups per page.
|
||||
|
||||
<li> <tt>withinGroupSort</tt>: how the documents within each group
|
||||
are sorted. This can be different from the group sort.
|
||||
|
||||
<li> <tt>maxDocsPerGroup</tt>: how many top documents within each
|
||||
group to keep.
|
||||
|
||||
<li> <tt>withinGroupOffset</tt>: which "slice" of top
|
||||
documents you want to retrieve from each group.
|
||||
|
||||
</ul>
|
||||
|
||||
<p>The implementation is two-pass: the first pass ({@link
|
||||
org.apache.lucene.search.grouping.FirstPassGroupingCollector})
|
||||
gathers the top groups, and the second pass ({@link
|
||||
org.apache.lucene.search.grouping.SecondPassGroupingCollector})
|
||||
gathers documents within those groups. If the search is costly to
|
||||
run you may want to use the {@link
|
||||
org.apache.lucene.search.grouping.CachingCollector} class, which
|
||||
caches hits and can (quickly) replay them for the second pass. This
|
||||
way you only run the query once, but you pay a RAM cost to (briefly)
|
||||
hold all hits. Results are returned as a {@link
|
||||
org.apache.lucene.search.grouping.TopGroups} instance.</p>
|
||||
|
||||
<p>Known limitations:</p>
|
||||
<ul>
|
||||
<li> The group field must be a single-valued indexed field.
|
||||
{@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field.
|
||||
<li> Unlike Solr's implementation, this module cannot group by
|
||||
function query values nor by arbitrary queries.
|
||||
<li> Sharding is not directly supported, though is not too
|
||||
difficult, if you can merge the top groups and top documents per
|
||||
group yourself.
|
||||
</ul>
|
||||
|
||||
<p>Typical usage looks like this (using the {@link org.apache.lucene.search.grouping.CachingCollector}):</p>
|
||||
|
||||
<pre>
|
||||
FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
|
||||
|
||||
boolean cacheScores = true;
|
||||
double maxCacheRAMMB = 4.0;
|
||||
CachingCollector cachedCollector = new CachingCollector(c1, cacheScores, maxCacheRAMMB);
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector);
|
||||
|
||||
Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset, fillFields);
|
||||
|
||||
if (topGroups == null) {
|
||||
// No groups matched
|
||||
return;
|
||||
}
|
||||
|
||||
boolean getScores = true;
|
||||
boolean getMaxScores = true;
|
||||
boolean fillFields = true;
|
||||
SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
|
||||
|
||||
if (cachedCollector.isCached()) {
|
||||
// Cache fit within maxCacheRAMMB, so we can replay it:
|
||||
cachedCollector.replay(c2);
|
||||
} else {
|
||||
// Cache was too large; must re-execute query:
|
||||
s.search(new TermQuery(new Term("content", searchTerm)), c2);
|
||||
}
|
||||
|
||||
TopGroups groupsResult = c2.getTopGroups(docOffset);
|
||||
|
||||
// Render groupsResult...
|
||||
</pre>
|
||||
|
||||
Lucene's grouping module
|
||||
</body>
|
||||
</html>
|
||||
|
|
Loading…
Reference in New Issue