diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 04b52168971..a9aa96824d3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -412,7 +412,7 @@ New features bytes in RAM. (Mike McCandless) * LUCENE-1421, LUCENE-3102: added CachingCollector which allow you to cache - document IDs and scores encountered during the search, and "reply" them to + document IDs and scores encountered during the search, and "replay" them to another Collector. (Mike McCandless, Shai Erera) Optimizations diff --git a/lucene/src/java/org/apache/lucene/search/CachingCollector.java b/lucene/src/java/org/apache/lucene/search/CachingCollector.java index 18ea7f95787..9aa53ce79e7 100644 --- a/lucene/src/java/org/apache/lucene/search/CachingCollector.java +++ b/lucene/src/java/org/apache/lucene/search/CachingCollector.java @@ -308,6 +308,48 @@ public abstract class CachingCollector extends Collector { protected int base; protected int lastDocBase; + /** + * Creates a {@link CachingCollector} which does not wrap another collector. + * The cached documents and scores can later be {@link #replay(Collector) + * replayed}. + * + * @param acceptDocsOutOfOrder + * whether documents are allowed to be collected out-of-order + */ + public static CachingCollector create(final boolean acceptDocsOutOfOrder, boolean cacheScores, double maxRAMMB) { + Collector other = new Collector() { + @Override + public boolean acceptsDocsOutOfOrder() { + return acceptDocsOutOfOrder; + } + + @Override + public void setScorer(Scorer scorer) throws IOException {} + + @Override + public void collect(int doc) throws IOException {} + + @Override + public void setNextReader(AtomicReaderContext context) throws IOException {} + + }; + return create(other, cacheScores, maxRAMMB); + } + + /** + * Create a new {@link CachingCollector} that wraps the given collector and + * caches documents and scores up to the specified RAM threshold. + * + * @param other + * the Collector to wrap and delegate calls to. + * @param cacheScores + * whether to cache scores in addition to document IDs. Note that + * this increases the RAM consumed per doc + * @param maxRAMMB + * the maximum RAM in MB to consume for caching the documents and + * scores. If the collector exceeds the threshold, no documents and + * scores are cached. + */ public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) { return cacheScores ? new ScoreCachingCollector(other, maxRAMMB) : new NoScoreCachingCollector(other, maxRAMMB); } diff --git a/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java b/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java index bb30d540570..e812f25d207 100755 --- a/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java +++ b/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java @@ -171,5 +171,18 @@ public class TestCachingCollector extends LuceneTestCase { assertFalse(cc.isCached()); } } + + public void testNoWrappedCollector() throws Exception { + for (boolean cacheScores : new boolean[] { false, true }) { + // create w/ null wrapped collector, and test that the methods work + CachingCollector cc = CachingCollector.create(true, cacheScores, 50 * ONE_BYTE); + cc.setNextReader(null); + cc.setScorer(new MockScorer()); + cc.collect(0); + + assertTrue(cc.isCached()); + cc.replay(new NoOpCollector(true)); + } + } } diff --git a/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java b/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java index 92f9102c656..a2eaa57b78f 100644 --- a/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java +++ b/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java @@ -445,27 +445,54 @@ public class TestGrouping extends LuceneTestCase { final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups); final CachingCollector cCache; final Collector c; + + final boolean useWrappingCollector = random.nextBoolean(); + if (doCache) { final double maxCacheMB = random.nextDouble(); if (VERBOSE) { System.out.println("TEST: maxCacheMB=" + maxCacheMB); } - if (doAllGroups) { - cCache = CachingCollector.create(c1, true, maxCacheMB); - c = MultiCollector.wrap(cCache, allGroupsCollector); + if (useWrappingCollector) { + if (doAllGroups) { + cCache = CachingCollector.create(c1, true, maxCacheMB); + c = MultiCollector.wrap(cCache, allGroupsCollector); + } else { + c = cCache = CachingCollector.create(c1, true, maxCacheMB); + } } else { - c = cCache = CachingCollector.create(c1, true, maxCacheMB); + // Collect only into cache, then replay multiple times: + c = cCache = CachingCollector.create(false, true, maxCacheMB); } - } else if (doAllGroups) { - c = MultiCollector.wrap(c1, allGroupsCollector); - cCache = null; } else { - c = c1; cCache = null; + if (doAllGroups) { + c = MultiCollector.wrap(c1, allGroupsCollector); + } else { + c = c1; + } } + s.search(new TermQuery(new Term("content", searchTerm)), c); + if (doCache && !useWrappingCollector) { + if (cCache.isCached()) { + // Replay for first-pass grouping + cCache.replay(c1); + if (doAllGroups) { + // Replay for all groups: + cCache.replay(allGroupsCollector); + } + } else { + // Replay by re-running search: + s.search(new TermQuery(new Term("content", searchTerm)), c1); + if (doAllGroups) { + s.search(new TermQuery(new Term("content", searchTerm)), allGroupsCollector); + } + } + } + final Collection topGroups = c1.getTopGroups(groupOffset, fillFields); final TopGroups groupsResult;