mirror of https://github.com/apache/lucene.git
LUCENE-10484: Add support for concurrent facets random sampling (#765)
This commit adds a new createManager static method to RandomSamplingFacetsCollector that allows users to perform random sampling concurrently. The returned collector manager is very similar to the existing FacetsCollectorManager but it exposes a specialized reduced RandomSamplingFacetsCollector. This relates to [LUCENE-10002](https://issues.apache.org/jira/browse/LUCENE-10002). It allows users to use a collector manager instead of a collector when doing random sampling, in the effort of reducing usages of IndexSearcher#search(Query, Collector).
This commit is contained in:
parent
e7f9f2c50d
commit
7ed0f3d7ad
|
@ -33,6 +33,9 @@ Improvements
|
|||
* LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori.
|
||||
(Uihyun Kim)
|
||||
|
||||
* LUCENE-10484: Add support for concurrent random sampling by calling
|
||||
RandomSamplingFacetsCollector#createManager. (Luca Cavanna)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
(No changes)
|
||||
|
|
|
@ -18,10 +18,12 @@ package org.apache.lucene.facet;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.facet.FacetsConfig.DimConfig;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.CollectorManager;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.util.BitDocIdSet;
|
||||
|
@ -255,4 +257,40 @@ public class RandomSamplingFacetsCollector extends FacetsCollector {
|
|||
public double getSamplingRate() {
|
||||
return samplingRate;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link CollectorManager} for concurrent random sampling through {@link
|
||||
* RandomSamplingFacetsCollector}
|
||||
*/
|
||||
public static CollectorManager<RandomSamplingFacetsCollector, RandomSamplingFacetsCollector>
|
||||
createManager(int sampleSize, long seed) {
|
||||
return new CollectorManager<>() {
|
||||
@Override
|
||||
public RandomSamplingFacetsCollector newCollector() {
|
||||
return new RandomSamplingFacetsCollector(sampleSize, seed);
|
||||
}
|
||||
|
||||
@Override
|
||||
public RandomSamplingFacetsCollector reduce(
|
||||
Collection<RandomSamplingFacetsCollector> collectors) {
|
||||
if (collectors == null || collectors.size() == 0) {
|
||||
return new RandomSamplingFacetsCollector(sampleSize, seed);
|
||||
}
|
||||
if (collectors.size() == 1) {
|
||||
return collectors.iterator().next();
|
||||
}
|
||||
return new ReducedRandomSamplingFacetsCollector(sampleSize, seed, collectors);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static class ReducedRandomSamplingFacetsCollector extends RandomSamplingFacetsCollector {
|
||||
ReducedRandomSamplingFacetsCollector(
|
||||
int sampleSize, long seed, Collection<RandomSamplingFacetsCollector> facetsCollectors) {
|
||||
super(sampleSize, seed);
|
||||
facetsCollectors.forEach(
|
||||
facetsCollector ->
|
||||
getOriginalMatchingDocs().addAll(facetsCollector.getOriginalMatchingDocs()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,9 +27,9 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.CollectorManager;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MultiCollector;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.tests.index.RandomIndexWriter;
|
||||
|
@ -74,11 +74,11 @@ public class TestRandomSamplingFacetsCollector extends FacetTestCase {
|
|||
IOUtils.close(writer, taxoWriter);
|
||||
|
||||
// Test empty results
|
||||
RandomSamplingFacetsCollector collectRandomZeroResults =
|
||||
new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
|
||||
|
||||
CollectorManager<RandomSamplingFacetsCollector, RandomSamplingFacetsCollector> fcm =
|
||||
RandomSamplingFacetsCollector.createManager(numDocs / 10, random.nextLong());
|
||||
// There should be no divisions by zero
|
||||
searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
|
||||
RandomSamplingFacetsCollector collectRandomZeroResults =
|
||||
searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), fcm);
|
||||
|
||||
// There should be no divisions by zero and no null result
|
||||
assertNotNull(collectRandomZeroResults.getMatchingDocs());
|
||||
|
@ -93,13 +93,9 @@ public class TestRandomSamplingFacetsCollector extends FacetTestCase {
|
|||
// Use a query to select half of the documents.
|
||||
TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
|
||||
|
||||
RandomSamplingFacetsCollector random10Percent =
|
||||
new RandomSamplingFacetsCollector(
|
||||
numDocs / 10, random.nextLong()); // 10% of total docs, 20% of the hits
|
||||
|
||||
FacetsCollector fc = new FacetsCollector();
|
||||
|
||||
searcher.search(query, MultiCollector.wrap(fc, random10Percent));
|
||||
// 10% of total docs, 20% of the hits
|
||||
fcm = RandomSamplingFacetsCollector.createManager(numDocs / 10, random.nextLong());
|
||||
RandomSamplingFacetsCollector random10Percent = searcher.search(query, fcm);
|
||||
|
||||
final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();
|
||||
|
||||
|
|
Loading…
Reference in New Issue