LUCENE-10484: Add support for concurrent facets random sampling (#765)

This commit adds a new createManager static method to RandomSamplingFacetsCollector that allows users to perform random sampling concurrently. The returned collector manager is very similar to the existing FacetsCollectorManager but it exposes a specialized reduced RandomSamplingFacetsCollector.

This relates to [LUCENE-10002](https://issues.apache.org/jira/browse/LUCENE-10002). It allows users to use a collector manager instead of a collector when doing random sampling, in the effort of reducing usages of IndexSearcher#search(Query, Collector).
This commit is contained in:
Luca Cavanna 2022-04-05 08:51:57 +02:00 committed by GitHub
parent e7f9f2c50d
commit 7ed0f3d7ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 49 additions and 12 deletions

View File

@ -33,6 +33,9 @@ Improvements
* LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori.
(Uihyun Kim)
* LUCENE-10484: Add support for concurrent random sampling by calling
RandomSamplingFacetsCollector#createManager. (Luca Cavanna)
Optimizations
---------------------
(No changes)

View File

@ -18,10 +18,12 @@ package org.apache.lucene.facet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BitDocIdSet;
@ -255,4 +257,40 @@ public class RandomSamplingFacetsCollector extends FacetsCollector {
public double getSamplingRate() {
return samplingRate;
}
/**
* Creates a {@link CollectorManager} for concurrent random sampling through {@link
* RandomSamplingFacetsCollector}
*/
public static CollectorManager<RandomSamplingFacetsCollector, RandomSamplingFacetsCollector>
createManager(int sampleSize, long seed) {
return new CollectorManager<>() {
@Override
public RandomSamplingFacetsCollector newCollector() {
return new RandomSamplingFacetsCollector(sampleSize, seed);
}
@Override
public RandomSamplingFacetsCollector reduce(
Collection<RandomSamplingFacetsCollector> collectors) {
if (collectors == null || collectors.size() == 0) {
return new RandomSamplingFacetsCollector(sampleSize, seed);
}
if (collectors.size() == 1) {
return collectors.iterator().next();
}
return new ReducedRandomSamplingFacetsCollector(sampleSize, seed, collectors);
}
};
}
private static class ReducedRandomSamplingFacetsCollector extends RandomSamplingFacetsCollector {
ReducedRandomSamplingFacetsCollector(
int sampleSize, long seed, Collection<RandomSamplingFacetsCollector> facetsCollectors) {
super(sampleSize, seed);
facetsCollectors.forEach(
facetsCollector ->
getOriginalMatchingDocs().addAll(facetsCollector.getOriginalMatchingDocs()));
}
}
}

View File

@ -27,9 +27,9 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
@ -74,11 +74,11 @@ public class TestRandomSamplingFacetsCollector extends FacetTestCase {
IOUtils.close(writer, taxoWriter);
// Test empty results
RandomSamplingFacetsCollector collectRandomZeroResults =
new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
CollectorManager<RandomSamplingFacetsCollector, RandomSamplingFacetsCollector> fcm =
RandomSamplingFacetsCollector.createManager(numDocs / 10, random.nextLong());
// There should be no divisions by zero
searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
RandomSamplingFacetsCollector collectRandomZeroResults =
searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), fcm);
// There should be no divisions by zero and no null result
assertNotNull(collectRandomZeroResults.getMatchingDocs());
@ -93,13 +93,9 @@ public class TestRandomSamplingFacetsCollector extends FacetTestCase {
// Use a query to select half of the documents.
TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
RandomSamplingFacetsCollector random10Percent =
new RandomSamplingFacetsCollector(
numDocs / 10, random.nextLong()); // 10% of total docs, 20% of the hits
FacetsCollector fc = new FacetsCollector();
searcher.search(query, MultiCollector.wrap(fc, random10Percent));
// 10% of total docs, 20% of the hits
fcm = RandomSamplingFacetsCollector.createManager(numDocs / 10, random.nextLong());
RandomSamplingFacetsCollector random10Percent = searcher.search(query, fcm);
final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();