mirror of https://github.com/apache/lucene.git
LUCENE-5015: Unexpected performance difference between SamplingAccumulator and StandardFacetAccumulator
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1487397 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a79ffdfeee
commit
d784679c84
|
@ -86,6 +86,10 @@ Changes in backwards compatibility policy
|
|||
DictionaryCompoundWordTokenFilter and HyphenationCompoundWordTokenFilter don't
|
||||
update offsets anymore. (Adrien Grand)
|
||||
|
||||
* LUCENE-5015: SamplingAccumulator no longer corrects the counts of the sampled
|
||||
categories. You should set TakmiSampleFixer on SamplingParams if required (but
|
||||
notice that this means slower search). (Rob Audenaerde, Gilad Barkai, Shai Erera)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4997: Internal test framework's tests are sensitive to previous
|
||||
|
|
|
@ -3,6 +3,7 @@ package org.apache.lucene.facet.sampling;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.facet.search.FacetResult;
|
||||
import org.apache.lucene.facet.search.FacetResultNode;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDs;
|
||||
|
||||
/*
|
||||
|
@ -23,22 +24,50 @@ import org.apache.lucene.facet.search.ScoredDocIDs;
|
|||
*/
|
||||
|
||||
/**
|
||||
* Fixer of sample facet accumulation results
|
||||
* Fixer of sample facet accumulation results.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public interface SampleFixer {
|
||||
public abstract class SampleFixer {
|
||||
|
||||
/**
|
||||
* Alter the input result, fixing it to account for the sampling. This
|
||||
* implementation can compute accurate or estimated counts for the sampled facets.
|
||||
* For example, a faster correction could just multiply by a compensating factor.
|
||||
* implementation can compute accurate or estimated counts for the sampled
|
||||
* facets. For example, a faster correction could just multiply by a
|
||||
* compensating factor.
|
||||
*
|
||||
* @param origDocIds
|
||||
* full set of matching documents.
|
||||
* @param fres
|
||||
* sample result to be fixed.
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
* @throws IOException
|
||||
* If there is a low-level I/O error.
|
||||
*/
|
||||
public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) throws IOException;
|
||||
public void fixResult(ScoredDocIDs origDocIds, FacetResult fres, double samplingRatio) throws IOException {
|
||||
FacetResultNode topRes = fres.getFacetResultNode();
|
||||
fixResultNode(topRes, origDocIds, samplingRatio);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fix result node count, and, recursively, fix all its children
|
||||
*
|
||||
* @param facetResNode
|
||||
* result node to be fixed
|
||||
* @param docIds
|
||||
* docids in effect
|
||||
* @throws IOException
|
||||
* If there is a low-level I/O error.
|
||||
*/
|
||||
protected void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio)
|
||||
throws IOException {
|
||||
singleNodeFix(facetResNode, docIds, samplingRatio);
|
||||
for (FacetResultNode frn : facetResNode.subResults) {
|
||||
fixResultNode(frn, docIds, samplingRatio);
|
||||
}
|
||||
}
|
||||
|
||||
/** Fix the given node's value. */
|
||||
protected abstract void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio)
|
||||
throws IOException;
|
||||
|
||||
}
|
|
@ -12,7 +12,6 @@ import org.apache.lucene.facet.search.FacetResult;
|
|||
import org.apache.lucene.facet.search.FacetResultNode;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDs;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -110,16 +109,6 @@ public abstract class Sampler {
|
|||
protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize, int sampleSetSize)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Get a fixer of sample facet accumulation results. Default implementation
|
||||
* returns a <code>TakmiSampleFixer</code> which is adequate only for
|
||||
* counting. For any other accumulator, provide a different fixer.
|
||||
*/
|
||||
public SampleFixer getSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader,
|
||||
FacetSearchParams searchParams) {
|
||||
return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of sample computation
|
||||
*/
|
||||
|
|
|
@ -79,7 +79,11 @@ public class SamplingAccumulator extends StandardFacetsAccumulator {
|
|||
public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
|
||||
// Replacing the original searchParams with the over-sampled
|
||||
FacetSearchParams original = searchParams;
|
||||
SampleFixer samplerFixer = sampler.samplingParams.getSampleFixer();
|
||||
final boolean shouldOversample = sampler.samplingParams.shouldOverSample();
|
||||
if (shouldOversample) {
|
||||
searchParams = sampler.overSampledSearchParams(original);
|
||||
}
|
||||
|
||||
List<FacetResult> sampleRes = super.accumulate(docids);
|
||||
|
||||
|
@ -87,13 +91,17 @@ public class SamplingAccumulator extends StandardFacetsAccumulator {
|
|||
for (FacetResult fres : sampleRes) {
|
||||
// for sure fres is not null because this is guaranteed by the delegee.
|
||||
PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest());
|
||||
if (samplerFixer != null) {
|
||||
// fix the result of current request
|
||||
sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres);
|
||||
samplerFixer.fixResult(docids, fres, samplingRatio);
|
||||
|
||||
fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to
|
||||
|
||||
if (shouldOversample) {
|
||||
// Using the sampler to trim the extra (over-sampled) results
|
||||
fres = sampler.trimResult(fres);
|
||||
}
|
||||
}
|
||||
|
||||
// final labeling if allowed (because labeling is a costly operation)
|
||||
frh.labelResult(fres);
|
||||
|
|
|
@ -28,7 +28,7 @@ public class SamplingParams {
|
|||
* Default factor by which more results are requested over the sample set.
|
||||
* @see SamplingParams#getOversampleFactor()
|
||||
*/
|
||||
public static final double DEFAULT_OVERSAMPLE_FACTOR = 2d;
|
||||
public static final double DEFAULT_OVERSAMPLE_FACTOR = 1d;
|
||||
|
||||
/**
|
||||
* Default ratio between size of sample to original size of document set.
|
||||
|
@ -60,6 +60,8 @@ public class SamplingParams {
|
|||
private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD;
|
||||
private double oversampleFactor = DEFAULT_OVERSAMPLE_FACTOR;
|
||||
|
||||
private SampleFixer sampleFixer = null;
|
||||
|
||||
/**
|
||||
* Return the maxSampleSize.
|
||||
* In no case should the resulting sample size exceed this value.
|
||||
|
@ -166,4 +168,29 @@ public class SamplingParams {
|
|||
this.oversampleFactor = oversampleFactor;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return {@link SampleFixer} to be used while fixing the sampled results, if
|
||||
* <code>null</code> no fixing will be performed
|
||||
*/
|
||||
public SampleFixer getSampleFixer() {
|
||||
return sampleFixer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a {@link SampleFixer} to be used while fixing the sampled results.
|
||||
* {@code null} means no fixing will be performed
|
||||
*/
|
||||
public void setSampleFixer(SampleFixer sampleFixer) {
|
||||
this.sampleFixer = sampleFixer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether over-sampling should be done. By default returns
|
||||
* {@code true} when {@link #getSampleFixer()} is not {@code null} and
|
||||
* {@link #getOversampleFactor()} > 1, {@code false} otherwise.
|
||||
*/
|
||||
public boolean shouldOverSample() {
|
||||
return sampleFixer != null && oversampleFactor > 1d;
|
||||
}
|
||||
|
||||
}
|
|
@ -52,29 +52,41 @@ public class SamplingWrapper extends StandardFacetsAccumulator {
|
|||
public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
|
||||
// Replacing the original searchParams with the over-sampled (and without statistics-compute)
|
||||
FacetSearchParams original = delegee.searchParams;
|
||||
boolean shouldOversample = sampler.samplingParams.shouldOverSample();
|
||||
|
||||
if (shouldOversample) {
|
||||
delegee.searchParams = sampler.overSampledSearchParams(original);
|
||||
}
|
||||
|
||||
SampleResult sampleSet = sampler.getSampleSet(docids);
|
||||
|
||||
List<FacetResult> sampleRes = delegee.accumulate(sampleSet.docids);
|
||||
|
||||
List<FacetResult> fixedRes = new ArrayList<FacetResult>();
|
||||
SampleFixer sampleFixer = sampler.samplingParams.getSampleFixer();
|
||||
|
||||
for (FacetResult fres : sampleRes) {
|
||||
// for sure fres is not null because this is guaranteed by the delegee.
|
||||
PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest());
|
||||
if (sampleFixer != null) {
|
||||
// fix the result of current request
|
||||
sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres);
|
||||
sampleFixer.fixResult(docids, fres, sampleSet.actualSampleRatio);
|
||||
fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any
|
||||
}
|
||||
|
||||
if (shouldOversample) {
|
||||
// Using the sampler to trim the extra (over-sampled) results
|
||||
fres = sampler.trimResult(fres);
|
||||
}
|
||||
|
||||
// final labeling if allowed (because labeling is a costly operation)
|
||||
frh.labelResult(fres);
|
||||
fixedRes.add(fres); // add to final results
|
||||
}
|
||||
|
||||
if (shouldOversample) {
|
||||
delegee.searchParams = original; // Back to original params
|
||||
}
|
||||
|
||||
return fixedRes;
|
||||
}
|
||||
|
|
|
@ -2,21 +2,19 @@ package org.apache.lucene.facet.sampling;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
import org.apache.lucene.facet.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.DrillDownQuery;
|
||||
import org.apache.lucene.facet.search.FacetResult;
|
||||
import org.apache.lucene.facet.search.FacetResultNode;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDs;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -36,16 +34,21 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|||
*/
|
||||
|
||||
/**
|
||||
* Fix sampling results by counting the intersection between two lists: a
|
||||
* TermDocs (list of documents in a certain category) and a DocIdSetIterator
|
||||
* (list of documents matching the query).
|
||||
*
|
||||
* Fix sampling results by correct results, by counting the intersection between
|
||||
* two lists: a TermDocs (list of documents in a certain category) and a
|
||||
* DocIdSetIterator (list of documents matching the query).
|
||||
* <p>
|
||||
* This fixer is suitable for scenarios which prioritize accuracy over
|
||||
* performance.
|
||||
* <p>
|
||||
* <b>Note:</b> for statistically more accurate top-k selection, set
|
||||
* {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at
|
||||
* least 2, so that the top-k categories would have better chance of showing up
|
||||
* in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor}
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
// TODO (Facet): implement also an estimated fixing by ratio (taking into
|
||||
// account "translation" of counts!)
|
||||
class TakmiSampleFixer implements SampleFixer {
|
||||
public class TakmiSampleFixer extends SampleFixer {
|
||||
|
||||
private TaxonomyReader taxonomyReader;
|
||||
private IndexReader indexReader;
|
||||
|
@ -59,26 +62,8 @@ class TakmiSampleFixer implements SampleFixer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void fixResult(ScoredDocIDs origDocIds, FacetResult fres)
|
||||
throws IOException {
|
||||
FacetResultNode topRes = fres.getFacetResultNode();
|
||||
fixResultNode(topRes, origDocIds);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fix result node count, and, recursively, fix all its children
|
||||
*
|
||||
* @param facetResNode
|
||||
* result node to be fixed
|
||||
* @param docIds
|
||||
* docids in effect
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
*/
|
||||
private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds) throws IOException {
|
||||
public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException {
|
||||
recount(facetResNode, docIds);
|
||||
for (FacetResultNode frn : facetResNode.subResults) {
|
||||
fixResultNode(frn, docIds);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -179,4 +164,5 @@ class TakmiSampleFixer implements SampleFixer {
|
|||
}
|
||||
return false; // exhausted
|
||||
}
|
||||
|
||||
}
|
|
@ -94,7 +94,7 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
|
|||
|
||||
private Object accumulateGuard;
|
||||
|
||||
private double complementThreshold;
|
||||
private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD;
|
||||
|
||||
public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
|
||||
TaxonomyReader taxonomyReader) {
|
||||
|
|
|
@ -94,7 +94,7 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
for (int nTrial = 0; nTrial < RETRIES; nTrial++) {
|
||||
try {
|
||||
// complement with sampling!
|
||||
final Sampler sampler = createSampler(nTrial, useRandomSampler);
|
||||
final Sampler sampler = createSampler(nTrial, useRandomSampler, samplingSearchParams);
|
||||
|
||||
assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
|
||||
assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
|
||||
|
@ -128,14 +128,20 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
return FacetsCollector.create(sfa);
|
||||
}
|
||||
|
||||
private Sampler createSampler(int nTrial, boolean useRandomSampler) {
|
||||
private Sampler createSampler(int nTrial, boolean useRandomSampler, FacetSearchParams sParams) {
|
||||
SamplingParams samplingParams = new SamplingParams();
|
||||
|
||||
/*
|
||||
* Set sampling to Exact fixing with TakmiSampleFixer as it is not easy to
|
||||
* validate results with amortized results.
|
||||
*/
|
||||
samplingParams.setSampleFixer(new TakmiSampleFixer(indexReader, taxoReader, sParams));
|
||||
|
||||
final double retryFactor = Math.pow(1.01, nTrial);
|
||||
samplingParams.setOversampleFactor(5.0 * retryFactor); // Oversampling
|
||||
samplingParams.setSampleRatio(0.8 * retryFactor);
|
||||
samplingParams.setMinSampleSize((int) (100 * retryFactor));
|
||||
samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
|
||||
samplingParams.setOversampleFactor(5.0 * retryFactor);
|
||||
samplingParams.setSamplingThreshold(11000); //force sampling
|
||||
|
||||
Sampler sampler = useRandomSampler ?
|
||||
|
|
|
@ -0,0 +1,111 @@
|
|||
package org.apache.lucene.facet.sampling;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.facet.FacetTestBase;
|
||||
import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||
import org.apache.lucene.facet.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.CountFacetRequest;
|
||||
import org.apache.lucene.facet.search.FacetResultNode;
|
||||
import org.apache.lucene.facet.search.FacetsCollector;
|
||||
import org.apache.lucene.facet.search.StandardFacetsAccumulator;
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class SamplerTest extends FacetTestBase {
|
||||
|
||||
private FacetIndexingParams fip;
|
||||
|
||||
@Override
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
fip = getFacetIndexingParams(Integer.MAX_VALUE);
|
||||
initIndex(fip);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int numDocsToIndex() {
|
||||
return 100;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<CategoryPath> getCategories(final int doc) {
|
||||
return new ArrayList<CategoryPath>() {
|
||||
{
|
||||
add(new CategoryPath("root", "a", Integer.toString(doc % 10)));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getContent(int doc) {
|
||||
return "";
|
||||
}
|
||||
|
||||
@Override
|
||||
@After
|
||||
public void tearDown() throws Exception {
|
||||
closeAll();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public void testDefaultFixer() throws Exception {
|
||||
RandomSampler randomSampler = new RandomSampler();
|
||||
SampleFixer fixer = randomSampler.samplingParams.getSampleFixer();
|
||||
assertEquals(null, fixer);
|
||||
}
|
||||
|
||||
public void testCustomFixer() throws Exception {
|
||||
SamplingParams sp = new SamplingParams();
|
||||
sp.setSampleFixer(new TakmiSampleFixer(null, null, null));
|
||||
assertEquals(TakmiSampleFixer.class, sp.getSampleFixer().getClass());
|
||||
}
|
||||
|
||||
public void testNoFixing() throws Exception {
|
||||
SamplingParams sp = new SamplingParams();
|
||||
sp.setMaxSampleSize(10);
|
||||
sp.setMinSampleSize(5);
|
||||
sp.setSampleRatio(0.01d);
|
||||
sp.setSamplingThreshold(50);
|
||||
sp.setOversampleFactor(5d);
|
||||
|
||||
assertNull("Fixer should be null as the test is for no-fixing",
|
||||
sp.getSampleFixer());
|
||||
FacetSearchParams fsp = new FacetSearchParams(fip, new CountFacetRequest(
|
||||
new CategoryPath("root", "a"), 1));
|
||||
SamplingAccumulator accumulator = new SamplingAccumulator(
|
||||
new RandomSampler(sp, random()), fsp, indexReader, taxoReader);
|
||||
|
||||
// Make sure no complements are in action
|
||||
accumulator
|
||||
.setComplementThreshold(StandardFacetsAccumulator.DISABLE_COMPLEMENT);
|
||||
|
||||
FacetsCollector fc = FacetsCollector.create(accumulator);
|
||||
|
||||
searcher.search(new MatchAllDocsQuery(), fc);
|
||||
FacetResultNode node = fc.getFacetResults().get(0).getFacetResultNode();
|
||||
|
||||
assertTrue(node.value < numDocsToIndex());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue