LUCENE-5015: Unexpected performance difference between SamplingAccumulator and StandardFacetAccumulator

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1487397 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2013-05-29 08:24:03 +00:00
parent a79ffdfeee
commit d784679c84
10 changed files with 245 additions and 73 deletions

View File

@ -86,6 +86,10 @@ Changes in backwards compatibility policy
DictionaryCompoundWordTokenFilter and HyphenationCompoundWordTokenFilter don't
update offsets anymore. (Adrien Grand)
* LUCENE-5015: SamplingAccumulator no longer corrects the counts of the sampled
categories. You should set TakmiSampleFixer on SamplingParams if required (but
notice that this means slower search). (Rob Audenaerde, Gilad Barkai, Shai Erera)
Bug Fixes
* LUCENE-4997: Internal test framework's tests are sensitive to previous

View File

@ -3,6 +3,7 @@ package org.apache.lucene.facet.sampling;
import java.io.IOException;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.ScoredDocIDs;
/*
@ -23,22 +24,50 @@ import org.apache.lucene.facet.search.ScoredDocIDs;
*/
/**
* Fixer of sample facet accumulation results
* Fixer of sample facet accumulation results.
*
* @lucene.experimental
*/
public interface SampleFixer {
public abstract class SampleFixer {
/**
* Alter the input result, fixing it to account for the sampling. This
* implementation can compute accurate or estimated counts for the sampled facets.
* For example, a faster correction could just multiply by a compensating factor.
* implementation can compute accurate or estimated counts for the sampled
* facets. For example, a faster correction could just multiply by a
* compensating factor.
*
* @param origDocIds
* full set of matching documents.
* @param fres
* sample result to be fixed.
* @throws IOException If there is a low-level I/O error.
* @throws IOException
* If there is a low-level I/O error.
*/
public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) throws IOException;
public void fixResult(ScoredDocIDs origDocIds, FacetResult fres, double samplingRatio) throws IOException {
FacetResultNode topRes = fres.getFacetResultNode();
fixResultNode(topRes, origDocIds, samplingRatio);
}
/**
* Fix result node count, and, recursively, fix all its children
*
* @param facetResNode
* result node to be fixed
* @param docIds
* docids in effect
* @throws IOException
* If there is a low-level I/O error.
*/
protected void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio)
throws IOException {
singleNodeFix(facetResNode, docIds, samplingRatio);
for (FacetResultNode frn : facetResNode.subResults) {
fixResultNode(frn, docIds, samplingRatio);
}
}
/** Fix the given node's value. */
protected abstract void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio)
throws IOException;
}

View File

@ -12,7 +12,6 @@ import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.ScoredDocIDs;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.IndexReader;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -110,16 +109,6 @@ public abstract class Sampler {
protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize, int sampleSetSize)
throws IOException;
/**
* Get a fixer of sample facet accumulation results. Default implementation
* returns a <code>TakmiSampleFixer</code> which is adequate only for
* counting. For any other accumulator, provide a different fixer.
*/
public SampleFixer getSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader,
FacetSearchParams searchParams) {
return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
}
/**
* Result of sample computation
*/

View File

@ -79,7 +79,11 @@ public class SamplingAccumulator extends StandardFacetsAccumulator {
public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
// Replacing the original searchParams with the over-sampled
FacetSearchParams original = searchParams;
searchParams = sampler.overSampledSearchParams(original);
SampleFixer samplerFixer = sampler.samplingParams.getSampleFixer();
final boolean shouldOversample = sampler.samplingParams.shouldOverSample();
if (shouldOversample) {
searchParams = sampler.overSampledSearchParams(original);
}
List<FacetResult> sampleRes = super.accumulate(docids);
@ -87,14 +91,18 @@ public class SamplingAccumulator extends StandardFacetsAccumulator {
for (FacetResult fres : sampleRes) {
// for sure fres is not null because this is guaranteed by the delegee.
PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest());
// fix the result of current request
sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres);
if (samplerFixer != null) {
// fix the result of current request
samplerFixer.fixResult(docids, fres, samplingRatio);
fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to
if (shouldOversample) {
// Using the sampler to trim the extra (over-sampled) results
fres = sampler.trimResult(fres);
}
}
fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to
// Using the sampler to trim the extra (over-sampled) results
fres = sampler.trimResult(fres);
// final labeling if allowed (because labeling is a costly operation)
frh.labelResult(fres);
fixedRes.add(fres); // add to final results

View File

@ -28,7 +28,7 @@ public class SamplingParams {
* Default factor by which more results are requested over the sample set.
* @see SamplingParams#getOversampleFactor()
*/
public static final double DEFAULT_OVERSAMPLE_FACTOR = 2d;
public static final double DEFAULT_OVERSAMPLE_FACTOR = 1d;
/**
* Default ratio between size of sample to original size of document set.
@ -59,6 +59,8 @@ public class SamplingParams {
private double sampleRatio = DEFAULT_SAMPLE_RATIO;
private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD;
private double oversampleFactor = DEFAULT_OVERSAMPLE_FACTOR;
private SampleFixer sampleFixer = null;
/**
* Return the maxSampleSize.
@ -166,4 +168,29 @@ public class SamplingParams {
this.oversampleFactor = oversampleFactor;
}
}
/**
* @return {@link SampleFixer} to be used while fixing the sampled results, if
* <code>null</code> no fixing will be performed
*/
public SampleFixer getSampleFixer() {
return sampleFixer;
}
/**
* Set a {@link SampleFixer} to be used while fixing the sampled results.
* {@code null} means no fixing will be performed
*/
public void setSampleFixer(SampleFixer sampleFixer) {
this.sampleFixer = sampleFixer;
}
/**
* Returns whether over-sampling should be done. By default returns
* {@code true} when {@link #getSampleFixer()} is not {@code null} and
* {@link #getOversampleFactor()} &gt; 1, {@code false} otherwise.
*/
public boolean shouldOverSample() {
return sampleFixer != null && oversampleFactor > 1d;
}
}

View File

@ -52,29 +52,41 @@ public class SamplingWrapper extends StandardFacetsAccumulator {
public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
// Replacing the original searchParams with the over-sampled (and without statistics-compute)
FacetSearchParams original = delegee.searchParams;
delegee.searchParams = sampler.overSampledSearchParams(original);
boolean shouldOversample = sampler.samplingParams.shouldOverSample();
if (shouldOversample) {
delegee.searchParams = sampler.overSampledSearchParams(original);
}
SampleResult sampleSet = sampler.getSampleSet(docids);
List<FacetResult> sampleRes = delegee.accumulate(sampleSet.docids);
List<FacetResult> fixedRes = new ArrayList<FacetResult>();
SampleFixer sampleFixer = sampler.samplingParams.getSampleFixer();
for (FacetResult fres : sampleRes) {
// for sure fres is not null because this is guaranteed by the delegee.
PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest());
// fix the result of current request
sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres);
fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any
if (sampleFixer != null) {
// fix the result of current request
sampleFixer.fixResult(docids, fres, sampleSet.actualSampleRatio);
fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any
}
// Using the sampler to trim the extra (over-sampled) results
fres = sampler.trimResult(fres);
if (shouldOversample) {
// Using the sampler to trim the extra (over-sampled) results
fres = sampler.trimResult(fres);
}
// final labeling if allowed (because labeling is a costly operation)
frh.labelResult(fres);
fixedRes.add(fres); // add to final results
}
delegee.searchParams = original; // Back to original params
if (shouldOversample) {
delegee.searchParams = original; // Back to original params
}
return fixedRes;
}

View File

@ -2,21 +2,19 @@ package org.apache.lucene.facet.sampling;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.DrillDownQuery;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.ScoredDocIDs;
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -36,16 +34,21 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
*/
/**
* Fix sampling results by counting the intersection between two lists: a
* TermDocs (list of documents in a certain category) and a DocIdSetIterator
* (list of documents matching the query).
*
* Fix sampling results by correct results, by counting the intersection between
* two lists: a TermDocs (list of documents in a certain category) and a
* DocIdSetIterator (list of documents matching the query).
* <p>
* This fixer is suitable for scenarios which prioritize accuracy over
* performance.
* <p>
* <b>Note:</b> for statistically more accurate top-k selection, set
* {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at
* least 2, so that the top-k categories would have better chance of showing up
* in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor}
*
* @lucene.experimental
*/
// TODO (Facet): implement also an estimated fixing by ratio (taking into
// account "translation" of counts!)
class TakmiSampleFixer implements SampleFixer {
public class TakmiSampleFixer extends SampleFixer {
private TaxonomyReader taxonomyReader;
private IndexReader indexReader;
@ -59,28 +62,10 @@ class TakmiSampleFixer implements SampleFixer {
}
@Override
public void fixResult(ScoredDocIDs origDocIds, FacetResult fres)
throws IOException {
FacetResultNode topRes = fres.getFacetResultNode();
fixResultNode(topRes, origDocIds);
public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException {
recount(facetResNode, docIds);
}
/**
* Fix result node count, and, recursively, fix all its children
*
* @param facetResNode
* result node to be fixed
* @param docIds
* docids in effect
* @throws IOException If there is a low-level I/O error.
*/
private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds) throws IOException {
recount(facetResNode, docIds);
for (FacetResultNode frn : facetResNode.subResults) {
fixResultNode(frn, docIds);
}
}
/**
* Internal utility: recount for a facet result node
*
@ -179,4 +164,5 @@ class TakmiSampleFixer implements SampleFixer {
}
return false; // exhausted
}
}

View File

@ -94,7 +94,7 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
private Object accumulateGuard;
private double complementThreshold;
private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD;
public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader) {

View File

@ -94,7 +94,7 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
for (int nTrial = 0; nTrial < RETRIES; nTrial++) {
try {
// complement with sampling!
final Sampler sampler = createSampler(nTrial, useRandomSampler);
final Sampler sampler = createSampler(nTrial, useRandomSampler, samplingSearchParams);
assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
@ -128,14 +128,20 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
return FacetsCollector.create(sfa);
}
private Sampler createSampler(int nTrial, boolean useRandomSampler) {
private Sampler createSampler(int nTrial, boolean useRandomSampler, FacetSearchParams sParams) {
SamplingParams samplingParams = new SamplingParams();
/*
* Set sampling to Exact fixing with TakmiSampleFixer as it is not easy to
* validate results with amortized results.
*/
samplingParams.setSampleFixer(new TakmiSampleFixer(indexReader, taxoReader, sParams));
final double retryFactor = Math.pow(1.01, nTrial);
samplingParams.setOversampleFactor(5.0 * retryFactor); // Oversampling
samplingParams.setSampleRatio(0.8 * retryFactor);
samplingParams.setMinSampleSize((int) (100 * retryFactor));
samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
samplingParams.setOversampleFactor(5.0 * retryFactor);
samplingParams.setSamplingThreshold(11000); //force sampling
Sampler sampler = useRandomSampler ?

View File

@ -0,0 +1,111 @@
package org.apache.lucene.facet.sampling;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.facet.FacetTestBase;
import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.CountFacetRequest;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.FacetsCollector;
import org.apache.lucene.facet.search.StandardFacetsAccumulator;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.junit.After;
import org.junit.Before;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class SamplerTest extends FacetTestBase {
private FacetIndexingParams fip;
@Override
@Before
public void setUp() throws Exception {
super.setUp();
fip = getFacetIndexingParams(Integer.MAX_VALUE);
initIndex(fip);
}
@Override
protected int numDocsToIndex() {
return 100;
}
@Override
protected List<CategoryPath> getCategories(final int doc) {
return new ArrayList<CategoryPath>() {
{
add(new CategoryPath("root", "a", Integer.toString(doc % 10)));
}
};
}
@Override
protected String getContent(int doc) {
return "";
}
@Override
@After
public void tearDown() throws Exception {
closeAll();
super.tearDown();
}
public void testDefaultFixer() throws Exception {
RandomSampler randomSampler = new RandomSampler();
SampleFixer fixer = randomSampler.samplingParams.getSampleFixer();
assertEquals(null, fixer);
}
public void testCustomFixer() throws Exception {
SamplingParams sp = new SamplingParams();
sp.setSampleFixer(new TakmiSampleFixer(null, null, null));
assertEquals(TakmiSampleFixer.class, sp.getSampleFixer().getClass());
}
public void testNoFixing() throws Exception {
SamplingParams sp = new SamplingParams();
sp.setMaxSampleSize(10);
sp.setMinSampleSize(5);
sp.setSampleRatio(0.01d);
sp.setSamplingThreshold(50);
sp.setOversampleFactor(5d);
assertNull("Fixer should be null as the test is for no-fixing",
sp.getSampleFixer());
FacetSearchParams fsp = new FacetSearchParams(fip, new CountFacetRequest(
new CategoryPath("root", "a"), 1));
SamplingAccumulator accumulator = new SamplingAccumulator(
new RandomSampler(sp, random()), fsp, indexReader, taxoReader);
// Make sure no complements are in action
accumulator
.setComplementThreshold(StandardFacetsAccumulator.DISABLE_COMPLEMENT);
FacetsCollector fc = FacetsCollector.create(accumulator);
searcher.search(new MatchAllDocsQuery(), fc);
FacetResultNode node = fc.getFacetResults().get(0).getFacetResultNode();
assertTrue(node.value < numDocsToIndex());
}
}