LUCENE-5015: Unexpected performance difference between SamplingAccumulator and StandardFacetAccumulator

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1487397 13f79535-47bb-0310-9956-ffa450edef68
2013-05-29 08:24:03 +00:00 · 2013-05-29 08:24:03 +00:00 · d784679c84
parent a79ffdfeee
commit d784679c84
10 changed files with 245 additions and 73 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -86,6 +86,10 @@ Changes in backwards compatibility policy
  DictionaryCompoundWordTokenFilter and HyphenationCompoundWordTokenFilter don't
  update offsets anymore. (Adrien Grand)

+* LUCENE-5015: SamplingAccumulator no longer corrects the counts of the sampled 
+  categories. You should set TakmiSampleFixer on SamplingParams if required (but 
+  notice that this means slower search). (Rob Audenaerde, Gilad Barkai, Shai Erera)
+
 Bug Fixes

 * LUCENE-4997: Internal test framework's tests are sensitive to previous 
--- a/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java
@ -3,6 +3,7 @@ package org.apache.lucene.facet.sampling;
 import java.io.IOException;

 import org.apache.lucene.facet.search.FacetResult;
+import org.apache.lucene.facet.search.FacetResultNode;
 import org.apache.lucene.facet.search.ScoredDocIDs;

 /*
@ -23,22 +24,50 @@ import org.apache.lucene.facet.search.ScoredDocIDs;
 */

 /**
- * Fixer of sample facet accumulation results
+ * Fixer of sample facet accumulation results.
 * 
 * @lucene.experimental
 */
-public interface SampleFixer {
+public abstract class SampleFixer {
  
  /**
   * Alter the input result, fixing it to account for the sampling. This
-   * implementation can compute accurate or estimated counts for the sampled facets. 
-   * For example, a faster correction could just multiply by a compensating factor.
+   * implementation can compute accurate or estimated counts for the sampled
+   * facets. For example, a faster correction could just multiply by a
+   * compensating factor.
   * 
   * @param origDocIds
   *          full set of matching documents.
   * @param fres
   *          sample result to be fixed.
-   * @throws IOException If there is a low-level I/O error.
+   * @throws IOException
+   *           If there is a low-level I/O error.
   */
-  public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) throws IOException; 
+  public void fixResult(ScoredDocIDs origDocIds, FacetResult fres, double samplingRatio) throws IOException {
+    FacetResultNode topRes = fres.getFacetResultNode();
+    fixResultNode(topRes, origDocIds, samplingRatio);
+  }
+  
+  /**
+   * Fix result node count, and, recursively, fix all its children
+   * 
+   * @param facetResNode
+   *          result node to be fixed
+   * @param docIds
+   *          docids in effect
+   * @throws IOException
+   *           If there is a low-level I/O error.
+   */
+  protected void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) 
+      throws IOException {
+    singleNodeFix(facetResNode, docIds, samplingRatio);
+    for (FacetResultNode frn : facetResNode.subResults) {
+      fixResultNode(frn, docIds, samplingRatio);
+    }
+  }
+  
+  /** Fix the given node's value. */
+  protected abstract void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) 
+      throws IOException;
+  
 }
--- a/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java
@ -12,7 +12,6 @@ import org.apache.lucene.facet.search.FacetResult;
 import org.apache.lucene.facet.search.FacetResultNode;
 import org.apache.lucene.facet.search.ScoredDocIDs;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
-import org.apache.lucene.index.IndexReader;

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -110,16 +109,6 @@ public abstract class Sampler {
  protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize, int sampleSetSize) 
      throws IOException;

-  /**
-   * Get a fixer of sample facet accumulation results. Default implementation
-   * returns a <code>TakmiSampleFixer</code> which is adequate only for
-   * counting. For any other accumulator, provide a different fixer.
-   */
-  public SampleFixer getSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader,
-      FacetSearchParams searchParams) {
-    return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
-  }
-  
  /**
   * Result of sample computation
   */
--- a/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java
@ -79,7 +79,11 @@ public class SamplingAccumulator extends StandardFacetsAccumulator {
  public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
    // Replacing the original searchParams with the over-sampled
    FacetSearchParams original = searchParams;
-    searchParams = sampler.overSampledSearchParams(original);
+    SampleFixer samplerFixer = sampler.samplingParams.getSampleFixer();
+    final boolean shouldOversample = sampler.samplingParams.shouldOverSample();
+    if (shouldOversample) {
+      searchParams = sampler.overSampledSearchParams(original);
+    }
    
    List<FacetResult> sampleRes = super.accumulate(docids);
    
@ -87,14 +91,18 @@ public class SamplingAccumulator extends StandardFacetsAccumulator {
    for (FacetResult fres : sampleRes) {
      // for sure fres is not null because this is guaranteed by the delegee.
      PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest());
-      // fix the result of current request
-      sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres);
+      if (samplerFixer != null) {
+        // fix the result of current request
+        samplerFixer.fixResult(docids, fres, samplingRatio);
+        
+        fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to
+
+        if (shouldOversample) {
+          // Using the sampler to trim the extra (over-sampled) results
+          fres = sampler.trimResult(fres);
+        }
+      }
      
-      fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to
-
-      // Using the sampler to trim the extra (over-sampled) results
-      fres = sampler.trimResult(fres);
-
      // final labeling if allowed (because labeling is a costly operation)
      frh.labelResult(fres);
      fixedRes.add(fres); // add to final results
--- a/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java
@ -28,7 +28,7 @@ public class SamplingParams {
   * Default factor by which more results are requested over the sample set.
   * @see SamplingParams#getOversampleFactor()
   */
-  public static final double DEFAULT_OVERSAMPLE_FACTOR = 2d;
+  public static final double DEFAULT_OVERSAMPLE_FACTOR = 1d;
  
  /**
   * Default ratio between size of sample to original size of document set.
@ -59,6 +59,8 @@ public class SamplingParams {
  private double sampleRatio = DEFAULT_SAMPLE_RATIO;
  private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD;
  private double oversampleFactor = DEFAULT_OVERSAMPLE_FACTOR;
+
+  private SampleFixer sampleFixer = null;
  
  /**
   * Return the maxSampleSize.
@ -166,4 +168,29 @@ public class SamplingParams {
    this.oversampleFactor = oversampleFactor;
  }

-}
+  /**
+   * @return {@link SampleFixer} to be used while fixing the sampled results, if
+   *         <code>null</code> no fixing will be performed
+   */
+  public SampleFixer getSampleFixer() {
+    return sampleFixer;
+  }
+
+  /**
+   * Set a {@link SampleFixer} to be used while fixing the sampled results.
+   * {@code null} means no fixing will be performed
+   */
+  public void setSampleFixer(SampleFixer sampleFixer) {
+    this.sampleFixer = sampleFixer;
+  }
+
+  /**
+   * Returns whether over-sampling should be done. By default returns
+   * {@code true} when {@link #getSampleFixer()} is not {@code null} and
+   * {@link #getOversampleFactor()} &gt; 1, {@code false} otherwise.
+   */
+  public boolean shouldOverSample() {
+    return sampleFixer != null && oversampleFactor > 1d;
+  }
+  
+}
--- a/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java
@ -52,29 +52,41 @@ public class SamplingWrapper extends StandardFacetsAccumulator {
  public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
    // Replacing the original searchParams with the over-sampled (and without statistics-compute)
    FacetSearchParams original = delegee.searchParams;
-    delegee.searchParams = sampler.overSampledSearchParams(original);
+    boolean shouldOversample = sampler.samplingParams.shouldOverSample();
+   
+    if (shouldOversample) {
+      delegee.searchParams = sampler.overSampledSearchParams(original);
+    }
    
    SampleResult sampleSet = sampler.getSampleSet(docids);

    List<FacetResult> sampleRes = delegee.accumulate(sampleSet.docids);

    List<FacetResult> fixedRes = new ArrayList<FacetResult>();
+    SampleFixer sampleFixer = sampler.samplingParams.getSampleFixer();
+    
    for (FacetResult fres : sampleRes) {
      // for sure fres is not null because this is guaranteed by the delegee.
      PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest());
-      // fix the result of current request
-      sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres); 
-      fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any
+      if (sampleFixer != null) {
+        // fix the result of current request
+        sampleFixer.fixResult(docids, fres, sampleSet.actualSampleRatio); 
+        fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any
+      }
      
-      // Using the sampler to trim the extra (over-sampled) results
-      fres = sampler.trimResult(fres);
+      if (shouldOversample) {
+        // Using the sampler to trim the extra (over-sampled) results
+        fres = sampler.trimResult(fres);
+      }
      
      // final labeling if allowed (because labeling is a costly operation)
      frh.labelResult(fres);
      fixedRes.add(fres); // add to final results
    }

-    delegee.searchParams = original; // Back to original params
+    if (shouldOversample) {
+      delegee.searchParams = original; // Back to original params
+    }
    
    return fixedRes; 
  }
--- a/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java
@ -2,21 +2,19 @@ package org.apache.lucene.facet.sampling;

 import java.io.IOException;

-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.util.Bits;
-
 import org.apache.lucene.facet.params.FacetSearchParams;
 import org.apache.lucene.facet.search.DrillDownQuery;
-import org.apache.lucene.facet.search.FacetResult;
 import org.apache.lucene.facet.search.FacetResultNode;
 import org.apache.lucene.facet.search.ScoredDocIDs;
 import org.apache.lucene.facet.search.ScoredDocIDsIterator;
 import org.apache.lucene.facet.taxonomy.CategoryPath;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.Bits;

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -36,16 +34,21 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
 */

 /**
- * Fix sampling results by counting the intersection between two lists: a
- * TermDocs (list of documents in a certain category) and a DocIdSetIterator
- * (list of documents matching the query).
- * 
+ * Fix sampling results by correct results, by counting the intersection between
+ * two lists: a TermDocs (list of documents in a certain category) and a
+ * DocIdSetIterator (list of documents matching the query).
+ * <p>
+ * This fixer is suitable for scenarios which prioritize accuracy over
+ * performance. 
+ * <p>
+ * <b>Note:</b> for statistically more accurate top-k selection, set
+ * {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at
+ * least 2, so that the top-k categories would have better chance of showing up
+ * in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor}
 * 
 * @lucene.experimental
 */
-// TODO (Facet): implement also an estimated fixing by ratio (taking into
-// account "translation" of counts!)
-class TakmiSampleFixer implements SampleFixer {
+public class TakmiSampleFixer extends SampleFixer {
  
  private TaxonomyReader taxonomyReader;
  private IndexReader indexReader;
@ -59,28 +62,10 @@ class TakmiSampleFixer implements SampleFixer {
  }

  @Override
-  public void fixResult(ScoredDocIDs origDocIds, FacetResult fres)
-      throws IOException {
-    FacetResultNode topRes = fres.getFacetResultNode();
-    fixResultNode(topRes, origDocIds);
+  public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException {
+    recount(facetResNode, docIds);
  }
  
-  /**
-   * Fix result node count, and, recursively, fix all its children
-   * 
-   * @param facetResNode
-   *          result node to be fixed
-   * @param docIds
-   *          docids in effect
-   * @throws IOException If there is a low-level I/O error.
-   */
-  private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds) throws IOException {
-    recount(facetResNode, docIds);
-    for (FacetResultNode frn : facetResNode.subResults) {
-      fixResultNode(frn, docIds);
-    }
-  }
-
  /**
   * Internal utility: recount for a facet result node
   * 
@ -179,4 +164,5 @@ class TakmiSampleFixer implements SampleFixer {
    }
    return false; // exhausted
  }
+
 }
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java
@ -94,7 +94,7 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {

  private Object accumulateGuard;

-  private double complementThreshold;
+  private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD;
  
  public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader, 
      TaxonomyReader taxonomyReader) {
--- a/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java
@ -94,7 +94,7 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
        for (int nTrial = 0; nTrial < RETRIES; nTrial++) {
          try {
            // complement with sampling!
-            final Sampler sampler = createSampler(nTrial, useRandomSampler);
+            final Sampler sampler = createSampler(nTrial, useRandomSampler, samplingSearchParams);
            
            assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
            assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
@ -128,14 +128,20 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
    return FacetsCollector.create(sfa);
  }
  
-  private Sampler createSampler(int nTrial, boolean useRandomSampler) {
+  private Sampler createSampler(int nTrial, boolean useRandomSampler, FacetSearchParams sParams) {
    SamplingParams samplingParams = new SamplingParams();
    
+    /*
+     * Set sampling to Exact fixing with TakmiSampleFixer as it is not easy to
+     * validate results with amortized results. 
+     */
+    samplingParams.setSampleFixer(new TakmiSampleFixer(indexReader, taxoReader, sParams));
+        
    final double retryFactor = Math.pow(1.01, nTrial);
+    samplingParams.setOversampleFactor(5.0 * retryFactor); // Oversampling 
    samplingParams.setSampleRatio(0.8 * retryFactor);
    samplingParams.setMinSampleSize((int) (100 * retryFactor));
    samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
-    samplingParams.setOversampleFactor(5.0 * retryFactor);
    samplingParams.setSamplingThreshold(11000); //force sampling

    Sampler sampler = useRandomSampler ? 
--- a/lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java
@ -0,0 +1,111 @@
+package org.apache.lucene.facet.sampling;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.facet.FacetTestBase;
+import org.apache.lucene.facet.params.FacetIndexingParams;
+import org.apache.lucene.facet.params.FacetSearchParams;
+import org.apache.lucene.facet.search.CountFacetRequest;
+import org.apache.lucene.facet.search.FacetResultNode;
+import org.apache.lucene.facet.search.FacetsCollector;
+import org.apache.lucene.facet.search.StandardFacetsAccumulator;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.junit.After;
+import org.junit.Before;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class SamplerTest extends FacetTestBase {
+  
+  private FacetIndexingParams fip;
+  
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    fip = getFacetIndexingParams(Integer.MAX_VALUE);
+    initIndex(fip);
+  }
+  
+  @Override
+  protected int numDocsToIndex() {
+    return 100;
+  }
+  
+  @Override
+  protected List<CategoryPath> getCategories(final int doc) {
+    return new ArrayList<CategoryPath>() {
+      {
+        add(new CategoryPath("root", "a", Integer.toString(doc % 10)));
+      }
+    };
+  }
+  
+  @Override
+  protected String getContent(int doc) {
+    return "";
+  }
+  
+  @Override
+  @After
+  public void tearDown() throws Exception {
+    closeAll();
+    super.tearDown();
+  }
+  
+  public void testDefaultFixer() throws Exception {
+    RandomSampler randomSampler = new RandomSampler();
+    SampleFixer fixer = randomSampler.samplingParams.getSampleFixer();
+    assertEquals(null, fixer);
+  }
+  
+  public void testCustomFixer() throws Exception {
+    SamplingParams sp = new SamplingParams();
+    sp.setSampleFixer(new TakmiSampleFixer(null, null, null));
+    assertEquals(TakmiSampleFixer.class, sp.getSampleFixer().getClass());
+  }
+  
+  public void testNoFixing() throws Exception {
+    SamplingParams sp = new SamplingParams();
+    sp.setMaxSampleSize(10);
+    sp.setMinSampleSize(5);
+    sp.setSampleRatio(0.01d);
+    sp.setSamplingThreshold(50);
+    sp.setOversampleFactor(5d);
+    
+    assertNull("Fixer should be null as the test is for no-fixing",
+        sp.getSampleFixer());
+    FacetSearchParams fsp = new FacetSearchParams(fip, new CountFacetRequest(
+        new CategoryPath("root", "a"), 1));
+    SamplingAccumulator accumulator = new SamplingAccumulator(
+        new RandomSampler(sp, random()), fsp, indexReader, taxoReader);
+    
+    // Make sure no complements are in action
+    accumulator
+        .setComplementThreshold(StandardFacetsAccumulator.DISABLE_COMPLEMENT);
+    
+    FacetsCollector fc = FacetsCollector.create(accumulator);
+    
+    searcher.search(new MatchAllDocsQuery(), fc);
+    FacetResultNode node = fc.getFacetResults().get(0).getFacetResultNode();
+    
+    assertTrue(node.value < numDocsToIndex());
+  }
+}