mirror of https://github.com/apache/lucene.git
Improve random sampling test
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1636821 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2c8de2f35c
commit
5b00b2e981
|
@ -1,5 +1,6 @@
|
|||
package org.apache.lucene.facet;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -12,6 +13,7 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
|||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MultiCollector;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
|
@ -37,29 +39,40 @@ import org.apache.lucene.util.IOUtils;
|
|||
|
||||
public class TestRandomSamplingFacetsCollector extends FacetTestCase {
|
||||
|
||||
// The first 50 chi-square value for p-value=0.05, taken from:
|
||||
// http://en.wikibooks.org/wiki/Engineering_Tables/Chi-Squared_Distibution
|
||||
private static final float[] CHI_SQUARE_VALUES = new float[] {0.0f, 3.841f,
|
||||
5.991f, 7.815f, 9.488f, 11.07f, 12.592f, 14.067f, 15.507f, 16.919f,
|
||||
18.307f, 19.675f, 21.026f, 22.362f, 23.685f, 24.996f, 26.296f, 27.587f,
|
||||
28.869f, 30.144f, 31.41f, 32.671f, 33.924f, 35.172f, 36.415f, 37.652f,
|
||||
38.885f, 40.113f, 41.337f, 42.557f, 43.773f, 44.985f, 46.194f, 47.4f,
|
||||
48.602f, 49.802f, 50.998f, 52.192f, 53.384f, 54.572f, 55.758f, 56.942f,
|
||||
58.124f, 59.304f, 60.481f, 61.656f, 62.83f, 64.001f, 65.171f, 66.339f,
|
||||
67.505f};
|
||||
|
||||
public void testRandomSampling() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
Directory taxoDir = newDirectory();
|
||||
|
||||
Random random = random();
|
||||
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, dir);
|
||||
|
||||
FacetsConfig config = new FacetsConfig();
|
||||
|
||||
final int numCategories = 10;
|
||||
int numDocs = atLeast(10000);
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
|
||||
doc.add(new FacetField("iMod10", String.valueOf(i % 10)));
|
||||
doc.add(new FacetField("iMod10", Integer.toString(i % numCategories)));
|
||||
writer.addDocument(config.build(taxoWriter, doc));
|
||||
}
|
||||
Random random = random();
|
||||
|
||||
// NRT open
|
||||
IndexSearcher searcher = newSearcher(writer.getReader());
|
||||
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
|
||||
writer.close();
|
||||
IOUtils.close(taxoWriter);
|
||||
IOUtils.close(writer, taxoWriter);
|
||||
|
||||
// Test empty results
|
||||
RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
|
||||
|
@ -80,61 +93,55 @@ public class TestRandomSamplingFacetsCollector extends FacetTestCase {
|
|||
// Use a query to select half of the documents.
|
||||
TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
|
||||
|
||||
// there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i %
|
||||
// 10) are hits.
|
||||
// there is a REAL small chance that one of the 5 values will be missed when
|
||||
// sampling.
|
||||
// but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be
|
||||
// missing) ~ 10^-193
|
||||
// so that is probably not going to happen.
|
||||
int maxNumChildren = 5;
|
||||
|
||||
RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.nextLong()); // no sampling
|
||||
RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong()); // 10 % of total docs, 20% of the hits
|
||||
RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong()); // 10% of total docs, 20% of the hits
|
||||
|
||||
FacetsCollector fc = new FacetsCollector();
|
||||
|
||||
searcher.search(query, MultiCollector.wrap(fc, random100Percent, random10Percent));
|
||||
searcher.search(query, MultiCollector.wrap(fc, random10Percent));
|
||||
|
||||
FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
|
||||
FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent);
|
||||
FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc);
|
||||
|
||||
FacetResult random10Result = random10Percent.amortizeFacetCounts(random10FacetCounts.getTopChildren(10, "iMod10"), config, searcher);
|
||||
FacetResult random100Result = random100FacetCounts.getTopChildren(10, "iMod10");
|
||||
FacetResult exactResult = exactFacetCounts.getTopChildren(10, "iMod10");
|
||||
|
||||
assertEquals(random100Result, exactResult);
|
||||
|
||||
// we should have five children, but there is a small chance we have less.
|
||||
// (see above).
|
||||
assertTrue(random10Result.childCount <= maxNumChildren);
|
||||
// there should be one child at least.
|
||||
assertTrue(random10Result.childCount >= 1);
|
||||
|
||||
// now calculate some statistics to determine if the sampled result is 'ok'.
|
||||
// because random sampling is used, the results will vary each time.
|
||||
int sum = 0;
|
||||
for (LabelAndValue lav : random10Result.labelValues) {
|
||||
sum += lav.value.intValue();
|
||||
final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();
|
||||
|
||||
// count the total hits and sampled docs, also store the number of sampled
|
||||
// docs per segment
|
||||
int totalSampledDocs = 0, totalHits = 0;
|
||||
int[] numSampledDocs = new int[matchingDocs.size()];
|
||||
// System.out.println("numSegments=" + numSampledDocs.length);
|
||||
for (int i = 0; i < numSampledDocs.length; i++) {
|
||||
MatchingDocs md = matchingDocs.get(i);
|
||||
final DocIdSetIterator iter = md.bits.iterator();
|
||||
while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) ++numSampledDocs[i];
|
||||
totalSampledDocs += numSampledDocs[i];
|
||||
totalHits += md.totalHits;
|
||||
}
|
||||
float mu = (float) sum / (float) maxNumChildren;
|
||||
|
||||
float variance = 0;
|
||||
for (LabelAndValue lav : random10Result.labelValues) {
|
||||
variance += Math.pow((mu - lav.value.intValue()), 2);
|
||||
// compute the chi-square value for the sampled documents' distribution
|
||||
float chi_square = 0;
|
||||
for (int i = 0; i < numSampledDocs.length; i++) {
|
||||
MatchingDocs md = matchingDocs.get(i);
|
||||
float ei = (float) md.totalHits / totalHits;
|
||||
if (ei > 0.0f) {
|
||||
float oi = (float) numSampledDocs[i] / totalSampledDocs;
|
||||
chi_square += (Math.pow(ei - oi, 2) / ei);
|
||||
}
|
||||
}
|
||||
variance = variance / maxNumChildren;
|
||||
float sigma = (float) Math.sqrt(variance);
|
||||
|
||||
// we query only half the documents and have 5 categories. The average
|
||||
// number of docs in a category will thus be the total divided by 5*2
|
||||
float targetMu = numDocs / (5.0f * 2.0f);
|
||||
// Verify that the chi-square value isn't too big. According to
|
||||
// http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value,
|
||||
// we basically verify that there is a really small chance of hitting a very
|
||||
// bad sample (p-value < 0.05), for n-degrees of freedom. The number 'n' depends
|
||||
// on the number of segments.
|
||||
assertTrue("chisquare not statistically significant enough: " + chi_square, chi_square < CHI_SQUARE_VALUES[numSampledDocs.length]);
|
||||
|
||||
// the average should be in the range and the standard deviation should not
|
||||
// be too great
|
||||
assertTrue(sigma < 200);
|
||||
assertTrue(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma);
|
||||
// Test amortized counts - should be 5X the sampled count, but maximum numDocs/10
|
||||
final FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
|
||||
final FacetResult random10Result = random10FacetCounts.getTopChildren(10, "iMod10");
|
||||
final FacetResult amortized10Result = random10Percent.amortizeFacetCounts(random10Result, config, searcher);
|
||||
for (int i = 0; i < amortized10Result.labelValues.length; i++) {
|
||||
LabelAndValue amortized = amortized10Result.labelValues[i];
|
||||
LabelAndValue sampled = random10Result.labelValues[i];
|
||||
// since numDocs may not divide by 10 exactly, allow for some slack in the amortized count
|
||||
assertEquals(amortized.value.floatValue(), Math.min(5 * sampled.value.floatValue(), numDocs / 10.f), 1.0);
|
||||
}
|
||||
|
||||
IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue