diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java b/modules/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java index 81c6c4e0f79..3c41787fe5a 100644 --- a/modules/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java +++ b/modules/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java @@ -2,12 +2,15 @@ package org.apache.lucene.facet.search; import java.io.IOException; import java.util.List; +import java.util.Random; import org.apache.lucene.index.IndexReader; import org.apache.lucene.facet.search.params.FacetSearchParams; import org.apache.lucene.facet.search.results.FacetResult; import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.search.sampling.RandomSampler; +import org.apache.lucene.facet.search.sampling.RepeatableSampler; import org.apache.lucene.facet.search.sampling.Sampler; import org.apache.lucene.facet.search.sampling.SamplingAccumulator; import org.apache.lucene.facet.taxonomy.TaxonomyReader; @@ -44,7 +47,7 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader; */ public final class AdaptiveFacetsAccumulator extends StandardFacetsAccumulator { - private Sampler sampler = new Sampler(); + private Sampler sampler = new RandomSampler(); /** * Create an {@link AdaptiveFacetsAccumulator} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/RandomSampler.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/RandomSampler.java new file mode 100644 index 00000000000..da9aa256645 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/RandomSampler.java @@ -0,0 +1,71 @@ +package org.apache.lucene.facet.search.sampling; + +import java.io.IOException; +import java.util.Random; + +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIDsIterator; +import org.apache.lucene.facet.util.ScoredDocIdsUtils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Simple random sampler + */ +public class RandomSampler extends Sampler { + + private final Random random; + + public RandomSampler() { + super(); + this.random = new Random(); + } + + public RandomSampler(SamplingParams params, Random random) throws IllegalArgumentException { + super(params); + this.random = random; + } + + @Override + protected SampleResult createSample(ScoredDocIDs docids, int actualSize, int sampleSetSize) throws IOException { + final int[] sample = new int[sampleSetSize]; + final int maxStep = (actualSize * 2 ) / sampleSetSize; //floor + int remaining = actualSize; + ScoredDocIDsIterator it = docids.iterator(); + int i = 0; + // select sample docs with random skipStep, make sure to leave sufficient #docs for selection after last skip + while (i(sampleSetSize-maxStep-i)) { + int skipStep = 1 + random.nextInt(maxStep); + // Skip over 'skipStep' documents + for (int j=0; jsampleSize values from the first collectionSize * locations of collection, chosen using @@ -57,10 +72,10 @@ public class RandomSample { * @return An array of values chosen from the collection. * @see Algorithm#TRAVERSAL */ - public static int[] repeatableSample(ScoredDocIDs collection, + private static int[] repeatableSample(ScoredDocIDs collection, int collectionSize, int sampleSize) throws IOException { - return RandomSample.repeatableSample(collection, collectionSize, + return repeatableSample(collection, collectionSize, sampleSize, Algorithm.HASHING, Sorted.NO); } @@ -75,7 +90,7 @@ public class RandomSample { * Sorted.NO to return them in essentially random order. * @return An array of values chosen from the collection. */ - public static int[] repeatableSample(ScoredDocIDs collection, + private static int[] repeatableSample(ScoredDocIDs collection, int collectionSize, int sampleSize, Algorithm algorithm, Sorted sorted) throws IOException { @@ -91,16 +106,16 @@ public class RandomSample { int[] sample = new int[sampleSize]; long[] times = new long[4]; if (algorithm == Algorithm.TRAVERSAL) { - RandomSample.sample1(collection, collectionSize, sample, times); + sample1(collection, collectionSize, sample, times); } else if (algorithm == Algorithm.HASHING) { - RandomSample.sample2(collection, collectionSize, sample, times); + sample2(collection, collectionSize, sample, times); } else { throw new IllegalArgumentException("Invalid algorithm selection"); } if (sorted == Sorted.YES) { Arrays.sort(sample); } - if (RandomSample.returnTimings) { + if (returnTimings) { times[3] = System.currentTimeMillis(); if (logger.isLoggable(Level.FINEST)) { logger.finest("Times: " + (times[1] - times[0]) + "ms, " @@ -133,13 +148,13 @@ public class RandomSample { private static void sample1(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times) throws IOException { ScoredDocIDsIterator it = collection.iterator(); - if (RandomSample.returnTimings) { + if (returnTimings) { times[0] = System.currentTimeMillis(); } int sampleSize = sample.length; - int prime = RandomSample.findGoodStepSize(collectionSize, sampleSize); + int prime = findGoodStepSize(collectionSize, sampleSize); int mod = prime % collectionSize; - if (RandomSample.returnTimings) { + if (returnTimings) { times[1] = System.currentTimeMillis(); } int sampleCount = 0; @@ -158,10 +173,10 @@ public class RandomSample { } sample[sampleCount++] = it.getDocID(); } - if (RandomSample.returnTimings) { + if (returnTimings) { times[2] = System.currentTimeMillis(); } - } // end RandomSample.sample1() + } /** * Returns a value which will allow the caller to walk @@ -187,10 +202,10 @@ public class RandomSample { i = collectionSize / sampleSize; } do { - i = RandomSample.findNextPrimeAfter(i); + i = findNextPrimeAfter(i); } while (collectionSize % i == 0); return i; - } // end RandomSample.findGoodStepSize() + } /** * Returns the first prime number that is larger than n. @@ -199,10 +214,10 @@ public class RandomSample { */ private static int findNextPrimeAfter(int n) { n += (n % 2 == 0) ? 1 : 2; // next odd - foundFactor: for (;; n += 2) { + foundFactor: for (;; n += 2) { //TODO labels??!! int sri = (int) (Math.sqrt(n)); - for (int primeIndex = 0; primeIndex < RandomSample.N_PRIMES; primeIndex++) { - int p = RandomSample.primes[primeIndex]; + for (int primeIndex = 0; primeIndex < N_PRIMES; primeIndex++) { + int p = primes[primeIndex]; if (p > sri) { return n; } @@ -210,7 +225,7 @@ public class RandomSample { continue foundFactor; } } - for (int p = RandomSample.primes[RandomSample.N_PRIMES - 1] + 2;; p += 2) { + for (int p = primes[N_PRIMES - 1] + 2;; p += 2) { if (p > sri) { return n; } @@ -219,70 +234,17 @@ public class RandomSample { } } } - } // end RandomSample.findNextPrimeAfter() - - /** - * Divides the values in collection into numSubranges - * subranges from minValue to maxValue and returns the - * number of values in each subrange. (For testing the flatness of distribution of - * a sample.) - * @param collection The collection of values to be counted. - * @param range The number of possible values. - * @param numSubranges How many intervals to divide the value range into. - */ - private static int[] countsBySubrange(int[] collection, int range, int numSubranges) { - int[] counts = new int[numSubranges]; - Arrays.fill(counts, 0); - int numInSubrange = range / numSubranges; - for (int j = 0; j < collection.length; j++) { - counts[collection[j] / numInSubrange]++; - } - return counts; - } // end RandomSample.countsBySubrange() - - /** - * Factors value into primes. - */ - public static int[] factor(long value) { - ArrayList list = new ArrayList(); - while (value > 1 && value % 2 == 0) { - list.add(2); - value /= 2; - } - long sqrt = Math.round(Math.sqrt(value)); - for (int pIndex = 0, lim = RandomSample.primes.length; pIndex < lim; pIndex++) { - int p = RandomSample.primes[pIndex]; - if (p >= sqrt) { - break; - } - while (value % p == 0) { - list.add(p); - value /= p; - sqrt = Math.round(Math.sqrt(value)); - } - } - if (list.size() == 0 || value > Integer.MAX_VALUE) { - throw new RuntimeException("Prime or too large to factor: "+value); - } - if ((int)value > 1) { - list.add((int)value); - } - int[] factors = new int[list.size()]; - for (int j = 0; j < factors.length; j++) { - factors[j] = list.get(j).intValue(); - } - return factors; - } // end RandomSample.factor() + } /** * The first N_PRIMES primes, after 2. */ private static final int N_PRIMES = 4000; - private static int[] primes = new int[RandomSample.N_PRIMES]; + private static int[] primes = new int[N_PRIMES]; static { - RandomSample.primes[0] = 3; - for (int count = 1; count < RandomSample.N_PRIMES; count++) { - primes[count] = RandomSample.findNextPrimeAfter(primes[count - 1]); + primes[0] = 3; + for (int count = 1; count < N_PRIMES; count++) { + primes[count] = findNextPrimeAfter(primes[count - 1]); } } @@ -307,7 +269,7 @@ public class RandomSample { */ private static void sample2(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times) throws IOException { - if (RandomSample.returnTimings) { + if (returnTimings) { times[0] = System.currentTimeMillis(); } int sampleSize = sample.length; @@ -320,7 +282,7 @@ public class RandomSample { while (it.next()) { pq.insertWithReuse((int)(it.getDocID() * PHI_32) & 0x7FFFFFFF); } - if (RandomSample.returnTimings) { + if (returnTimings) { times[1] = System.currentTimeMillis(); } /* @@ -330,10 +292,10 @@ public class RandomSample { for (int si = 0; si < sampleSize; si++) { sample[si] = (int)(((IntPriorityQueue.MI)(heap[si+1])).value * PHI_32I) & 0x7FFFFFFF; } - if (RandomSample.returnTimings) { + if (returnTimings) { times[2] = System.currentTimeMillis(); } - } // end RandomSample.sample2() + } /** * A bounded priority queue for Integers, to retain a specified number of @@ -358,7 +320,7 @@ public class RandomSample { } this.mi.value = intval; this.mi = (MI)this.insertWithOverflow(this.mi); - } // end IntPriorityQueue.insertWithReuse() + } /** * Returns the underlying data structure for faster access. Extracting elements @@ -386,19 +348,19 @@ public class RandomSample { private static class MI { MI() { } public int value; - } // end class RandomSample.IntPriorityQueue.MI + } /** * The mutable integer instance for reuse after first overflow. */ private MI mi; - } // end class RandomSample.IntPriorityQueue + } /** * For specifying which sampling algorithm to use. */ - public static class Algorithm { + private enum Algorithm { /** * Specifies a methodical traversal algorithm, which is guaranteed to span the collection @@ -410,7 +372,7 @@ public class RandomSample { // TODO (Facet): This one produces a bimodal distribution (very flat around // each peak!) for collection size 10M and sample sizes 10k and 10544. // Figure out why. - public static final Algorithm TRAVERSAL = new Algorithm("Traversal"); + TRAVERSAL, /** * Specifies a Fibonacci-style hash algorithm (see Knuth, S&S), which generates a less @@ -418,68 +380,24 @@ public class RandomSample { * but requires a bounded priority queue the size of the sample, and creates an object * containing a sampled value and its hash, for every element in the full set. */ - public static final Algorithm HASHING = new Algorithm("Hashing"); - - /** - * Constructs an instance of an algorithm. - * @param name An ID for printing. - */ - private Algorithm(String name) { - this.name = name; - } - - /** - * Prints this algorithm's name. - */ - @Override - public String toString() { - return this.name; - } - - /** - * The name of this algorithm, for printing. - */ - private String name; - - } // end class RandomSample.Algorithm + HASHING + } /** * For specifying whether to sort the sample. */ - public static class Sorted { + private enum Sorted { /** - * Specifies sorting the resulting sample before returning. + * Sort resulting sample before returning. */ - public static final Sorted YES = new Sorted("sorted"); + YES, /** - * Specifies not sorting the resulting sample. + *Do not sort the resulting sample. */ - public static final Sorted NO = new Sorted("unsorted"); - - /** - * Constructs an instance of a "sorted" selector. - * @param name An ID for printing. - */ - private Sorted(String name) { - this.name = name; - } - - /** - * Prints this selector's name. - */ - @Override - public String toString() { - return this.name; - } - - /** - * The name of this selector, for printing. - */ - private String name; - - } // end class RandomSample.Sorted + NO + } /** * Magic number 1: prime closest to phi, in 32 bits. @@ -496,143 +414,4 @@ public class RandomSample { */ private static boolean returnTimings = false; - /** - * Self-test. - */ - public static void main(String[] args) throws Exception { - RandomSample.returnTimings = true; - /* - * Create an array of sequential integers, from which samples will be taken. - */ - final int COLLECTION_SIZE = 10 * 1000 * 1000; - ScoredDocIDs collection = createAllScoredDocs(COLLECTION_SIZE); - - /* - * Factor PHI. - * - int[] factors = RandomSample.factor(PHI_32); - System.out.print("Factors of PHI_32: "); - for (int k : factors) { - System.out.print(k+", "); - } - System.out.println(""); - - * Verify inverse relationship of PHI & phi. - * - boolean inverseValid = true; - for (int j = 0; j < Integer.MAX_VALUE; j++) { - int k = (int)(j * PHI_32) & 0x7FFFFFFF; - int m = (int)(k * PHI_32I) & 0X7FFFFFFF; - if (j != m) { - System.out.println("Inverse not valid for "+j); - inverseValid = false; - } - } - System.out.println("Inverse valid? "+inverseValid); - */ - /* - * Take samples of various sizes from the full set, verify no duplicates, - * check flatness. - */ - int[] sampleSizes = { - 10, 57, 100, 333, 1000, 2154, 10000 - }; - Algorithm[] algorithms = { Algorithm.HASHING, Algorithm.TRAVERSAL }; - for (int sampleSize : sampleSizes) { - for (Algorithm algorithm : algorithms) { - System.out.println("Sample size " + sampleSize - + ", algorithm " + algorithm + "..."); - /* - * Take the sample. - */ - int[] sample = RandomSample.repeatableSample( - collection, COLLECTION_SIZE, sampleSize, algorithm, Sorted.YES); - /* - * Check for duplicates. - */ - boolean noDups = true; - for (int j = 0; j < sampleSize - 1; j++) { - if (sample[j] == sample[j + 1]) { - System.out.println("Duplicate value " - + sample[j] + " at " + j + ", " - + (j + 1)); - noDups = false; - break; - } - } - if (noDups) { - System.out.println("No duplicates."); - } - if (algorithm == Algorithm.HASHING) { - System.out.print("Hashed sample, up to 100 of "+sampleSize+": "); - int lim = Math.min(100, sampleSize); - for (int k = 0; k < lim; k++) { - System.out.print(sample[k]+", "); - } - System.out.println(""); - } - /* - * Check flatness of distribution in sample. - */ - final int N_INTERVALS = 100; - int[] counts = RandomSample.countsBySubrange(sample, COLLECTION_SIZE, N_INTERVALS); - int minCount = Integer.MAX_VALUE; - int maxCount = Integer.MIN_VALUE; - int avgCount = 0; - for (int j = 0; j < N_INTERVALS; j++) { - int count = counts[j]; - if (count < minCount) { - minCount = count; - } - if (count > maxCount) { - maxCount = count; - } - avgCount += count; - } - avgCount /= N_INTERVALS; - System.out.println("Min, max, avg: "+minCount+", "+maxCount+", "+avgCount); - - if (((double)minCount - avgCount)/avgCount < -0.05 && (minCount - avgCount) < -5) { - System.out.println("Not flat enough."); - } else if (((double)maxCount - avgCount)/avgCount > 0.05 && (maxCount - avgCount) > 5) { - System.out.println("Not flat enough."); - } else { - System.out.println("Flat enough."); - } - if (sampleSize == 10544 && algorithm == Algorithm.TRAVERSAL) { - System.out.print("Counts of interest: "); - for (int j = 0; j < N_INTERVALS; j++) { - System.out.print(counts[j]+", "); - } - System.out.println(""); - } - } - } - System.out.println("Last prime is " - + RandomSample.primes[RandomSample.N_PRIMES - 1]); - } - - private static ScoredDocIDs createAllScoredDocs(final int COLLECTION_SIZE) - throws CorruptIndexException, LockObtainFailedException, IOException { - ScoredDocIDs collection; - - IndexReader reader = null; - Directory ramDir = new RAMDirectory(); - try { - IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_30, new KeywordAnalyzer())); - for (int i = 0; i < COLLECTION_SIZE; i++) { - writer.addDocument(new Document()); - } - writer.commit(); - writer.close(); - reader = IndexReader.open(ramDir); - collection = ScoredDocIdsUtils.createAllDocsScoredDocIDs(reader); - } finally { - if (reader != null) { - reader.close(); - } - ramDir.close(); - } - return collection; - } -} // end class RandomSample +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java index debebeafd58..0f660eb2118 100644 --- a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java +++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java @@ -1,8 +1,6 @@ package org.apache.lucene.facet.search.sampling; import java.io.IOException; -import java.util.logging.Level; -import java.util.logging.Logger; import org.apache.lucene.index.IndexReader; @@ -15,8 +13,6 @@ import org.apache.lucene.facet.search.results.FacetResult; import org.apache.lucene.facet.search.results.FacetResultNode; import org.apache.lucene.facet.search.results.MutableFacetResultNode; import org.apache.lucene.facet.taxonomy.TaxonomyReader; -import org.apache.lucene.facet.util.RandomSample; -import org.apache.lucene.facet.util.ScoredDocIdsUtils; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -48,11 +44,9 @@ import org.apache.lucene.facet.util.ScoredDocIdsUtils; * * @lucene.experimental */ -public class Sampler { +public abstract class Sampler { - private static final Logger logger = Logger.getLogger(Sampler.class.getName()); - - private final SamplingParams samplingParams; + protected final SamplingParams samplingParams; /** * Construct with {@link SamplingParams} @@ -103,25 +97,19 @@ public class Sampler { sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize()); sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize()); - int[] sampleSet = null; - try { - sampleSet = RandomSample.repeatableSample(docids, actualSize, - sampleSetSize); - } catch (IOException e) { - if (logger.isLoggable(Level.WARNING)) { - logger.log(Level.WARNING, "sampling failed: "+e.getMessage()+" - falling back to no sampling!", e); - } - return new SampleResult(docids, 1d); - } - - ScoredDocIDs sampled = ScoredDocIdsUtils.createScoredDocIDsSubset(docids, - sampleSet); - if (logger.isLoggable(Level.FINEST)) { - logger.finest("******************** " + sampled.size()); - } - return new SampleResult(sampled, sampled.size()/(double)docids.size()); + return createSample(docids, actualSize, sampleSetSize); } + /** + * Create and return a sample of the input set + * @param docids input set out of which a sample is to be created + * @param actualSize original size of set, prior to sampling + * @param sampleSetSize required size of sample set + * @return sample of the input set in the required size + */ + protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize, + int sampleSetSize) throws IOException; + /** * Get a fixer of sample facet accumulation results. Default implementation * returns a TakmiSampleFixer which is adequate only for diff --git a/modules/facet/src/test/org/apache/lucene/facet/FacetTestBase.java b/modules/facet/src/test/org/apache/lucene/facet/FacetTestBase.java index 1e216837e5f..d57853988cf 100644 --- a/modules/facet/src/test/org/apache/lucene/facet/FacetTestBase.java +++ b/modules/facet/src/test/org/apache/lucene/facet/FacetTestBase.java @@ -313,7 +313,7 @@ public abstract class FacetTestBase extends LuceneTestCase { System.err.println("Results are not the same!"); System.err.println("Expected:\n" + expectedResults); System.err.println("Actual" + actualResults); - fail("Results are not the same!"); + throw new NotSameResultError(); } } @@ -325,4 +325,12 @@ public abstract class FacetTestBase extends LuceneTestCase { } return sb.toString().replaceAll("Residue:.*.0", "").replaceAll("Num valid Descendants.*", ""); } + + /** Special Error class for ability to ignore only this error and retry... */ + public static class NotSameResultError extends Error { + public NotSameResultError() { + super("Results are not the same!"); + } + } + } diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java b/modules/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java index 4d6ec056737..4367ea992b7 100644 --- a/modules/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java +++ b/modules/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java @@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.sampling; import java.io.IOException; import java.util.List; +import java.util.Random; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; @@ -41,7 +42,7 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK { protected static final int K = 2; /** since there is a chance that this test would fail even if the code is correct, retry the sampling */ - protected static final int RETRIES = 4; + protected static final int RETRIES = 10; protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler, TaxonomyReader taxoReader, IndexReader indexReader, @@ -53,51 +54,54 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK { * is performed. The results are compared to non-sampled ones. */ public void testCountUsingSamping() throws Exception, IOException { + boolean useRandomSampler = random.nextBoolean(); for (int partitionSize : partitionSizes) { - initIndex(partitionSize); - - // Get all of the documents and run the query, then do different - // facet counts and compare to control - Query q = new TermQuery(new Term(CONTENT_FIELD, BETA)); // 90% of the docs - ScoredDocIdCollector docCollector = ScoredDocIdCollector.create(searcher.maxDoc(), false); - - FacetSearchParams expectedSearchParams = searchParamsWithRequests(K, partitionSize); - FacetsCollector fc = new FacetsCollector(expectedSearchParams, indexReader, taxoReader); - - searcher.search(q, MultiCollector.wrap(docCollector, fc)); - - List expectedResults = fc.getFacetResults(); - - // complement with sampling! - final Sampler sampler = createSampler(docCollector.getScoredDocIDs()); - - FacetSearchParams samplingSearchParams = searchParamsWithRequests(K, partitionSize); - - assertSampling(expectedResults, q, sampler, samplingSearchParams, false); - assertSampling(expectedResults, q, sampler, samplingSearchParams, true); - - closeAll(); + try { + initIndex(partitionSize); + // Get all of the documents and run the query, then do different + // facet counts and compare to control + Query q = new TermQuery(new Term(CONTENT_FIELD, BETA)); // 90% of the docs + ScoredDocIdCollector docCollector = ScoredDocIdCollector.create(searcher.maxDoc(), false); + + FacetSearchParams expectedSearchParams = searchParamsWithRequests(K, partitionSize); + FacetsCollector fc = new FacetsCollector(expectedSearchParams, indexReader, taxoReader); + + searcher.search(q, MultiCollector.wrap(docCollector, fc)); + + List expectedResults = fc.getFacetResults(); + + FacetSearchParams samplingSearchParams = searchParamsWithRequests(K, partitionSize); + + // try several times in case of failure, because the test has a chance to fail + // if the top K facets are not sufficiently common with the sample set + for (int nTrial=0; nTrial=RETRIES-1) { + throw e; // no more retries allowed, must fail + } + } + } + } finally { + closeAll(); + } } } private void assertSampling(List expected, Query q, Sampler sampler, FacetSearchParams params, boolean complement) throws Exception { - // try several times in case of failure, because the test has a chance to fail - // if the top K facets are not sufficiently common with the sample set - for (int n=RETRIES; n>0; n--) { - FacetsCollector samplingFC = samplingCollector(false, sampler, params); - - searcher.search(q, samplingFC); - List sampledResults = samplingFC.getFacetResults(); - - try { - assertSameResults(expected, sampledResults); - break; // succeeded - } catch (Exception e) { - if (n<=1) { // otherwise try again - throw e; - } - } - } + FacetsCollector samplingFC = samplingCollector(complement, sampler, params); + + searcher.search(q, samplingFC); + List sampledResults = samplingFC.getFacetResults(); + + assertSameResults(expected, sampledResults); } private FacetsCollector samplingCollector( @@ -117,14 +121,19 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK { return samplingFC; } - private Sampler createSampler(ScoredDocIDs scoredDocIDs) { + private Sampler createSampler(int nTrial, ScoredDocIDs scoredDocIDs, boolean useRandomSampler) { SamplingParams samplingParams = new SamplingParams(); - samplingParams.setSampleRatio(0.8); - samplingParams.setMinSampleSize(100); - samplingParams.setMaxSampleSize(10000); + + final double retryFactor = Math.pow(1.01, nTrial); + samplingParams.setSampleRatio(0.8 * retryFactor); + samplingParams.setMinSampleSize((int) (100 * retryFactor)); + samplingParams.setMaxSampleSize((int) (10000 * retryFactor)); + samplingParams.setOversampleFactor(5.0 * retryFactor); + samplingParams.setSampingThreshold(11000); //force sampling - samplingParams.setOversampleFactor(5.0); - Sampler sampler = new Sampler(samplingParams); + Sampler sampler = useRandomSampler ? + new RandomSampler(samplingParams, new Random(random.nextLong())) : + new RepeatableSampler(samplingParams); assertTrue("must enable sampling for this test!",sampler.shouldSample(scoredDocIDs)); return sampler; }