mirror of https://github.com/apache/lucene.git
LUCENE-3501: random sampler was not random (and so facet SamplingWrapperTest occasionally failed)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1181760 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b438b265aa
commit
bd067ee329
|
@ -2,12 +2,15 @@ package org.apache.lucene.facet.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import org.apache.lucene.facet.search.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.results.FacetResult;
|
||||
import org.apache.lucene.facet.search.results.FacetResultNode;
|
||||
import org.apache.lucene.facet.search.sampling.RandomSampler;
|
||||
import org.apache.lucene.facet.search.sampling.RepeatableSampler;
|
||||
import org.apache.lucene.facet.search.sampling.Sampler;
|
||||
import org.apache.lucene.facet.search.sampling.SamplingAccumulator;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
|
@ -44,7 +47,7 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|||
*/
|
||||
public final class AdaptiveFacetsAccumulator extends StandardFacetsAccumulator {
|
||||
|
||||
private Sampler sampler = new Sampler();
|
||||
private Sampler sampler = new RandomSampler();
|
||||
|
||||
/**
|
||||
* Create an {@link AdaptiveFacetsAccumulator}
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
package org.apache.lucene.facet.search.sampling;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.facet.search.ScoredDocIDs;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
|
||||
import org.apache.lucene.facet.util.ScoredDocIdsUtils;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Simple random sampler
|
||||
*/
|
||||
public class RandomSampler extends Sampler {
|
||||
|
||||
private final Random random;
|
||||
|
||||
public RandomSampler() {
|
||||
super();
|
||||
this.random = new Random();
|
||||
}
|
||||
|
||||
public RandomSampler(SamplingParams params, Random random) throws IllegalArgumentException {
|
||||
super(params);
|
||||
this.random = random;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected SampleResult createSample(ScoredDocIDs docids, int actualSize, int sampleSetSize) throws IOException {
|
||||
final int[] sample = new int[sampleSetSize];
|
||||
final int maxStep = (actualSize * 2 ) / sampleSetSize; //floor
|
||||
int remaining = actualSize;
|
||||
ScoredDocIDsIterator it = docids.iterator();
|
||||
int i = 0;
|
||||
// select sample docs with random skipStep, make sure to leave sufficient #docs for selection after last skip
|
||||
while (i<sample.length && remaining>(sampleSetSize-maxStep-i)) {
|
||||
int skipStep = 1 + random.nextInt(maxStep);
|
||||
// Skip over 'skipStep' documents
|
||||
for (int j=0; j<skipStep; j++) {
|
||||
it.next();
|
||||
-- remaining;
|
||||
}
|
||||
sample[i++] = it.getDocID();
|
||||
}
|
||||
// Add leftover documents to the sample set
|
||||
while (i<sample.length) {
|
||||
it.next();
|
||||
sample[i++] = it.getDocID();
|
||||
}
|
||||
ScoredDocIDs sampleRes = ScoredDocIdsUtils.createScoredDocIDsSubset(docids, sample);
|
||||
SampleResult res = new SampleResult(sampleRes, sampleSetSize/(double)actualSize);
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,25 +1,15 @@
|
|||
package org.apache.lucene.facet.util;
|
||||
package org.apache.lucene.facet.search.sampling;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.apache.lucene.analysis.core.KeywordAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import org.apache.lucene.facet.search.ScoredDocIDs;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
|
||||
import org.apache.lucene.facet.util.ScoredDocIdsUtils;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -40,13 +30,38 @@ import org.apache.lucene.facet.search.ScoredDocIDsIterator;
|
|||
|
||||
/**
|
||||
* Take random samples of large collections.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class RandomSample {
|
||||
public class RepeatableSampler extends Sampler {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(RandomSample.class.getName());
|
||||
private static final Logger logger = Logger.getLogger(RepeatableSampler.class.getName());
|
||||
|
||||
public RepeatableSampler(SamplingParams params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected SampleResult createSample(ScoredDocIDs docids, int actualSize,
|
||||
int sampleSetSize) throws IOException {
|
||||
int[] sampleSet = null;
|
||||
try {
|
||||
sampleSet = repeatableSample(docids, actualSize,
|
||||
sampleSetSize);
|
||||
} catch (IOException e) {
|
||||
if (logger.isLoggable(Level.WARNING)) {
|
||||
logger.log(Level.WARNING, "sampling failed: "+e.getMessage()+" - falling back to no sampling!", e);
|
||||
}
|
||||
return new SampleResult(docids, 1d);
|
||||
}
|
||||
|
||||
ScoredDocIDs sampled = ScoredDocIdsUtils.createScoredDocIDsSubset(docids,
|
||||
sampleSet);
|
||||
if (logger.isLoggable(Level.FINEST)) {
|
||||
logger.finest("******************** " + sampled.size());
|
||||
}
|
||||
return new SampleResult(sampled, sampled.size()/(double)docids.size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <code>sampleSize</code> values from the first <code>collectionSize</code>
|
||||
* locations of <code>collection</code>, chosen using
|
||||
|
@ -57,10 +72,10 @@ public class RandomSample {
|
|||
* @return An array of values chosen from the collection.
|
||||
* @see Algorithm#TRAVERSAL
|
||||
*/
|
||||
public static int[] repeatableSample(ScoredDocIDs collection,
|
||||
private static int[] repeatableSample(ScoredDocIDs collection,
|
||||
int collectionSize, int sampleSize)
|
||||
throws IOException {
|
||||
return RandomSample.repeatableSample(collection, collectionSize,
|
||||
return repeatableSample(collection, collectionSize,
|
||||
sampleSize, Algorithm.HASHING, Sorted.NO);
|
||||
}
|
||||
|
||||
|
@ -75,7 +90,7 @@ public class RandomSample {
|
|||
* Sorted.NO to return them in essentially random order.
|
||||
* @return An array of values chosen from the collection.
|
||||
*/
|
||||
public static int[] repeatableSample(ScoredDocIDs collection,
|
||||
private static int[] repeatableSample(ScoredDocIDs collection,
|
||||
int collectionSize, int sampleSize,
|
||||
Algorithm algorithm, Sorted sorted)
|
||||
throws IOException {
|
||||
|
@ -91,16 +106,16 @@ public class RandomSample {
|
|||
int[] sample = new int[sampleSize];
|
||||
long[] times = new long[4];
|
||||
if (algorithm == Algorithm.TRAVERSAL) {
|
||||
RandomSample.sample1(collection, collectionSize, sample, times);
|
||||
sample1(collection, collectionSize, sample, times);
|
||||
} else if (algorithm == Algorithm.HASHING) {
|
||||
RandomSample.sample2(collection, collectionSize, sample, times);
|
||||
sample2(collection, collectionSize, sample, times);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Invalid algorithm selection");
|
||||
}
|
||||
if (sorted == Sorted.YES) {
|
||||
Arrays.sort(sample);
|
||||
}
|
||||
if (RandomSample.returnTimings) {
|
||||
if (returnTimings) {
|
||||
times[3] = System.currentTimeMillis();
|
||||
if (logger.isLoggable(Level.FINEST)) {
|
||||
logger.finest("Times: " + (times[1] - times[0]) + "ms, "
|
||||
|
@ -133,13 +148,13 @@ public class RandomSample {
|
|||
private static void sample1(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
|
||||
throws IOException {
|
||||
ScoredDocIDsIterator it = collection.iterator();
|
||||
if (RandomSample.returnTimings) {
|
||||
if (returnTimings) {
|
||||
times[0] = System.currentTimeMillis();
|
||||
}
|
||||
int sampleSize = sample.length;
|
||||
int prime = RandomSample.findGoodStepSize(collectionSize, sampleSize);
|
||||
int prime = findGoodStepSize(collectionSize, sampleSize);
|
||||
int mod = prime % collectionSize;
|
||||
if (RandomSample.returnTimings) {
|
||||
if (returnTimings) {
|
||||
times[1] = System.currentTimeMillis();
|
||||
}
|
||||
int sampleCount = 0;
|
||||
|
@ -158,10 +173,10 @@ public class RandomSample {
|
|||
}
|
||||
sample[sampleCount++] = it.getDocID();
|
||||
}
|
||||
if (RandomSample.returnTimings) {
|
||||
if (returnTimings) {
|
||||
times[2] = System.currentTimeMillis();
|
||||
}
|
||||
} // end RandomSample.sample1()
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a value which will allow the caller to walk
|
||||
|
@ -187,10 +202,10 @@ public class RandomSample {
|
|||
i = collectionSize / sampleSize;
|
||||
}
|
||||
do {
|
||||
i = RandomSample.findNextPrimeAfter(i);
|
||||
i = findNextPrimeAfter(i);
|
||||
} while (collectionSize % i == 0);
|
||||
return i;
|
||||
} // end RandomSample.findGoodStepSize()
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the first prime number that is larger than <code>n</code>.
|
||||
|
@ -199,10 +214,10 @@ public class RandomSample {
|
|||
*/
|
||||
private static int findNextPrimeAfter(int n) {
|
||||
n += (n % 2 == 0) ? 1 : 2; // next odd
|
||||
foundFactor: for (;; n += 2) {
|
||||
foundFactor: for (;; n += 2) { //TODO labels??!!
|
||||
int sri = (int) (Math.sqrt(n));
|
||||
for (int primeIndex = 0; primeIndex < RandomSample.N_PRIMES; primeIndex++) {
|
||||
int p = RandomSample.primes[primeIndex];
|
||||
for (int primeIndex = 0; primeIndex < N_PRIMES; primeIndex++) {
|
||||
int p = primes[primeIndex];
|
||||
if (p > sri) {
|
||||
return n;
|
||||
}
|
||||
|
@ -210,7 +225,7 @@ public class RandomSample {
|
|||
continue foundFactor;
|
||||
}
|
||||
}
|
||||
for (int p = RandomSample.primes[RandomSample.N_PRIMES - 1] + 2;; p += 2) {
|
||||
for (int p = primes[N_PRIMES - 1] + 2;; p += 2) {
|
||||
if (p > sri) {
|
||||
return n;
|
||||
}
|
||||
|
@ -219,70 +234,17 @@ public class RandomSample {
|
|||
}
|
||||
}
|
||||
}
|
||||
} // end RandomSample.findNextPrimeAfter()
|
||||
|
||||
/**
|
||||
* Divides the values in <code>collection</code> into <code>numSubranges</code>
|
||||
* subranges from <code>minValue</code> to <code>maxValue</code> and returns the
|
||||
* number of values in each subrange. (For testing the flatness of distribution of
|
||||
* a sample.)
|
||||
* @param collection The collection of values to be counted.
|
||||
* @param range The number of possible values.
|
||||
* @param numSubranges How many intervals to divide the value range into.
|
||||
*/
|
||||
private static int[] countsBySubrange(int[] collection, int range, int numSubranges) {
|
||||
int[] counts = new int[numSubranges];
|
||||
Arrays.fill(counts, 0);
|
||||
int numInSubrange = range / numSubranges;
|
||||
for (int j = 0; j < collection.length; j++) {
|
||||
counts[collection[j] / numInSubrange]++;
|
||||
}
|
||||
return counts;
|
||||
} // end RandomSample.countsBySubrange()
|
||||
|
||||
/**
|
||||
* Factors <code>value</code> into primes.
|
||||
*/
|
||||
public static int[] factor(long value) {
|
||||
ArrayList<Integer> list = new ArrayList<Integer>();
|
||||
while (value > 1 && value % 2 == 0) {
|
||||
list.add(2);
|
||||
value /= 2;
|
||||
}
|
||||
long sqrt = Math.round(Math.sqrt(value));
|
||||
for (int pIndex = 0, lim = RandomSample.primes.length; pIndex < lim; pIndex++) {
|
||||
int p = RandomSample.primes[pIndex];
|
||||
if (p >= sqrt) {
|
||||
break;
|
||||
}
|
||||
while (value % p == 0) {
|
||||
list.add(p);
|
||||
value /= p;
|
||||
sqrt = Math.round(Math.sqrt(value));
|
||||
}
|
||||
}
|
||||
if (list.size() == 0 || value > Integer.MAX_VALUE) {
|
||||
throw new RuntimeException("Prime or too large to factor: "+value);
|
||||
}
|
||||
if ((int)value > 1) {
|
||||
list.add((int)value);
|
||||
}
|
||||
int[] factors = new int[list.size()];
|
||||
for (int j = 0; j < factors.length; j++) {
|
||||
factors[j] = list.get(j).intValue();
|
||||
}
|
||||
return factors;
|
||||
} // end RandomSample.factor()
|
||||
}
|
||||
|
||||
/**
|
||||
* The first N_PRIMES primes, after 2.
|
||||
*/
|
||||
private static final int N_PRIMES = 4000;
|
||||
private static int[] primes = new int[RandomSample.N_PRIMES];
|
||||
private static int[] primes = new int[N_PRIMES];
|
||||
static {
|
||||
RandomSample.primes[0] = 3;
|
||||
for (int count = 1; count < RandomSample.N_PRIMES; count++) {
|
||||
primes[count] = RandomSample.findNextPrimeAfter(primes[count - 1]);
|
||||
primes[0] = 3;
|
||||
for (int count = 1; count < N_PRIMES; count++) {
|
||||
primes[count] = findNextPrimeAfter(primes[count - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -307,7 +269,7 @@ public class RandomSample {
|
|||
*/
|
||||
private static void sample2(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times)
|
||||
throws IOException {
|
||||
if (RandomSample.returnTimings) {
|
||||
if (returnTimings) {
|
||||
times[0] = System.currentTimeMillis();
|
||||
}
|
||||
int sampleSize = sample.length;
|
||||
|
@ -320,7 +282,7 @@ public class RandomSample {
|
|||
while (it.next()) {
|
||||
pq.insertWithReuse((int)(it.getDocID() * PHI_32) & 0x7FFFFFFF);
|
||||
}
|
||||
if (RandomSample.returnTimings) {
|
||||
if (returnTimings) {
|
||||
times[1] = System.currentTimeMillis();
|
||||
}
|
||||
/*
|
||||
|
@ -330,10 +292,10 @@ public class RandomSample {
|
|||
for (int si = 0; si < sampleSize; si++) {
|
||||
sample[si] = (int)(((IntPriorityQueue.MI)(heap[si+1])).value * PHI_32I) & 0x7FFFFFFF;
|
||||
}
|
||||
if (RandomSample.returnTimings) {
|
||||
if (returnTimings) {
|
||||
times[2] = System.currentTimeMillis();
|
||||
}
|
||||
} // end RandomSample.sample2()
|
||||
}
|
||||
|
||||
/**
|
||||
* A bounded priority queue for Integers, to retain a specified number of
|
||||
|
@ -358,7 +320,7 @@ public class RandomSample {
|
|||
}
|
||||
this.mi.value = intval;
|
||||
this.mi = (MI)this.insertWithOverflow(this.mi);
|
||||
} // end IntPriorityQueue.insertWithReuse()
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the underlying data structure for faster access. Extracting elements
|
||||
|
@ -386,19 +348,19 @@ public class RandomSample {
|
|||
private static class MI {
|
||||
MI() { }
|
||||
public int value;
|
||||
} // end class RandomSample.IntPriorityQueue.MI
|
||||
}
|
||||
|
||||
/**
|
||||
* The mutable integer instance for reuse after first overflow.
|
||||
*/
|
||||
private MI mi;
|
||||
|
||||
} // end class RandomSample.IntPriorityQueue
|
||||
}
|
||||
|
||||
/**
|
||||
* For specifying which sampling algorithm to use.
|
||||
*/
|
||||
public static class Algorithm {
|
||||
private enum Algorithm {
|
||||
|
||||
/**
|
||||
* Specifies a methodical traversal algorithm, which is guaranteed to span the collection
|
||||
|
@ -410,7 +372,7 @@ public class RandomSample {
|
|||
// TODO (Facet): This one produces a bimodal distribution (very flat around
|
||||
// each peak!) for collection size 10M and sample sizes 10k and 10544.
|
||||
// Figure out why.
|
||||
public static final Algorithm TRAVERSAL = new Algorithm("Traversal");
|
||||
TRAVERSAL,
|
||||
|
||||
/**
|
||||
* Specifies a Fibonacci-style hash algorithm (see Knuth, S&S), which generates a less
|
||||
|
@ -418,68 +380,24 @@ public class RandomSample {
|
|||
* but requires a bounded priority queue the size of the sample, and creates an object
|
||||
* containing a sampled value and its hash, for every element in the full set.
|
||||
*/
|
||||
public static final Algorithm HASHING = new Algorithm("Hashing");
|
||||
|
||||
/**
|
||||
* Constructs an instance of an algorithm.
|
||||
* @param name An ID for printing.
|
||||
*/
|
||||
private Algorithm(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints this algorithm's name.
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
/**
|
||||
* The name of this algorithm, for printing.
|
||||
*/
|
||||
private String name;
|
||||
|
||||
} // end class RandomSample.Algorithm
|
||||
HASHING
|
||||
}
|
||||
|
||||
/**
|
||||
* For specifying whether to sort the sample.
|
||||
*/
|
||||
public static class Sorted {
|
||||
private enum Sorted {
|
||||
|
||||
/**
|
||||
* Specifies sorting the resulting sample before returning.
|
||||
* Sort resulting sample before returning.
|
||||
*/
|
||||
public static final Sorted YES = new Sorted("sorted");
|
||||
YES,
|
||||
|
||||
/**
|
||||
* Specifies not sorting the resulting sample.
|
||||
*Do not sort the resulting sample.
|
||||
*/
|
||||
public static final Sorted NO = new Sorted("unsorted");
|
||||
|
||||
/**
|
||||
* Constructs an instance of a "sorted" selector.
|
||||
* @param name An ID for printing.
|
||||
*/
|
||||
private Sorted(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints this selector's name.
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
/**
|
||||
* The name of this selector, for printing.
|
||||
*/
|
||||
private String name;
|
||||
|
||||
} // end class RandomSample.Sorted
|
||||
NO
|
||||
}
|
||||
|
||||
/**
|
||||
* Magic number 1: prime closest to phi, in 32 bits.
|
||||
|
@ -496,143 +414,4 @@ public class RandomSample {
|
|||
*/
|
||||
private static boolean returnTimings = false;
|
||||
|
||||
/**
|
||||
* Self-test.
|
||||
*/
|
||||
public static void main(String[] args) throws Exception {
|
||||
RandomSample.returnTimings = true;
|
||||
/*
|
||||
* Create an array of sequential integers, from which samples will be taken.
|
||||
*/
|
||||
final int COLLECTION_SIZE = 10 * 1000 * 1000;
|
||||
ScoredDocIDs collection = createAllScoredDocs(COLLECTION_SIZE);
|
||||
|
||||
/*
|
||||
* Factor PHI.
|
||||
*
|
||||
int[] factors = RandomSample.factor(PHI_32);
|
||||
System.out.print("Factors of PHI_32: ");
|
||||
for (int k : factors) {
|
||||
System.out.print(k+", ");
|
||||
}
|
||||
System.out.println("");
|
||||
|
||||
* Verify inverse relationship of PHI & phi.
|
||||
*
|
||||
boolean inverseValid = true;
|
||||
for (int j = 0; j < Integer.MAX_VALUE; j++) {
|
||||
int k = (int)(j * PHI_32) & 0x7FFFFFFF;
|
||||
int m = (int)(k * PHI_32I) & 0X7FFFFFFF;
|
||||
if (j != m) {
|
||||
System.out.println("Inverse not valid for "+j);
|
||||
inverseValid = false;
|
||||
}
|
||||
}
|
||||
System.out.println("Inverse valid? "+inverseValid);
|
||||
*/
|
||||
/*
|
||||
* Take samples of various sizes from the full set, verify no duplicates,
|
||||
* check flatness.
|
||||
*/
|
||||
int[] sampleSizes = {
|
||||
10, 57, 100, 333, 1000, 2154, 10000
|
||||
};
|
||||
Algorithm[] algorithms = { Algorithm.HASHING, Algorithm.TRAVERSAL };
|
||||
for (int sampleSize : sampleSizes) {
|
||||
for (Algorithm algorithm : algorithms) {
|
||||
System.out.println("Sample size " + sampleSize
|
||||
+ ", algorithm " + algorithm + "...");
|
||||
/*
|
||||
* Take the sample.
|
||||
*/
|
||||
int[] sample = RandomSample.repeatableSample(
|
||||
collection, COLLECTION_SIZE, sampleSize, algorithm, Sorted.YES);
|
||||
/*
|
||||
* Check for duplicates.
|
||||
*/
|
||||
boolean noDups = true;
|
||||
for (int j = 0; j < sampleSize - 1; j++) {
|
||||
if (sample[j] == sample[j + 1]) {
|
||||
System.out.println("Duplicate value "
|
||||
+ sample[j] + " at " + j + ", "
|
||||
+ (j + 1));
|
||||
noDups = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (noDups) {
|
||||
System.out.println("No duplicates.");
|
||||
}
|
||||
if (algorithm == Algorithm.HASHING) {
|
||||
System.out.print("Hashed sample, up to 100 of "+sampleSize+": ");
|
||||
int lim = Math.min(100, sampleSize);
|
||||
for (int k = 0; k < lim; k++) {
|
||||
System.out.print(sample[k]+", ");
|
||||
}
|
||||
System.out.println("");
|
||||
}
|
||||
/*
|
||||
* Check flatness of distribution in sample.
|
||||
*/
|
||||
final int N_INTERVALS = 100;
|
||||
int[] counts = RandomSample.countsBySubrange(sample, COLLECTION_SIZE, N_INTERVALS);
|
||||
int minCount = Integer.MAX_VALUE;
|
||||
int maxCount = Integer.MIN_VALUE;
|
||||
int avgCount = 0;
|
||||
for (int j = 0; j < N_INTERVALS; j++) {
|
||||
int count = counts[j];
|
||||
if (count < minCount) {
|
||||
minCount = count;
|
||||
}
|
||||
if (count > maxCount) {
|
||||
maxCount = count;
|
||||
}
|
||||
avgCount += count;
|
||||
}
|
||||
avgCount /= N_INTERVALS;
|
||||
System.out.println("Min, max, avg: "+minCount+", "+maxCount+", "+avgCount);
|
||||
|
||||
if (((double)minCount - avgCount)/avgCount < -0.05 && (minCount - avgCount) < -5) {
|
||||
System.out.println("Not flat enough.");
|
||||
} else if (((double)maxCount - avgCount)/avgCount > 0.05 && (maxCount - avgCount) > 5) {
|
||||
System.out.println("Not flat enough.");
|
||||
} else {
|
||||
System.out.println("Flat enough.");
|
||||
}
|
||||
if (sampleSize == 10544 && algorithm == Algorithm.TRAVERSAL) {
|
||||
System.out.print("Counts of interest: ");
|
||||
for (int j = 0; j < N_INTERVALS; j++) {
|
||||
System.out.print(counts[j]+", ");
|
||||
}
|
||||
System.out.println("");
|
||||
}
|
||||
}
|
||||
}
|
||||
System.out.println("Last prime is "
|
||||
+ RandomSample.primes[RandomSample.N_PRIMES - 1]);
|
||||
}
|
||||
|
||||
private static ScoredDocIDs createAllScoredDocs(final int COLLECTION_SIZE)
|
||||
throws CorruptIndexException, LockObtainFailedException, IOException {
|
||||
ScoredDocIDs collection;
|
||||
|
||||
IndexReader reader = null;
|
||||
Directory ramDir = new RAMDirectory();
|
||||
try {
|
||||
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_30, new KeywordAnalyzer()));
|
||||
for (int i = 0; i < COLLECTION_SIZE; i++) {
|
||||
writer.addDocument(new Document());
|
||||
}
|
||||
writer.commit();
|
||||
writer.close();
|
||||
reader = IndexReader.open(ramDir);
|
||||
collection = ScoredDocIdsUtils.createAllDocsScoredDocIDs(reader);
|
||||
} finally {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
}
|
||||
ramDir.close();
|
||||
}
|
||||
return collection;
|
||||
}
|
||||
} // end class RandomSample
|
||||
}
|
|
@ -1,8 +1,6 @@
|
|||
package org.apache.lucene.facet.search.sampling;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
|
@ -15,8 +13,6 @@ import org.apache.lucene.facet.search.results.FacetResult;
|
|||
import org.apache.lucene.facet.search.results.FacetResultNode;
|
||||
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.util.RandomSample;
|
||||
import org.apache.lucene.facet.util.ScoredDocIdsUtils;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -48,11 +44,9 @@ import org.apache.lucene.facet.util.ScoredDocIdsUtils;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Sampler {
|
||||
public abstract class Sampler {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(Sampler.class.getName());
|
||||
|
||||
private final SamplingParams samplingParams;
|
||||
protected final SamplingParams samplingParams;
|
||||
|
||||
/**
|
||||
* Construct with {@link SamplingParams}
|
||||
|
@ -103,25 +97,19 @@ public class Sampler {
|
|||
sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize());
|
||||
sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize());
|
||||
|
||||
int[] sampleSet = null;
|
||||
try {
|
||||
sampleSet = RandomSample.repeatableSample(docids, actualSize,
|
||||
sampleSetSize);
|
||||
} catch (IOException e) {
|
||||
if (logger.isLoggable(Level.WARNING)) {
|
||||
logger.log(Level.WARNING, "sampling failed: "+e.getMessage()+" - falling back to no sampling!", e);
|
||||
}
|
||||
return new SampleResult(docids, 1d);
|
||||
}
|
||||
|
||||
ScoredDocIDs sampled = ScoredDocIdsUtils.createScoredDocIDsSubset(docids,
|
||||
sampleSet);
|
||||
if (logger.isLoggable(Level.FINEST)) {
|
||||
logger.finest("******************** " + sampled.size());
|
||||
}
|
||||
return new SampleResult(sampled, sampled.size()/(double)docids.size());
|
||||
return createSample(docids, actualSize, sampleSetSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and return a sample of the input set
|
||||
* @param docids input set out of which a sample is to be created
|
||||
* @param actualSize original size of set, prior to sampling
|
||||
* @param sampleSetSize required size of sample set
|
||||
* @return sample of the input set in the required size
|
||||
*/
|
||||
protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize,
|
||||
int sampleSetSize) throws IOException;
|
||||
|
||||
/**
|
||||
* Get a fixer of sample facet accumulation results. Default implementation
|
||||
* returns a <code>TakmiSampleFixer</code> which is adequate only for
|
||||
|
|
|
@ -313,7 +313,7 @@ public abstract class FacetTestBase extends LuceneTestCase {
|
|||
System.err.println("Results are not the same!");
|
||||
System.err.println("Expected:\n" + expectedResults);
|
||||
System.err.println("Actual" + actualResults);
|
||||
fail("Results are not the same!");
|
||||
throw new NotSameResultError();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -325,4 +325,12 @@ public abstract class FacetTestBase extends LuceneTestCase {
|
|||
}
|
||||
return sb.toString().replaceAll("Residue:.*.0", "").replaceAll("Num valid Descendants.*", "");
|
||||
}
|
||||
|
||||
/** Special Error class for ability to ignore only this error and retry... */
|
||||
public static class NotSameResultError extends Error {
|
||||
public NotSameResultError() {
|
||||
super("Results are not the same!");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.sampling;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -41,7 +42,7 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
protected static final int K = 2;
|
||||
|
||||
/** since there is a chance that this test would fail even if the code is correct, retry the sampling */
|
||||
protected static final int RETRIES = 4;
|
||||
protected static final int RETRIES = 10;
|
||||
|
||||
protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler,
|
||||
TaxonomyReader taxoReader, IndexReader indexReader,
|
||||
|
@ -53,51 +54,54 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
* is performed. The results are compared to non-sampled ones.
|
||||
*/
|
||||
public void testCountUsingSamping() throws Exception, IOException {
|
||||
boolean useRandomSampler = random.nextBoolean();
|
||||
for (int partitionSize : partitionSizes) {
|
||||
initIndex(partitionSize);
|
||||
|
||||
// Get all of the documents and run the query, then do different
|
||||
// facet counts and compare to control
|
||||
Query q = new TermQuery(new Term(CONTENT_FIELD, BETA)); // 90% of the docs
|
||||
ScoredDocIdCollector docCollector = ScoredDocIdCollector.create(searcher.maxDoc(), false);
|
||||
|
||||
FacetSearchParams expectedSearchParams = searchParamsWithRequests(K, partitionSize);
|
||||
FacetsCollector fc = new FacetsCollector(expectedSearchParams, indexReader, taxoReader);
|
||||
|
||||
searcher.search(q, MultiCollector.wrap(docCollector, fc));
|
||||
|
||||
List<FacetResult> expectedResults = fc.getFacetResults();
|
||||
|
||||
// complement with sampling!
|
||||
final Sampler sampler = createSampler(docCollector.getScoredDocIDs());
|
||||
|
||||
FacetSearchParams samplingSearchParams = searchParamsWithRequests(K, partitionSize);
|
||||
|
||||
assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
|
||||
assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
|
||||
|
||||
closeAll();
|
||||
try {
|
||||
initIndex(partitionSize);
|
||||
// Get all of the documents and run the query, then do different
|
||||
// facet counts and compare to control
|
||||
Query q = new TermQuery(new Term(CONTENT_FIELD, BETA)); // 90% of the docs
|
||||
ScoredDocIdCollector docCollector = ScoredDocIdCollector.create(searcher.maxDoc(), false);
|
||||
|
||||
FacetSearchParams expectedSearchParams = searchParamsWithRequests(K, partitionSize);
|
||||
FacetsCollector fc = new FacetsCollector(expectedSearchParams, indexReader, taxoReader);
|
||||
|
||||
searcher.search(q, MultiCollector.wrap(docCollector, fc));
|
||||
|
||||
List<FacetResult> expectedResults = fc.getFacetResults();
|
||||
|
||||
FacetSearchParams samplingSearchParams = searchParamsWithRequests(K, partitionSize);
|
||||
|
||||
// try several times in case of failure, because the test has a chance to fail
|
||||
// if the top K facets are not sufficiently common with the sample set
|
||||
for (int nTrial=0; nTrial<RETRIES; nTrial++) {
|
||||
try {
|
||||
// complement with sampling!
|
||||
final Sampler sampler = createSampler(nTrial, docCollector.getScoredDocIDs(), useRandomSampler);
|
||||
|
||||
assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
|
||||
assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
|
||||
|
||||
break; // succeeded
|
||||
} catch (NotSameResultError e) {
|
||||
if (nTrial>=RETRIES-1) {
|
||||
throw e; // no more retries allowed, must fail
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
closeAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void assertSampling(List<FacetResult> expected, Query q, Sampler sampler, FacetSearchParams params, boolean complement) throws Exception {
|
||||
// try several times in case of failure, because the test has a chance to fail
|
||||
// if the top K facets are not sufficiently common with the sample set
|
||||
for (int n=RETRIES; n>0; n--) {
|
||||
FacetsCollector samplingFC = samplingCollector(false, sampler, params);
|
||||
|
||||
searcher.search(q, samplingFC);
|
||||
List<FacetResult> sampledResults = samplingFC.getFacetResults();
|
||||
|
||||
try {
|
||||
assertSameResults(expected, sampledResults);
|
||||
break; // succeeded
|
||||
} catch (Exception e) {
|
||||
if (n<=1) { // otherwise try again
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
FacetsCollector samplingFC = samplingCollector(complement, sampler, params);
|
||||
|
||||
searcher.search(q, samplingFC);
|
||||
List<FacetResult> sampledResults = samplingFC.getFacetResults();
|
||||
|
||||
assertSameResults(expected, sampledResults);
|
||||
}
|
||||
|
||||
private FacetsCollector samplingCollector(
|
||||
|
@ -117,14 +121,19 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
return samplingFC;
|
||||
}
|
||||
|
||||
private Sampler createSampler(ScoredDocIDs scoredDocIDs) {
|
||||
private Sampler createSampler(int nTrial, ScoredDocIDs scoredDocIDs, boolean useRandomSampler) {
|
||||
SamplingParams samplingParams = new SamplingParams();
|
||||
samplingParams.setSampleRatio(0.8);
|
||||
samplingParams.setMinSampleSize(100);
|
||||
samplingParams.setMaxSampleSize(10000);
|
||||
|
||||
final double retryFactor = Math.pow(1.01, nTrial);
|
||||
samplingParams.setSampleRatio(0.8 * retryFactor);
|
||||
samplingParams.setMinSampleSize((int) (100 * retryFactor));
|
||||
samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
|
||||
samplingParams.setOversampleFactor(5.0 * retryFactor);
|
||||
|
||||
samplingParams.setSampingThreshold(11000); //force sampling
|
||||
samplingParams.setOversampleFactor(5.0);
|
||||
Sampler sampler = new Sampler(samplingParams);
|
||||
Sampler sampler = useRandomSampler ?
|
||||
new RandomSampler(samplingParams, new Random(random.nextLong())) :
|
||||
new RepeatableSampler(samplingParams);
|
||||
assertTrue("must enable sampling for this test!",sampler.shouldSample(scoredDocIDs));
|
||||
return sampler;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue