mirror of https://github.com/apache/lucene.git
LUCENE-3269: speed up top-k sampling tests
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1143122 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
06a3778905
commit
6e25bef3ef
|
@ -21,6 +21,7 @@ import org.apache.lucene.document.Field.TermVector;
|
|||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
|
@ -138,7 +139,7 @@ public abstract class FacetTestBase extends LuceneTestCase {
|
|||
taxoDir = newDirectory();
|
||||
}
|
||||
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random, indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, getAnalyzer()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random, indexDir, getIndexWriterConfig(getAnalyzer()));
|
||||
TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE);
|
||||
|
||||
populateIndex(iw, taxo, getFacetIndexingParams(partitionSize));
|
||||
|
@ -154,6 +155,11 @@ public abstract class FacetTestBase extends LuceneTestCase {
|
|||
indexReader = IndexReader.open(indexDir);
|
||||
searcher = newSearcher(indexReader);
|
||||
}
|
||||
|
||||
/** Returns indexing params for the main index */
|
||||
protected IndexWriterConfig getIndexWriterConfig(Analyzer analyzer) {
|
||||
return newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
|
||||
}
|
||||
|
||||
/** Returns a default facet indexing params */
|
||||
protected FacetIndexingParams getFacetIndexingParams(final int partSize) {
|
||||
|
|
|
@ -6,8 +6,11 @@ import java.util.List;
|
|||
|
||||
import org.apache.lucene.DocumentBuilder.DocumentBuilderException;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.facet.FacetTestBase;
|
||||
import org.apache.lucene.facet.index.params.FacetIndexingParams;
|
||||
import org.apache.lucene.facet.search.params.CountFacetRequest;
|
||||
|
@ -105,4 +108,9 @@ public abstract class BaseTestTopK extends FacetTestBase {
|
|||
protected int numDocsToIndex() {
|
||||
return 20000;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected IndexWriterConfig getIndexWriterConfig(Analyzer analyzer) {
|
||||
return super.getIndexWriterConfig(analyzer).setMaxBufferedDocs(_TestUtil.nextInt(random, 500, 10000));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,41 +32,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|||
*/
|
||||
|
||||
public class TestTopKResultsHandlerRandom extends BaseTestTopK {
|
||||
|
||||
/**
|
||||
* Try out faceted search in it's most basic form (no sampling nor complement
|
||||
* that is). In this test lots (and lots..) of randomly generated data is
|
||||
* being indexed, and later on an "over-all" faceted search is performed. The
|
||||
* results are checked against the DF of each facet by itself
|
||||
*/
|
||||
@Test
|
||||
public void testCountsComplementDisabled() throws Exception {
|
||||
doTestCounts(false);
|
||||
}
|
||||
|
||||
private void doTestCounts(boolean doComplement) throws Exception,
|
||||
IOException, IllegalAccessException, InstantiationException {
|
||||
for (int partitionSize : partitionSizes) {
|
||||
initIndex(partitionSize);
|
||||
|
||||
List<FacetResult> facetResults = countFacets(partitionSize, 100000, doComplement);
|
||||
assertCountsAndCardinality(facetCountsTruth(), facetResults);
|
||||
|
||||
closeAll();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try out faceted search with complements. In this test lots (and lots..) of
|
||||
* randomly generated data is being indexed, and later on, a "beta" faceted
|
||||
* search is performed - retrieving ~90% of the documents so complements takes
|
||||
* place in here. The results are checked against the a regular (a.k.a
|
||||
* no-complement, no-sampling) faceted search with the same parameters.
|
||||
*/
|
||||
@Test
|
||||
public void testCountsComplementEnforced() throws Exception {
|
||||
doTestCounts(true);
|
||||
}
|
||||
|
||||
private List<FacetResult> countFacets(int partitionSize, int numResults, final boolean doComplement)
|
||||
throws IOException, IllegalAccessException, InstantiationException {
|
||||
|
@ -97,6 +62,25 @@ public class TestTopKResultsHandlerRandom extends BaseTestTopK {
|
|||
for (int partitionSize : partitionSizes) {
|
||||
initIndex(partitionSize);
|
||||
|
||||
/*
|
||||
* Try out faceted search in it's most basic form (no sampling nor complement
|
||||
* that is). In this test lots (and lots..) of randomly generated data is
|
||||
* being indexed, and later on an "over-all" faceted search is performed. The
|
||||
* results are checked against the DF of each facet by itself
|
||||
*/
|
||||
List<FacetResult> facetResults = countFacets(partitionSize, 100000, false);
|
||||
assertCountsAndCardinality(facetCountsTruth(), facetResults);
|
||||
|
||||
/*
|
||||
* Try out faceted search with complements. In this test lots (and lots..) of
|
||||
* randomly generated data is being indexed, and later on, a "beta" faceted
|
||||
* search is performed - retrieving ~90% of the documents so complements takes
|
||||
* place in here. The results are checked against the a regular (a.k.a
|
||||
* no-complement, no-sampling) faceted search with the same parameters.
|
||||
*/
|
||||
facetResults = countFacets(partitionSize, 100000, true);
|
||||
assertCountsAndCardinality(facetCountsTruth(), facetResults);
|
||||
|
||||
List<FacetResult> allFacetResults = countFacets(partitionSize, 100000, false);
|
||||
|
||||
HashMap<String,Integer> all = new HashMap<String,Integer>();
|
||||
|
|
|
@ -7,7 +7,6 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.junit.Test;
|
||||
|
||||
import org.apache.lucene.search.MultiCollector;
|
||||
import org.apache.lucene.facet.search.BaseTestTopK;
|
||||
|
@ -48,22 +47,12 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
TaxonomyReader taxoReader, IndexReader indexReader,
|
||||
FacetSearchParams searchParams);
|
||||
|
||||
@Test
|
||||
public void testCountUsingComplementSampling() throws Exception {
|
||||
doTestWithSamping(true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCountUsingSampling() throws Exception {
|
||||
doTestWithSamping(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Try out faceted search with sampling enabled and complements either disabled or enforced
|
||||
* Lots of randomly generated data is being indexed, and later on a "90% docs" faceted search
|
||||
* is performed. The results are compared to non-sampled ones.
|
||||
*/
|
||||
private void doTestWithSamping(boolean complement) throws Exception, IOException {
|
||||
public void testCountUsingSamping() throws Exception, IOException {
|
||||
for (int partitionSize : partitionSizes) {
|
||||
initIndex(partitionSize);
|
||||
|
||||
|
@ -84,24 +73,30 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
|
||||
FacetSearchParams samplingSearchParams = searchParamsWithRequests(K, partitionSize);
|
||||
|
||||
// try several times in case of failure, because the test has a chance to fail
|
||||
// if the top K facets are not sufficiently common with the sample set
|
||||
for (int n=RETRIES; n>0; n--) {
|
||||
FacetsCollector samplingFC = samplingCollector(complement, sampler, samplingSearchParams);
|
||||
|
||||
searcher.search(q, samplingFC);
|
||||
List<FacetResult> sampledResults = samplingFC.getFacetResults();
|
||||
|
||||
try {
|
||||
assertSameResults(expectedResults, sampledResults);
|
||||
break; // succeeded
|
||||
} catch (Exception e) {
|
||||
if (n<=1) { // otherwise try again
|
||||
throw e;
|
||||
}
|
||||
assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
|
||||
assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
|
||||
|
||||
closeAll();
|
||||
}
|
||||
}
|
||||
|
||||
private void assertSampling(List<FacetResult> expected, Query q, Sampler sampler, FacetSearchParams params, boolean complement) throws Exception {
|
||||
// try several times in case of failure, because the test has a chance to fail
|
||||
// if the top K facets are not sufficiently common with the sample set
|
||||
for (int n=RETRIES; n>0; n--) {
|
||||
FacetsCollector samplingFC = samplingCollector(false, sampler, params);
|
||||
|
||||
searcher.search(q, samplingFC);
|
||||
List<FacetResult> sampledResults = samplingFC.getFacetResults();
|
||||
|
||||
try {
|
||||
assertSameResults(expected, sampledResults);
|
||||
break; // succeeded
|
||||
} catch (Exception e) {
|
||||
if (n<=1) { // otherwise try again
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
closeAll();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ public class TestCompactLabelToOrdinal extends LuceneTestCase {
|
|||
|
||||
CompactLabelToOrdinal compact = new CompactLabelToOrdinal(2000000, 0.15f, 3);
|
||||
|
||||
final int n = 100 * 1000;
|
||||
final int n = atLeast(10 * 1000);
|
||||
final int numUniqueValues = 50 * 1000;
|
||||
|
||||
String[] uniqueValues = new String[numUniqueValues];
|
||||
|
|
Loading…
Reference in New Issue