LUCENE-3269: speed up top-k sampling tests

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1143122 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-07-05 16:13:15 +00:00
parent 06a3778905
commit 6e25bef3ef
5 changed files with 58 additions and 65 deletions

View File

@ -21,6 +21,7 @@ import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
@ -138,7 +139,7 @@ public abstract class FacetTestBase extends LuceneTestCase {
taxoDir = newDirectory();
}
RandomIndexWriter iw = new RandomIndexWriter(random, indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, getAnalyzer()));
RandomIndexWriter iw = new RandomIndexWriter(random, indexDir, getIndexWriterConfig(getAnalyzer()));
TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE);
populateIndex(iw, taxo, getFacetIndexingParams(partitionSize));
@ -154,6 +155,11 @@ public abstract class FacetTestBase extends LuceneTestCase {
indexReader = IndexReader.open(indexDir);
searcher = newSearcher(indexReader);
}
/** Returns indexing params for the main index */
protected IndexWriterConfig getIndexWriterConfig(Analyzer analyzer) {
return newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
}
/** Returns a default facet indexing params */
protected FacetIndexingParams getFacetIndexingParams(final int partSize) {

View File

@ -6,8 +6,11 @@ import java.util.List;
import org.apache.lucene.DocumentBuilder.DocumentBuilderException;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.facet.FacetTestBase;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.search.params.CountFacetRequest;
@ -105,4 +108,9 @@ public abstract class BaseTestTopK extends FacetTestBase {
protected int numDocsToIndex() {
return 20000;
}
@Override
protected IndexWriterConfig getIndexWriterConfig(Analyzer analyzer) {
return super.getIndexWriterConfig(analyzer).setMaxBufferedDocs(_TestUtil.nextInt(random, 500, 10000));
}
}

View File

@ -32,41 +32,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
*/
public class TestTopKResultsHandlerRandom extends BaseTestTopK {
/**
* Try out faceted search in it's most basic form (no sampling nor complement
* that is). In this test lots (and lots..) of randomly generated data is
* being indexed, and later on an "over-all" faceted search is performed. The
* results are checked against the DF of each facet by itself
*/
@Test
public void testCountsComplementDisabled() throws Exception {
doTestCounts(false);
}
private void doTestCounts(boolean doComplement) throws Exception,
IOException, IllegalAccessException, InstantiationException {
for (int partitionSize : partitionSizes) {
initIndex(partitionSize);
List<FacetResult> facetResults = countFacets(partitionSize, 100000, doComplement);
assertCountsAndCardinality(facetCountsTruth(), facetResults);
closeAll();
}
}
/**
* Try out faceted search with complements. In this test lots (and lots..) of
* randomly generated data is being indexed, and later on, a "beta" faceted
* search is performed - retrieving ~90% of the documents so complements takes
* place in here. The results are checked against the a regular (a.k.a
* no-complement, no-sampling) faceted search with the same parameters.
*/
@Test
public void testCountsComplementEnforced() throws Exception {
doTestCounts(true);
}
private List<FacetResult> countFacets(int partitionSize, int numResults, final boolean doComplement)
throws IOException, IllegalAccessException, InstantiationException {
@ -97,6 +62,25 @@ public class TestTopKResultsHandlerRandom extends BaseTestTopK {
for (int partitionSize : partitionSizes) {
initIndex(partitionSize);
/*
* Try out faceted search in it's most basic form (no sampling nor complement
* that is). In this test lots (and lots..) of randomly generated data is
* being indexed, and later on an "over-all" faceted search is performed. The
* results are checked against the DF of each facet by itself
*/
List<FacetResult> facetResults = countFacets(partitionSize, 100000, false);
assertCountsAndCardinality(facetCountsTruth(), facetResults);
/*
* Try out faceted search with complements. In this test lots (and lots..) of
* randomly generated data is being indexed, and later on, a "beta" faceted
* search is performed - retrieving ~90% of the documents so complements takes
* place in here. The results are checked against the a regular (a.k.a
* no-complement, no-sampling) faceted search with the same parameters.
*/
facetResults = countFacets(partitionSize, 100000, true);
assertCountsAndCardinality(facetCountsTruth(), facetResults);
List<FacetResult> allFacetResults = countFacets(partitionSize, 100000, false);
HashMap<String,Integer> all = new HashMap<String,Integer>();

View File

@ -7,7 +7,6 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.junit.Test;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.facet.search.BaseTestTopK;
@ -48,22 +47,12 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
TaxonomyReader taxoReader, IndexReader indexReader,
FacetSearchParams searchParams);
@Test
public void testCountUsingComplementSampling() throws Exception {
doTestWithSamping(true);
}
@Test
public void testCountUsingSampling() throws Exception {
doTestWithSamping(false);
}
/**
* Try out faceted search with sampling enabled and complements either disabled or enforced
* Lots of randomly generated data is being indexed, and later on a "90% docs" faceted search
* is performed. The results are compared to non-sampled ones.
*/
private void doTestWithSamping(boolean complement) throws Exception, IOException {
public void testCountUsingSamping() throws Exception, IOException {
for (int partitionSize : partitionSizes) {
initIndex(partitionSize);
@ -84,24 +73,30 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
FacetSearchParams samplingSearchParams = searchParamsWithRequests(K, partitionSize);
// try several times in case of failure, because the test has a chance to fail
// if the top K facets are not sufficiently common with the sample set
for (int n=RETRIES; n>0; n--) {
FacetsCollector samplingFC = samplingCollector(complement, sampler, samplingSearchParams);
searcher.search(q, samplingFC);
List<FacetResult> sampledResults = samplingFC.getFacetResults();
try {
assertSameResults(expectedResults, sampledResults);
break; // succeeded
} catch (Exception e) {
if (n<=1) { // otherwise try again
throw e;
}
assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
closeAll();
}
}
private void assertSampling(List<FacetResult> expected, Query q, Sampler sampler, FacetSearchParams params, boolean complement) throws Exception {
// try several times in case of failure, because the test has a chance to fail
// if the top K facets are not sufficiently common with the sample set
for (int n=RETRIES; n>0; n--) {
FacetsCollector samplingFC = samplingCollector(false, sampler, params);
searcher.search(q, samplingFC);
List<FacetResult> sampledResults = samplingFC.getFacetResults();
try {
assertSameResults(expected, sampledResults);
break; // succeeded
} catch (Exception e) {
if (n<=1) { // otherwise try again
throw e;
}
}
closeAll();
}
}

View File

@ -36,7 +36,7 @@ public class TestCompactLabelToOrdinal extends LuceneTestCase {
CompactLabelToOrdinal compact = new CompactLabelToOrdinal(2000000, 0.15f, 3);
final int n = 100 * 1000;
final int n = atLeast(10 * 1000);
final int numUniqueValues = 50 * 1000;
String[] uniqueValues = new String[numUniqueValues];