diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a7a4899792d..8c6d238e33d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -115,6 +115,10 @@ New Features * LUCENE-4072: Add ICUNormalizer2CharFilter, which lets you do unicode normalization with offset correction before the tokenizer. (David Goldfarb, Ippei UKAI via Robert Muir) +* LUCENE-5476: Add RandomSamplingFacetsCollector for computing facets on a sampled + set of matching hits, in cases where there are millions of hits. + (Rob Audenaerde, Gilad Barkai, Shai Erera) + API Changes * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues diff --git a/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java b/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java new file mode 100644 index 00000000000..58c0696477c --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java @@ -0,0 +1,264 @@ +package org.apache.lucene.facet; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.facet.FacetsConfig.DimConfig; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.FixedBitSet; + +/** + * Collects hits for subsequent faceting, using sampling if needed. Once you've + * run a search and collect hits into this, instantiate one of the + * {@link Facets} subclasses to do the facet counting. Note that this collector + * does not collect the scores of matching docs (i.e. + * {@link FacetsCollector.MatchingDocs#scores}) is {@code null}. + *

+ * If you require the original set of hits, you can call + * {@link #getOriginalMatchingDocs()}. Also, since the counts of the top-facets + * is based on the sampled set, you can amortize the counts by calling + * {@link #amortizeFacetCounts}. + */ +public class RandomSamplingFacetsCollector extends FacetsCollector { + + /** + * Faster alternative for java.util.Random, inspired by + * http://dmurphy747.wordpress.com/2011/03/23/xorshift-vs-random- + * performance-in-java/ + *

+ * Has a period of 2^64-1 + */ + private static class XORShift64Random { + + private long x; + + /** Creates a xorshift random generator using the provided seed */ + public XORShift64Random(long seed) { + x = seed == 0 ? 0xdeadbeef : seed; + } + + /** Get the next random long value */ + public long randomLong() { + x ^= (x << 21); + x ^= (x >>> 35); + x ^= (x << 4); + return x; + } + + /** Get the next random int, between 0 (inclusive) and n (exclusive) */ + public int nextInt(int n) { + int res = (int) (randomLong() % n); + return (res < 0) ? -res : res; + } + + } + + private final static int NOT_CALCULATED = -1; + + private final int sampleSize; + private final XORShift64Random random; + + private double samplingRate; + private List sampledDocs; + private int totalHits = NOT_CALCULATED; + private int leftoverBin = NOT_CALCULATED; + private int leftoverIndex = NOT_CALCULATED; + + /** + * Constructor with the given sample size and default seed. + * + * @see #RandomSamplingFacetsCollector(int, long) + */ + public RandomSamplingFacetsCollector(int sampleSize) { + this(sampleSize, 0); + } + + /** + * Constructor with the given sample size and seed. + * + * @param sampleSize + * The preferred sample size. If the number of hits is greater than + * the size, sampling will be done using a sample ratio of sampling + * size / totalN. For example: 1000 hits, sample size = 10 results in + * samplingRatio of 0.01. If the number of hits is lower, no sampling + * is done at all + * @param seed + * The random seed. If {@code 0} then a seed will be chosen for you. + */ + public RandomSamplingFacetsCollector(int sampleSize, long seed) { + super(false); + this.sampleSize = sampleSize; + this.random = new XORShift64Random(seed); + this.sampledDocs = null; + } + + /** + * Returns the sampled list of the matching documents. Note that a + * {@link FacetsCollector.MatchingDocs} instance is returned per segment, even + * if no hits from that segment are included in the sampled set. + *

+ * Note: One or more of the MatchingDocs might be empty (not containing any + * hits) as result of sampling. + *

+ * Note: {@code MatchingDocs.totalHits} is copied from the original + * MatchingDocs, scores is set to {@code null} + */ + @Override + public List getMatchingDocs() { + List matchingDocs = super.getMatchingDocs(); + + if (totalHits == NOT_CALCULATED) { + totalHits = 0; + for (MatchingDocs md : matchingDocs) { + totalHits += md.totalHits; + } + } + + if (totalHits <= sampleSize) { + return matchingDocs; + } + + if (sampledDocs == null) { + samplingRate = (1.0 * sampleSize) / totalHits; + sampledDocs = createSampledDocs(matchingDocs); + } + return sampledDocs; + } + + /** Returns the original matching documents. */ + public List getOriginalMatchingDocs() { + return super.getMatchingDocs(); + } + + /** Create a sampled copy of the matching documents list. */ + private List createSampledDocs(List matchingDocsList) { + List sampledDocsList = new ArrayList(matchingDocsList.size()); + for (MatchingDocs docs : matchingDocsList) { + sampledDocsList.add(createSample(docs)); + } + return sampledDocsList; + } + + /** Create a sampled of the given hits. */ + private MatchingDocs createSample(MatchingDocs docs) { + int maxdoc = docs.context.reader().maxDoc(); + + // TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse + FixedBitSet sampleDocs = new FixedBitSet(maxdoc); + + int binSize = (int) (1.0 / samplingRate); + + try { + int counter = 0; + int limit, randomIndex; + if (leftoverBin != NOT_CALCULATED) { + limit = leftoverBin; + // either NOT_CALCULATED, which means we already sampled from that bin, + // or the next document to sample + randomIndex = leftoverIndex; + } else { + limit = binSize; + randomIndex = random.nextInt(binSize); + } + final DocIdSetIterator it = docs.bits.iterator(); + for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { + if (counter == randomIndex) { + sampleDocs.set(doc); + } + counter++; + if (counter >= limit) { + counter = 0; + limit = binSize; + randomIndex = random.nextInt(binSize); + } + } + + if (counter == 0) { + // we either exhausted the bin and the iterator at the same time, or + // this segment had no results. in the latter case we might want to + // carry leftover to the next segment as is, but that complicates the + // code and doesn't seem so important. + leftoverBin = leftoverIndex = NOT_CALCULATED; + } else { + leftoverBin = limit - counter; + if (randomIndex > counter) { + // the document to sample is in the next bin + leftoverIndex = randomIndex - counter; + } else if (randomIndex < counter) { + // we sampled a document from the bin, so just skip over remaining + // documents in the bin in the next segment. + leftoverIndex = NOT_CALCULATED; + } + } + + return new MatchingDocs(docs.context, sampleDocs, docs.totalHits, null); + } catch (IOException e) { + throw new RuntimeException(); + } + } + + /** + * Note: if you use a counting {@link Facets} implementation, you can amortize the + * sampled counts by calling this method. Uses the {@link FacetsConfig} and + * the {@link IndexSearcher} to determine the upper bound for each facet value. + */ + public FacetResult amortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException { + if (res == null || totalHits <= sampleSize) { + return res; + } + + LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length]; + IndexReader reader = searcher.getIndexReader(); + DimConfig dimConfig = config.getDimConfig(res.dim); + + // +2 to prepend dimension, append child label + String[] childPath = new String[res.path.length + 2]; + childPath[0] = res.dim; + + System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse + + for (int i = 0; i < res.labelValues.length; i++) { + childPath[res.path.length + 1] = res.labelValues[i].label; + String fullPath = FacetsConfig.pathToString(childPath, childPath.length); + int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath)); + int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate); + correctedCount = Math.min(max, correctedCount); + fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount); + } + + // cap the total count on the total number of non-deleted documents in the reader + int correctedTotalCount = res.value.intValue(); + if (correctedTotalCount > 0) { + correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate)); + } + + return new FacetResult(res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount); + } + + /** Returns the sampling rate that was used. */ + public double getSamplingRate() { + return samplingRate; + } + +} diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java b/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java new file mode 100644 index 00000000000..ce283a0a59d --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java @@ -0,0 +1,141 @@ +package org.apache.lucene.facet; + +import java.util.Random; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.StringField; +import org.apache.lucene.facet.FacetsCollector.MatchingDocs; +import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiCollector; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestRandomSamplingFacetsCollector extends FacetTestCase { + + public void testRandomSampling() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + FacetsConfig config = new FacetsConfig(); + + int numDocs = atLeast(10000); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + doc.add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO)); + doc.add(new FacetField("iMod10", String.valueOf(i % 10))); + writer.addDocument(config.build(taxoWriter, doc)); + } + Random random = random(); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + IOUtils.close(writer, taxoWriter); + + // Test empty results + RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong()); + + // There should be no divisions by zero + searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults); + + // There should be no divisions by zero and no null result + assertNotNull(collectRandomZeroResults.getMatchingDocs()); + + // There should be no results at all + for (MatchingDocs doc : collectRandomZeroResults.getMatchingDocs()) { + assertEquals(0, doc.totalHits); + } + + // Now start searching and retrieve results. + + // Use a query to select half of the documents. + TermQuery query = new TermQuery(new Term("EvenOdd", "even")); + + // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i % + // 10) are hits. + // there is a REAL small chance that one of the 5 values will be missed when + // sampling. + // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be + // missing) ~ 10^-193 + // so that is probably not going to happen. + int maxNumChildren = 5; + + RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.nextLong()); // no sampling + RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong()); // 10 % of total docs, 20% of the hits + + FacetsCollector fc = new FacetsCollector(); + + searcher.search(query, MultiCollector.wrap(fc, random100Percent, random10Percent)); + + FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent); + FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent); + FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc); + + FacetResult random10Result = random10Percent.amortizeFacetCounts(random10FacetCounts.getTopChildren(10, "iMod10"), config, searcher); + FacetResult random100Result = random100FacetCounts.getTopChildren(10, "iMod10"); + FacetResult exactResult = exactFacetCounts.getTopChildren(10, "iMod10"); + + assertEquals(random100Result, exactResult); + + // we should have five children, but there is a small chance we have less. + // (see above). + assertTrue(random10Result.childCount <= maxNumChildren); + // there should be one child at least. + assertTrue(random10Result.childCount >= 1); + + // now calculate some statistics to determine if the sampled result is 'ok'. + // because random sampling is used, the results will vary each time. + int sum = 0; + for (LabelAndValue lav : random10Result.labelValues) { + sum += lav.value.intValue(); + } + float mu = (float) sum / (float) maxNumChildren; + + float variance = 0; + for (LabelAndValue lav : random10Result.labelValues) { + variance += Math.pow((mu - lav.value.intValue()), 2); + } + variance = variance / maxNumChildren; + float sigma = (float) Math.sqrt(variance); + + // we query only half the documents and have 5 categories. The average + // number of docs in a category will thus be the total divided by 5*2 + float targetMu = numDocs / (5.0f * 2.0f); + + // the average should be in the range and the standard deviation should not + // be too great + assertTrue(sigma < 200); + assertTrue(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma); + + IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir); + } + +}