LUCENE-4683: Change Aggregator and CategoryListIterator to be per-segment

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1432890 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2013-01-14 12:03:11 +00:00
parent 3552167217
commit 93b0a15183
37 changed files with 553 additions and 359 deletions

View File

@ -122,6 +122,10 @@ Changes in backwards compatibility policy
result, few other classes such as Aggregator and CategoryListIterator were
changed to handle bulk category ordinals. (Shai Erera)
* LUCENE-4683: CategoryListIterator and Aggregator are now per-segment. As such
their implementations no longer take a top-level IndexReader in the constructor
but rather implement a setNextReader. (Shai Erera)
New Features
* LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of

View File

@ -38,7 +38,7 @@ import org.apache.lucene.search.DocIdSetIterator;
public final class FixedBitSet extends DocIdSet implements Bits {
private final long[] bits;
private int numBits;
private final int numBits;
/** returns the number of 64 bit words it would take to hold numBits */
public static int bits2words(int numBits) {

View File

@ -3,7 +3,7 @@ package org.apache.lucene.facet.associations;
import java.io.IOException;
import org.apache.lucene.facet.search.PayloadIterator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.BytesRef;
@ -46,12 +46,21 @@ public abstract class AssociationsPayloadIterator<T extends CategoryAssociation>
* It is assumed that all association values can be deserialized with the
* given {@link CategoryAssociation}.
*/
public AssociationsPayloadIterator(IndexReader reader, String field, T association) throws IOException {
pi = new PayloadIterator(reader, new Term(field, association.getCategoryListID()));
hasAssociations = pi.init();
public AssociationsPayloadIterator(String field, T association) throws IOException {
pi = new PayloadIterator(new Term(field, association.getCategoryListID()));
this.association = association;
}
/**
* Sets the {@link AtomicReaderContext} for which {@link #setNextDoc(int)}
* calls will be made. Returns true iff this reader has associations for any
* of the documents belonging to the association given to the constructor.
*/
public final boolean setNextReader(AtomicReaderContext context) throws IOException {
hasAssociations = pi.setNextReader(context);
return hasAssociations;
}
/**
* Skip to the requested document. Returns true iff the document has category
* association values and they were read successfully. Associations are

View File

@ -2,7 +2,6 @@ package org.apache.lucene.facet.associations;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.collections.IntToFloatMap;
/*
@ -31,9 +30,8 @@ public class FloatAssociationsPayloadIterator extends AssociationsPayloadIterato
private final IntToFloatMap ordinalAssociations = new IntToFloatMap();
public FloatAssociationsPayloadIterator(IndexReader reader, String field, CategoryFloatAssociation association)
throws IOException {
super(reader, field, association);
public FloatAssociationsPayloadIterator(String field, CategoryFloatAssociation association) throws IOException {
super(field, association);
}
@Override

View File

@ -2,7 +2,6 @@ package org.apache.lucene.facet.associations;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.collections.IntToIntMap;
/*
@ -31,9 +30,8 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator<
private final IntToIntMap ordinalAssociations = new IntToIntMap();
public IntAssociationsPayloadIterator(IndexReader reader, String field, CategoryIntAssociation association)
throws IOException {
super(reader, field, association);
public IntAssociationsPayloadIterator(String field, CategoryIntAssociation association) throws IOException {
super(field, association);
}
@Override

View File

@ -3,13 +3,10 @@ package org.apache.lucene.facet.index.params;
import java.io.IOException;
import java.io.Serializable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.facet.search.CategoryListIterator;
import org.apache.lucene.facet.search.PayloadCategoryListIteraor;
import org.apache.lucene.facet.search.TotalFacetCounts;
import org.apache.lucene.facet.util.PartitionsUtils;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.encoding.DGapIntEncoder;
import org.apache.lucene.util.encoding.IntDecoder;
import org.apache.lucene.util.encoding.IntEncoder;
@ -98,11 +95,6 @@ public class CategoryListParams implements Serializable {
return new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())));
}
/**
* Equality is defined by the 'term' that defines this category list.
* Sub-classes should override this method if a more complex calculation
* is needed to ensure equality.
*/
@Override
public boolean equals(Object o) {
if (o == this) {
@ -121,29 +113,16 @@ public class CategoryListParams implements Serializable {
return this.term.equals(other.term);
}
/**
* Hashcode is similar to {@link #equals(Object)}, in that it uses
* the term that defines this category list to derive the hashcode.
* Subclasses need to ensure that equality/hashcode is correctly defined,
* or there could be side-effects in the {@link TotalFacetCounts} caching
* mechanism (as the filename for a Total Facet Counts array cache
* is dependent on the hashCode, so it should consistently return the same
* hash for identity).
*/
@Override
public int hashCode() {
return this.hashCode;
}
/**
* Create the category list iterator for the specified partition.
*/
public CategoryListIterator createCategoryListIterator(IndexReader reader,
int partition) throws IOException {
/** Create the {@link CategoryListIterator} for the specified partition. */
public CategoryListIterator createCategoryListIterator(int partition) throws IOException {
String categoryListTermStr = PartitionsUtils.partitionName(this, partition);
Term payloadTerm = new Term(term.field(), categoryListTermStr);
return new PayloadCategoryListIteraor(reader, payloadTerm,
createEncoder().createMatchingDecoder());
return new PayloadCategoryListIteraor(payloadTerm, createEncoder().createMatchingDecoder());
}
}

View File

@ -50,7 +50,7 @@ public final class AdaptiveFacetsAccumulator extends StandardFacetsAccumulator {
* Create an {@link AdaptiveFacetsAccumulator}
* @see StandardFacetsAccumulator#StandardFacetsAccumulator(FacetSearchParams, IndexReader, TaxonomyReader)
*/
public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader) {
super(searchParams, indexReader, taxonomyReader);
}

View File

@ -2,6 +2,7 @@ package org.apache.lucene.facet.search;
import java.io.IOException;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.IntsRef;
/*
@ -23,6 +24,8 @@ import org.apache.lucene.util.IntsRef;
/**
* An interface for obtaining the category ordinals of documents.
* {@link #getOrdinals(int, IntsRef)} calls are done with document IDs that are
* local to the reader given to {@link #setNextReader(AtomicReaderContext)}.
* <p>
* <b>NOTE:</b> this class operates as a key to a map, and therefore you should
* implement {@code equals()} and {@code hashCode()} for proper behavior.
@ -32,19 +35,20 @@ import org.apache.lucene.util.IntsRef;
public interface CategoryListIterator {
/**
* Initializes the iterator. This method must be called before any calls to
* {@link #getOrdinals(int, IntsRef)}, and its return value indicates whether there are
* any relevant documents for this iterator.
* Sets the {@link AtomicReaderContext} for which
* {@link #getOrdinals(int, IntsRef)} calls will be made. Returns true iff any
* of the documents in this reader have category ordinals. This method must be
* called before any calls to {@link #getOrdinals(int, IntsRef)}.
*/
public boolean init() throws IOException;
public boolean setNextReader(AtomicReaderContext context) throws IOException;
/**
* Stores the category ordinals of the given document ID in the given
* {@link IntsRef}, starting at position 0 upto {@link IntsRef#length}. Grows
* the {@link IntsRef} if it is not large enough.
*
* <p>
* <b>NOTE:</b> if the requested document does not category ordinals
* <b>NOTE:</b> if the requested document does not have category ordinals
* associated with it, {@link IntsRef#length} is set to zero.
*/
public void getOrdinals(int docID, IntsRef ints) throws IOException;

View File

@ -2,7 +2,7 @@ package org.apache.lucene.facet.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
@ -34,17 +34,15 @@ import org.apache.lucene.util.encoding.IntDecoder;
public class PayloadCategoryListIteraor implements CategoryListIterator {
private final IntDecoder decoder;
private final IndexReader indexReader;
private final Term term;
private final PayloadIterator pi;
private final int hashCode;
public PayloadCategoryListIteraor(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
pi = new PayloadIterator(indexReader, term);
public PayloadCategoryListIteraor(Term term, IntDecoder decoder) throws IOException {
pi = new PayloadIterator(term);
this.decoder = decoder;
hashCode = indexReader.hashCode() ^ term.hashCode();
hashCode = term.hashCode();
this.term = term;
this.indexReader = indexReader;
}
@Override
@ -58,7 +56,7 @@ public class PayloadCategoryListIteraor implements CategoryListIterator {
}
// Hash codes are the same, check equals() to avoid cases of hash-collisions.
return indexReader.equals(that.indexReader) && term.equals(that.term);
return term.equals(that.term);
}
@Override
@ -67,8 +65,8 @@ public class PayloadCategoryListIteraor implements CategoryListIterator {
}
@Override
public boolean init() throws IOException {
return pi.init();
public boolean setNextReader(AtomicReaderContext context) throws IOException {
return pi.setNextReader(context);
}
@Override

View File

@ -1,12 +1,10 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@ -42,99 +40,75 @@ import org.apache.lucene.util.BytesRef;
*/
public class PayloadIterator {
protected BytesRef data;
private TermsEnum reuseTE;
private DocsAndPositionsEnum currentDPE;
private DocsAndPositionsEnum dpe;
private boolean hasMore;
private int curDocID, curDocBase;
private int curDocID;
private final Iterator<AtomicReaderContext> leaves;
private final Term term;
public PayloadIterator(IndexReader indexReader, Term term) throws IOException {
leaves = indexReader.leaves().iterator();
public PayloadIterator(Term term) throws IOException {
this.term = term;
}
private void nextSegment() throws IOException {
/**
* Sets the {@link AtomicReaderContext} for which {@link #getPayload(int)}
* calls will be made. Returns true iff this reader has payload for any of the
* documents belonging to the {@link Term} given to the constructor.
*/
public boolean setNextReader(AtomicReaderContext context) throws IOException {
hasMore = false;
while (leaves.hasNext()) {
AtomicReaderContext ctx = leaves.next();
curDocBase = ctx.docBase;
Fields fields = ctx.reader().fields();
if (fields != null) {
Terms terms = fields.terms(term.field());
if (terms != null) {
reuseTE = terms.iterator(reuseTE);
if (reuseTE.seekExact(term.bytes(), true)) {
// this class is usually used to iterate on whatever a Query matched
// if it didn't match deleted documents, we won't receive them. if it
// did, we should iterate on them too, therefore we pass liveDocs=null
currentDPE = reuseTE.docsAndPositions(null, currentDPE, DocsAndPositionsEnum.FLAG_PAYLOADS);
if (currentDPE != null && (curDocID = currentDPE.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
hasMore = true;
break;
}
Fields fields = context.reader().fields();
if (fields != null) {
Terms terms = fields.terms(term.field());
if (terms != null) {
reuseTE = terms.iterator(reuseTE);
if (reuseTE.seekExact(term.bytes(), true)) {
// this class is usually used to iterate on whatever a Query matched
// if it didn't match deleted documents, we won't receive them. if it
// did, we should iterate on them too, therefore we pass liveDocs=null
dpe = reuseTE.docsAndPositions(null, dpe, DocsAndPositionsEnum.FLAG_PAYLOADS);
if (dpe != null && (curDocID = dpe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
hasMore = true;
}
}
}
}
return hasMore;
}
/**
* Initialize the iterator. Should be done before the first call to
* {@link #getPayload(int)}. Returns {@code false} if no category list is
* found, or the category list has no documents.
*/
public boolean init() throws IOException {
nextSegment();
return hasMore;
}
/**
* Returns the {@link BytesRef payload} of the given document, or {@code null}
* if the document does not exist, there are no more documents in the posting
* list, or the document exists but has not payload. You should call
* {@link #init()} before the first call to this method.
* list, or the document exists but has not payload. The given document IDs
* are treated as local to the reader given to
* {@link #setNextReader(AtomicReaderContext)}.
*/
public BytesRef getPayload(int docID) throws IOException {
if (!hasMore) {
return null;
}
// re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
int localDocID = docID - curDocBase;
if (curDocID > localDocID) {
if (curDocID > docID) {
// document does not exist
return null;
}
if (curDocID < localDocID) {
// look for the document either in that segment, or others
while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
nextSegment(); // also updates curDocID
localDocID = docID - curDocBase;
// nextSegment advances to nextDoc, so check if we still need to advance
if (curDocID >= localDocID) {
break;
if (curDocID < docID) {
curDocID = dpe.advance(docID);
if (curDocID != docID) { // requested document does not have a payload
if (curDocID == DocIdSetIterator.NO_MORE_DOCS) { // no more docs in this reader
hasMore = false;
}
}
// we break from the above loop when:
// 1. we iterated over all segments (hasMore=false)
// 2. current segment advanced to a doc, either requested or higher
if (!hasMore || curDocID != localDocID) {
return null;
}
}
// we're on the document
assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
int pos = currentDPE.nextPosition();
assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
return currentDPE.getPayload();
assert dpe.freq() == 1 : "expecting freq=1 (got " + dpe.freq() + ") term=" + term + " doc=" + curDocID;
int pos = dpe.nextPosition();
assert pos != -1 : "no positions for term=" + term + " doc=" + curDocID;
return dpe.getPayload();
}
}

View File

@ -62,7 +62,7 @@ public abstract class ScoredDocIdCollector extends Collector {
}
@Override
public ScoredDocIDsIterator scoredDocIdsIterator() {
protected ScoredDocIDsIterator scoredDocIdsIterator() {
return new ScoredDocIDsIterator() {
private DocIdSetIterator docIdsIter = docIds.iterator();
@ -129,7 +129,7 @@ public abstract class ScoredDocIdCollector extends Collector {
}
@Override
public ScoredDocIDsIterator scoredDocIdsIterator() {
protected ScoredDocIDsIterator scoredDocIdsIterator() {
return new ScoredDocIDsIterator() {
private DocIdSetIterator docIdsIter = docIds.iterator();
@ -189,8 +189,7 @@ public abstract class ScoredDocIdCollector extends Collector {
* do not require scoring, it is better to set it to <i>false</i>.
*/
public static ScoredDocIdCollector create(int maxDoc, boolean enableScoring) {
return enableScoring ? new ScoringDocIdCollector(maxDoc)
: new NonScoringDocIdCollector(maxDoc);
return enableScoring ? new ScoringDocIdCollector(maxDoc) : new NonScoringDocIdCollector(maxDoc);
}
private ScoredDocIdCollector(int maxDoc) {
@ -198,13 +197,14 @@ public abstract class ScoredDocIdCollector extends Collector {
docIds = new FixedBitSet(maxDoc);
}
protected abstract ScoredDocIDsIterator scoredDocIdsIterator() throws IOException;
/** Returns the default score used when scoring is disabled. */
public abstract float getDefaultScore();
/** Set the default score. Only applicable if scoring is disabled. */
public abstract void setDefaultScore(float defaultScore);
public abstract ScoredDocIDsIterator scoredDocIdsIterator() throws IOException;
public ScoredDocIDs getScoredDocIDs() {
return new ScoredDocIDs() {

View File

@ -4,22 +4,23 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.util.PartitionsUtils;
import org.apache.lucene.facet.util.ScoredDocIdsUtils;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -179,11 +180,11 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
List<FacetResult> res = new ArrayList<FacetResult>();
for (FacetRequest fr : searchParams.getFacetRequests()) {
FacetResultsHandler frHndlr = fr.createFacetResultsHandler(taxonomyReader);
IntermediateFacetResult tmpResult = fr2tmpRes.get(fr);
IntermediateFacetResult tmpResult = fr2tmpRes.get(fr);
if (tmpResult == null) {
continue; // do not add a null to the list.
}
FacetResult facetRes = frHndlr.renderFacetResult(tmpResult);
FacetResult facetRes = frHndlr.renderFacetResult(tmpResult);
// final labeling if allowed (because labeling is a costly operation)
if (isAllowLabeling()) {
frHndlr.labelResult(facetRes);
@ -213,18 +214,15 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
/** Check if it is worth to use complements */
protected boolean shouldComplement(ScoredDocIDs docids) {
return
mayComplement() &&
(docids.size() > indexReader.numDocs() * getComplementThreshold()) ;
return mayComplement() && (docids.size() > indexReader.numDocs() * getComplementThreshold()) ;
}
/**
* Iterate over the documents for this partition and fill the facet arrays with the correct
* count/complement count/value.
* @throws IOException If there is a low-level I/O error.
*/
private final void fillArraysForPartition(ScoredDocIDs docids,
FacetArrays facetArrays, int partition) throws IOException {
private final void fillArraysForPartition(ScoredDocIDs docids, FacetArrays facetArrays, int partition)
throws IOException {
if (isUsingComplements) {
initArraysByTotalCounts(facetArrays, partition, docids.size());
@ -236,27 +234,41 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
IntsRef ordinals = new IntsRef(32); // a reasonable start capacity for most common apps
for (Entry<CategoryListIterator, Aggregator> entry : categoryLists.entrySet()) {
CategoryListIterator categoryList = entry.getKey();
if (!categoryList.init()) {
continue;
}
Aggregator categorator = entry.getValue();
ScoredDocIDsIterator iterator = docids.iterator();
final ScoredDocIDsIterator iterator = docids.iterator();
final CategoryListIterator categoryListIter = entry.getKey();
final Aggregator aggregator = entry.getValue();
Iterator<AtomicReaderContext> contexts = indexReader.leaves().iterator();
AtomicReaderContext current = null;
int maxDoc = -1;
while (iterator.next()) {
int docID = iterator.getDocID();
categoryList.getOrdinals(docID, ordinals);
if (ordinals.length == 0) {
continue;
while (docID >= maxDoc) { // find the segment which contains this document
if (!contexts.hasNext()) {
throw new RuntimeException("ScoredDocIDs contains documents outside this reader's segments !?");
}
current = contexts.next();
maxDoc = current.docBase + current.reader().maxDoc();
if (docID < maxDoc) { // segment has docs, check if it has categories
boolean validSegment = categoryListIter.setNextReader(current);
validSegment &= aggregator.setNextReader(current);
if (!validSegment) { // if categoryList or aggregtor say it's an invalid segment, skip all docs
while (docID < maxDoc && iterator.next()) {
docID = iterator.getDocID();
}
}
}
}
categorator.aggregate(docID, iterator.getScore(), ordinals);
docID -= current.docBase;
categoryListIter.getOrdinals(docID, ordinals);
if (ordinals.length == 0) {
continue; // document does not have category ordinals
}
aggregator.aggregate(docID, iterator.getScore(), ordinals);
}
}
}
/**
* Init arrays for partition by total counts, optionally applying a factor
*/
/** Init arrays for partition by total counts, optionally applying a factor */
private final void initArraysByTotalCounts(FacetArrays facetArrays, int partition, int nAccumulatedDocs) {
int[] intArray = facetArrays.getIntArray();
totalFacetCounts.fillTotalCountsForPartition(intArray, partition);
@ -302,10 +314,9 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
for (FacetRequest facetRequest : searchParams.getFacetRequests()) {
Aggregator categoryAggregator = facetRequest.createAggregator(
isUsingComplements, facetArrays, indexReader, taxonomyReader);
isUsingComplements, facetArrays, taxonomyReader);
CategoryListIterator cli =
facetRequest.createCategoryListIterator(indexReader, taxonomyReader, searchParams, partition);
CategoryListIterator cli = facetRequest.createCategoryListIterator(taxonomyReader, searchParams, partition);
// get the aggregator
Aggregator old = categoryLists.put(cli, categoryAggregator);

View File

@ -170,7 +170,7 @@ public class TotalFacetCounts {
Aggregator aggregator = new CountingAggregator(counts[partition]);
HashMap<CategoryListIterator, Aggregator> map = new HashMap<CategoryListIterator, Aggregator>();
for (CategoryListParams clp: facetIndexingParams.getAllCategoryListParams()) {
final CategoryListIterator cli = clIteraor(clCache, clp, indexReader, partition);
final CategoryListIterator cli = clIteraor(clCache, clp, partition);
map.put(cli, aggregator);
}
return map;
@ -181,14 +181,14 @@ public class TotalFacetCounts {
return new TotalFacetCounts(taxonomy, facetIndexingParams, counts, CreationType.Computed);
}
static CategoryListIterator clIteraor(CategoryListCache clCache, CategoryListParams clp,
IndexReader indexReader, int partition) throws IOException {
static CategoryListIterator clIteraor(CategoryListCache clCache, CategoryListParams clp, int partition)
throws IOException {
if (clCache != null) {
CategoryListData cld = clCache.get(clp);
if (cld != null) {
return cld.iterator(partition);
}
}
return clp.createCategoryListIterator(indexReader, partition);
return clp.createCategoryListIterator(partition);
}
}

View File

@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.aggregator;
import java.io.IOException;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.IntsRef;
/*
@ -22,21 +23,22 @@ import org.apache.lucene.util.IntsRef;
*/
/**
* An Aggregator is the analogue of Lucene's Collector (see
* {@link org.apache.lucene.search.Collector}), for processing the categories
* belonging to a certain document. The Aggregator is responsible for doing
* whatever it wishes with the categories it is fed, e.g., counting the number
* of times that each category appears, or performing some computation on their
* association values.
* <P>
* Much of the function of an Aggregator implementation is not described by this
* interface. This includes the constructor and getter methods to retrieve the
* results of the aggregation.
* Aggregates the categories of documents given to
* {@link #aggregate(int, float, IntsRef)}. Note that the document IDs are local
* to the reader given to {@link #setNextReader(AtomicReaderContext)}.
*
* @lucene.experimental
*/
public interface Aggregator {
/**
* Sets the {@link AtomicReaderContext} for which
* {@link #aggregate(int, float, IntsRef)} calls will be made. If this method
* returns false, {@link #aggregate(int, float, IntsRef)} should not be called
* for this reader.
*/
public boolean setNextReader(AtomicReaderContext context) throws IOException;
/**
* Aggregate the ordinals of the given document ID (and its score). The given
* ordinals offset is always zero.

View File

@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.aggregator;
import java.io.IOException;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.IntsRef;
/*
@ -57,4 +58,9 @@ public class CountingAggregator implements Aggregator {
return counterArray == null ? 0 : counterArray.hashCode();
}
@Override
public boolean setNextReader(AtomicReaderContext context) throws IOException {
return true;
}
}

View File

@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.aggregator;
import java.io.IOException;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.IntsRef;
/*
@ -58,4 +59,9 @@ public class ScoringAggregator implements Aggregator {
return hashCode;
}
@Override
public boolean setNextReader(AtomicReaderContext context) throws IOException {
return true;
}
}

View File

@ -6,7 +6,7 @@ import org.apache.lucene.facet.associations.CategoryFloatAssociation;
import org.apache.lucene.facet.associations.FloatAssociationsPayloadIterator;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.collections.IntToFloatMap;
@ -39,13 +39,13 @@ public class AssociationFloatSumAggregator implements Aggregator {
protected final float[] sumArray;
protected final FloatAssociationsPayloadIterator associations;
public AssociationFloatSumAggregator(IndexReader reader, float[] sumArray) throws IOException {
this(CategoryListParams.DEFAULT_TERM.field(), reader, sumArray);
public AssociationFloatSumAggregator(float[] sumArray) throws IOException {
this(CategoryListParams.DEFAULT_TERM.field(), sumArray);
}
public AssociationFloatSumAggregator(String field, IndexReader reader, float[] sumArray) throws IOException {
public AssociationFloatSumAggregator(String field, float[] sumArray) throws IOException {
this.field = field;
associations = new FloatAssociationsPayloadIterator(reader, field, new CategoryFloatAssociation());
associations = new FloatAssociationsPayloadIterator(field, new CategoryFloatAssociation());
this.sumArray = sumArray;
}
@ -76,4 +76,9 @@ public class AssociationFloatSumAggregator implements Aggregator {
return field.hashCode();
}
@Override
public boolean setNextReader(AtomicReaderContext context) throws IOException {
return associations.setNextReader(context);
}
}

View File

@ -6,7 +6,7 @@ import org.apache.lucene.facet.associations.CategoryIntAssociation;
import org.apache.lucene.facet.associations.IntAssociationsPayloadIterator;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.collections.IntToIntMap;
@ -39,13 +39,13 @@ public class AssociationIntSumAggregator implements Aggregator {
protected final int[] sumArray;
protected final IntAssociationsPayloadIterator associations;
public AssociationIntSumAggregator(IndexReader reader, int[] sumArray) throws IOException {
this(CategoryListParams.DEFAULT_TERM.field(), reader, sumArray);
public AssociationIntSumAggregator(int[] sumArray) throws IOException {
this(CategoryListParams.DEFAULT_TERM.field(), sumArray);
}
public AssociationIntSumAggregator(String field, IndexReader reader, int[] sumArray) throws IOException {
public AssociationIntSumAggregator(String field, int[] sumArray) throws IOException {
this.field = field;
associations = new IntAssociationsPayloadIterator(reader, field, new CategoryIntAssociation());
associations = new IntAssociationsPayloadIterator(field, new CategoryIntAssociation());
this.sumArray = sumArray;
}
@ -76,4 +76,9 @@ public class AssociationIntSumAggregator implements Aggregator {
return field.hashCode();
}
@Override
public boolean setNextReader(AtomicReaderContext context) throws IOException {
return associations.setNextReader(context);
}
}

View File

@ -6,6 +6,7 @@ import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.search.CategoryListIterator;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.IntsRef;
@ -56,25 +57,30 @@ public class CategoryListData {
}
/** Compute category list data for caching for faster iteration. */
CategoryListData(IndexReader reader, TaxonomyReader taxo,
FacetIndexingParams iparams, CategoryListParams clp) throws IOException {
CategoryListData(IndexReader reader, TaxonomyReader taxo, FacetIndexingParams iparams, CategoryListParams clp)
throws IOException {
final int maxDoc = reader.maxDoc();
int[][][]dpf = new int[maxDoc][][];
int[][][]dpf = new int[reader.maxDoc()][][];
int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize());
IntsRef ordinals = new IntsRef(32);
for (int part = 0; part < numPartitions; part++) {
CategoryListIterator cli = clp.createCategoryListIterator(reader, part);
if (cli.init()) {
for (int doc = 0; doc < maxDoc; doc++) {
cli.getOrdinals(doc, ordinals);
if (ordinals.length > 0) {
if (dpf[doc] == null) {
dpf[doc] = new int[numPartitions][];
}
dpf[doc][part] = new int[ordinals.length];
for (int i = 0; i < ordinals.length; i++) {
dpf[doc][part][i] = ordinals.ints[i];
for (AtomicReaderContext context : reader.leaves()) {
CategoryListIterator cli = clp.createCategoryListIterator(part);
if (cli.setNextReader(context)) {
final int maxDoc = context.reader().maxDoc();
for (int i = 0; i < maxDoc; i++) {
cli.getOrdinals(i, ordinals);
if (ordinals.length > 0) {
int doc = i + context.docBase;
if (dpf[doc] == null) {
dpf[doc] = new int[numPartitions][];
}
if (dpf[doc][part] == null) {
dpf[doc][part] = new int[ordinals.length];
}
for (int j = 0; j < ordinals.length; j++) {
dpf[doc][part][j] = ordinals.ints[j];
}
}
}
}
@ -93,6 +99,7 @@ public class CategoryListData {
/** Internal: category list iterator over uncompressed category info in RAM */
private static class RAMCategoryListIterator implements CategoryListIterator {
private int docBase;
private final int part;
private final int[][][] dpc;
@ -102,13 +109,15 @@ public class CategoryListData {
}
@Override
public boolean init() throws IOException {
public boolean setNextReader(AtomicReaderContext context) throws IOException {
docBase = context.docBase;
return dpc != null && dpc.length > part;
}
@Override
public void getOrdinals(int docID, IntsRef ints) throws IOException {
ints.length = 0;
docID += docBase;
if (dpc.length > docID && dpc[docID] != null && dpc[docID][part] != null) {
if (ints.ints.length < dpc[docID][part].length) {
ints.grow(dpc[docID][part].length);

View File

@ -1,7 +1,5 @@
package org.apache.lucene.facet.search.params;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.facet.search.FacetArrays;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.facet.search.aggregator.ComplementCountingAggregator;
@ -47,8 +45,7 @@ public class CountFacetRequest extends FacetRequest {
}
@Override
public Aggregator createAggregator(boolean useComplements,
FacetArrays arrays, IndexReader reader, TaxonomyReader taxonomy) {
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) {
// we rely on that, if needed, result is cleared by arrays!
int[] a = arrays.getIntArray();
if (useComplements) {

View File

@ -2,8 +2,6 @@ package org.apache.lucene.facet.search.params;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.search.CategoryListIterator;
import org.apache.lucene.facet.search.FacetArrays;
@ -11,8 +9,8 @@ import org.apache.lucene.facet.search.FacetResultsHandler;
import org.apache.lucene.facet.search.TopKFacetResultsHandler;
import org.apache.lucene.facet.search.TopKInEachNodeHandler;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.facet.search.cache.CategoryListData;
import org.apache.lucene.facet.search.cache.CategoryListCache;
import org.apache.lucene.facet.search.cache.CategoryListData;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
@ -314,33 +312,29 @@ public abstract class FacetRequest implements Cloneable {
* computation.
* @param arrays
* provider for facet arrays in use for current computation.
* @param indexReader
* index reader in effect.
* @param taxonomy
* reader of taxonomy in effect.
* @throws IOException If there is a low-level I/O error.
*/
public abstract Aggregator createAggregator(boolean useComplements,
FacetArrays arrays, IndexReader indexReader,
TaxonomyReader taxonomy) throws IOException;
public abstract Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy)
throws IOException;
/**
* Create the category list iterator for the specified partition.
* If a non null cache is provided which contains the required data,
* use it for the iteration.
* Create the category list iterator for the specified partition. If a non
* null cache is provided which contains the required data, use it for the
* iteration.
*/
public CategoryListIterator createCategoryListIterator(IndexReader reader,
TaxonomyReader taxo, FacetSearchParams sParams, int partition)
public CategoryListIterator createCategoryListIterator(TaxonomyReader taxo, FacetSearchParams sParams, int partition)
throws IOException {
CategoryListCache clCache = sParams.getCategoryListCache();
CategoryListParams clParams = sParams.getFacetIndexingParams().getCategoryListParams(categoryPath);
if (clCache!=null) {
if (clCache != null) {
CategoryListData clData = clCache.get(clParams);
if (clData!=null) {
if (clData != null) {
return clData.iterator(partition);
}
}
return clParams.createCategoryListIterator(reader, partition);
return clParams.createCategoryListIterator(partition);
}
/**

View File

@ -1,7 +1,5 @@
package org.apache.lucene.facet.search.params;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.facet.search.FacetArrays;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.facet.search.aggregator.ScoringAggregator;
@ -38,9 +36,7 @@ public class ScoreFacetRequest extends FacetRequest {
}
@Override
public Aggregator createAggregator(boolean useComplements,
FacetArrays arrays, IndexReader reader,
TaxonomyReader taxonomy) {
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) {
assert !useComplements : "complements are not supported by this FacetRequest";
return new ScoringAggregator(arrays.getFloatArray());
}

View File

@ -2,8 +2,6 @@ package org.apache.lucene.facet.search.params.associations;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.facet.search.FacetArrays;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.facet.search.aggregator.associations.AssociationFloatSumAggregator;
@ -45,10 +43,10 @@ public class AssociationFloatSumFacetRequest extends FacetRequest {
}
@Override
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, IndexReader reader,
TaxonomyReader taxonomy) throws IOException {
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy)
throws IOException {
assert !useComplements : "complements are not supported by this FacetRequest";
return new AssociationFloatSumAggregator(reader, arrays.getFloatArray());
return new AssociationFloatSumAggregator(arrays.getFloatArray());
}
@Override

View File

@ -2,8 +2,6 @@ package org.apache.lucene.facet.search.params.associations;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.facet.search.FacetArrays;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.facet.search.aggregator.associations.AssociationIntSumAggregator;
@ -45,10 +43,10 @@ public class AssociationIntSumFacetRequest extends FacetRequest {
}
@Override
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, IndexReader reader,
TaxonomyReader taxonomy) throws IOException {
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy)
throws IOException {
assert !useComplements : "complements are not supported by this FacetRequest";
return new AssociationIntSumAggregator(reader, arrays.getIntArray());
return new AssociationIntSumAggregator(arrays.getIntArray());
}
@Override

View File

@ -60,6 +60,7 @@ public abstract class Sampler {
/**
* Construct with certain {@link SamplingParams}
*
* @param params sampling params in effect
* @throws IllegalArgumentException if the provided SamplingParams are not valid
*/
@ -110,16 +111,15 @@ public abstract class Sampler {
* @param sampleSetSize required size of sample set
* @return sample of the input set in the required size
*/
protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize,
int sampleSetSize) throws IOException;
protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize, int sampleSetSize)
throws IOException;
/**
* Get a fixer of sample facet accumulation results. Default implementation
* returns a <code>TakmiSampleFixer</code> which is adequate only for
* counting. For any other accumulator, provide a different fixer.
*/
public SampleFixer getSampleFixer(
IndexReader indexReader, TaxonomyReader taxonomyReader,
public SampleFixer getSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader,
FacetSearchParams searchParams) {
return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
}
@ -161,10 +161,10 @@ public abstract class Sampler {
OverSampledFacetRequest sampledFreq = null;
try {
sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest();
sampledFreq = (OverSampledFacetRequest) facetResult.getFacetRequest();
} catch (ClassCastException e) {
throw new IllegalArgumentException(
"It is only valid to call this method with result obtained for a" +
"It is only valid to call this method with result obtained for a " +
"facet request created through sampler.overSamlpingSearchParams()",
e);
}
@ -215,19 +215,15 @@ public abstract class Sampler {
}
@Override
public CategoryListIterator createCategoryListIterator(IndexReader reader,
TaxonomyReader taxo, FacetSearchParams sParams, int partition)
throws IOException {
return orig.createCategoryListIterator(reader, taxo, sParams, partition);
public CategoryListIterator createCategoryListIterator(TaxonomyReader taxo, FacetSearchParams sParams,
int partition) throws IOException {
return orig.createCategoryListIterator(taxo, sParams, partition);
}
@Override
public Aggregator createAggregator(boolean useComplements,
FacetArrays arrays, IndexReader indexReader,
TaxonomyReader taxonomy) throws IOException {
return orig.createAggregator(useComplements, arrays, indexReader,
taxonomy);
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy)
throws IOException {
return orig.createAggregator(useComplements, arrays, taxonomy);
}
@Override
@ -245,4 +241,5 @@ public abstract class Sampler {
return orig.supportsComplements();
}
}
}

View File

@ -91,8 +91,7 @@ class TakmiSampleFixer implements SampleFixer {
* full set of matching documents.
* @throws IOException If there is a low-level I/O error.
*/
private void recount(FacetResultNode fresNode, ScoredDocIDs docIds)
throws IOException {
private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) throws IOException {
// TODO (Facet): change from void to return the new, smaller docSet, and use
// that for the children, as this will make their intersection ops faster.
// can do this only when the new set is "sufficiently" smaller.
@ -109,8 +108,7 @@ class TakmiSampleFixer implements SampleFixer {
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
int updatedCount = countIntersection(MultiFields.getTermDocsEnum(indexReader, liveDocs,
drillDownTerm.field(), drillDownTerm.bytes(),
0),
docIds.iterator());
0), docIds.iterator());
fresNode.setValue(updatedCount);
}

View File

@ -5,6 +5,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.facet.search.CategoryListIterator;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.IntsRef;
/*
@ -42,9 +43,10 @@ public class MultiCategoryListIterator implements CategoryListIterator {
}
@Override
public boolean init() throws IOException {
public boolean setNextReader(AtomicReaderContext context) throws IOException {
validIterators.clear();
for (CategoryListIterator cli : iterators) {
if (cli.init()) {
if (cli.setNextReader(context)) {
validIterators.add(cli);
}
}

View File

@ -3,17 +3,18 @@ package org.apache.lucene.facet.util;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.facet.search.ScoredDocIDs;
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.OpenBitSetDISI;
import org.apache.lucene.facet.search.ScoredDocIDs;
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -49,48 +50,57 @@ public class ScoredDocIdsUtils {
* @param reader holding the number of documents & information about deletions.
*/
public final static ScoredDocIDs getComplementSet(final ScoredDocIDs docids, final IndexReader reader)
throws IOException {
throws IOException {
final int maxDoc = reader.maxDoc();
DocIdSet docIdSet = docids.getDocIDs();
final OpenBitSet complement;
if (docIdSet instanceof OpenBitSet) {
final FixedBitSet complement;
if (docIdSet instanceof FixedBitSet) {
// That is the most common case, if ScoredDocIdsCollector was used.
complement = ((OpenBitSet) docIdSet).clone();
complement = ((FixedBitSet) docIdSet).clone();
} else {
complement = new OpenBitSetDISI(docIdSet.iterator(), maxDoc);
complement = new FixedBitSet(maxDoc);
DocIdSetIterator iter = docIdSet.iterator();
int doc;
while ((doc = iter.nextDoc()) < maxDoc) {
complement.set(doc);
}
}
complement.flip(0, maxDoc);
// Remove all Deletions from the complement set
clearDeleted(reader, complement);
return createScoredDocIds(complement, maxDoc);
}
/**
* Clear all deleted documents from a given open-bit-set according to a given reader
*/
private static void clearDeleted(final IndexReader reader,
final OpenBitSet set) throws IOException {
/** Clear all deleted documents from a given open-bit-set according to a given reader */
private static void clearDeleted(final IndexReader reader, final FixedBitSet set) throws IOException {
// If there are no deleted docs
if (!reader.hasDeletions()) {
return; // return immediately
}
Bits bits = MultiFields.getLiveDocs(reader);
DocIdSetIterator it = set.iterator();
int doc = DocIdSetIterator.NO_MORE_DOCS;
while ((doc = it.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (!bits.get(doc)) {
set.fastClear(doc);
int doc = it.nextDoc();
for (AtomicReaderContext context : reader.leaves()) {
AtomicReader r = context.reader();
final int maxDoc = r.maxDoc() + context.docBase;
if (doc >= maxDoc) { // skip this segment
continue;
}
if (!r.hasDeletions()) { // skip all docs that belong to this reader as it has no deletions
while ((doc = it.nextDoc()) < maxDoc) {}
continue;
}
Bits liveDocs = r.getLiveDocs();
do {
if (!liveDocs.get(doc - context.docBase)) {
set.clear(doc);
}
} while ((doc = it.nextDoc()) < maxDoc);
}
}
/**
* Create a subset of an existing ScoredDocIDs object.
*
@ -274,8 +284,7 @@ public class ScoredDocIdsUtils {
if (target <= next) {
target = next + 1;
}
return next = target >= maxDoc ? NO_MORE_DOCS
: target;
return next = target >= maxDoc ? NO_MORE_DOCS : target;
}
@Override
@ -420,4 +429,5 @@ public class ScoredDocIdsUtils {
}
}
}
}

View File

@ -317,8 +317,7 @@ public abstract class FacetTestBase extends LuceneTestCase {
}
/** Validate results equality */
protected static void assertSameResults(List<FacetResult> expected,
List<FacetResult> actual) {
protected static void assertSameResults(List<FacetResult> expected, List<FacetResult> actual) {
String expectedResults = resStringValueOnly(expected);
String actualResults = resStringValueOnly(actual);
if (!expectedResults.equals(actualResults)) {

View File

@ -29,12 +29,11 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
public class AdaptiveAccumulatorTest extends BaseSampleTestTopK {
@Override
protected FacetsAccumulator getSamplingAccumulator(Sampler sampler,
TaxonomyReader taxoReader, IndexReader indexReader,
FacetSearchParams searchParams) {
AdaptiveFacetsAccumulator res = new AdaptiveFacetsAccumulator(searchParams,
indexReader, taxoReader);
protected FacetsAccumulator getSamplingAccumulator(Sampler sampler, TaxonomyReader taxoReader,
IndexReader indexReader, FacetSearchParams searchParams) {
AdaptiveFacetsAccumulator res = new AdaptiveFacetsAccumulator(searchParams, indexReader, taxoReader);
res.setSampler(sampler);
return res;
}
}

View File

@ -14,6 +14,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
@ -106,30 +107,31 @@ public class CategoryListIteratorTest extends LuceneTestCase {
IndexReader reader = writer.getReader();
writer.close();
IntsRef ordinals = new IntsRef();
CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder());
cli.init();
int totalCategories = 0;
for (int i = 0; i < data.length; i++) {
Set<Integer> values = new HashSet<Integer>();
for (int j = 0; j < data[i].length; j++) {
values.add(data[i].ints[j]);
IntsRef ordinals = new IntsRef();
CategoryListIterator cli = new PayloadCategoryListIteraor(new Term("f","1"), encoder.createMatchingDecoder());
for (AtomicReaderContext context : reader.leaves()) {
cli.setNextReader(context);
int maxDoc = context.reader().maxDoc();
int dataIdx = context.docBase;
for (int doc = 0; doc < maxDoc; doc++, dataIdx++) {
Set<Integer> values = new HashSet<Integer>();
for (int j = 0; j < data[dataIdx].length; j++) {
values.add(data[dataIdx].ints[j]);
}
cli.getOrdinals(doc, ordinals);
assertTrue("no ordinals for document " + doc, ordinals.length > 0);
for (int j = 0; j < ordinals.length; j++) {
assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j]));
}
totalCategories += ordinals.length;
}
cli.getOrdinals(i, ordinals);
assertTrue("no ordinals for document " + i, ordinals.length > 0);
for (int j = 0; j < ordinals.length; j++) {
assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j]));
}
totalCategories += ordinals.length;
}
assertEquals("Missing categories!",10,totalCategories);
assertEquals("Missing categories!", 10, totalCategories);
reader.close();
dir.close();
}
/**
* Test that a document with no payloads does not confuse the payload decoder.
*/
@Test
public void testPayloadIteratorWithInvalidDoc() throws Exception {
Directory dir = newDirectory();
@ -160,24 +162,28 @@ public class CategoryListIteratorTest extends LuceneTestCase {
IndexReader reader = writer.getReader();
writer.close();
IntsRef ordinals = new IntsRef();
CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder());
assertTrue("Failed to initialize payload iterator", cli.init());
int totalCategories = 0;
for (int i = 0; i < data.length; i++) {
Set<Integer> values = new HashSet<Integer>();
for (int j = 0; j < data[i].length; j++) {
values.add(data[i].ints[j]);
}
cli.getOrdinals(i, ordinals);
if (i == 0) {
assertTrue("document 0 must have a payload", ordinals.length > 0);
for (int j = 0; j < ordinals.length; j++) {
assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j]));
IntsRef ordinals = new IntsRef();
CategoryListIterator cli = new PayloadCategoryListIteraor(new Term("f","1"), encoder.createMatchingDecoder());
for (AtomicReaderContext context : reader.leaves()) {
cli.setNextReader(context);
int maxDoc = context.reader().maxDoc();
int dataIdx = context.docBase;
for (int doc = 0; doc < maxDoc; doc++, dataIdx++) {
Set<Integer> values = new HashSet<Integer>();
for (int j = 0; j < data[dataIdx].length; j++) {
values.add(data[dataIdx].ints[j]);
}
cli.getOrdinals(doc, ordinals);
if (dataIdx == 0) {
assertTrue("document 0 must have a payload", ordinals.length > 0);
for (int j = 0; j < ordinals.length; j++) {
assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j]));
}
totalCategories += ordinals.length;
} else {
assertTrue("only document 0 should have a payload", ordinals.length == 0);
}
totalCategories += ordinals.length;
} else {
assertTrue("only document 0 should have a payload", ordinals.length == 0);
}
}
assertEquals("Wrong number of total categories!", 2, totalCategories);

View File

@ -22,6 +22,7 @@ import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.index.AtomicReaderContext;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -132,8 +133,8 @@ public class TestCategoryListCache extends FacetTestBase {
}
}
@Override
public boolean init() throws IOException {
return it.init();
public boolean setNextReader(AtomicReaderContext context) throws IOException {
return it.setNextReader(context);
}
};
}

View File

@ -0,0 +1,128 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.search.params.CountFacetRequest;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.facet.util.AssertingCategoryListIterator;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestStandardFacetsAccumulator extends LuceneTestCase {
private void indexTwoDocs(IndexWriter indexWriter, FacetFields facetFields, boolean withContent) throws Exception {
for (int i = 0; i < 2; i++) {
Document doc = new Document();
if (withContent) {
doc.add(new StringField("f", "a", Store.NO));
}
if (facetFields != null) {
facetFields.addFields(doc, Collections.singletonList(new CategoryPath("A", Integer.toString(i))));
}
indexWriter.addDocument(doc);
}
indexWriter.commit();
}
@Test
public void testSegmentsWithoutCategoriesOrResults() throws Exception {
// tests the accumulator when there are segments with no results
Directory indexDir = newDirectory();
Directory taxoDir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(NoMergePolicy.COMPOUND_FILES); // prevent merges
IndexWriter indexWriter = new IndexWriter(indexDir, iwc);
FacetIndexingParams fip = new FacetIndexingParams(new CategoryListParams() {
@Override
public CategoryListIterator createCategoryListIterator(int partition) throws IOException {
return new AssertingCategoryListIterator(super.createCategoryListIterator(partition));
}
});
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
FacetFields facetFields = new FacetFields(taxoWriter, fip);
indexTwoDocs(indexWriter, facetFields, false); // 1st segment, no content, with categories
indexTwoDocs(indexWriter, null, true); // 2nd segment, with content, no categories
indexTwoDocs(indexWriter, facetFields, true); // 3rd segment ok
indexTwoDocs(indexWriter, null, false); // 4th segment, no content, or categories
indexTwoDocs(indexWriter, null, true); // 5th segment, with content, no categories
indexTwoDocs(indexWriter, facetFields, true); // 6th segment, with content, with categories
IOUtils.close(indexWriter, taxoWriter);
DirectoryReader indexReader = DirectoryReader.open(indexDir);
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
// search for "f:a", only segments 1 and 3 should match results
Query q = new TermQuery(new Term("f", "a"));
ArrayList<FacetRequest> requests = new ArrayList<FacetRequest>(1);
CountFacetRequest countNoComplements = new CountFacetRequest(new CategoryPath("A"), 10) {
@Override
public boolean supportsComplements() {
return false; // disable complements
}
};
requests.add(countNoComplements);
FacetSearchParams fsp = new FacetSearchParams(requests, fip);
FacetsCollector fc = new FacetsCollector(fsp , indexReader, taxoReader);
indexSearcher.search(q, fc);
List<FacetResult> results = fc.getFacetResults();
assertEquals("received too many facet results", 1, results.size());
FacetResultNode frn = results.get(0).getFacetResultNode();
assertEquals("wrong weight for \"A\"", 4, (int) frn.getValue());
assertEquals("wrong number of children", 2, frn.getNumSubResults());
for (FacetResultNode node : frn.getSubResults()) {
assertEquals("wrong weight for child " + node.getLabel(), 2, (int) node.getValue());
}
IOUtils.close(indexReader, taxoReader);
IOUtils.close(indexDir, taxoDir);
}
}

View File

@ -17,6 +17,7 @@ import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.facet.util.MultiCategoryListIterator;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
@ -100,21 +101,24 @@ public class MultiCategoryListIteratorTest extends LuceneTestCase {
clCache.loadAndRegister(clp, indexReader, taxoReader, indexingParams);
iterators[i] = clCache.get(clp).iterator(0); // no partitions
} else {
iterators[i] = new PayloadCategoryListIteraor(indexReader, clp.getTerm(), decoder);
iterators[i] = new PayloadCategoryListIteraor(clp.getTerm(), decoder);
}
}
MultiCategoryListIterator cli = new MultiCategoryListIterator(iterators);
assertTrue("failed to init multi-iterator", cli.init());
IntsRef ordinals = new IntsRef();
int maxDoc = indexReader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
cli.getOrdinals(i, ordinals);
assertTrue("document " + i + " does not have categories", ordinals.length > 0);
for (int j = 0; j < ordinals.length; j++) {
CategoryPath cp = taxoReader.getPath(ordinals.ints[j]);
assertNotNull("ordinal " + ordinals.ints[j] + " not found in taxonomy", cp);
if (cp.length == 2) {
assertEquals("invalid category for document " + i, i, Integer.parseInt(cp.components[1]));
for (AtomicReaderContext context : indexReader.leaves()) {
assertTrue("failed to init multi-iterator", cli.setNextReader(context));
IntsRef ordinals = new IntsRef();
final int maxDoc = context.reader().maxDoc();
for (int i = 0; i < maxDoc; i++) {
cli.getOrdinals(i, ordinals);
assertTrue("document " + i + " does not have categories", ordinals.length > 0);
for (int j = 0; j < ordinals.length; j++) {
CategoryPath cp = taxoReader.getPath(ordinals.ints[j]);
assertNotNull("ordinal " + ordinals.ints[j] + " not found in taxonomy", cp);
if (cp.length == 2) {
int globalDoc = i + context.docBase;
assertEquals("invalid category for document " + globalDoc, globalDoc, Integer.parseInt(cp.components[1]));
}
}
}
}

View File

@ -59,9 +59,8 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
return res;
}
protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler,
TaxonomyReader taxoReader, IndexReader indexReader,
FacetSearchParams searchParams);
protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler, TaxonomyReader taxoReader,
IndexReader indexReader, FacetSearchParams searchParams);
/**
* Try out faceted search with sampling enabled and complements either disabled or enforced
@ -89,7 +88,7 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
// try several times in case of failure, because the test has a chance to fail
// if the top K facets are not sufficiently common with the sample set
for (int nTrial=0; nTrial<RETRIES; nTrial++) {
for (int nTrial = 0; nTrial < RETRIES; nTrial++) {
try {
// complement with sampling!
final Sampler sampler = createSampler(nTrial, docCollector.getScoredDocIDs(), useRandomSampler);
@ -99,7 +98,7 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
break; // succeeded
} catch (NotSameResultError e) {
if (nTrial>=RETRIES-1) {
if (nTrial >= RETRIES - 1) {
throw e; // no more retries allowed, must fail
}
}
@ -119,14 +118,11 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
assertSameResults(expected, sampledResults);
}
private FacetsCollector samplingCollector(
final boolean complement,
final Sampler sampler,
private FacetsCollector samplingCollector(final boolean complement, final Sampler sampler,
FacetSearchParams samplingSearchParams) {
FacetsCollector samplingFC = new FacetsCollector(samplingSearchParams, indexReader, taxoReader) {
@Override
protected FacetsAccumulator initFacetsAccumulator(
FacetSearchParams facetSearchParams, IndexReader indexReader,
protected FacetsAccumulator initFacetsAccumulator(FacetSearchParams facetSearchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader) {
FacetsAccumulator acc = getSamplingAccumulator(sampler, taxonomyReader, indexReader, facetSearchParams);
acc.setComplementThreshold(complement ? FacetsAccumulator.FORCE_COMPLEMENT : FacetsAccumulator.DISABLE_COMPLEMENT);
@ -144,12 +140,13 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
samplingParams.setMinSampleSize((int) (100 * retryFactor));
samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
samplingParams.setOversampleFactor(5.0 * retryFactor);
samplingParams.setSamplingThreshold(11000); //force sampling
samplingParams.setSamplingThreshold(11000); //force sampling
Sampler sampler = useRandomSampler ?
new RandomSampler(samplingParams, new Random(random().nextLong())) :
new RepeatableSampler(samplingParams);
assertTrue("must enable sampling for this test!",sampler.shouldSample(scoredDocIDs));
return sampler;
}
}

View File

@ -0,0 +1,65 @@
package org.apache.lucene.facet.util;
import java.io.IOException;
import org.apache.lucene.facet.search.CategoryListIterator;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link CategoryListIterator} which asserts that
* {@link #getOrdinals(int, IntsRef)} is not called before
* {@link #setNextReader(AtomicReaderContext)} and that if
* {@link #setNextReader(AtomicReaderContext)} returns false,
* {@link #getOrdinals(int, IntsRef)} isn't called.
*/
public class AssertingCategoryListIterator implements CategoryListIterator {
private final CategoryListIterator delegate;
private boolean setNextReaderCalled = false;
private boolean validSegment = false;
private int maxDoc;
public AssertingCategoryListIterator(CategoryListIterator delegate) {
this.delegate = delegate;
}
@Override
public boolean setNextReader(AtomicReaderContext context) throws IOException {
setNextReaderCalled = true;
maxDoc = context.reader().maxDoc();
return validSegment = delegate.setNextReader(context);
}
@Override
public void getOrdinals(int docID, IntsRef ints) throws IOException {
if (!setNextReaderCalled) {
throw new RuntimeException("should not call getOrdinals without setNextReader first");
}
if (!validSegment) {
throw new RuntimeException("should not call getOrdinals if setNextReader returned false");
}
if (docID >= maxDoc) {
throw new RuntimeException("docID is larger than current maxDoc; forgot to call setNextReader?");
}
delegate.getOrdinals(docID, ints);
}
}

View File

@ -9,6 +9,9 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.search.ScoredDocIDs;
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
import org.apache.lucene.facet.search.ScoredDocIdCollector;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
@ -21,14 +24,9 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.OpenBitSetDISI;
import org.junit.Test;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.facet.search.ScoredDocIDs;
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
import org.apache.lucene.facet.search.ScoredDocIdCollector;
import org.junit.Test;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -52,21 +50,21 @@ public class TestScoredDocIDsUtils extends LuceneTestCase {
@Test
public void testComplementIterator() throws Exception {
final int n = atLeast(10000);
final OpenBitSet bits = new OpenBitSet(n);
for (int i = 0; i < 5 * n; i++) {
bits.flip(random().nextInt(n));
final FixedBitSet bits = new FixedBitSet(n);
Random random = random();
for (int i = 0; i < n; i++) {
int idx = random.nextInt(n);
bits.flip(idx, idx + 1);
}
OpenBitSet verify = new OpenBitSet(n);
verify.or(bits);
FixedBitSet verify = new FixedBitSet(bits);
ScoredDocIDs scoredDocIDs = ScoredDocIdsUtils.createScoredDocIds(bits, n);
Directory dir = newDirectory();
IndexReader reader = createReaderWithNDocs(random(), n, dir);
IndexReader reader = createReaderWithNDocs(random, n, dir);
try {
assertEquals(n - verify.cardinality(), ScoredDocIdsUtils.getComplementSet(scoredDocIDs,
reader).size());
assertEquals(n - verify.cardinality(), ScoredDocIdsUtils.getComplementSet(scoredDocIDs, reader).size());
} finally {
reader.close();
dir.close();
@ -147,7 +145,7 @@ public class TestScoredDocIDsUtils extends LuceneTestCase {
searcher.search(q, collector);
ScoredDocIDs scoredDocIds = collector.getScoredDocIDs();
OpenBitSet resultSet = new OpenBitSetDISI(scoredDocIds.getDocIDs().iterator(), reader.maxDoc());
FixedBitSet resultSet = (FixedBitSet) scoredDocIds.getDocIDs();
// Getting the complement set of the query result
ScoredDocIDs complementSet = ScoredDocIdsUtils.getComplementSet(scoredDocIds, reader);
@ -164,12 +162,11 @@ public class TestScoredDocIDsUtils extends LuceneTestCase {
assertFalse(
"Complement-Set must not contain deleted documents (doc="+docNum+")",
live != null && !live.get(docNum));
assertNull(
"Complement-Set must not contain docs from the original set (doc="+ docNum+")",
assertNull("Complement-Set must not contain docs from the original set (doc="+ docNum+")",
reader.document(docNum).getField("del"));
assertFalse(
"Complement-Set must not contain docs from the original set (doc="+docNum+")",
resultSet.fastGet(docNum));
resultSet.get(docNum));
}
} finally {
reader.close();