mirror of https://github.com/apache/lucene.git
LUCENE-4683: Change Aggregator and CategoryListIterator to be per-segment
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1432890 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3552167217
commit
93b0a15183
|
@ -122,6 +122,10 @@ Changes in backwards compatibility policy
|
|||
result, few other classes such as Aggregator and CategoryListIterator were
|
||||
changed to handle bulk category ordinals. (Shai Erera)
|
||||
|
||||
* LUCENE-4683: CategoryListIterator and Aggregator are now per-segment. As such
|
||||
their implementations no longer take a top-level IndexReader in the constructor
|
||||
but rather implement a setNextReader. (Shai Erera)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of
|
||||
|
|
|
@ -38,7 +38,7 @@ import org.apache.lucene.search.DocIdSetIterator;
|
|||
|
||||
public final class FixedBitSet extends DocIdSet implements Bits {
|
||||
private final long[] bits;
|
||||
private int numBits;
|
||||
private final int numBits;
|
||||
|
||||
/** returns the number of 64 bit words it would take to hold numBits */
|
||||
public static int bits2words(int numBits) {
|
||||
|
|
|
@ -3,7 +3,7 @@ package org.apache.lucene.facet.associations;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.facet.search.PayloadIterator;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -46,12 +46,21 @@ public abstract class AssociationsPayloadIterator<T extends CategoryAssociation>
|
|||
* It is assumed that all association values can be deserialized with the
|
||||
* given {@link CategoryAssociation}.
|
||||
*/
|
||||
public AssociationsPayloadIterator(IndexReader reader, String field, T association) throws IOException {
|
||||
pi = new PayloadIterator(reader, new Term(field, association.getCategoryListID()));
|
||||
hasAssociations = pi.init();
|
||||
public AssociationsPayloadIterator(String field, T association) throws IOException {
|
||||
pi = new PayloadIterator(new Term(field, association.getCategoryListID()));
|
||||
this.association = association;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the {@link AtomicReaderContext} for which {@link #setNextDoc(int)}
|
||||
* calls will be made. Returns true iff this reader has associations for any
|
||||
* of the documents belonging to the association given to the constructor.
|
||||
*/
|
||||
public final boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
hasAssociations = pi.setNextReader(context);
|
||||
return hasAssociations;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip to the requested document. Returns true iff the document has category
|
||||
* association values and they were read successfully. Associations are
|
||||
|
|
|
@ -2,7 +2,6 @@ package org.apache.lucene.facet.associations;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.collections.IntToFloatMap;
|
||||
|
||||
/*
|
||||
|
@ -31,9 +30,8 @@ public class FloatAssociationsPayloadIterator extends AssociationsPayloadIterato
|
|||
|
||||
private final IntToFloatMap ordinalAssociations = new IntToFloatMap();
|
||||
|
||||
public FloatAssociationsPayloadIterator(IndexReader reader, String field, CategoryFloatAssociation association)
|
||||
throws IOException {
|
||||
super(reader, field, association);
|
||||
public FloatAssociationsPayloadIterator(String field, CategoryFloatAssociation association) throws IOException {
|
||||
super(field, association);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -2,7 +2,6 @@ package org.apache.lucene.facet.associations;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.collections.IntToIntMap;
|
||||
|
||||
/*
|
||||
|
@ -31,9 +30,8 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator<
|
|||
|
||||
private final IntToIntMap ordinalAssociations = new IntToIntMap();
|
||||
|
||||
public IntAssociationsPayloadIterator(IndexReader reader, String field, CategoryIntAssociation association)
|
||||
throws IOException {
|
||||
super(reader, field, association);
|
||||
public IntAssociationsPayloadIterator(String field, CategoryIntAssociation association) throws IOException {
|
||||
super(field, association);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -3,13 +3,10 @@ package org.apache.lucene.facet.index.params;
|
|||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
||||
import org.apache.lucene.facet.search.CategoryListIterator;
|
||||
import org.apache.lucene.facet.search.PayloadCategoryListIteraor;
|
||||
import org.apache.lucene.facet.search.TotalFacetCounts;
|
||||
import org.apache.lucene.facet.util.PartitionsUtils;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.encoding.DGapIntEncoder;
|
||||
import org.apache.lucene.util.encoding.IntDecoder;
|
||||
import org.apache.lucene.util.encoding.IntEncoder;
|
||||
|
@ -98,11 +95,6 @@ public class CategoryListParams implements Serializable {
|
|||
return new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())));
|
||||
}
|
||||
|
||||
/**
|
||||
* Equality is defined by the 'term' that defines this category list.
|
||||
* Sub-classes should override this method if a more complex calculation
|
||||
* is needed to ensure equality.
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (o == this) {
|
||||
|
@ -121,29 +113,16 @@ public class CategoryListParams implements Serializable {
|
|||
return this.term.equals(other.term);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hashcode is similar to {@link #equals(Object)}, in that it uses
|
||||
* the term that defines this category list to derive the hashcode.
|
||||
* Subclasses need to ensure that equality/hashcode is correctly defined,
|
||||
* or there could be side-effects in the {@link TotalFacetCounts} caching
|
||||
* mechanism (as the filename for a Total Facet Counts array cache
|
||||
* is dependent on the hashCode, so it should consistently return the same
|
||||
* hash for identity).
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return this.hashCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the category list iterator for the specified partition.
|
||||
*/
|
||||
public CategoryListIterator createCategoryListIterator(IndexReader reader,
|
||||
int partition) throws IOException {
|
||||
/** Create the {@link CategoryListIterator} for the specified partition. */
|
||||
public CategoryListIterator createCategoryListIterator(int partition) throws IOException {
|
||||
String categoryListTermStr = PartitionsUtils.partitionName(this, partition);
|
||||
Term payloadTerm = new Term(term.field(), categoryListTermStr);
|
||||
return new PayloadCategoryListIteraor(reader, payloadTerm,
|
||||
createEncoder().createMatchingDecoder());
|
||||
return new PayloadCategoryListIteraor(payloadTerm, createEncoder().createMatchingDecoder());
|
||||
}
|
||||
|
||||
}
|
|
@ -50,7 +50,7 @@ public final class AdaptiveFacetsAccumulator extends StandardFacetsAccumulator {
|
|||
* Create an {@link AdaptiveFacetsAccumulator}
|
||||
* @see StandardFacetsAccumulator#StandardFacetsAccumulator(FacetSearchParams, IndexReader, TaxonomyReader)
|
||||
*/
|
||||
public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
|
||||
public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
|
||||
TaxonomyReader taxonomyReader) {
|
||||
super(searchParams, indexReader, taxonomyReader);
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.facet.search;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/*
|
||||
|
@ -23,6 +24,8 @@ import org.apache.lucene.util.IntsRef;
|
|||
|
||||
/**
|
||||
* An interface for obtaining the category ordinals of documents.
|
||||
* {@link #getOrdinals(int, IntsRef)} calls are done with document IDs that are
|
||||
* local to the reader given to {@link #setNextReader(AtomicReaderContext)}.
|
||||
* <p>
|
||||
* <b>NOTE:</b> this class operates as a key to a map, and therefore you should
|
||||
* implement {@code equals()} and {@code hashCode()} for proper behavior.
|
||||
|
@ -32,19 +35,20 @@ import org.apache.lucene.util.IntsRef;
|
|||
public interface CategoryListIterator {
|
||||
|
||||
/**
|
||||
* Initializes the iterator. This method must be called before any calls to
|
||||
* {@link #getOrdinals(int, IntsRef)}, and its return value indicates whether there are
|
||||
* any relevant documents for this iterator.
|
||||
* Sets the {@link AtomicReaderContext} for which
|
||||
* {@link #getOrdinals(int, IntsRef)} calls will be made. Returns true iff any
|
||||
* of the documents in this reader have category ordinals. This method must be
|
||||
* called before any calls to {@link #getOrdinals(int, IntsRef)}.
|
||||
*/
|
||||
public boolean init() throws IOException;
|
||||
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException;
|
||||
|
||||
/**
|
||||
* Stores the category ordinals of the given document ID in the given
|
||||
* {@link IntsRef}, starting at position 0 upto {@link IntsRef#length}. Grows
|
||||
* the {@link IntsRef} if it is not large enough.
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE:</b> if the requested document does not category ordinals
|
||||
* <b>NOTE:</b> if the requested document does not have category ordinals
|
||||
* associated with it, {@link IntsRef#length} is set to zero.
|
||||
*/
|
||||
public void getOrdinals(int docID, IntsRef ints) throws IOException;
|
||||
|
|
|
@ -2,7 +2,7 @@ package org.apache.lucene.facet.search;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
@ -34,17 +34,15 @@ import org.apache.lucene.util.encoding.IntDecoder;
|
|||
public class PayloadCategoryListIteraor implements CategoryListIterator {
|
||||
|
||||
private final IntDecoder decoder;
|
||||
private final IndexReader indexReader;
|
||||
private final Term term;
|
||||
private final PayloadIterator pi;
|
||||
private final int hashCode;
|
||||
|
||||
public PayloadCategoryListIteraor(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
|
||||
pi = new PayloadIterator(indexReader, term);
|
||||
public PayloadCategoryListIteraor(Term term, IntDecoder decoder) throws IOException {
|
||||
pi = new PayloadIterator(term);
|
||||
this.decoder = decoder;
|
||||
hashCode = indexReader.hashCode() ^ term.hashCode();
|
||||
hashCode = term.hashCode();
|
||||
this.term = term;
|
||||
this.indexReader = indexReader;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -58,7 +56,7 @@ public class PayloadCategoryListIteraor implements CategoryListIterator {
|
|||
}
|
||||
|
||||
// Hash codes are the same, check equals() to avoid cases of hash-collisions.
|
||||
return indexReader.equals(that.indexReader) && term.equals(that.term);
|
||||
return term.equals(that.term);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -67,8 +65,8 @@ public class PayloadCategoryListIteraor implements CategoryListIterator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean init() throws IOException {
|
||||
return pi.init();
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
return pi.setNextReader(context);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
package org.apache.lucene.facet.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
|
@ -42,99 +40,75 @@ import org.apache.lucene.util.BytesRef;
|
|||
*/
|
||||
public class PayloadIterator {
|
||||
|
||||
protected BytesRef data;
|
||||
|
||||
private TermsEnum reuseTE;
|
||||
private DocsAndPositionsEnum currentDPE;
|
||||
private DocsAndPositionsEnum dpe;
|
||||
private boolean hasMore;
|
||||
private int curDocID, curDocBase;
|
||||
private int curDocID;
|
||||
|
||||
private final Iterator<AtomicReaderContext> leaves;
|
||||
private final Term term;
|
||||
|
||||
public PayloadIterator(IndexReader indexReader, Term term) throws IOException {
|
||||
leaves = indexReader.leaves().iterator();
|
||||
public PayloadIterator(Term term) throws IOException {
|
||||
this.term = term;
|
||||
}
|
||||
|
||||
private void nextSegment() throws IOException {
|
||||
/**
|
||||
* Sets the {@link AtomicReaderContext} for which {@link #getPayload(int)}
|
||||
* calls will be made. Returns true iff this reader has payload for any of the
|
||||
* documents belonging to the {@link Term} given to the constructor.
|
||||
*/
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
hasMore = false;
|
||||
while (leaves.hasNext()) {
|
||||
AtomicReaderContext ctx = leaves.next();
|
||||
curDocBase = ctx.docBase;
|
||||
Fields fields = ctx.reader().fields();
|
||||
if (fields != null) {
|
||||
Terms terms = fields.terms(term.field());
|
||||
if (terms != null) {
|
||||
reuseTE = terms.iterator(reuseTE);
|
||||
if (reuseTE.seekExact(term.bytes(), true)) {
|
||||
// this class is usually used to iterate on whatever a Query matched
|
||||
// if it didn't match deleted documents, we won't receive them. if it
|
||||
// did, we should iterate on them too, therefore we pass liveDocs=null
|
||||
currentDPE = reuseTE.docsAndPositions(null, currentDPE, DocsAndPositionsEnum.FLAG_PAYLOADS);
|
||||
if (currentDPE != null && (curDocID = currentDPE.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
hasMore = true;
|
||||
break;
|
||||
}
|
||||
Fields fields = context.reader().fields();
|
||||
if (fields != null) {
|
||||
Terms terms = fields.terms(term.field());
|
||||
if (terms != null) {
|
||||
reuseTE = terms.iterator(reuseTE);
|
||||
if (reuseTE.seekExact(term.bytes(), true)) {
|
||||
// this class is usually used to iterate on whatever a Query matched
|
||||
// if it didn't match deleted documents, we won't receive them. if it
|
||||
// did, we should iterate on them too, therefore we pass liveDocs=null
|
||||
dpe = reuseTE.docsAndPositions(null, dpe, DocsAndPositionsEnum.FLAG_PAYLOADS);
|
||||
if (dpe != null && (curDocID = dpe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
hasMore = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return hasMore;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the iterator. Should be done before the first call to
|
||||
* {@link #getPayload(int)}. Returns {@code false} if no category list is
|
||||
* found, or the category list has no documents.
|
||||
*/
|
||||
public boolean init() throws IOException {
|
||||
nextSegment();
|
||||
return hasMore;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the {@link BytesRef payload} of the given document, or {@code null}
|
||||
* if the document does not exist, there are no more documents in the posting
|
||||
* list, or the document exists but has not payload. You should call
|
||||
* {@link #init()} before the first call to this method.
|
||||
* list, or the document exists but has not payload. The given document IDs
|
||||
* are treated as local to the reader given to
|
||||
* {@link #setNextReader(AtomicReaderContext)}.
|
||||
*/
|
||||
public BytesRef getPayload(int docID) throws IOException {
|
||||
if (!hasMore) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
|
||||
int localDocID = docID - curDocBase;
|
||||
|
||||
if (curDocID > localDocID) {
|
||||
if (curDocID > docID) {
|
||||
// document does not exist
|
||||
return null;
|
||||
}
|
||||
|
||||
if (curDocID < localDocID) {
|
||||
// look for the document either in that segment, or others
|
||||
while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
nextSegment(); // also updates curDocID
|
||||
localDocID = docID - curDocBase;
|
||||
// nextSegment advances to nextDoc, so check if we still need to advance
|
||||
if (curDocID >= localDocID) {
|
||||
break;
|
||||
if (curDocID < docID) {
|
||||
curDocID = dpe.advance(docID);
|
||||
if (curDocID != docID) { // requested document does not have a payload
|
||||
if (curDocID == DocIdSetIterator.NO_MORE_DOCS) { // no more docs in this reader
|
||||
hasMore = false;
|
||||
}
|
||||
}
|
||||
|
||||
// we break from the above loop when:
|
||||
// 1. we iterated over all segments (hasMore=false)
|
||||
// 2. current segment advanced to a doc, either requested or higher
|
||||
if (!hasMore || curDocID != localDocID) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// we're on the document
|
||||
assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
|
||||
int pos = currentDPE.nextPosition();
|
||||
assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
|
||||
return currentDPE.getPayload();
|
||||
assert dpe.freq() == 1 : "expecting freq=1 (got " + dpe.freq() + ") term=" + term + " doc=" + curDocID;
|
||||
int pos = dpe.nextPosition();
|
||||
assert pos != -1 : "no positions for term=" + term + " doc=" + curDocID;
|
||||
return dpe.getPayload();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -62,7 +62,7 @@ public abstract class ScoredDocIdCollector extends Collector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public ScoredDocIDsIterator scoredDocIdsIterator() {
|
||||
protected ScoredDocIDsIterator scoredDocIdsIterator() {
|
||||
return new ScoredDocIDsIterator() {
|
||||
|
||||
private DocIdSetIterator docIdsIter = docIds.iterator();
|
||||
|
@ -129,7 +129,7 @@ public abstract class ScoredDocIdCollector extends Collector {
|
|||
}
|
||||
|
||||
@Override
|
||||
public ScoredDocIDsIterator scoredDocIdsIterator() {
|
||||
protected ScoredDocIDsIterator scoredDocIdsIterator() {
|
||||
return new ScoredDocIDsIterator() {
|
||||
|
||||
private DocIdSetIterator docIdsIter = docIds.iterator();
|
||||
|
@ -189,8 +189,7 @@ public abstract class ScoredDocIdCollector extends Collector {
|
|||
* do not require scoring, it is better to set it to <i>false</i>.
|
||||
*/
|
||||
public static ScoredDocIdCollector create(int maxDoc, boolean enableScoring) {
|
||||
return enableScoring ? new ScoringDocIdCollector(maxDoc)
|
||||
: new NonScoringDocIdCollector(maxDoc);
|
||||
return enableScoring ? new ScoringDocIdCollector(maxDoc) : new NonScoringDocIdCollector(maxDoc);
|
||||
}
|
||||
|
||||
private ScoredDocIdCollector(int maxDoc) {
|
||||
|
@ -198,13 +197,14 @@ public abstract class ScoredDocIdCollector extends Collector {
|
|||
docIds = new FixedBitSet(maxDoc);
|
||||
}
|
||||
|
||||
protected abstract ScoredDocIDsIterator scoredDocIdsIterator() throws IOException;
|
||||
|
||||
/** Returns the default score used when scoring is disabled. */
|
||||
public abstract float getDefaultScore();
|
||||
|
||||
/** Set the default score. Only applicable if scoring is disabled. */
|
||||
public abstract void setDefaultScore(float defaultScore);
|
||||
|
||||
public abstract ScoredDocIDsIterator scoredDocIdsIterator() throws IOException;
|
||||
|
||||
public ScoredDocIDs getScoredDocIDs() {
|
||||
return new ScoredDocIDs() {
|
||||
|
|
|
@ -4,22 +4,23 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||
import org.apache.lucene.facet.search.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.params.FacetRequest;
|
||||
import org.apache.lucene.facet.search.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.results.FacetResult;
|
||||
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.util.PartitionsUtils;
|
||||
import org.apache.lucene.facet.util.ScoredDocIdsUtils;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -179,11 +180,11 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
|
|||
List<FacetResult> res = new ArrayList<FacetResult>();
|
||||
for (FacetRequest fr : searchParams.getFacetRequests()) {
|
||||
FacetResultsHandler frHndlr = fr.createFacetResultsHandler(taxonomyReader);
|
||||
IntermediateFacetResult tmpResult = fr2tmpRes.get(fr);
|
||||
IntermediateFacetResult tmpResult = fr2tmpRes.get(fr);
|
||||
if (tmpResult == null) {
|
||||
continue; // do not add a null to the list.
|
||||
}
|
||||
FacetResult facetRes = frHndlr.renderFacetResult(tmpResult);
|
||||
FacetResult facetRes = frHndlr.renderFacetResult(tmpResult);
|
||||
// final labeling if allowed (because labeling is a costly operation)
|
||||
if (isAllowLabeling()) {
|
||||
frHndlr.labelResult(facetRes);
|
||||
|
@ -213,18 +214,15 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
|
|||
|
||||
/** Check if it is worth to use complements */
|
||||
protected boolean shouldComplement(ScoredDocIDs docids) {
|
||||
return
|
||||
mayComplement() &&
|
||||
(docids.size() > indexReader.numDocs() * getComplementThreshold()) ;
|
||||
return mayComplement() && (docids.size() > indexReader.numDocs() * getComplementThreshold()) ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over the documents for this partition and fill the facet arrays with the correct
|
||||
* count/complement count/value.
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
*/
|
||||
private final void fillArraysForPartition(ScoredDocIDs docids,
|
||||
FacetArrays facetArrays, int partition) throws IOException {
|
||||
private final void fillArraysForPartition(ScoredDocIDs docids, FacetArrays facetArrays, int partition)
|
||||
throws IOException {
|
||||
|
||||
if (isUsingComplements) {
|
||||
initArraysByTotalCounts(facetArrays, partition, docids.size());
|
||||
|
@ -236,27 +234,41 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
|
|||
|
||||
IntsRef ordinals = new IntsRef(32); // a reasonable start capacity for most common apps
|
||||
for (Entry<CategoryListIterator, Aggregator> entry : categoryLists.entrySet()) {
|
||||
CategoryListIterator categoryList = entry.getKey();
|
||||
if (!categoryList.init()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Aggregator categorator = entry.getValue();
|
||||
ScoredDocIDsIterator iterator = docids.iterator();
|
||||
final ScoredDocIDsIterator iterator = docids.iterator();
|
||||
final CategoryListIterator categoryListIter = entry.getKey();
|
||||
final Aggregator aggregator = entry.getValue();
|
||||
Iterator<AtomicReaderContext> contexts = indexReader.leaves().iterator();
|
||||
AtomicReaderContext current = null;
|
||||
int maxDoc = -1;
|
||||
while (iterator.next()) {
|
||||
int docID = iterator.getDocID();
|
||||
categoryList.getOrdinals(docID, ordinals);
|
||||
if (ordinals.length == 0) {
|
||||
continue;
|
||||
while (docID >= maxDoc) { // find the segment which contains this document
|
||||
if (!contexts.hasNext()) {
|
||||
throw new RuntimeException("ScoredDocIDs contains documents outside this reader's segments !?");
|
||||
}
|
||||
current = contexts.next();
|
||||
maxDoc = current.docBase + current.reader().maxDoc();
|
||||
if (docID < maxDoc) { // segment has docs, check if it has categories
|
||||
boolean validSegment = categoryListIter.setNextReader(current);
|
||||
validSegment &= aggregator.setNextReader(current);
|
||||
if (!validSegment) { // if categoryList or aggregtor say it's an invalid segment, skip all docs
|
||||
while (docID < maxDoc && iterator.next()) {
|
||||
docID = iterator.getDocID();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
categorator.aggregate(docID, iterator.getScore(), ordinals);
|
||||
docID -= current.docBase;
|
||||
categoryListIter.getOrdinals(docID, ordinals);
|
||||
if (ordinals.length == 0) {
|
||||
continue; // document does not have category ordinals
|
||||
}
|
||||
aggregator.aggregate(docID, iterator.getScore(), ordinals);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Init arrays for partition by total counts, optionally applying a factor
|
||||
*/
|
||||
/** Init arrays for partition by total counts, optionally applying a factor */
|
||||
private final void initArraysByTotalCounts(FacetArrays facetArrays, int partition, int nAccumulatedDocs) {
|
||||
int[] intArray = facetArrays.getIntArray();
|
||||
totalFacetCounts.fillTotalCountsForPartition(intArray, partition);
|
||||
|
@ -302,10 +314,9 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
|
|||
|
||||
for (FacetRequest facetRequest : searchParams.getFacetRequests()) {
|
||||
Aggregator categoryAggregator = facetRequest.createAggregator(
|
||||
isUsingComplements, facetArrays, indexReader, taxonomyReader);
|
||||
isUsingComplements, facetArrays, taxonomyReader);
|
||||
|
||||
CategoryListIterator cli =
|
||||
facetRequest.createCategoryListIterator(indexReader, taxonomyReader, searchParams, partition);
|
||||
CategoryListIterator cli = facetRequest.createCategoryListIterator(taxonomyReader, searchParams, partition);
|
||||
|
||||
// get the aggregator
|
||||
Aggregator old = categoryLists.put(cli, categoryAggregator);
|
||||
|
|
|
@ -170,7 +170,7 @@ public class TotalFacetCounts {
|
|||
Aggregator aggregator = new CountingAggregator(counts[partition]);
|
||||
HashMap<CategoryListIterator, Aggregator> map = new HashMap<CategoryListIterator, Aggregator>();
|
||||
for (CategoryListParams clp: facetIndexingParams.getAllCategoryListParams()) {
|
||||
final CategoryListIterator cli = clIteraor(clCache, clp, indexReader, partition);
|
||||
final CategoryListIterator cli = clIteraor(clCache, clp, partition);
|
||||
map.put(cli, aggregator);
|
||||
}
|
||||
return map;
|
||||
|
@ -181,14 +181,14 @@ public class TotalFacetCounts {
|
|||
return new TotalFacetCounts(taxonomy, facetIndexingParams, counts, CreationType.Computed);
|
||||
}
|
||||
|
||||
static CategoryListIterator clIteraor(CategoryListCache clCache, CategoryListParams clp,
|
||||
IndexReader indexReader, int partition) throws IOException {
|
||||
static CategoryListIterator clIteraor(CategoryListCache clCache, CategoryListParams clp, int partition)
|
||||
throws IOException {
|
||||
if (clCache != null) {
|
||||
CategoryListData cld = clCache.get(clp);
|
||||
if (cld != null) {
|
||||
return cld.iterator(partition);
|
||||
}
|
||||
}
|
||||
return clp.createCategoryListIterator(indexReader, partition);
|
||||
return clp.createCategoryListIterator(partition);
|
||||
}
|
||||
}
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.aggregator;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/*
|
||||
|
@ -22,21 +23,22 @@ import org.apache.lucene.util.IntsRef;
|
|||
*/
|
||||
|
||||
/**
|
||||
* An Aggregator is the analogue of Lucene's Collector (see
|
||||
* {@link org.apache.lucene.search.Collector}), for processing the categories
|
||||
* belonging to a certain document. The Aggregator is responsible for doing
|
||||
* whatever it wishes with the categories it is fed, e.g., counting the number
|
||||
* of times that each category appears, or performing some computation on their
|
||||
* association values.
|
||||
* <P>
|
||||
* Much of the function of an Aggregator implementation is not described by this
|
||||
* interface. This includes the constructor and getter methods to retrieve the
|
||||
* results of the aggregation.
|
||||
* Aggregates the categories of documents given to
|
||||
* {@link #aggregate(int, float, IntsRef)}. Note that the document IDs are local
|
||||
* to the reader given to {@link #setNextReader(AtomicReaderContext)}.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public interface Aggregator {
|
||||
|
||||
/**
|
||||
* Sets the {@link AtomicReaderContext} for which
|
||||
* {@link #aggregate(int, float, IntsRef)} calls will be made. If this method
|
||||
* returns false, {@link #aggregate(int, float, IntsRef)} should not be called
|
||||
* for this reader.
|
||||
*/
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException;
|
||||
|
||||
/**
|
||||
* Aggregate the ordinals of the given document ID (and its score). The given
|
||||
* ordinals offset is always zero.
|
||||
|
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.aggregator;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/*
|
||||
|
@ -57,4 +58,9 @@ public class CountingAggregator implements Aggregator {
|
|||
return counterArray == null ? 0 : counterArray.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.facet.search.aggregator;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/*
|
||||
|
@ -58,4 +59,9 @@ public class ScoringAggregator implements Aggregator {
|
|||
return hashCode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@ import org.apache.lucene.facet.associations.CategoryFloatAssociation;
|
|||
import org.apache.lucene.facet.associations.FloatAssociationsPayloadIterator;
|
||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.collections.IntToFloatMap;
|
||||
|
||||
|
@ -39,13 +39,13 @@ public class AssociationFloatSumAggregator implements Aggregator {
|
|||
protected final float[] sumArray;
|
||||
protected final FloatAssociationsPayloadIterator associations;
|
||||
|
||||
public AssociationFloatSumAggregator(IndexReader reader, float[] sumArray) throws IOException {
|
||||
this(CategoryListParams.DEFAULT_TERM.field(), reader, sumArray);
|
||||
public AssociationFloatSumAggregator(float[] sumArray) throws IOException {
|
||||
this(CategoryListParams.DEFAULT_TERM.field(), sumArray);
|
||||
}
|
||||
|
||||
public AssociationFloatSumAggregator(String field, IndexReader reader, float[] sumArray) throws IOException {
|
||||
public AssociationFloatSumAggregator(String field, float[] sumArray) throws IOException {
|
||||
this.field = field;
|
||||
associations = new FloatAssociationsPayloadIterator(reader, field, new CategoryFloatAssociation());
|
||||
associations = new FloatAssociationsPayloadIterator(field, new CategoryFloatAssociation());
|
||||
this.sumArray = sumArray;
|
||||
}
|
||||
|
||||
|
@ -76,4 +76,9 @@ public class AssociationFloatSumAggregator implements Aggregator {
|
|||
return field.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
return associations.setNextReader(context);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@ import org.apache.lucene.facet.associations.CategoryIntAssociation;
|
|||
import org.apache.lucene.facet.associations.IntAssociationsPayloadIterator;
|
||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.collections.IntToIntMap;
|
||||
|
||||
|
@ -39,13 +39,13 @@ public class AssociationIntSumAggregator implements Aggregator {
|
|||
protected final int[] sumArray;
|
||||
protected final IntAssociationsPayloadIterator associations;
|
||||
|
||||
public AssociationIntSumAggregator(IndexReader reader, int[] sumArray) throws IOException {
|
||||
this(CategoryListParams.DEFAULT_TERM.field(), reader, sumArray);
|
||||
public AssociationIntSumAggregator(int[] sumArray) throws IOException {
|
||||
this(CategoryListParams.DEFAULT_TERM.field(), sumArray);
|
||||
}
|
||||
|
||||
public AssociationIntSumAggregator(String field, IndexReader reader, int[] sumArray) throws IOException {
|
||||
public AssociationIntSumAggregator(String field, int[] sumArray) throws IOException {
|
||||
this.field = field;
|
||||
associations = new IntAssociationsPayloadIterator(reader, field, new CategoryIntAssociation());
|
||||
associations = new IntAssociationsPayloadIterator(field, new CategoryIntAssociation());
|
||||
this.sumArray = sumArray;
|
||||
}
|
||||
|
||||
|
@ -76,4 +76,9 @@ public class AssociationIntSumAggregator implements Aggregator {
|
|||
return field.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
return associations.setNextReader(context);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@ import org.apache.lucene.facet.index.params.CategoryListParams;
|
|||
import org.apache.lucene.facet.index.params.FacetIndexingParams;
|
||||
import org.apache.lucene.facet.search.CategoryListIterator;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
|
@ -56,25 +57,30 @@ public class CategoryListData {
|
|||
}
|
||||
|
||||
/** Compute category list data for caching for faster iteration. */
|
||||
CategoryListData(IndexReader reader, TaxonomyReader taxo,
|
||||
FacetIndexingParams iparams, CategoryListParams clp) throws IOException {
|
||||
CategoryListData(IndexReader reader, TaxonomyReader taxo, FacetIndexingParams iparams, CategoryListParams clp)
|
||||
throws IOException {
|
||||
|
||||
final int maxDoc = reader.maxDoc();
|
||||
int[][][]dpf = new int[maxDoc][][];
|
||||
int[][][]dpf = new int[reader.maxDoc()][][];
|
||||
int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize());
|
||||
IntsRef ordinals = new IntsRef(32);
|
||||
for (int part = 0; part < numPartitions; part++) {
|
||||
CategoryListIterator cli = clp.createCategoryListIterator(reader, part);
|
||||
if (cli.init()) {
|
||||
for (int doc = 0; doc < maxDoc; doc++) {
|
||||
cli.getOrdinals(doc, ordinals);
|
||||
if (ordinals.length > 0) {
|
||||
if (dpf[doc] == null) {
|
||||
dpf[doc] = new int[numPartitions][];
|
||||
}
|
||||
dpf[doc][part] = new int[ordinals.length];
|
||||
for (int i = 0; i < ordinals.length; i++) {
|
||||
dpf[doc][part][i] = ordinals.ints[i];
|
||||
for (AtomicReaderContext context : reader.leaves()) {
|
||||
CategoryListIterator cli = clp.createCategoryListIterator(part);
|
||||
if (cli.setNextReader(context)) {
|
||||
final int maxDoc = context.reader().maxDoc();
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
cli.getOrdinals(i, ordinals);
|
||||
if (ordinals.length > 0) {
|
||||
int doc = i + context.docBase;
|
||||
if (dpf[doc] == null) {
|
||||
dpf[doc] = new int[numPartitions][];
|
||||
}
|
||||
if (dpf[doc][part] == null) {
|
||||
dpf[doc][part] = new int[ordinals.length];
|
||||
}
|
||||
for (int j = 0; j < ordinals.length; j++) {
|
||||
dpf[doc][part][j] = ordinals.ints[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -93,6 +99,7 @@ public class CategoryListData {
|
|||
/** Internal: category list iterator over uncompressed category info in RAM */
|
||||
private static class RAMCategoryListIterator implements CategoryListIterator {
|
||||
|
||||
private int docBase;
|
||||
private final int part;
|
||||
private final int[][][] dpc;
|
||||
|
||||
|
@ -102,13 +109,15 @@ public class CategoryListData {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean init() throws IOException {
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
docBase = context.docBase;
|
||||
return dpc != null && dpc.length > part;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void getOrdinals(int docID, IntsRef ints) throws IOException {
|
||||
ints.length = 0;
|
||||
docID += docBase;
|
||||
if (dpc.length > docID && dpc[docID] != null && dpc[docID][part] != null) {
|
||||
if (ints.ints.length < dpc[docID][part].length) {
|
||||
ints.grow(dpc[docID][part].length);
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
package org.apache.lucene.facet.search.params;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import org.apache.lucene.facet.search.FacetArrays;
|
||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||
import org.apache.lucene.facet.search.aggregator.ComplementCountingAggregator;
|
||||
|
@ -47,8 +45,7 @@ public class CountFacetRequest extends FacetRequest {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Aggregator createAggregator(boolean useComplements,
|
||||
FacetArrays arrays, IndexReader reader, TaxonomyReader taxonomy) {
|
||||
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) {
|
||||
// we rely on that, if needed, result is cleared by arrays!
|
||||
int[] a = arrays.getIntArray();
|
||||
if (useComplements) {
|
||||
|
|
|
@ -2,8 +2,6 @@ package org.apache.lucene.facet.search.params;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
||||
import org.apache.lucene.facet.search.CategoryListIterator;
|
||||
import org.apache.lucene.facet.search.FacetArrays;
|
||||
|
@ -11,8 +9,8 @@ import org.apache.lucene.facet.search.FacetResultsHandler;
|
|||
import org.apache.lucene.facet.search.TopKFacetResultsHandler;
|
||||
import org.apache.lucene.facet.search.TopKInEachNodeHandler;
|
||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||
import org.apache.lucene.facet.search.cache.CategoryListData;
|
||||
import org.apache.lucene.facet.search.cache.CategoryListCache;
|
||||
import org.apache.lucene.facet.search.cache.CategoryListData;
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
|
||||
|
@ -314,33 +312,29 @@ public abstract class FacetRequest implements Cloneable {
|
|||
* computation.
|
||||
* @param arrays
|
||||
* provider for facet arrays in use for current computation.
|
||||
* @param indexReader
|
||||
* index reader in effect.
|
||||
* @param taxonomy
|
||||
* reader of taxonomy in effect.
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
*/
|
||||
public abstract Aggregator createAggregator(boolean useComplements,
|
||||
FacetArrays arrays, IndexReader indexReader,
|
||||
TaxonomyReader taxonomy) throws IOException;
|
||||
public abstract Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Create the category list iterator for the specified partition.
|
||||
* If a non null cache is provided which contains the required data,
|
||||
* use it for the iteration.
|
||||
* Create the category list iterator for the specified partition. If a non
|
||||
* null cache is provided which contains the required data, use it for the
|
||||
* iteration.
|
||||
*/
|
||||
public CategoryListIterator createCategoryListIterator(IndexReader reader,
|
||||
TaxonomyReader taxo, FacetSearchParams sParams, int partition)
|
||||
public CategoryListIterator createCategoryListIterator(TaxonomyReader taxo, FacetSearchParams sParams, int partition)
|
||||
throws IOException {
|
||||
CategoryListCache clCache = sParams.getCategoryListCache();
|
||||
CategoryListParams clParams = sParams.getFacetIndexingParams().getCategoryListParams(categoryPath);
|
||||
if (clCache!=null) {
|
||||
if (clCache != null) {
|
||||
CategoryListData clData = clCache.get(clParams);
|
||||
if (clData!=null) {
|
||||
if (clData != null) {
|
||||
return clData.iterator(partition);
|
||||
}
|
||||
}
|
||||
return clParams.createCategoryListIterator(reader, partition);
|
||||
return clParams.createCategoryListIterator(partition);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
package org.apache.lucene.facet.search.params;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import org.apache.lucene.facet.search.FacetArrays;
|
||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||
import org.apache.lucene.facet.search.aggregator.ScoringAggregator;
|
||||
|
@ -38,9 +36,7 @@ public class ScoreFacetRequest extends FacetRequest {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Aggregator createAggregator(boolean useComplements,
|
||||
FacetArrays arrays, IndexReader reader,
|
||||
TaxonomyReader taxonomy) {
|
||||
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) {
|
||||
assert !useComplements : "complements are not supported by this FacetRequest";
|
||||
return new ScoringAggregator(arrays.getFloatArray());
|
||||
}
|
||||
|
|
|
@ -2,8 +2,6 @@ package org.apache.lucene.facet.search.params.associations;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import org.apache.lucene.facet.search.FacetArrays;
|
||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||
import org.apache.lucene.facet.search.aggregator.associations.AssociationFloatSumAggregator;
|
||||
|
@ -45,10 +43,10 @@ public class AssociationFloatSumFacetRequest extends FacetRequest {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, IndexReader reader,
|
||||
TaxonomyReader taxonomy) throws IOException {
|
||||
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy)
|
||||
throws IOException {
|
||||
assert !useComplements : "complements are not supported by this FacetRequest";
|
||||
return new AssociationFloatSumAggregator(reader, arrays.getFloatArray());
|
||||
return new AssociationFloatSumAggregator(arrays.getFloatArray());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -2,8 +2,6 @@ package org.apache.lucene.facet.search.params.associations;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import org.apache.lucene.facet.search.FacetArrays;
|
||||
import org.apache.lucene.facet.search.aggregator.Aggregator;
|
||||
import org.apache.lucene.facet.search.aggregator.associations.AssociationIntSumAggregator;
|
||||
|
@ -45,10 +43,10 @@ public class AssociationIntSumFacetRequest extends FacetRequest {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, IndexReader reader,
|
||||
TaxonomyReader taxonomy) throws IOException {
|
||||
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy)
|
||||
throws IOException {
|
||||
assert !useComplements : "complements are not supported by this FacetRequest";
|
||||
return new AssociationIntSumAggregator(reader, arrays.getIntArray());
|
||||
return new AssociationIntSumAggregator(arrays.getIntArray());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -60,6 +60,7 @@ public abstract class Sampler {
|
|||
|
||||
/**
|
||||
* Construct with certain {@link SamplingParams}
|
||||
*
|
||||
* @param params sampling params in effect
|
||||
* @throws IllegalArgumentException if the provided SamplingParams are not valid
|
||||
*/
|
||||
|
@ -110,16 +111,15 @@ public abstract class Sampler {
|
|||
* @param sampleSetSize required size of sample set
|
||||
* @return sample of the input set in the required size
|
||||
*/
|
||||
protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize,
|
||||
int sampleSetSize) throws IOException;
|
||||
protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize, int sampleSetSize)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Get a fixer of sample facet accumulation results. Default implementation
|
||||
* returns a <code>TakmiSampleFixer</code> which is adequate only for
|
||||
* counting. For any other accumulator, provide a different fixer.
|
||||
*/
|
||||
public SampleFixer getSampleFixer(
|
||||
IndexReader indexReader, TaxonomyReader taxonomyReader,
|
||||
public SampleFixer getSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader,
|
||||
FacetSearchParams searchParams) {
|
||||
return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
|
||||
}
|
||||
|
@ -161,10 +161,10 @@ public abstract class Sampler {
|
|||
OverSampledFacetRequest sampledFreq = null;
|
||||
|
||||
try {
|
||||
sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest();
|
||||
sampledFreq = (OverSampledFacetRequest) facetResult.getFacetRequest();
|
||||
} catch (ClassCastException e) {
|
||||
throw new IllegalArgumentException(
|
||||
"It is only valid to call this method with result obtained for a" +
|
||||
"It is only valid to call this method with result obtained for a " +
|
||||
"facet request created through sampler.overSamlpingSearchParams()",
|
||||
e);
|
||||
}
|
||||
|
@ -215,19 +215,15 @@ public abstract class Sampler {
|
|||
}
|
||||
|
||||
@Override
|
||||
public CategoryListIterator createCategoryListIterator(IndexReader reader,
|
||||
TaxonomyReader taxo, FacetSearchParams sParams, int partition)
|
||||
throws IOException {
|
||||
return orig.createCategoryListIterator(reader, taxo, sParams, partition);
|
||||
public CategoryListIterator createCategoryListIterator(TaxonomyReader taxo, FacetSearchParams sParams,
|
||||
int partition) throws IOException {
|
||||
return orig.createCategoryListIterator(taxo, sParams, partition);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Aggregator createAggregator(boolean useComplements,
|
||||
FacetArrays arrays, IndexReader indexReader,
|
||||
TaxonomyReader taxonomy) throws IOException {
|
||||
return orig.createAggregator(useComplements, arrays, indexReader,
|
||||
taxonomy);
|
||||
public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy)
|
||||
throws IOException {
|
||||
return orig.createAggregator(useComplements, arrays, taxonomy);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -245,4 +241,5 @@ public abstract class Sampler {
|
|||
return orig.supportsComplements();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -91,8 +91,7 @@ class TakmiSampleFixer implements SampleFixer {
|
|||
* full set of matching documents.
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
*/
|
||||
private void recount(FacetResultNode fresNode, ScoredDocIDs docIds)
|
||||
throws IOException {
|
||||
private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) throws IOException {
|
||||
// TODO (Facet): change from void to return the new, smaller docSet, and use
|
||||
// that for the children, as this will make their intersection ops faster.
|
||||
// can do this only when the new set is "sufficiently" smaller.
|
||||
|
@ -109,8 +108,7 @@ class TakmiSampleFixer implements SampleFixer {
|
|||
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
|
||||
int updatedCount = countIntersection(MultiFields.getTermDocsEnum(indexReader, liveDocs,
|
||||
drillDownTerm.field(), drillDownTerm.bytes(),
|
||||
0),
|
||||
docIds.iterator());
|
||||
0), docIds.iterator());
|
||||
|
||||
fresNode.setValue(updatedCount);
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.facet.search.CategoryListIterator;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/*
|
||||
|
@ -42,9 +43,10 @@ public class MultiCategoryListIterator implements CategoryListIterator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean init() throws IOException {
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
validIterators.clear();
|
||||
for (CategoryListIterator cli : iterators) {
|
||||
if (cli.init()) {
|
||||
if (cli.setNextReader(context)) {
|
||||
validIterators.add(cli);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,17 +3,18 @@ package org.apache.lucene.facet.util;
|
|||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.facet.search.ScoredDocIDs;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.OpenBitSetDISI;
|
||||
|
||||
import org.apache.lucene.facet.search.ScoredDocIDs;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -49,48 +50,57 @@ public class ScoredDocIdsUtils {
|
|||
* @param reader holding the number of documents & information about deletions.
|
||||
*/
|
||||
public final static ScoredDocIDs getComplementSet(final ScoredDocIDs docids, final IndexReader reader)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
final int maxDoc = reader.maxDoc();
|
||||
|
||||
DocIdSet docIdSet = docids.getDocIDs();
|
||||
final OpenBitSet complement;
|
||||
if (docIdSet instanceof OpenBitSet) {
|
||||
final FixedBitSet complement;
|
||||
if (docIdSet instanceof FixedBitSet) {
|
||||
// That is the most common case, if ScoredDocIdsCollector was used.
|
||||
complement = ((OpenBitSet) docIdSet).clone();
|
||||
complement = ((FixedBitSet) docIdSet).clone();
|
||||
} else {
|
||||
complement = new OpenBitSetDISI(docIdSet.iterator(), maxDoc);
|
||||
complement = new FixedBitSet(maxDoc);
|
||||
DocIdSetIterator iter = docIdSet.iterator();
|
||||
int doc;
|
||||
while ((doc = iter.nextDoc()) < maxDoc) {
|
||||
complement.set(doc);
|
||||
}
|
||||
}
|
||||
|
||||
complement.flip(0, maxDoc);
|
||||
|
||||
// Remove all Deletions from the complement set
|
||||
clearDeleted(reader, complement);
|
||||
|
||||
return createScoredDocIds(complement, maxDoc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear all deleted documents from a given open-bit-set according to a given reader
|
||||
*/
|
||||
private static void clearDeleted(final IndexReader reader,
|
||||
final OpenBitSet set) throws IOException {
|
||||
|
||||
|
||||
/** Clear all deleted documents from a given open-bit-set according to a given reader */
|
||||
private static void clearDeleted(final IndexReader reader, final FixedBitSet set) throws IOException {
|
||||
|
||||
// If there are no deleted docs
|
||||
if (!reader.hasDeletions()) {
|
||||
return; // return immediately
|
||||
}
|
||||
|
||||
Bits bits = MultiFields.getLiveDocs(reader);
|
||||
|
||||
DocIdSetIterator it = set.iterator();
|
||||
int doc = DocIdSetIterator.NO_MORE_DOCS;
|
||||
while ((doc = it.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (!bits.get(doc)) {
|
||||
set.fastClear(doc);
|
||||
int doc = it.nextDoc();
|
||||
for (AtomicReaderContext context : reader.leaves()) {
|
||||
AtomicReader r = context.reader();
|
||||
final int maxDoc = r.maxDoc() + context.docBase;
|
||||
if (doc >= maxDoc) { // skip this segment
|
||||
continue;
|
||||
}
|
||||
if (!r.hasDeletions()) { // skip all docs that belong to this reader as it has no deletions
|
||||
while ((doc = it.nextDoc()) < maxDoc) {}
|
||||
continue;
|
||||
}
|
||||
Bits liveDocs = r.getLiveDocs();
|
||||
do {
|
||||
if (!liveDocs.get(doc - context.docBase)) {
|
||||
set.clear(doc);
|
||||
}
|
||||
} while ((doc = it.nextDoc()) < maxDoc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a subset of an existing ScoredDocIDs object.
|
||||
*
|
||||
|
@ -274,8 +284,7 @@ public class ScoredDocIdsUtils {
|
|||
if (target <= next) {
|
||||
target = next + 1;
|
||||
}
|
||||
return next = target >= maxDoc ? NO_MORE_DOCS
|
||||
: target;
|
||||
return next = target >= maxDoc ? NO_MORE_DOCS : target;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -420,4 +429,5 @@ public class ScoredDocIdsUtils {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -317,8 +317,7 @@ public abstract class FacetTestBase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
/** Validate results equality */
|
||||
protected static void assertSameResults(List<FacetResult> expected,
|
||||
List<FacetResult> actual) {
|
||||
protected static void assertSameResults(List<FacetResult> expected, List<FacetResult> actual) {
|
||||
String expectedResults = resStringValueOnly(expected);
|
||||
String actualResults = resStringValueOnly(actual);
|
||||
if (!expectedResults.equals(actualResults)) {
|
||||
|
|
|
@ -29,12 +29,11 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|||
public class AdaptiveAccumulatorTest extends BaseSampleTestTopK {
|
||||
|
||||
@Override
|
||||
protected FacetsAccumulator getSamplingAccumulator(Sampler sampler,
|
||||
TaxonomyReader taxoReader, IndexReader indexReader,
|
||||
FacetSearchParams searchParams) {
|
||||
AdaptiveFacetsAccumulator res = new AdaptiveFacetsAccumulator(searchParams,
|
||||
indexReader, taxoReader);
|
||||
protected FacetsAccumulator getSamplingAccumulator(Sampler sampler, TaxonomyReader taxoReader,
|
||||
IndexReader indexReader, FacetSearchParams searchParams) {
|
||||
AdaptiveFacetsAccumulator res = new AdaptiveFacetsAccumulator(searchParams, indexReader, taxoReader);
|
||||
res.setSampler(sampler);
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -106,30 +107,31 @@ public class CategoryListIteratorTest extends LuceneTestCase {
|
|||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IntsRef ordinals = new IntsRef();
|
||||
CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder());
|
||||
cli.init();
|
||||
int totalCategories = 0;
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
Set<Integer> values = new HashSet<Integer>();
|
||||
for (int j = 0; j < data[i].length; j++) {
|
||||
values.add(data[i].ints[j]);
|
||||
IntsRef ordinals = new IntsRef();
|
||||
CategoryListIterator cli = new PayloadCategoryListIteraor(new Term("f","1"), encoder.createMatchingDecoder());
|
||||
for (AtomicReaderContext context : reader.leaves()) {
|
||||
cli.setNextReader(context);
|
||||
int maxDoc = context.reader().maxDoc();
|
||||
int dataIdx = context.docBase;
|
||||
for (int doc = 0; doc < maxDoc; doc++, dataIdx++) {
|
||||
Set<Integer> values = new HashSet<Integer>();
|
||||
for (int j = 0; j < data[dataIdx].length; j++) {
|
||||
values.add(data[dataIdx].ints[j]);
|
||||
}
|
||||
cli.getOrdinals(doc, ordinals);
|
||||
assertTrue("no ordinals for document " + doc, ordinals.length > 0);
|
||||
for (int j = 0; j < ordinals.length; j++) {
|
||||
assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j]));
|
||||
}
|
||||
totalCategories += ordinals.length;
|
||||
}
|
||||
cli.getOrdinals(i, ordinals);
|
||||
assertTrue("no ordinals for document " + i, ordinals.length > 0);
|
||||
for (int j = 0; j < ordinals.length; j++) {
|
||||
assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j]));
|
||||
}
|
||||
totalCategories += ordinals.length;
|
||||
}
|
||||
assertEquals("Missing categories!",10,totalCategories);
|
||||
assertEquals("Missing categories!", 10, totalCategories);
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that a document with no payloads does not confuse the payload decoder.
|
||||
*/
|
||||
@Test
|
||||
public void testPayloadIteratorWithInvalidDoc() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
|
@ -160,24 +162,28 @@ public class CategoryListIteratorTest extends LuceneTestCase {
|
|||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IntsRef ordinals = new IntsRef();
|
||||
CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder());
|
||||
assertTrue("Failed to initialize payload iterator", cli.init());
|
||||
int totalCategories = 0;
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
Set<Integer> values = new HashSet<Integer>();
|
||||
for (int j = 0; j < data[i].length; j++) {
|
||||
values.add(data[i].ints[j]);
|
||||
}
|
||||
cli.getOrdinals(i, ordinals);
|
||||
if (i == 0) {
|
||||
assertTrue("document 0 must have a payload", ordinals.length > 0);
|
||||
for (int j = 0; j < ordinals.length; j++) {
|
||||
assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j]));
|
||||
IntsRef ordinals = new IntsRef();
|
||||
CategoryListIterator cli = new PayloadCategoryListIteraor(new Term("f","1"), encoder.createMatchingDecoder());
|
||||
for (AtomicReaderContext context : reader.leaves()) {
|
||||
cli.setNextReader(context);
|
||||
int maxDoc = context.reader().maxDoc();
|
||||
int dataIdx = context.docBase;
|
||||
for (int doc = 0; doc < maxDoc; doc++, dataIdx++) {
|
||||
Set<Integer> values = new HashSet<Integer>();
|
||||
for (int j = 0; j < data[dataIdx].length; j++) {
|
||||
values.add(data[dataIdx].ints[j]);
|
||||
}
|
||||
cli.getOrdinals(doc, ordinals);
|
||||
if (dataIdx == 0) {
|
||||
assertTrue("document 0 must have a payload", ordinals.length > 0);
|
||||
for (int j = 0; j < ordinals.length; j++) {
|
||||
assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j]));
|
||||
}
|
||||
totalCategories += ordinals.length;
|
||||
} else {
|
||||
assertTrue("only document 0 should have a payload", ordinals.length == 0);
|
||||
}
|
||||
totalCategories += ordinals.length;
|
||||
} else {
|
||||
assertTrue("only document 0 should have a payload", ordinals.length == 0);
|
||||
}
|
||||
}
|
||||
assertEquals("Wrong number of total categories!", 2, totalCategories);
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.facet.search.params.FacetRequest;
|
|||
import org.apache.lucene.facet.search.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.results.FacetResult;
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -132,8 +133,8 @@ public class TestCategoryListCache extends FacetTestBase {
|
|||
}
|
||||
}
|
||||
@Override
|
||||
public boolean init() throws IOException {
|
||||
return it.init();
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
return it.setNextReader(context);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -0,0 +1,128 @@
|
|||
package org.apache.lucene.facet.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.facet.index.FacetFields;
|
||||
import org.apache.lucene.facet.index.params.CategoryListParams;
|
||||
import org.apache.lucene.facet.index.params.FacetIndexingParams;
|
||||
import org.apache.lucene.facet.search.params.CountFacetRequest;
|
||||
import org.apache.lucene.facet.search.params.FacetRequest;
|
||||
import org.apache.lucene.facet.search.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.results.FacetResult;
|
||||
import org.apache.lucene.facet.search.results.FacetResultNode;
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||
import org.apache.lucene.facet.util.AssertingCategoryListIterator;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.NoMergePolicy;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestStandardFacetsAccumulator extends LuceneTestCase {
|
||||
|
||||
private void indexTwoDocs(IndexWriter indexWriter, FacetFields facetFields, boolean withContent) throws Exception {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
Document doc = new Document();
|
||||
if (withContent) {
|
||||
doc.add(new StringField("f", "a", Store.NO));
|
||||
}
|
||||
if (facetFields != null) {
|
||||
facetFields.addFields(doc, Collections.singletonList(new CategoryPath("A", Integer.toString(i))));
|
||||
}
|
||||
indexWriter.addDocument(doc);
|
||||
}
|
||||
|
||||
indexWriter.commit();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentsWithoutCategoriesOrResults() throws Exception {
|
||||
// tests the accumulator when there are segments with no results
|
||||
Directory indexDir = newDirectory();
|
||||
Directory taxoDir = newDirectory();
|
||||
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwc.setMergePolicy(NoMergePolicy.COMPOUND_FILES); // prevent merges
|
||||
IndexWriter indexWriter = new IndexWriter(indexDir, iwc);
|
||||
FacetIndexingParams fip = new FacetIndexingParams(new CategoryListParams() {
|
||||
@Override
|
||||
public CategoryListIterator createCategoryListIterator(int partition) throws IOException {
|
||||
return new AssertingCategoryListIterator(super.createCategoryListIterator(partition));
|
||||
}
|
||||
});
|
||||
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
|
||||
FacetFields facetFields = new FacetFields(taxoWriter, fip);
|
||||
indexTwoDocs(indexWriter, facetFields, false); // 1st segment, no content, with categories
|
||||
indexTwoDocs(indexWriter, null, true); // 2nd segment, with content, no categories
|
||||
indexTwoDocs(indexWriter, facetFields, true); // 3rd segment ok
|
||||
indexTwoDocs(indexWriter, null, false); // 4th segment, no content, or categories
|
||||
indexTwoDocs(indexWriter, null, true); // 5th segment, with content, no categories
|
||||
indexTwoDocs(indexWriter, facetFields, true); // 6th segment, with content, with categories
|
||||
IOUtils.close(indexWriter, taxoWriter);
|
||||
|
||||
DirectoryReader indexReader = DirectoryReader.open(indexDir);
|
||||
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir);
|
||||
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
|
||||
|
||||
// search for "f:a", only segments 1 and 3 should match results
|
||||
Query q = new TermQuery(new Term("f", "a"));
|
||||
ArrayList<FacetRequest> requests = new ArrayList<FacetRequest>(1);
|
||||
CountFacetRequest countNoComplements = new CountFacetRequest(new CategoryPath("A"), 10) {
|
||||
@Override
|
||||
public boolean supportsComplements() {
|
||||
return false; // disable complements
|
||||
}
|
||||
};
|
||||
requests.add(countNoComplements);
|
||||
FacetSearchParams fsp = new FacetSearchParams(requests, fip);
|
||||
FacetsCollector fc = new FacetsCollector(fsp , indexReader, taxoReader);
|
||||
indexSearcher.search(q, fc);
|
||||
List<FacetResult> results = fc.getFacetResults();
|
||||
assertEquals("received too many facet results", 1, results.size());
|
||||
FacetResultNode frn = results.get(0).getFacetResultNode();
|
||||
assertEquals("wrong weight for \"A\"", 4, (int) frn.getValue());
|
||||
assertEquals("wrong number of children", 2, frn.getNumSubResults());
|
||||
for (FacetResultNode node : frn.getSubResults()) {
|
||||
assertEquals("wrong weight for child " + node.getLabel(), 2, (int) node.getValue());
|
||||
}
|
||||
IOUtils.close(indexReader, taxoReader);
|
||||
|
||||
IOUtils.close(indexDir, taxoDir);
|
||||
}
|
||||
|
||||
}
|
|
@ -17,6 +17,7 @@ import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
|||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||
import org.apache.lucene.facet.util.MultiCategoryListIterator;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -100,21 +101,24 @@ public class MultiCategoryListIteratorTest extends LuceneTestCase {
|
|||
clCache.loadAndRegister(clp, indexReader, taxoReader, indexingParams);
|
||||
iterators[i] = clCache.get(clp).iterator(0); // no partitions
|
||||
} else {
|
||||
iterators[i] = new PayloadCategoryListIteraor(indexReader, clp.getTerm(), decoder);
|
||||
iterators[i] = new PayloadCategoryListIteraor(clp.getTerm(), decoder);
|
||||
}
|
||||
}
|
||||
MultiCategoryListIterator cli = new MultiCategoryListIterator(iterators);
|
||||
assertTrue("failed to init multi-iterator", cli.init());
|
||||
IntsRef ordinals = new IntsRef();
|
||||
int maxDoc = indexReader.maxDoc();
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
cli.getOrdinals(i, ordinals);
|
||||
assertTrue("document " + i + " does not have categories", ordinals.length > 0);
|
||||
for (int j = 0; j < ordinals.length; j++) {
|
||||
CategoryPath cp = taxoReader.getPath(ordinals.ints[j]);
|
||||
assertNotNull("ordinal " + ordinals.ints[j] + " not found in taxonomy", cp);
|
||||
if (cp.length == 2) {
|
||||
assertEquals("invalid category for document " + i, i, Integer.parseInt(cp.components[1]));
|
||||
for (AtomicReaderContext context : indexReader.leaves()) {
|
||||
assertTrue("failed to init multi-iterator", cli.setNextReader(context));
|
||||
IntsRef ordinals = new IntsRef();
|
||||
final int maxDoc = context.reader().maxDoc();
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
cli.getOrdinals(i, ordinals);
|
||||
assertTrue("document " + i + " does not have categories", ordinals.length > 0);
|
||||
for (int j = 0; j < ordinals.length; j++) {
|
||||
CategoryPath cp = taxoReader.getPath(ordinals.ints[j]);
|
||||
assertNotNull("ordinal " + ordinals.ints[j] + " not found in taxonomy", cp);
|
||||
if (cp.length == 2) {
|
||||
int globalDoc = i + context.docBase;
|
||||
assertEquals("invalid category for document " + globalDoc, globalDoc, Integer.parseInt(cp.components[1]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -59,9 +59,8 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
return res;
|
||||
}
|
||||
|
||||
protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler,
|
||||
TaxonomyReader taxoReader, IndexReader indexReader,
|
||||
FacetSearchParams searchParams);
|
||||
protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler, TaxonomyReader taxoReader,
|
||||
IndexReader indexReader, FacetSearchParams searchParams);
|
||||
|
||||
/**
|
||||
* Try out faceted search with sampling enabled and complements either disabled or enforced
|
||||
|
@ -89,7 +88,7 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
|
||||
// try several times in case of failure, because the test has a chance to fail
|
||||
// if the top K facets are not sufficiently common with the sample set
|
||||
for (int nTrial=0; nTrial<RETRIES; nTrial++) {
|
||||
for (int nTrial = 0; nTrial < RETRIES; nTrial++) {
|
||||
try {
|
||||
// complement with sampling!
|
||||
final Sampler sampler = createSampler(nTrial, docCollector.getScoredDocIDs(), useRandomSampler);
|
||||
|
@ -99,7 +98,7 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
|
||||
break; // succeeded
|
||||
} catch (NotSameResultError e) {
|
||||
if (nTrial>=RETRIES-1) {
|
||||
if (nTrial >= RETRIES - 1) {
|
||||
throw e; // no more retries allowed, must fail
|
||||
}
|
||||
}
|
||||
|
@ -119,14 +118,11 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
assertSameResults(expected, sampledResults);
|
||||
}
|
||||
|
||||
private FacetsCollector samplingCollector(
|
||||
final boolean complement,
|
||||
final Sampler sampler,
|
||||
private FacetsCollector samplingCollector(final boolean complement, final Sampler sampler,
|
||||
FacetSearchParams samplingSearchParams) {
|
||||
FacetsCollector samplingFC = new FacetsCollector(samplingSearchParams, indexReader, taxoReader) {
|
||||
@Override
|
||||
protected FacetsAccumulator initFacetsAccumulator(
|
||||
FacetSearchParams facetSearchParams, IndexReader indexReader,
|
||||
protected FacetsAccumulator initFacetsAccumulator(FacetSearchParams facetSearchParams, IndexReader indexReader,
|
||||
TaxonomyReader taxonomyReader) {
|
||||
FacetsAccumulator acc = getSamplingAccumulator(sampler, taxonomyReader, indexReader, facetSearchParams);
|
||||
acc.setComplementThreshold(complement ? FacetsAccumulator.FORCE_COMPLEMENT : FacetsAccumulator.DISABLE_COMPLEMENT);
|
||||
|
@ -144,12 +140,13 @@ public abstract class BaseSampleTestTopK extends BaseTestTopK {
|
|||
samplingParams.setMinSampleSize((int) (100 * retryFactor));
|
||||
samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
|
||||
samplingParams.setOversampleFactor(5.0 * retryFactor);
|
||||
samplingParams.setSamplingThreshold(11000); //force sampling
|
||||
|
||||
samplingParams.setSamplingThreshold(11000); //force sampling
|
||||
Sampler sampler = useRandomSampler ?
|
||||
new RandomSampler(samplingParams, new Random(random().nextLong())) :
|
||||
new RepeatableSampler(samplingParams);
|
||||
assertTrue("must enable sampling for this test!",sampler.shouldSample(scoredDocIDs));
|
||||
return sampler;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
package org.apache.lucene.facet.util;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.facet.search.CategoryListIterator;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A {@link CategoryListIterator} which asserts that
|
||||
* {@link #getOrdinals(int, IntsRef)} is not called before
|
||||
* {@link #setNextReader(AtomicReaderContext)} and that if
|
||||
* {@link #setNextReader(AtomicReaderContext)} returns false,
|
||||
* {@link #getOrdinals(int, IntsRef)} isn't called.
|
||||
*/
|
||||
public class AssertingCategoryListIterator implements CategoryListIterator {
|
||||
|
||||
private final CategoryListIterator delegate;
|
||||
private boolean setNextReaderCalled = false;
|
||||
private boolean validSegment = false;
|
||||
private int maxDoc;
|
||||
|
||||
public AssertingCategoryListIterator(CategoryListIterator delegate) {
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean setNextReader(AtomicReaderContext context) throws IOException {
|
||||
setNextReaderCalled = true;
|
||||
maxDoc = context.reader().maxDoc();
|
||||
return validSegment = delegate.setNextReader(context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getOrdinals(int docID, IntsRef ints) throws IOException {
|
||||
if (!setNextReaderCalled) {
|
||||
throw new RuntimeException("should not call getOrdinals without setNextReader first");
|
||||
}
|
||||
if (!validSegment) {
|
||||
throw new RuntimeException("should not call getOrdinals if setNextReader returned false");
|
||||
}
|
||||
if (docID >= maxDoc) {
|
||||
throw new RuntimeException("docID is larger than current maxDoc; forgot to call setNextReader?");
|
||||
}
|
||||
delegate.getOrdinals(docID, ints);
|
||||
}
|
||||
|
||||
}
|
|
@ -9,6 +9,9 @@ import org.apache.lucene.document.Document;
|
|||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDs;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
|
||||
import org.apache.lucene.facet.search.ScoredDocIdCollector;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
|
@ -21,14 +24,9 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.util.OpenBitSetDISI;
|
||||
import org.junit.Test;
|
||||
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDs;
|
||||
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
|
||||
import org.apache.lucene.facet.search.ScoredDocIdCollector;
|
||||
import org.junit.Test;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -52,21 +50,21 @@ public class TestScoredDocIDsUtils extends LuceneTestCase {
|
|||
@Test
|
||||
public void testComplementIterator() throws Exception {
|
||||
final int n = atLeast(10000);
|
||||
final OpenBitSet bits = new OpenBitSet(n);
|
||||
for (int i = 0; i < 5 * n; i++) {
|
||||
bits.flip(random().nextInt(n));
|
||||
final FixedBitSet bits = new FixedBitSet(n);
|
||||
Random random = random();
|
||||
for (int i = 0; i < n; i++) {
|
||||
int idx = random.nextInt(n);
|
||||
bits.flip(idx, idx + 1);
|
||||
}
|
||||
|
||||
OpenBitSet verify = new OpenBitSet(n);
|
||||
verify.or(bits);
|
||||
FixedBitSet verify = new FixedBitSet(bits);
|
||||
|
||||
ScoredDocIDs scoredDocIDs = ScoredDocIdsUtils.createScoredDocIds(bits, n);
|
||||
|
||||
Directory dir = newDirectory();
|
||||
IndexReader reader = createReaderWithNDocs(random(), n, dir);
|
||||
IndexReader reader = createReaderWithNDocs(random, n, dir);
|
||||
try {
|
||||
assertEquals(n - verify.cardinality(), ScoredDocIdsUtils.getComplementSet(scoredDocIDs,
|
||||
reader).size());
|
||||
assertEquals(n - verify.cardinality(), ScoredDocIdsUtils.getComplementSet(scoredDocIDs, reader).size());
|
||||
} finally {
|
||||
reader.close();
|
||||
dir.close();
|
||||
|
@ -147,7 +145,7 @@ public class TestScoredDocIDsUtils extends LuceneTestCase {
|
|||
searcher.search(q, collector);
|
||||
|
||||
ScoredDocIDs scoredDocIds = collector.getScoredDocIDs();
|
||||
OpenBitSet resultSet = new OpenBitSetDISI(scoredDocIds.getDocIDs().iterator(), reader.maxDoc());
|
||||
FixedBitSet resultSet = (FixedBitSet) scoredDocIds.getDocIDs();
|
||||
|
||||
// Getting the complement set of the query result
|
||||
ScoredDocIDs complementSet = ScoredDocIdsUtils.getComplementSet(scoredDocIds, reader);
|
||||
|
@ -164,12 +162,11 @@ public class TestScoredDocIDsUtils extends LuceneTestCase {
|
|||
assertFalse(
|
||||
"Complement-Set must not contain deleted documents (doc="+docNum+")",
|
||||
live != null && !live.get(docNum));
|
||||
assertNull(
|
||||
"Complement-Set must not contain docs from the original set (doc="+ docNum+")",
|
||||
assertNull("Complement-Set must not contain docs from the original set (doc="+ docNum+")",
|
||||
reader.document(docNum).getField("del"));
|
||||
assertFalse(
|
||||
"Complement-Set must not contain docs from the original set (doc="+docNum+")",
|
||||
resultSet.fastGet(docNum));
|
||||
resultSet.get(docNum));
|
||||
}
|
||||
} finally {
|
||||
reader.close();
|
||||
|
|
Loading…
Reference in New Issue