+ * Create an index, and adds to it sample documents and facets.
+ * @throws Exception
+ * on error (no detailed exception handling here for sample
+ * simplicity
+ */
+ public static void index(IndexWriter iw, LuceneTaxonomyWriter taxo,
+ FacetIndexingParams iParams, String[] docTitles,
+ String[] docTexts, CategoryPath[][] cPaths) throws Exception {
+
+ // loop over sample documents
+ int nDocsAdded = 0;
+ int nFacetsAdded = 0;
+ for (int docNum = 0; docNum < SimpleUtils.docTexts.length; docNum++) {
+ List
+ * The idea is that implementations implement {@link #build(Document doc)},
+ * which adds to the given Document whatever {@link Field}s it wants to add. A
+ * DocumentBuilder is also allowed to inspect or change existing Fields in the
+ * Document, if it wishes to.
+ *
+ * Implementations should normally have a constructor with parameters which
+ * determine what {@link #build(Document)} will add to doc.
+ * The interface defines a builder pattern, which allows applications to invoke
+ * several document builders in the following way:
+ *
+ *
+ * A category enhancement can contribute to the index in two possible ways:
+ *
+ * NOTE: The returned array is copied, it is recommended to allocate
+ * a new one each time.
+ *
+ * The bytes generated by this method are the input of
+ * {@link #extractCategoryTokenData(byte[], int, int)}.
+ *
+ * @param categoryAttribute
+ * The attribute of the category.
+ * @return The bytes to be added to the category token payload for this
+ * enhancement.
+ */
+ byte[] getCategoryTokenBytes(CategoryAttribute categoryAttribute);
+
+ /**
+ * Get the data of this enhancement from a category token payload.
+ *
+ * The input bytes for this method are generated in
+ * {@link #getCategoryTokenBytes(CategoryAttribute)}.
+ *
+ * @param buffer
+ * The payload buffer.
+ * @param offset
+ * The offset of this enhancement's data in the buffer.
+ * @param length
+ * The length of this enhancement's data (bytes).
+ * @return An Object containing the data.
+ */
+ Object extractCategoryTokenData(byte[] buffer, int offset, int length);
+
+ /**
+ * Declarative method to indicate whether this enhancement generates
+ * separate category list.
+ *
+ * @return {@code true} if generates category list, else {@code false}.
+ */
+ boolean generatesCategoryList();
+
+ /**
+ * Returns the text of this enhancement's category list term.
+ *
+ * @return The text of this enhancement's category list term.
+ */
+ String getCategoryListTermText();
+
+ /**
+ * Get the {@link CategoryListTokenizer} which generates the category list
+ * for this enhancement. If {@link #generatesCategoryList()} returns
+ * {@code false} this method will not be called.
+ *
+ * @param tokenizer
+ * The input stream containing categories.
+ * @param indexingParams
+ * The indexing params to use.
+ * @param taxonomyWriter
+ * The taxonomy to add categories and get their ordinals.
+ * @return A {@link CategoryListTokenizer} generating the category list for
+ * this enhancement, with {@code tokenizer} as it's input.
+ */
+ CategoryListTokenizer getCategoryListTokenizer(TokenStream tokenizer,
+ EnhancementsIndexingParams indexingParams,
+ TaxonomyWriter taxonomyWriter);
+
+ /**
+ * Get a {@link CategoryProperty} class to be retained when creating
+ * {@link CategoryParentsStream}.
+ *
+ * @return the {@link CategoryProperty} class to be retained when creating
+ * {@link CategoryParentsStream}, or {@code null} if there is no
+ * such property.
+ */
+ Class extends CategoryProperty> getRetainableProperty();
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsCategoryTokenizer.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsCategoryTokenizer.java
new file mode 100644
index 00000000000..9d401bcd32e
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsCategoryTokenizer.java
@@ -0,0 +1,121 @@
+package org.apache.lucene.facet.enhancements;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+
+import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams;
+import org.apache.lucene.facet.index.streaming.CategoryTokenizer;
+import org.apache.lucene.util.Vint8;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A tokenizer which adds to each category token payload according to the
+ * {@link CategoryEnhancement}s defined in the given
+ * {@link EnhancementsIndexingParams}.
+ *
+ * @lucene.experimental
+ */
+public class EnhancementsCategoryTokenizer extends CategoryTokenizer {
+
+ /**
+ * The data buffer used for payload instance.
+ */
+ protected byte[] payloadBytes;
+
+ /**
+ * The category enhancements to handle
+ */
+ protected List
+ * This class leave to extending classes the definition of
+ * {@link #merge(CategoryProperty)} policy for the integer associations.
+ *
+ * Note: The association value is added both to a special category list,
+ * and to the category tokens.
+ *
+ * @see AssociationEnhancement
+ * @lucene.experimental
+ */
+public abstract class AssociationProperty implements CategoryProperty {
+
+ protected long association = Integer.MAX_VALUE + 1;
+
+ /**
+ * Construct an {@link AssociationProperty}.
+ *
+ * @param value
+ * The association value.
+ */
+ public AssociationProperty(int value) {
+ this.association = value;
+ }
+
+ /**
+ * Returns the association value.
+ *
+ * @return The association value.
+ */
+ public int getAssociation() {
+ return (int) association;
+ }
+
+ /**
+ * Returns whether this attribute has been set (not all categories have an
+ * association).
+ */
+ public boolean hasBeenSet() {
+ return this.association <= Integer.MAX_VALUE;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + ": " + association;
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationsPayloadIterator.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationsPayloadIterator.java
new file mode 100644
index 00000000000..bae9a410d64
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationsPayloadIterator.java
@@ -0,0 +1,235 @@
+package org.apache.lucene.facet.enhancements.association;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+
+import org.apache.lucene.facet.index.params.CategoryListParams;
+import org.apache.lucene.facet.search.PayloadIntDecodingIterator;
+import org.apache.lucene.util.collections.IntIterator;
+import org.apache.lucene.util.collections.IntToIntMap;
+import org.apache.lucene.util.encoding.SimpleIntDecoder;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Allows easy iteration over the associations payload, decoding and breaking it
+ * to (ordinal, value) pairs, stored in a hash.
+ *
+ * @lucene.experimental
+ */
+public class AssociationsPayloadIterator {
+
+ /**
+ * Default Term for associations
+ */
+ public static final Term ASSOCIATION_POSTING_TERM = new Term(
+ CategoryListParams.DEFAULT_TERM.field(),
+ AssociationEnhancement.CATEGORY_LIST_TERM_TEXT);
+
+ /**
+ * Hash mapping to ordinals to the associated int value
+ */
+ private IntToIntMap ordinalToAssociationMap;
+
+ /**
+ * An inner payload decoder which actually goes through the posting and
+ * decode the ints representing the ordinals and the values
+ */
+ private PayloadIntDecodingIterator associationPayloadIter;
+
+ /**
+ * Marking whether there are associations (at all) in the given index
+ */
+ private boolean hasAssociations = false;
+
+ /**
+ * The long-special-value returned for ordinals which have no associated int
+ * value. It is not in the int range of values making it a valid mark.
+ */
+ public final static long NO_ASSOCIATION = Integer.MAX_VALUE + 1;
+
+ /**
+ * Construct a new association-iterator, initializing the inner payload
+ * iterator, with the supplied term and checking whether there are any
+ * associations within the given index
+ *
+ * @param reader
+ * a reader containing the postings to be iterated
+ * @param field
+ * the field containing the relevant associations list term
+ */
+ public AssociationsPayloadIterator(IndexReader reader, String field)
+ throws IOException {
+ // Initialize the payloadDecodingIterator
+ associationPayloadIter = new PayloadIntDecodingIterator(
+ reader,
+ // TODO (Facet): should consolidate with AssociationListTokenizer which
+ // uses AssociationEnhancement.getCatTermText()
+ new Term(field, AssociationEnhancement.CATEGORY_LIST_TERM_TEXT),
+ new SimpleIntDecoder());
+
+ // Check whether there are any associations
+ hasAssociations = associationPayloadIter.init();
+
+ ordinalToAssociationMap = new IntToIntMap();
+ }
+
+ /**
+ * Skipping to the next document, fetching its associations & populating the
+ * map.
+ *
+ * @param docId
+ * document id to be skipped to
+ * @return true if the document contains associations and they were fetched
+ * correctly. false otherwise.
+ * @throws IOException
+ * on error
+ */
+ public boolean setNextDoc(int docId) throws IOException {
+ ordinalToAssociationMap.clear();
+ boolean docContainsAssociations = false;
+ try {
+ docContainsAssociations = fetchAssociations(docId);
+ } catch (IOException e) {
+ IOException ioe = new IOException(
+ "An Error occured while reading a document's associations payload (docId="
+ + docId + ")");
+ ioe.initCause(e);
+ throw ioe;
+ }
+
+ return docContainsAssociations;
+ }
+
+ /**
+ * Get int association value for the given ordinal. A {@link org.apache.lucene.facet.enhancements.CategoryEnhancement CategoryEnhancement}
+(which can correspond to a
+{@link org.apache.lucene.facet.index.attributes.CategoryProperty CategoryProperty})
+can contribute to the index in two possible ways:
+
+ * Note that both {@link #setCategories(Iterable)} and
+ * {@link #setCategoryPaths(Iterable)} return this
+ * {@link CategoryDocumentBuilder}, allowing the following pattern: {@code new
+ * CategoryDocumentBuilder(taxonomy,
+ * params).setCategories(categories).build(doc)}.
+ *
+ * @lucene.experimental
+ */
+public class CategoryDocumentBuilder implements DocumentBuilder {
+
+ /**
+ * A {@link TaxonomyWriter} for adding categories and retrieving their
+ * ordinals.
+ */
+ protected final TaxonomyWriter taxonomyWriter;
+
+ /**
+ * Parameters to be used when indexing categories.
+ */
+ protected final FacetIndexingParams indexingParams;
+
+ /**
+ * A list of fields which is filled at ancestors' construction and used
+ * during {@link CategoryDocumentBuilder#build(Document)}.
+ */
+ protected final ArrayList
+ * For re-mapping the ordinals before you merge the indexes, do the following:
+ *
+ *
+ * NOTE: while the second example looks simpler, IndexWriter may trigger
+ * a long merge due to addIndexes. The first example avoids this perhaps
+ * unneeded merge, as well as can be done separately (e.g. on another node)
+ * before the index is merged.
+ *
+ * @lucene.experimental
+ */
+public class FacetsPayloadProcessorProvider extends PayloadProcessorProvider {
+
+ private final Directory workDir;
+
+ private final DirPayloadProcessor dirProcessor;
+
+ /**
+ * Construct FacetsPayloadProcessorProvider with FacetIndexingParams
+ *
+ * @param dir the {@link Directory} containing the segments to update
+ * @param ordinalMap an array mapping previous facets ordinals to new ones
+ * @param indexingParams the facets indexing parameters
+ */
+ public FacetsPayloadProcessorProvider(Directory dir, int[] ordinalMap,
+ FacetIndexingParams indexingParams) {
+ workDir = dir;
+ dirProcessor = new FacetsDirPayloadProcessor(indexingParams, ordinalMap);
+ }
+
+ @Override
+ public DirPayloadProcessor getDirProcessor(Directory dir) throws IOException {
+ if (workDir != dir) {
+ return null;
+ }
+ return dirProcessor;
+ }
+
+ public static class FacetsDirPayloadProcessor extends DirPayloadProcessor {
+
+ private final Map
+ * Ordinal properties are added internally during processing of category
+ * streams, and it is recommended not to use it externally.
+ *
+ * @lucene.experimental
+ */
+public class OrdinalProperty implements CategoryProperty {
+
+ protected int ordinal = -1;
+
+ public int getOrdinal() {
+ return ordinal;
+ }
+
+ public boolean hasBeenSet() {
+ return this.ordinal >= 0;
+ }
+
+ public void setOrdinal(int value) {
+ this.ordinal = value;
+ }
+
+ public void clear() {
+ this.ordinal = -1;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
+ }
+ if (!(other instanceof OrdinalProperty)) {
+ return false;
+ }
+ OrdinalProperty o = (OrdinalProperty) other;
+ return o.ordinal == this.ordinal;
+ }
+
+ @Override
+ public int hashCode() {
+ return this.ordinal;
+ }
+
+ public void merge(CategoryProperty other) {
+ throw new UnsupportedOperationException(
+ "Merging ordinal attributes is prohibited");
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/attributes/package.html b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/package.html
new file mode 100644
index 00000000000..8964fafa652
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/package.html
@@ -0,0 +1,13 @@
+
+
+ * Default implementation creates a new Sorting(Unique(DGap)) encoder.
+ * Uniqueness in this regard means when the same category appears twice in a
+ * document, only one appearance would be encoded. This has effect on facet
+ * counting results.
+ *
+ * Some possible considerations when overriding may be:
+ *
+ * Getters for partition-size, {@link OrdinalPolicy} and
+ * {@link PathPolicy} are all final, and so the proper way to modify them when
+ * extending this class is through {@link #fixedPartitionSize()},
+ * {@link #fixedOrdinalPolicy()} or {@link #fixedPathPolicy()} accordingly.
+ *
+ * @lucene.experimental
+ */
+public class DefaultFacetIndexingParams implements FacetIndexingParams {
+
+ /**
+ * delimiter between a categories in a path, e.g. Products FACET_DELIM
+ * Consumer FACET_DELIM Tv. This should be a character not found in any path
+ * component
+ */
+ public static final char DEFAULT_FACET_DELIM_CHAR = '\uF749';
+
+ private final CategoryListParams clpParams;
+ private final OrdinalPolicy ordinalPolicy;
+ private final PathPolicy pathPolicy;
+ private final int partitionSize;
+
+ public DefaultFacetIndexingParams() {
+ this(new CategoryListParams());
+ }
+
+ public DefaultFacetIndexingParams(CategoryListParams categoryListParams) {
+ clpParams = categoryListParams;
+ ordinalPolicy = fixedOrdinalPolicy();
+ pathPolicy = fixedPathPolicy();
+ partitionSize = fixedPartitionSize();
+ }
+
+ public CategoryListParams getCategoryListParams(CategoryPath category) {
+ return clpParams;
+ }
+
+ public int drillDownTermText(CategoryPath path, char[] buffer) {
+ return path.copyToCharArray(buffer, 0, -1, getFacetDelimChar());
+ }
+
+ /**
+ * "fixed" partition size.
+ * @see #getPartitionSize()
+ */
+ protected int fixedPartitionSize() {
+ return Integer.MAX_VALUE;
+ }
+
+ /**
+ * "fixed" ordinal policy.
+ * @see #getOrdinalPolicy()
+ */
+ protected OrdinalPolicy fixedOrdinalPolicy() {
+ return new DefaultOrdinalPolicy();
+ }
+
+ /**
+ * "fixed" path policy.
+ * @see #getPathPolicy()
+ */
+ protected PathPolicy fixedPathPolicy() {
+ return new DefaultPathPolicy();
+ }
+
+ public final int getPartitionSize() {
+ return partitionSize;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.lucene.facet.index.params.FacetIndexingParams#getAllCategoryListParams
+ * ()
+ */
+ public Iterable
+ * If non-default parameters were used during indexing, the same parameters
+ * must also be passed during faceted search. This requirement is analogous
+ * to the requirement during search to know which fields were indexed, and which
+ * Analyzer was used on the text.
+ *
+ * @lucene.experimental
+ */
+public interface FacetIndexingParams extends Serializable {
+
+ /**
+ * The name of the category-list to put this category in, or null if this
+ * category should not be aggregatable.
+ *
+ * By default, all categories are written to the same category list, but
+ * applications which know in advance that in some situations only parts
+ * of the category hierarchy needs to be counted can divide the categories
+ * into two or more different category lists.
+ *
+ * If null is returned for a category, it means that this category should
+ * not appear in any category list, and thus counts for it cannot be
+ * aggregated. This category can still be used for drill-down, even though
+ * the count for it is not known.
+ */
+ public CategoryListParams getCategoryListParams(CategoryPath category);
+
+ /**
+ * Return info about all category lists in the index.
+ *
+ * @see #getCategoryListParams(CategoryPath)
+ */
+ public Iterable
+ * Note: Make sure
+ * A 'dimension' is defined as the first or "zero-th" component in a
+ * CategoryPath. For example, if a CategoryPath is defined as
+ * "/Author/American/Mark Twain", then the dimension is "Author".
+ *
+ * This class also uses the 'default' CategoryListParams (as specified by
+ * {@link CategoryListParams#CategoryListParams()} when
+ * {@link #getCategoryListParams(CategoryPath)} is called for a CategoryPath
+ * whose dimension component has not been specifically defined.
+ *
+ * @lucene.experimental
+ */
+public class PerDimensionIndexingParams extends DefaultFacetIndexingParams {
+
+ // "Root" or "first component" of a Category Path maps to a
+ // CategoryListParams
+ private final Map
+ * A CategoryAttributesStream object can be reused for producing more than one
+ * stream. To do that, the user should cause the underlying
+ * Iterable
+ * Note: Sampling accumulation (Accumulation over a sampled-set of the results),
+ * does not guarantee accurate values for
+ * {@link FacetResult#getNumValidDescendants()} &
+ * {@link FacetResultNode#getResidue()}.
+ *
+ * @lucene.experimental
+ */
+public final class AdaptiveFacetsAccumulator extends StandardFacetsAccumulator {
+
+ private Sampler sampler = new Sampler();
+
+ /**
+ * Create an {@link AdaptiveFacetsAccumulator}
+ * @see StandardFacetsAccumulator#StandardFacetsAccumulator(FacetSearchParams, IndexReader, TaxonomyReader)
+ */
+ public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
+ TaxonomyReader taxonomyReader) {
+ super(searchParams, indexReader, taxonomyReader);
+ }
+
+ /**
+ * Create an {@link AdaptiveFacetsAccumulator}
+ * @see StandardFacetsAccumulator#StandardFacetsAccumulator(FacetSearchParams, IndexReader, TaxonomyReader,
+ * IntArrayAllocator, FloatArrayAllocator)
+ */
+ public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
+ TaxonomyReader taxonomyReader, IntArrayAllocator intArrayAllocator,
+ FloatArrayAllocator floatArrayAllocator) {
+ super(searchParams, indexReader, taxonomyReader, intArrayAllocator, floatArrayAllocator);
+ }
+
+ /**
+ * Set the sampler.
+ * @param sampler sampler to set
+ */
+ public void setSampler(Sampler sampler) {
+ this.sampler = sampler;
+ }
+
+ @Override
+ public List
+ * NOTE:
+ *
+ * The facet results handler provided by the {@link FacetRequest} to
+ * a {@link FacetsAccumulator}.
+ *
+ * First it is used by {@link FacetsAccumulator} to obtain a temporary
+ * facet result for each partition and to merge results of several partitions.
+ *
+ * Later the accumulator invokes the handler to render the results, creating
+ * {@link FacetResult} objects.
+ *
+ * Last the accumulator invokes the handler to label final results.
+ *
+ * @lucene.experimental
+ */
+public abstract class FacetResultsHandler {
+
+ /** Taxonomy for which facets are handled */
+ protected final TaxonomyReader taxonomyReader;
+
+ /**
+ * Facet request served by this handler.
+ */
+ protected final FacetRequest facetRequest;
+
+ /**
+ * Create a faceted search handler.
+ * @param taxonomyReader See {@link #getTaxonomyReader()}.
+ * @param facetRequest See {@link #getFacetRequest()}.
+ */
+ public FacetResultsHandler(TaxonomyReader taxonomyReader,
+ FacetRequest facetRequest) {
+ this.taxonomyReader = taxonomyReader;
+ this.facetRequest = facetRequest;
+ }
+
+ /**
+ * Fetch results of a single partition, given facet arrays for that partition,
+ * and based on the matching documents and faceted search parameters.
+ *
+ * @param arrays
+ * facet arrays for the certain partition
+ * @param offset
+ * offset in input arrays where partition starts
+ * @return temporary facet result, potentially, to be passed back to
+ * this result handler for merging, or null in case that
+ * constructor parameter,
+ * Possible use case: a sampling facets accumulator invoked another
+ * other facets accumulator on a sample set of documents, obtained
+ * rendered facet results, fixed their counts, and now it is needed
+ * to sort the results differently according to the fixed counts.
+ * @param facetResult result to be rearranged.
+ * @see FacetResultNode#setValue(double)
+ */
+ public abstract FacetResult rearrangeFacetResult(FacetResult facetResult);
+
+ /**
+ * Label results according to settings in {@link FacetRequest},
+ * such as {@link FacetRequest#getNumLabel()}.
+ * Usually invoked by {@link FacetsAccumulator#accumulate(ScoredDocIDs)}
+ * @param facetResult facet result to be labeled.
+ * @throws IOException on error
+ */
+ public abstract void labelResult (FacetResult facetResult) throws IOException;
+
+ /** Return taxonomy reader used for current facets accumulation operation. */
+ public final TaxonomyReader getTaxonomyReader() {
+ return this.taxonomyReader;
+ }
+
+ /** Return the facet request served by this handler. */
+ public final FacetRequest getFacetRequest() {
+ return this.facetRequest;
+ }
+
+ /**
+ * Check if an array contains the partition which contains ordinal
+ *
+ * @param ordinal
+ * checked facet
+ * @param facetArrays
+ * facet arrays for the certain partition
+ * @param offset
+ * offset in input arrays where partition starts
+ */
+ protected boolean isSelfPartition (int ordinal, FacetArrays facetArrays, int offset) {
+ int partitionSize = facetArrays.getArraysLength();
+ return ordinal / partitionSize == offset / partitionSize;
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/FacetsAccumulator.java b/modules/facet/src/java/org/apache/lucene/facet/search/FacetsAccumulator.java
new file mode 100644
index 00000000000..b707de6e187
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/FacetsAccumulator.java
@@ -0,0 +1,153 @@
+package org.apache.lucene.facet.search;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.facet.search.params.FacetSearchParams;
+import org.apache.lucene.facet.search.params.FacetRequest;
+import org.apache.lucene.facet.search.results.FacetResult;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Driver for Accumulating facets of faceted search requests over given
+ * documents.
+ *
+ * @lucene.experimental
+ */
+public abstract class FacetsAccumulator {
+
+ /**
+ * Default threshold for using the complements optimization.
+ * If accumulating facets for a document set larger than this ratio of the index size than
+ * perform the complement optimization.
+ * @see #setComplementThreshold(double) for more info on the complements optimization.
+ */
+ public static final double DEFAULT_COMPLEMENT_THRESHOLD = 0.6;
+
+ /**
+ * Passing this to {@link #setComplementThreshold(double)} will disable using complement optimization.
+ */
+ public static final double DISABLE_COMPLEMENT = Double.POSITIVE_INFINITY; // > 1 actually
+
+ /**
+ * Passing this to {@link #setComplementThreshold(double)} will force using complement optimization.
+ */
+ public static final double FORCE_COMPLEMENT = 0; // <=0
+
+ private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD;
+
+ protected final TaxonomyReader taxonomyReader;
+ protected final IndexReader indexReader;
+ protected FacetSearchParams searchParams;
+
+ private boolean allowLabeling = true;
+
+ public FacetsAccumulator(FacetSearchParams searchParams,
+ IndexReader indexReader,
+ TaxonomyReader taxonomyReader) {
+ this.indexReader = indexReader;
+ this.taxonomyReader = taxonomyReader;
+ this.searchParams = searchParams;
+ }
+
+ /**
+ * Accumulate facets over given documents, according to facet requests in effect.
+ * @param docids documents (and their scores) for which facets are Accumulated.
+ * @return Accumulated facets.
+ * @throws IOException on error.
+ */
+ // internal API note: it was considered to move the docids into the constructor as well,
+ // but this prevents nice extension capabilities, especially in the way that
+ // Sampling Accumulator works with the (any) delegated accumulator.
+ public abstract List
+ * Note that this optimization is only available when searching an index
+ * whose {@link IndexReader} implements both
+ * {@link IndexReader#directory()} and {@link IndexReader#getVersion()}
+ * otherwise the optimization is silently disabled regardless of
+ * the complement threshold settings.
+ *
+ * For the default settings see {@link #DEFAULT_COMPLEMENT_THRESHOLD}.
+ *
+ * To forcing complements in all cases pass {@link #FORCE_COMPLEMENT}.
+ * This is mostly useful for testing purposes, as forcing complements when only
+ * tiny fraction of available documents match the query does not make sense and
+ * would incur performance degradations.
+ *
+ * To disable complements pass {@link #DISABLE_COMPLEMENT}.
+ * @param complementThreshold the complement threshold to set
+ */
+ public void setComplementThreshold(double complementThreshold) {
+ this.complementThreshold = complementThreshold;
+ }
+
+ /**
+ * Check if labeling is allowed for this accumulator.
+ *
+ * By default labeling is allowed.
+ * This allows one accumulator to invoke other accumulators for accumulation
+ * but keep to itself the responsibility of labeling.
+ * This might br handy since labeling is a costly operation.
+ * @return true of labeling is allowed for this accumulator
+ * @see #setAllowLabeling(boolean)
+ */
+ protected boolean isAllowLabeling() {
+ return allowLabeling;
+ }
+
+ /**
+ * Set whether labeling is allowed for this accumulator.
+ * @param allowLabeling new setting for allow labeling
+ * @see #isAllowLabeling()
+ */
+ protected void setAllowLabeling(boolean allowLabeling) {
+ this.allowLabeling = allowLabeling;
+ }
+
+ /** check if all requests are complementable */
+ protected boolean mayComplement() {
+ for (FacetRequest freq:searchParams.getFacetRequests()) {
+ if (!freq.supportsComplements()) {
+ return false;
+ }
+ }
+ return true;
+ }
+}
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/FacetsCollector.java b/modules/facet/src/java/org/apache/lucene/facet/search/FacetsCollector.java
new file mode 100644
index 00000000000..af3a57a62a7
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/FacetsCollector.java
@@ -0,0 +1,137 @@
+package org.apache.lucene.facet.search;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.Scorer;
+
+import org.apache.lucene.facet.search.params.FacetRequest;
+import org.apache.lucene.facet.search.params.FacetSearchParams;
+import org.apache.lucene.facet.search.results.FacetResult;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Collector for facet accumulation. *
+ *
+ * @lucene.experimental
+ */
+public class FacetsCollector extends Collector {
+
+ protected final FacetsAccumulator facetsAccumulator;
+ private ScoredDocIdCollector scoreDocIdCollector;
+ private List
+ * An FloatArrayAllocator is thread-safe.
+ *
+ * @lucene.experimental
+ */
+public final class FloatArrayAllocator extends TemporaryObjectAllocator
+ * Note that the pool size only restricts the number of arrays that hang
+ * around when not needed, but not the maximum number of arrays
+ * that are allocated when actually is use: If a number of concurrent
+ * threads ask for an allocation, all of them will get a counter array,
+ * even if their number is greater than maxArrays. If an application wants
+ * to limit the number of concurrent threads making allocations, it needs
+ * to do so on its own - for example by blocking new threads until the
+ * existing ones have finished.
+ *
+ * In particular, when maxArrays=0, this object behaves as a trivial
+ * allocator, always allocating a new array and never reusing an old one.
+ */
+ public FloatArrayAllocator(int size, int maxArrays) {
+ super(maxArrays);
+ this.size = size;
+ }
+
+ @Override
+ public float[] create() {
+ return new float[size];
+ }
+
+ @Override
+ public void clear(float[] array) {
+ Arrays.fill(array, 0);
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/Heap.java b/modules/facet/src/java/org/apache/lucene/facet/search/Heap.java
new file mode 100644
index 00000000000..8dc5ccef449
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/Heap.java
@@ -0,0 +1,56 @@
+package org.apache.lucene.facet.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Declares an interface for heap (and heap alike) structures,
+ * handling a given type T
+ *
+ * @lucene.experimental
+ */
+public interface Heap
+ * An IntArrayAllocator is thread-safe.
+ *
+ * @lucene.experimental
+ */
+public final class IntArrayAllocator extends TemporaryObjectAllocator
+ * Note that the pool size only restricts the number of arrays that hang
+ * around when not needed, but not the maximum number of arrays
+ * that are allocated when actually is use: If a number of concurrent
+ * threads ask for an allocation, all of them will get a counter array,
+ * even if their number is greater than maxArrays. If an application wants
+ * to limit the number of concurrent threads making allocations, it needs
+ * to do so on its own - for example by blocking new threads until the
+ * existing ones have finished.
+ *
+ * In particular, when maxArrays=0, this object behaves as a trivial
+ * allocator, always allocating a new array and never reusing an old one.
+ */
+ public IntArrayAllocator(int length, int maxArrays) {
+ super(maxArrays);
+ this.length = length;
+ }
+
+ @Override
+ public int[] create() {
+ return new int[length];
+ }
+
+ @Override
+ public void clear(int[] array) {
+ Arrays.fill(array, 0);
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java b/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java
new file mode 100644
index 00000000000..87b6e1f2c62
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java
@@ -0,0 +1,117 @@
+package org.apache.lucene.facet.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+
+import org.apache.lucene.util.UnsafeByteArrayInputStream;
+import org.apache.lucene.util.encoding.IntDecoder;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A payload deserializer comes with its own working space (buffer). One need to
+ * define the {@link IndexReader} and {@link Term} in which the payload resides.
+ * The iterator then consumes the payload information of each document and
+ * decodes it into categories. A typical use case of this class is:
+ *
+ *
+ * Users should call this method with increasing docIds, and implementations
+ * can assume that this is the case.
+ */
+ public boolean setdoc(int docId) throws IOException {
+ if (!hasMore) {
+ return false;
+ }
+
+ if (tp.docID() > docId) {
+ return false;
+ }
+
+ // making sure we have the requested document
+ if (tp.docID() < docId) {
+ // Skipping to requested document
+ if (tp.advance(docId) == DocIdSetIterator.NO_MORE_DOCS) {
+ this.hasMore = false;
+ return false;
+ }
+
+ // If document not found (skipped to much)
+ if (tp.docID() != docId) {
+ return false;
+ }
+ }
+
+ // Prepare for payload extraction
+ tp.nextPosition();
+
+ // TODO: fix bug in SepCodec and then remove this check (the null check should be enough)
+ if (!tp.hasPayload()) {
+ return false;
+ }
+
+ BytesRef br = tp.getPayload();
+
+ if (br == null || br.length == 0) {
+ return false;
+ }
+
+ this.payloadLength = br.length;
+
+ if (this.payloadLength > this.buffer.length) {
+ // Growing if necessary.
+ this.buffer = new byte[this.payloadLength * 2 + 1];
+ }
+ // Loading the payload
+ System.arraycopy(br.bytes, br.offset, this.buffer, 0, payloadLength);
+ return true;
+ }
+
+ /**
+ * Get the buffer with the content of the last read payload.
+ */
+ public byte[] getBuffer() {
+ return buffer;
+ }
+
+ /**
+ * Get the length of the last read payload.
+ */
+ public int getPayloadLength() {
+ return payloadLength;
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/SamplingWrapper.java b/modules/facet/src/java/org/apache/lucene/facet/search/SamplingWrapper.java
new file mode 100644
index 00000000000..61a09b4d607
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/SamplingWrapper.java
@@ -0,0 +1,118 @@
+package org.apache.lucene.facet.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.facet.search.params.FacetSearchParams;
+import org.apache.lucene.facet.search.results.FacetResult;
+import org.apache.lucene.facet.search.results.FacetResultNode;
+import org.apache.lucene.facet.search.sampling.Sampler;
+import org.apache.lucene.facet.search.sampling.Sampler.SampleResult;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Wrap any Facets Accumulator with sampling.
+ *
+ * Note: Sampling accumulation (Accumulation over a sampled-set of the results),
+ * does not guarantee accurate values for
+ * {@link FacetResult#getNumValidDescendants()} &
+ * {@link FacetResultNode#getResidue()}.
+ *
+ * @lucene.experimental
+ */
+public class SamplingWrapper extends FacetsAccumulator {
+
+ private FacetsAccumulator delegee;
+ private Sampler sampler;
+
+ public SamplingWrapper(FacetsAccumulator delegee, Sampler sampler) {
+ super(delegee.searchParams, delegee.indexReader, delegee.taxonomyReader);
+ this.delegee = delegee;
+ this.sampler = sampler;
+ }
+
+ @Override
+ public List
+ * Why partitions? Because if there are say 100M categories out of which
+ * only top K are required, we must first compute value for all 100M categories
+ * (going over all documents) and only then could we select top K.
+ * This is made easier on memory by working in partitions of distinct categories:
+ * Once a values for a partition are found, we take the top K for that
+ * partition and work on the next partition, them merge the top K of both,
+ * and so forth, thereby computing top K with RAM needs for the size of
+ * a single partition rather than for the size of all the 100M categories.
+ *
+ * Decision on partitions size is done at indexing time, and the facet information
+ * for each partition is maintained separately.
+ *
+ * Implementation detail: Since facets information of each partition is
+ * maintained in a separate "category list", we can be more efficient
+ * at search time, because only the facet info for a single partition
+ * need to be read while processing that partition.
+ *
+ * @lucene.experimental
+ */
+public class StandardFacetsAccumulator extends FacetsAccumulator {
+
+ private static final Logger logger = Logger.getLogger(StandardFacetsAccumulator.class.getName());
+
+ protected final IntArrayAllocator intArrayAllocator;
+ protected final FloatArrayAllocator floatArrayAllocator;
+
+ protected int partitionSize;
+ protected int maxPartitions;
+ protected boolean isUsingComplements;
+
+ private TotalFacetCounts totalFacetCounts;
+
+ private Object accumulateGuard;
+
+ public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
+ TaxonomyReader taxonomyReader, IntArrayAllocator intArrayAllocator,
+ FloatArrayAllocator floatArrayAllocator) {
+
+ super(searchParams,indexReader,taxonomyReader);
+ int realPartitionSize = intArrayAllocator == null || floatArrayAllocator == null
+ ? PartitionsUtils.partitionSize(searchParams, taxonomyReader) : -1; // -1 if not needed.
+ this.intArrayAllocator = intArrayAllocator != null
+ ? intArrayAllocator
+ // create a default one if null was provided
+ : new IntArrayAllocator(realPartitionSize, 1);
+ this.floatArrayAllocator = floatArrayAllocator != null
+ ? floatArrayAllocator
+ // create a default one if null provided
+ : new FloatArrayAllocator(realPartitionSize, 1);
+ // can only be computed later when docids size is known
+ isUsingComplements = false;
+ partitionSize = PartitionsUtils.partitionSize(searchParams, taxonomyReader);
+ maxPartitions = (int) Math.ceil(this.taxonomyReader.getSize() / (double) partitionSize);
+ accumulateGuard = new Object();
+ }
+
+ public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
+ TaxonomyReader taxonomyReader) {
+
+ this(searchParams, indexReader, taxonomyReader, null, null);
+ }
+
+ @Override
+ public List
+ * Allows to override the set of documents to accumulate for. Invoked just
+ * before actual accumulating starts. From this point that set of documents
+ * remains unmodified. Default implementation just returns the input
+ * unchanged.
+ *
+ * @param docids
+ * candidate documents to accumulate for
+ * @return actual documents to accumulate for
+ */
+ protected ScoredDocIDs actualDocsToAccumulate(ScoredDocIDs docids) throws IOException {
+ return docids;
+ }
+
+ /** Check if it is worth to use complements */
+ protected boolean shouldComplement(ScoredDocIDs docids) {
+ return
+ mayComplement() &&
+ (docids.size() > indexReader.numDocs() * getComplementThreshold()) ;
+ }
+
+ /**
+ * Iterate over the documents for this partition and fill the facet arrays with the correct
+ * count/complement count/value.
+ * @param internalCollector
+ * @param facetArrays
+ * @param part
+ * @throws IOException
+ */
+ private final void fillArraysForPartition(ScoredDocIDs docids,
+ FacetArrays facetArrays, int partition) throws IOException {
+
+ if (isUsingComplements) {
+ initArraysByTotalCounts(facetArrays, partition, docids.size());
+ } else {
+ facetArrays.free(); // to get a cleared array for this partition
+ }
+
+ HashMap
+ * If two CategoryListIterators are served by the same aggregator, a single
+ * aggregator is returned for both.
+ *
+ * NOTE: If a given category list iterator is needed with two different
+ * aggregators (e.g counting and association) - an exception is thrown as this
+ * functionality is not supported at this time.
+ */
+ protected HashMap
+ * This technique is useful for temporary counter arrays in faceted search
+ * (see {@link FacetsAccumulator}), which can be reused across searches instead
+ * of being allocated afresh on every search.
+ *
+ * A TemporaryObjectAllocator is thread-safe.
+ *
+ * @lucene.experimental
+ */
+public abstract class TemporaryObjectAllocator
+ * Note that the pool size only restricts the number of objects that hang
+ * around when not needed, but not the maximum number of objects
+ * that are allocated when actually is use: If a number of concurrent
+ * threads ask for an allocation, all of them will get an object, even if
+ * their number is greater than maxObjects. If an application wants to
+ * limit the number of concurrent threads making allocations, it needs to
+ * do so on its own - for example by blocking new threads until the
+ * existing ones have finished. If more than maxObjects are freed, only
+ * maxObjects of them will be kept in the pool - the rest will not and
+ * will eventually be garbage-collected by Java.
+ *
+ * In particular, when maxObjects=0, this object behaves as a trivial
+ * allocator, always allocating a new array and never reusing an old one.
+ */
+ public TemporaryObjectAllocator(int maxObjects) {
+ this.maxObjects = maxObjects;
+ }
+
+ /**
+ * Subclasses must override this method to actually create a new object
+ * of the desired type.
+ *
+ */
+ protected abstract T create();
+
+ /**
+ * Subclasses must override this method to clear an existing object of
+ * the desired type, to prepare it for reuse. Note that objects will be
+ * cleared just before reuse (on allocation), not when freed.
+ */
+ protected abstract void clear(T object);
+
+ /**
+ * Allocate a new object. If there's a previously allocated object in our
+ * pool, we return it immediately. Otherwise, a new object is allocated.
+ *
+ * Don't forget to call {@link #free(Object)} when you're done with the object,
+ * to return it to the pool. If you don't, memory is not leaked,
+ * but the pool will remain empty and a new object will be allocated each
+ * time (just like the maxArrays=0 case).
+ */
+ public final T allocate() {
+ T object = pool.poll();
+ if (object==null) {
+ return create();
+ }
+ clear(object);
+ return object;
+ }
+
+ /**
+ * Return a no-longer-needed object back to the pool. If we already have
+ * enough objects in the pool (maxObjects as specified in the constructor),
+ * the array will not be saved, and Java will eventually garbage collect
+ * it.
+ *
+ * In particular, when maxArrays=0, the given array is never saved and
+ * free does nothing.
+ */
+ public final void free(T object) {
+ if (pool.size() < maxObjects && object != null) {
+ pool.add(object);
+ }
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/TopKFacetResultsHandler.java b/modules/facet/src/java/org/apache/lucene/facet/search/TopKFacetResultsHandler.java
new file mode 100644
index 00000000000..43df368c661
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/TopKFacetResultsHandler.java
@@ -0,0 +1,292 @@
+package org.apache.lucene.facet.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.lucene.facet.search.params.FacetRequest;
+import org.apache.lucene.facet.search.results.FacetResult;
+import org.apache.lucene.facet.search.results.FacetResultNode;
+import org.apache.lucene.facet.search.results.MutableFacetResultNode;
+import org.apache.lucene.facet.search.results.IntermediateFacetResult;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays;
+import org.apache.lucene.facet.util.ResultSortUtils;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Generate Top-K results for a particular FacetRequest.
+ *
+ * K is global (among all results) and is defined by {@link FacetRequest#getNumResults()}.
+ *
+ * Note: Values of 0 (Zero) are ignored by this results handler.
+ *
+ * @lucene.experimental
+ */
+public class TopKFacetResultsHandler extends FacetResultsHandler {
+
+ /**
+ * Construct top-K results handler.
+ * @param taxonomyReader taxonomy reader
+ * @param facetRequest facet request being served
+ */
+ public TopKFacetResultsHandler(TaxonomyReader taxonomyReader,
+ FacetRequest facetRequest) {
+ super(taxonomyReader, facetRequest);
+ }
+
+ // fetch top K for specific partition.
+ @Override
+ public IntermediateFacetResult fetchPartitionResult(FacetArrays facetArrays, int offset)
+ throws IOException {
+ TopKFacetResult res = null;
+ int ordinal = taxonomyReader.getOrdinal(facetRequest.getCategoryPath());
+ if (ordinal != TaxonomyReader.INVALID_ORDINAL) {
+ double value = 0;
+ if (isSelfPartition(ordinal, facetArrays, offset)) {
+ int partitionSize = facetArrays.getArraysLength();
+ value = facetRequest.getValueOf(facetArrays, ordinal % partitionSize);
+ }
+
+ // TODO (Facet): should initial value of "residue" depend on aggregator if not sum?
+ MutableFacetResultNode parentResultNode =
+ new MutableFacetResultNode(ordinal, value);
+
+ Heap
+ * Because the number of selected children of each node is restricted,
+ * and not the overall number of nodes in the {@link FacetResult}, facets not selected
+ * into {@link FacetResult} might have better values, or ordinals, (typically,
+ * higher counts), than facets that are selected into the {@link FacetResult}.
+ *
+ * The generated {@link FacetResult} also provides with
+ * {@link FacetResult#getNumValidDescendants()}, which returns the total number of facets
+ * that are descendants of the root node, no deeper than {@link FacetRequest#getDepth()}, and
+ * which have valid value. The rootnode itself is not counted here.
+ * Valid value is determined by the {@link FacetResultsHandler}.
+ * {@link TopKInEachNodeHandler} defines valid as != 0.
+ *
+ * NOTE: this code relies on the assumption that {@link TaxonomyReader#INVALID_ORDINAL} == -1, a smaller
+ * value than any valid ordinal.
+ *
+ * @lucene.experimental
+ */
+public class TopKInEachNodeHandler extends FacetResultsHandler {
+
+ public TopKInEachNodeHandler(TaxonomyReader taxonomyReader,
+ FacetRequest facetRequest) {
+ super(taxonomyReader, facetRequest);
+ }
+
+ /**
+ * Recursively explore all facets that can be potentially included in the
+ * {@link FacetResult} to be generated, and that belong to the given
+ * partition, so that values can be examined and collected. For each such
+ * node, gather its top K ({@link FacetRequest#getNumResults()}) children
+ * among its children that are encountered in the given particular partition
+ * (aka current counting list).
+ *
+ * @return {@link IntermediateFacetResult} consisting of
+ * {@link IntToObjectMap} that maps potential
+ * {@link FacetResult} nodes to their top K children encountered in
+ * the current partition. Note that the mapped potential tree nodes
+ * need not belong to the given partition, only the top K children
+ * mapped to. The aim is to identify nodes that are certainly excluded
+ * from the {@link FacetResult} to be eventually (after going through
+ * all the partitions) returned by this handler, because they have K
+ * better siblings, already identified in this partition. For the
+ * identified excluded nodes, we only count number of their
+ * descendants in the subtree (to be included in
+ * {@link FacetResult#getNumValidDescendants()}), but not bother with
+ * selecting top K in these generations, which, by definition, are,
+ * too, excluded from the FacetResult tree.
+ * @param arrays the already filled in count array, potentially only covering
+ * one partition: the ordinals ranging from
+ * @param offset to
+ * Implementation notes: Synchronization considerations and the interaction between lruKeys and cache:
+ *
+ * If new size is smaller than current size, the cache is appropriately trimmed.
+ *
+ * Minimal size is 1, so passing zero or negative size would result in size of 1.
+ * @param size new size to set
+ */
+ public void setCacheSize(int size) {
+ if (size < 1) size = 1;
+ int origSize = maxCacheSize;
+ maxCacheSize = size;
+ if (maxCacheSize < origSize) { // need to trim only if the cache was reduced
+ trimCache();
+ }
+ }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java
new file mode 100644
index 00000000000..45f15ca4a09
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java
@@ -0,0 +1,51 @@
+package org.apache.lucene.facet.search.aggregator;
+
+import java.io.IOException;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * An Aggregator is the analogue of Lucene's Collector (see
+ * {@link org.apache.lucene.search.Collector}), for processing the categories
+ * belonging to a certain document. The Aggregator is responsible for doing
+ * whatever it wishes with the categories it is fed, e.g., counting the number
+ * of times that each category appears, or performing some computation on their
+ * association values.
+ *
+ * Much of the function of an Aggregator implementation is not described by this
+ * interface. This includes the constructor and getter methods to retrieve the
+ * results of the aggregation.
+ *
+ * @lucene.experimental
+ */
+public interface Aggregator {
+
+ /**
+ * Specify the document (and its score in the search) that the following
+ * {@link #aggregate(int)} calls will pertain to.
+ */
+ void setNextDoc(int docid, float score) throws IOException;
+
+ /**
+ * Collect (and do whatever an implementation deems appropriate) the
+ * category given by its ordinal. This category belongs to a document
+ * given earlier by {@link #setNextDoc(int, float)}.
+ */
+ void aggregate(int ordinal);
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java
new file mode 100644
index 00000000000..eab1eb38cc9
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java
@@ -0,0 +1,37 @@
+package org.apache.lucene.facet.search.aggregator;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link CountingAggregator} used during complement counting.
+ *
+ * @lucene.experimental
+ */
+public class ComplementCountingAggregator extends CountingAggregator {
+
+ public ComplementCountingAggregator(int[] counterArray) {
+ super(counterArray);
+ }
+
+ @Override
+ public void aggregate(int ordinal) {
+ assert counterArray[ordinal]!=0:"complement aggregation: count is about to become negative for ordinal "+ordinal;
+ --counterArray[ordinal];
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java
new file mode 100644
index 00000000000..d3569a42556
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java
@@ -0,0 +1,59 @@
+package org.apache.lucene.facet.search.aggregator;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A CountingAggregator updates a counter array with the size of the whole
+ * taxonomy, counting the number of times each category appears in the given set
+ * of documents.
+ *
+ * @lucene.experimental
+ */
+public class CountingAggregator implements Aggregator {
+
+ protected int[] counterArray;
+
+ public void aggregate(int ordinal) {
+ ++counterArray[ordinal];
+ }
+
+ public void setNextDoc(int docid, float score) {
+ // There's nothing for us to do here since we only increment the count by 1
+ // in this aggregator.
+ }
+
+ public CountingAggregator(int[] counterArray) {
+ this.counterArray = counterArray;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || obj.getClass() != this.getClass()) {
+ return false;
+ }
+ CountingAggregator that = (CountingAggregator) obj;
+ return that.counterArray == this.counterArray;
+ }
+
+ @Override
+ public int hashCode() {
+ int hashCode = counterArray == null ? 0 : counterArray.hashCode();
+
+ return hashCode;
+ }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java
new file mode 100644
index 00000000000..6b1843c22fb
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java
@@ -0,0 +1,58 @@
+package org.apache.lucene.facet.search.aggregator;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * An {@link Aggregator} which updates the weight of a category according to the
+ * scores of the documents it was found in.
+ *
+ * @lucene.experimental
+ */
+public class ScoringAggregator implements Aggregator {
+
+ private final float[] scoreArray;
+ private float score;
+ private final int hashCode;
+
+ public ScoringAggregator(float[] counterArray) {
+ this.scoreArray = counterArray;
+ this.hashCode = scoreArray == null ? 0 : scoreArray.hashCode();
+ }
+
+ public void aggregate(int ordinal) {
+ scoreArray[ordinal] += score;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || obj.getClass() != this.getClass()) {
+ return false;
+ }
+ ScoringAggregator that = (ScoringAggregator) obj;
+ return that.scoreArray == this.scoreArray;
+ }
+
+ @Override
+ public int hashCode() {
+ return hashCode;
+ }
+
+ public void setNextDoc(int docid, float score) {
+ this.score = score;
+ }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationFloatSumAggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationFloatSumAggregator.java
new file mode 100644
index 00000000000..ab20ffbf9c9
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationFloatSumAggregator.java
@@ -0,0 +1,74 @@
+package org.apache.lucene.facet.search.aggregator.association;
+
+import java.io.IOException;
+
+import org.apache.lucene.facet.enhancements.association.AssociationsPayloadIterator;
+import org.apache.lucene.facet.index.params.CategoryListParams;
+import org.apache.lucene.facet.search.aggregator.Aggregator;
+import org.apache.lucene.index.IndexReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * An {@link Aggregator} which updates the weight of a category by summing the
+ * weights of the float association it finds for every document.
+ *
+ * @lucene.experimental
+ */
+public class AssociationFloatSumAggregator implements Aggregator {
+
+ protected final String field;
+ protected final float[] sumArray;
+ protected final AssociationsPayloadIterator associationsPayloadIterator;
+
+ public AssociationFloatSumAggregator(IndexReader reader, float[] sumArray) throws IOException {
+ this(CategoryListParams.DEFAULT_TERM.field(), reader, sumArray);
+ }
+
+ public AssociationFloatSumAggregator(String field, IndexReader reader, float[] sumArray) throws IOException {
+ this.field = field;
+ associationsPayloadIterator = new AssociationsPayloadIterator(reader, field);
+ this.sumArray = sumArray;
+ }
+
+ public void aggregate(int ordinal) {
+ long association = associationsPayloadIterator.getAssociation(ordinal);
+ if (association != AssociationsPayloadIterator.NO_ASSOCIATION) {
+ sumArray[ordinal] += Float.intBitsToFloat((int) association);
+ }
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || obj.getClass() != this.getClass()) {
+ return false;
+ }
+ AssociationFloatSumAggregator that = (AssociationFloatSumAggregator) obj;
+ return that.field.equals(field) && that.sumArray == sumArray;
+ }
+
+ @Override
+ public int hashCode() {
+ return field.hashCode();
+ }
+
+ public void setNextDoc(int docid, float score) throws IOException {
+ associationsPayloadIterator.setNextDoc(docid);
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationIntSumAggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationIntSumAggregator.java
new file mode 100644
index 00000000000..7452aabf430
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationIntSumAggregator.java
@@ -0,0 +1,74 @@
+package org.apache.lucene.facet.search.aggregator.association;
+
+import java.io.IOException;
+
+import org.apache.lucene.facet.enhancements.association.AssociationsPayloadIterator;
+import org.apache.lucene.facet.index.params.CategoryListParams;
+import org.apache.lucene.facet.search.aggregator.Aggregator;
+import org.apache.lucene.index.IndexReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * An {@link Aggregator} which updates the weight of a category by summing the
+ * weights of the integer association it finds for every document.
+ *
+ * @lucene.experimental
+ */
+public class AssociationIntSumAggregator implements Aggregator {
+
+ protected final String field;
+ protected final int[] sumArray;
+ protected final AssociationsPayloadIterator associationsPayloadIterator;
+
+ public AssociationIntSumAggregator(IndexReader reader, int[] sumArray) throws IOException {
+ this(CategoryListParams.DEFAULT_TERM.field(), reader, sumArray);
+ }
+
+ public AssociationIntSumAggregator(String field, IndexReader reader, int[] sumArray) throws IOException {
+ this.field = field;
+ associationsPayloadIterator = new AssociationsPayloadIterator(reader, field);
+ this.sumArray = sumArray;
+ }
+
+ public void aggregate(int ordinal) {
+ long association = associationsPayloadIterator.getAssociation(ordinal);
+ if (association != AssociationsPayloadIterator.NO_ASSOCIATION) {
+ sumArray[ordinal] += association;
+ }
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || obj.getClass() != this.getClass()) {
+ return false;
+ }
+ AssociationIntSumAggregator that = (AssociationIntSumAggregator) obj;
+ return that.field.equals(field) && that.sumArray == sumArray;
+ }
+
+ @Override
+ public int hashCode() {
+ return field.hashCode();
+ }
+
+ public void setNextDoc(int docid, float score) throws IOException {
+ associationsPayloadIterator.setNextDoc(docid);
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/package.html b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/package.html
new file mode 100644
index 00000000000..baa8f958b98
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/package.html
@@ -0,0 +1,12 @@
+
+
+ * Speeds up facets accumulation when more RAM is available.
+ *
+ * Note that this will consume more memory: one int (4 bytes) for each category
+ * of each document.
+ *
+ * Note: at the moment this class is insensitive to updates of the index, and,
+ * in particular, does not make use of Lucene's ability to refresh a single
+ * segment.
+ *
+ * See {@link CategoryListCache#register(CategoryListParams, CategoryListData)}
+ * and
+ * {@link CategoryListCache#loadAndRegister(CategoryListParams, IndexReader, TaxonomyReader, FacetIndexingParams)}.
+ *
+ * @lucene.experimental
+ */
+public class CategoryListData {
+
+ // TODO (Facet): experiment with different orders - p-d-c vs. current d-p-c.
+ private transient volatile int[][][] docPartitionCategories;
+
+ /**
+ * Empty constructor for extensions with modified computation of the data.
+ */
+ protected CategoryListData() {
+ }
+
+ /**
+ * Compute category list data for caching for faster iteration.
+ */
+ CategoryListData(IndexReader reader, TaxonomyReader taxo,
+ FacetIndexingParams iparams, CategoryListParams clp) throws IOException {
+
+ final int maxDoc = reader.maxDoc();
+ int[][][]dpf = new int[maxDoc][][];
+ int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize());
+ IntArray docCategories = new IntArray();
+ for (int part=0; part
+
+ We now describe the simpler interfaces.
+ There are mainly 3 interfaces for faceted search:
+
+ * The facet request additionally defines what information should
+ * be computed within the facet results, if and how should results
+ * be ordered, etc.
+ *
+ * An example facet request is to look at all sub-categories of "Author", and
+ * return the 10 with the highest counts (sorted by decreasing count).
+ *
+ * @lucene.experimental
+ */
+public abstract class FacetRequest implements Cloneable {
+
+ /**
+ * Default depth for facets accumulation.
+ * @see #getDepth()
+ */
+ public static final int DEFAULT_DEPTH = 1;
+
+ /**
+ * Default sort mode.
+ * @see #getSortBy()
+ */
+ public static final SortBy DEFAULT_SORT_BY = SortBy.VALUE;
+
+ /**
+ * Default result mode
+ * @see #getResultMode()
+ */
+ public static final ResultMode DEFAULT_RESULT_MODE = ResultMode.GLOBAL_FLAT;
+
+ private final CategoryPath categoryPath;
+ private final int numResults;
+ private int numLabel;
+ private int depth;
+ private SortOrder sortOrder;
+ private SortBy sortBy;
+
+ /**
+ * Computed at construction, this hashCode is based on two final members
+ * {@link CategoryPath} and
+ * NOTE: if
+ * NOTE: it is assumed that the given {@link CategoryPath} is not
+ * modified after construction of this object. Otherwise, some things may not
+ * function properly, e.g. {@link #hashCode()}.
+ *
+ * @throws IllegalArgumentException if numResults is ≤ 0
+ */
+ public FacetRequest(CategoryPath path, int numResults) {
+ if (numResults <= 0) {
+ throw new IllegalArgumentException("num results must be a positive (>0) number: " + numResults);
+ }
+ if (path == null) {
+ throw new IllegalArgumentException("category path cannot be null!");
+ }
+ categoryPath = path;
+ this.numResults = numResults;
+ numLabel = numResults;
+ depth = DEFAULT_DEPTH;
+ sortBy = DEFAULT_SORT_BY;
+ sortOrder = SortOrder.DESCENDING;
+
+ hashCode = categoryPath.hashCode() ^ this.numResults;
+ }
+
+ @Override
+ public Object clone() throws CloneNotSupportedException {
+ // Overridden to make it public
+ return super.clone();
+ }
+
+ public void setNumLabel(int numLabel) {
+ this.numLabel = numLabel;
+ }
+
+ public void setDepth(int depth) {
+ this.depth = depth;
+ }
+
+ public void setSortOrder(SortOrder sortOrder) {
+ this.sortOrder = sortOrder;
+ }
+
+ public void setSortBy(SortBy sortBy) {
+ this.sortBy = sortBy;
+ }
+
+ /**
+ * The root category of this facet request. The categories that are returned
+ * as a result of this request will all be descendants of this root.
+ *
+ * NOTE: you should not modify the returned {@link CategoryPath}, or
+ * otherwise some methonds may not work properly, e.g. {@link #hashCode()}.
+ */
+ public final CategoryPath getCategoryPath() {
+ return categoryPath;
+ }
+
+ /**
+ * How deeply to look under the given category. If the depth is 0,
+ * only the category itself is counted. If the depth is 1, its immediate
+ * children are also counted, and so on. If the depth is Integer.MAX_VALUE,
+ * all the category's descendants are counted.
+ * The purpose of this parameter is to avoid having to run the whole
+ * faceted search again when the user asks for more values for the facet;
+ * The application can ask (getNumResults()) for more values than it needs
+ * to show, but keep getNumLabel() only the number it wants to immediately
+ * show. The slow-down caused by finding more values is negligible, because
+ * the slowest part - finding the categories' paths, is avoided.
+ *
+ * Depending on the {@link #getResultMode() LimitsMode},
+ * this limit is applied globally or per results node.
+ * In the global mode, if this limit is 3,
+ * only 3 top results would be labeled.
+ * In the per-node mode, if this limit is 3,
+ * 3 top children of {@link #getCategoryPath() the target category} would be labeled,
+ * as well as 3 top children of each of them, and so forth, until the depth defined
+ * by {@link #getDepth()}.
+ * @see #getResultMode()
+ */
+ public final int getNumLabel() {
+ return numLabel;
+ }
+
+ /**
+ * The number of sub-categories to return (at most).
+ * If the sub-categories are returned.
+ *
+ * If Integer.MAX_VALUE is specified, all
+ * sub-categories are returned.
+ *
+ * Depending on the {@link #getResultMode() LimitsMode},
+ * this limit is applied globally or per results node.
+ * In the global mode, if this limit is 3,
+ * only 3 top results would be computed.
+ * In the per-node mode, if this limit is 3,
+ * 3 top children of {@link #getCategoryPath() the target category} would be returned,
+ * as well as 3 top children of each of them, and so forth, until the depth defined
+ * by {@link #getDepth()}.
+ * @see #getResultMode()
+ */
+ public final int getNumResults() {
+ return numResults;
+ }
+
+ /**
+ * Sort options for facet results.
+ */
+ public enum SortBy {
+ /** sort by category ordinal with the taxonomy */
+ ORDINAL,
+
+ /** sort by computed category value */
+ VALUE
+ }
+
+ /** Specify how should results be sorted. */
+ public final SortBy getSortBy() {
+ return sortBy;
+ }
+
+ /** Requested sort order for the results. */
+ public enum SortOrder { ASCENDING, DESCENDING }
+
+ /** Return the requested order of results. */
+ public final SortOrder getSortOrder() {
+ return sortOrder;
+ }
+
+ @Override
+ public String toString() {
+ return categoryPath.toString()+" nRes="+numResults+" nLbl="+numLabel;
+ }
+
+ /**
+ * Creates a new {@link FacetResultsHandler} that matches the request logic
+ * and current settings, such as {@link #getDepth() depth},
+ * {@link #getResultMode() limits-mode}, etc, as well as the passed in
+ * {@link TaxonomyReader}.
+ *
+ * @param taxonomyReader taxonomy reader is needed e.g. for knowing the
+ * taxonomy size.
+ */
+ public FacetResultsHandler createFacetResultsHandler(TaxonomyReader taxonomyReader) {
+ try {
+ if (resultMode == ResultMode.PER_NODE_IN_TREE) {
+ return new TopKInEachNodeHandler(taxonomyReader, (FacetRequest) clone());
+ }
+ return new TopKFacetResultsHandler(taxonomyReader, (FacetRequest) clone());
+ } catch (CloneNotSupportedException e) {
+ // Shouldn't happen since we implement Cloneable. If it does happen, it is
+ // probably because the class was changed to not implement Cloneable
+ // anymore.
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Result structure manner of applying request's limits such as
+ * {@link #getNumLabel()} and
+ * {@link #getNumResults()}.
+ */
+ public enum ResultMode {
+ /** Limits are applied per node, and the result has a full tree structure. */
+ PER_NODE_IN_TREE,
+
+ /** Limits are applied globally, on total number of results, and the result has a flat structure. */
+ GLOBAL_FLAT
+ }
+
+ /** Return the requested result mode. */
+ public final ResultMode getResultMode() {
+ return resultMode;
+ }
+
+ /**
+ * @param resultMode the resultMode to set
+ * @see #getResultMode()
+ */
+ public void setResultMode(ResultMode resultMode) {
+ this.resultMode = resultMode;
+ }
+
+ @Override
+ public int hashCode() {
+ return hashCode;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof FacetRequest) {
+ FacetRequest that = (FacetRequest)o;
+ return that.hashCode == this.hashCode &&
+ that.categoryPath.equals(this.categoryPath) &&
+ that.numResults == this.numResults &&
+ that.depth == this.depth &&
+ that.resultMode == this.resultMode &&
+ that.numLabel == this.numLabel;
+ }
+ return false;
+ }
+
+ /**
+ * Create an aggregator for this facet request. Aggregator action depends on
+ * request definition. For a count request, it will usually increment the
+ * count for that facet.
+ *
+ * @param useComplements
+ * whether the complements optimization is being used for current
+ * computation.
+ * @param arrays
+ * provider for facet arrays in use for current computation.
+ * @param indexReader
+ * index reader in effect.
+ * @param taxonomy
+ * reader of taxonomy in effect.
+ * @throws IOException
+ */
+ public abstract Aggregator createAggregator(boolean useComplements,
+ FacetArrays arrays, IndexReader indexReader,
+ TaxonomyReader taxonomy) throws IOException;
+
+ /**
+ * Create the category list iterator for the specified partition.
+ * If a non null cache is provided which contains the required data,
+ * use it for the iteration.
+ */
+ public CategoryListIterator createCategoryListIterator(IndexReader reader,
+ TaxonomyReader taxo, FacetSearchParams sParams, int partition)
+ throws IOException {
+ CategoryListCache clCache = sParams.getClCache();
+ CategoryListParams clParams = sParams.getFacetIndexingParams().getCategoryListParams(categoryPath);
+ if (clCache!=null) {
+ CategoryListData clData = clCache.get(clParams);
+ if (clData!=null) {
+ return clData.iterator(partition);
+ }
+ }
+ return clParams.createCategoryListIterator(reader, partition);
+ }
+
+ /**
+ * Return the value of a category used for facets computations for this
+ * request. For a count request this would be the count for that facet, i.e.
+ * an integer number. but for other requests this can be the result of a more
+ * complex operation, and the result can be any double precision number.
+ * Having this method with a general name value which is double
+ * precision allows to have more compact API and code for handling counts and
+ * perhaps other requests (such as for associations) very similarly, and by
+ * the same code and API, avoiding code duplication.
+ *
+ * @param arrays
+ * provider for facet arrays in use for current computation.
+ * @param idx
+ * an index into the count arrays now in effect in
+ *
+ * The contained facet requests define for which facets should info be gathered.
+ *
+ * Contained faceted indexing parameters provide required info on how
+ * to read and interpret the underlying faceted information in the search index.
+ *
+ * @lucene.experimental
+ */
+public class FacetSearchParams {
+
+ protected final FacetIndexingParams indexingParams;
+ protected final List
+ * Use with caution: loading a label for results is costly, performance wise.
+ * Therefore force labels loading only when really needed.
+ * @param taxonomyReader taxonomy reader for forcing (lazy) labeling of this result.
+ * @throws IOException on error
+ * @see FacetRequest#getNumLabel()
+ */
+ public CategoryPath getLabel(TaxonomyReader taxonomyReader) throws IOException;
+
+ /**
+ * Value of this result - usually either count or a value derived from some
+ * computing on the association of it.
+ */
+ public double getValue();
+
+ /**
+ * Value of screened out sub results.
+ *
+ * If only part of valid results are returned, e.g. because top K were requested,
+ * provide info on "what else is there under this result node".
+ */
+ public double getResidue();
+
+ /**
+ * Contained sub results.
+ * These are either child facets, if a tree result was requested, or simply descendants, in case
+ * tree result was not requested. In the first case, all returned are both descendants of
+ * this node in the taxonomy and siblings of each other in the taxonomy.
+ * In the latter case they are only guaranteed to be descendants of
+ * this node in the taxonomy.
+ */
+ public Iterable extends FacetResultNode> getSubResults();
+
+ /**
+ * Number of sub results
+ */
+ public int getNumSubResults();
+
+ /**
+ * Expert: Set a new value for this result node.
+ *
+ * Allows to modify the value of this facet node.
+ * Used for example to tune a sampled value, e.g. by
+ * {@link SampleFixer#fixResult(org.apache.lucene.facet.search.ScoredDocIDs, FacetResult)}
+ * @param value the new value to set
+ * @see #getValue()
+ * @see FacetResultsHandler#rearrangeFacetResult(FacetResult)
+ */
+ public void setValue(double value);
+
+}
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/results/IntermediateFacetResult.java b/modules/facet/src/java/org/apache/lucene/facet/search/results/IntermediateFacetResult.java
new file mode 100644
index 00000000000..100256b3e24
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/results/IntermediateFacetResult.java
@@ -0,0 +1,41 @@
+package org.apache.lucene.facet.search.results;
+
+import org.apache.lucene.facet.search.FacetResultsHandler;
+import org.apache.lucene.facet.search.params.FacetRequest;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Intermediate {@link FacetResult} of faceted search.
+ *
+ * This is an empty interface on purpose.
+ *
+ * It allows {@link FacetResultsHandler} to return intermediate result objects
+ * that only it knows how to interpret, and so the handler has maximal freedom
+ * in defining what an intermediate result is, depending on its specific logic.
+ *
+ * @lucene.experimental
+ */
+public interface IntermediateFacetResult {
+
+ /**
+ * Facet request for which this temporary result was created.
+ */
+ FacetRequest getFacetRequest();
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/results/MutableFacetResultNode.java b/modules/facet/src/java/org/apache/lucene/facet/search/results/MutableFacetResultNode.java
new file mode 100644
index 00000000000..92dcecb3f9e
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/results/MutableFacetResultNode.java
@@ -0,0 +1,344 @@
+package org.apache.lucene.facet.search.results;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Mutable implementation for Result of faceted search for a certain taxonomy node.
+ *
+ * @lucene.experimental
+ */
+public class MutableFacetResultNode implements FacetResultNode {
+
+ /**
+ * Empty sub results to be returned when there are no results.
+ * We never return null, so that code using this can remain simpler.
+ */
+ private static final ArrayList
+ * Used at the population of facet results, not intended for regular use by
+ * applications.
+ *
+ * @param ordinal
+ * ordinal in the taxonomy of the category of this result.
+ * @param value
+ * value of this result.
+ */
+ public void reset(int ordinal, double value) {
+ this.ordinal = ordinal;
+ this.value = value;
+ if (subResults != null) {
+ subResults.clear();
+ }
+ label = null;
+ residue = 0;
+ }
+
+ /**
+ * Create a Facet Result Node.
+ *
+ * @param ordinal
+ * ordinal in the taxonomy of the category of this result.
+ * @param value
+ * value of this result.
+ * @param residue
+ * Value of screened out sub results.
+ * @param label
+ * label of the category path of this result.
+ * @param subResults
+ * - sub results, usually descendants, sometimes child results, of
+ * this result - depending on the request.
+ */
+ public MutableFacetResultNode(int ordinal, double value, double residue,
+ CategoryPath label, List
+ * Note: Although the {@link #getResidue()} is not guaranteed to be
+ * accurate, it is worth fixing it, as possible, by taking under account the
+ * trimmed sub-nodes.
+ */
+ public void trimSubResults(int size) {
+ if (subResults == null || subResults.size() == 0) {
+ return;
+ }
+
+ ArrayList
+ * In case that input result node is already of an implementation
+ * class only casting is done, but in any case we pay the price
+ * of checking "instance of".
+ * @param frn facet result node to be turned into an implementation class object
+ */
+ public static MutableFacetResultNode toImpl(FacetResultNode frn) {
+ if (frn instanceof MutableFacetResultNode) {
+ return (MutableFacetResultNode) frn;
+ }
+ return new MutableFacetResultNode(frn, true);
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/results/package.html b/modules/facet/src/java/org/apache/lucene/facet/search/results/package.html
new file mode 100644
index 00000000000..2006bf13571
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/results/package.html
@@ -0,0 +1,18 @@
+
+
+ * The Sampler uses TAKMI style counting to provide a 'best guess' top-K result
+ * set of the facets accumulated.
+ *
+ * Note: Sampling accumulation (Accumulation over a sampled-set of the results),
+ * does not guarantee accurate values for
+ * {@link FacetResult#getNumValidDescendants()} &
+ * {@link FacetResultNode#getResidue()}.
+ *
+ * @lucene.experimental
+ */
+public class Sampler {
+
+ private static final Logger logger = Logger.getLogger(Sampler.class.getName());
+
+ private final SamplingParams samplingParams;
+
+ /**
+ * Construct with {@link SamplingParams}
+ */
+ public Sampler() {
+ this(new SamplingParams());
+ }
+
+ /**
+ * Construct with certain {@link SamplingParams}
+ * @param params sampling params in effect
+ * @throws IllegalArgumentException if the provided SamplingParams are not valid
+ */
+ public Sampler(SamplingParams params) throws IllegalArgumentException {
+ if (!params.validate()) {
+ throw new IllegalArgumentException("The provided SamplingParams are not valid!!");
+ }
+ this.samplingParams = params;
+ }
+
+ /**
+ * Check if this sampler would complement for the input docIds
+ */
+ public boolean shouldSample(ScoredDocIDs docIds) {
+ return docIds.size() > samplingParams.getSamplingThreshold();
+ }
+
+ /**
+ * Compute a sample set out of the input set, based on the {@link SamplingParams#getSampleRatio()}
+ * in effect. Sub classes can override to alter how the sample set is
+ * computed.
+ *
+ * If the input set is of size smaller than {@link SamplingParams#getMinSampleSize()},
+ * the input set is returned (no sampling takes place).
+ *
+ * Other than that, the returned set size will not be larger than {@link SamplingParams#getMaxSampleSize()}
+ * nor smaller than {@link SamplingParams#getMinSampleSize()}.
+ * @param docids
+ * full set of matching documents out of which a sample is needed.
+ */
+ public SampleResult getSampleSet(ScoredDocIDs docids) throws IOException {
+ if (!shouldSample(docids)) {
+ return new SampleResult(docids, 1d);
+ }
+
+ int actualSize = docids.size();
+ int sampleSetSize = (int) (actualSize * samplingParams.getSampleRatio());
+ sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize());
+ sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize());
+
+ int[] sampleSet = null;
+ try {
+ sampleSet = RandomSample.repeatableSample(docids, actualSize,
+ sampleSetSize);
+ } catch (IOException e) {
+ if (logger.isLoggable(Level.WARNING)) {
+ logger.log(Level.WARNING, "sampling failed: "+e.getMessage()+" - falling back to no sampling!", e);
+ }
+ return new SampleResult(docids, 1d);
+ }
+
+ ScoredDocIDs sampled = ScoredDocIdsUtils.createScoredDocIDsSubset(docids,
+ sampleSet);
+ if (logger.isLoggable(Level.FINEST)) {
+ logger.finest("******************** " + sampled.size());
+ }
+ return new SampleResult(sampled, sampled.size()/(double)docids.size());
+ }
+
+ /**
+ * Get a fixer of sample facet accumulation results. Default implementation
+ * returns a
+ * Note two major differences between this class and {@link SamplingWrapper}:
+ *
+ * Note: Sampling accumulation (Accumulation over a sampled-set of the results),
+ * does not guarantee accurate values for
+ * {@link FacetResult#getNumValidDescendants()} &
+ * {@link FacetResultNode#getResidue()}.
+ *
+ * @see Sampler
+ * @lucene.experimental
+ */
+public class SamplingAccumulator extends StandardFacetsAccumulator {
+
+ private double samplingRatio = -1d;
+ private final Sampler sampler;
+
+ /**
+ * Constructor...
+ */
+ public SamplingAccumulator(
+ Sampler sampler,
+ FacetSearchParams searchParams,
+ IndexReader indexReader, TaxonomyReader taxonomyReader,
+ IntArrayAllocator intArrayAllocator,
+ FloatArrayAllocator floatArrayAllocator) {
+ super(searchParams, indexReader, taxonomyReader, intArrayAllocator,
+ floatArrayAllocator);
+ this.sampler = sampler;
+ }
+
+ /**
+ * Constructor...
+ */
+ public SamplingAccumulator(
+ Sampler sampler,
+ FacetSearchParams searchParams,
+ IndexReader indexReader, TaxonomyReader taxonomyReader) {
+ super(searchParams, indexReader, taxonomyReader);
+ this.sampler = sampler;
+ }
+
+ @Override
+ public List
+ * CategoryPath is designed to reduce the number of object allocations, in two
+ * ways: First, it keeps the components internally in two arrays, rather than
+ * keeping individual strings. Second, it allows reusing the same CategoryPath
+ * object (which can be clear()ed and new components add()ed again) and of
+ * add()'s parameter (which can be a reusable object, not just a string).
+ *
+ * @lucene.experimental
+ */
+public class CategoryPath implements Serializable, Cloneable, Comparable
+ * Note that when a String object is passed to this method, a reference to
+ * it is not saved (rather, its content is copied), which will lead to that
+ * String object being gc'ed. To reduce the number of garbage objects, you
+ * can pass a mutable CharBuffer instead of an immutable String to this
+ * method.
+ */
+ public void add(CharSequence component) {
+ // Set the new end, increasing the "ends" array sizes if necessary:
+ if (ncomponents >= ends.length) {
+ short[] newends = new short[(ends.length + 1) * 2];
+ System.arraycopy(ends, 0, newends, 0, ends.length);
+ ends = newends;
+ }
+ short prevend = (ncomponents == 0) ? 0 : ends[ncomponents - 1];
+ int cmplen = component.length();
+ ends[ncomponents] = (short) (prevend + cmplen);
+
+ // Copy the new component's characters, increasing the "chars" array
+ // sizes if necessary:
+ if (ends[ncomponents] > chars.length) {
+ char[] newchars = new char[ends[ncomponents] * 2];
+ System.arraycopy(chars, 0, newchars, 0, chars.length);
+ chars = newchars;
+ }
+ for (int i = 0; i < cmplen; i++) {
+ chars[prevend++] = component.charAt(i);
+ }
+
+ ncomponents++;
+ }
+
+ /**
+ * Empty the CategoryPath object, so that it has zero components. The
+ * capacity of the object (see {@link #capacityChars()} and
+ * {@link #capacityComponents()}) is not reduced, so that the object can be
+ * reused without frequent reallocations.
+ */
+ public void clear() {
+ ncomponents = 0;
+ }
+
+ /**
+ * Build a string representation of the path, with its components separated
+ * by the given delimiter character. The resulting string is appended to a
+ * given Appendable, e.g., a StringBuilder, CharBuffer or Writer.
+ *
+ * Note that the two cases of zero components and one component with zero
+ * length produce indistinguishable results (both of them append nothing).
+ * This is normally not a problem, because components should not normally
+ * have zero lengths.
+ *
+ * An IOException can be thrown if the given Appendable's append() throws
+ * this exception.
+ */
+ public void appendTo(Appendable out, char delimiter) throws IOException {
+ if (ncomponents == 0) {
+ return; // just append nothing...
+ }
+ for (int i = 0; i < ends[0]; i++) {
+ out.append(chars[i]);
+ }
+ for (int j = 1; j < ncomponents; j++) {
+ out.append(delimiter);
+ for (int i = ends[j - 1]; i < ends[j]; i++) {
+ out.append(chars[i]);
+ }
+ }
+ }
+
+ /**
+ * like {@link #appendTo(Appendable, char)}, but takes only a prefix of the
+ * path, rather than the whole path.
+ *
+ * If the given prefix length is negative or bigger than the path's actual
+ * length, the whole path is taken.
+ */
+ public void appendTo(Appendable out, char delimiter, int prefixLen)
+ throws IOException {
+ if (prefixLen < 0 || prefixLen > ncomponents) {
+ prefixLen = ncomponents;
+ }
+ if (prefixLen == 0) {
+ return; // just append nothing...
+ }
+ for (int i = 0; i < ends[0]; i++) {
+ out.append(chars[i]);
+ }
+ for (int j = 1; j < prefixLen; j++) {
+ out.append(delimiter);
+ for (int i = ends[j - 1]; i < ends[j]; i++) {
+ out.append(chars[i]);
+ }
+ }
+ }
+
+ /**
+ * like {@link #appendTo(Appendable, char)}, but takes only a part of the
+ * path, rather than the whole path.
+ *
+ *
+ * Note that the two cases of zero components and one component with zero
+ * length produce indistinguishable results (both of them return an empty
+ * string). This is normally not a problem, because components should not
+ * normally have zero lengths.
+ */
+ public String toString(char delimiter) {
+ if (ncomponents == 0) {
+ return "";
+ }
+ StringBuilder sb = new StringBuilder(ends[ncomponents - 1]
+ + (ncomponents - 1));
+ try {
+ this.appendTo(sb, delimiter);
+ } catch (IOException e) {
+ // can't happen, because StringBuilder.append() never actually
+ // throws an exception!
+ }
+ return sb.toString();
+ }
+
+ /**
+ * This method, an implementation of the {@link Object#toString()}
+ * interface, is to allow simple printing of a CategoryPath, for debugging
+ * purposes. When possible, it recommended to avoid using it it, and rather,
+ * if you want to output the path with its components separated by a
+ * delimiter character, specify the delimiter explicitly, with
+ * {@link #toString(char)}.
+ */
+ @Override
+ public String toString() {
+ return toString('/');
+ }
+
+ /**
+ * like {@link #toString(char)}, but takes only a prefix with a given number
+ * of components, rather than the whole path.
+ *
+ * If the given length is negative or bigger than the path's actual length,
+ * the whole path is taken.
+ */
+ public String toString(char delimiter, int prefixLen) {
+ if (prefixLen < 0 || prefixLen > ncomponents) {
+ prefixLen = ncomponents;
+ }
+ if (prefixLen == 0) {
+ return "";
+ }
+ StringBuilder sb = new StringBuilder(ends[prefixLen - 1]
+ + (prefixLen - 1));
+ try {
+ this.appendTo(sb, delimiter, prefixLen);
+ } catch (IOException e) {
+ // can't happen, because sb.append() never actually throws an
+ // exception
+ }
+ return sb.toString();
+ }
+
+ /**
+ * like {@link #toString(char)}, but takes only a part of the path, rather
+ * than the whole path.
+ *
+ *
+ * This method returns the number of characters written to the array.
+ *
+ * @param outputBuffer
+ * The destination character array.
+ * @param outputBufferStart
+ * The first location to write in the output array.
+ * @param numberOfComponentsToCopy
+ * The number of path components to write to the destination
+ * buffer.
+ * @param separatorChar
+ * The separator inserted between every pair of path components
+ * in the output buffer.
+ * @see #charsNeededForFullPath()
+ */
+ public int copyToCharArray(char[] outputBuffer, int outputBufferStart,
+ int numberOfComponentsToCopy, char separatorChar) {
+ if (numberOfComponentsToCopy == 0) {
+ return 0;
+ }
+ if (numberOfComponentsToCopy < 0
+ || numberOfComponentsToCopy > ncomponents) {
+ numberOfComponentsToCopy = ncomponents;
+ }
+ int outputBufferInitialStart = outputBufferStart; // for calculating
+ // chars copied.
+ int sourceStart = 0;
+ int sourceLength = ends[0];
+ for (int component = 0; component < numberOfComponentsToCopy; component++) {
+ if (component > 0) {
+ sourceStart = ends[component - 1];
+ sourceLength = ends[component] - sourceStart;
+ outputBuffer[outputBufferStart++] = separatorChar;
+ }
+ System.arraycopy(chars, sourceStart, outputBuffer,
+ outputBufferStart, sourceLength);
+ outputBufferStart += sourceLength;
+ }
+ return outputBufferStart - outputBufferInitialStart;
+ }
+
+ /**
+ * Returns the number of characters required to represent this entire
+ * category path, if written using
+ * {@link #copyToCharArray(char[], int, int, char)} or
+ * {@link #appendTo(Appendable, char)}. This includes the number of
+ * characters in all the components, plus the number of separators between
+ * them (each one character in the aforementioned methods).
+ */
+ public int charsNeededForFullPath() {
+ if (ncomponents == 0) {
+ return 0;
+ }
+ return ends[ncomponents - 1] + ncomponents - 1;
+ }
+
+ /**
+ * Construct a new CategoryPath object, given a single string with
+ * components separated by a given delimiter character.
+ *
+ * The initial capacity of the constructed object will be exactly what is
+ * needed to hold the given path. This fact is convenient when creating a
+ * temporary object that will not be reused later.
+ */
+ public CategoryPath(String pathString, char delimiter) {
+ if (pathString.length() == 0) {
+ ncomponents = 0;
+ chars = new char[0];
+ ends = new short[0];
+ return;
+ }
+
+ // This constructor is often used for creating a temporary object
+ // (one which will not be reused to hold multiple paths), so we want
+ // to do our best to allocate exactly the needed size - not less (to
+ // avoid reallocation) and not more (so as not to waste space).
+ // To do this, we unfortunately need to make an additional pass on the
+ // given string:
+ int nparts = 1;
+ for (int i = pathString.indexOf(delimiter); i >= 0; i = pathString
+ .indexOf(delimiter, i + 1)) {
+ nparts++;
+ }
+
+ ends = new short[nparts];
+ chars = new char[pathString.length() - nparts + 1];
+ ncomponents = 0;
+
+ add(pathString, delimiter);
+ }
+
+ /**
+ * Add the given components to the end of the path. The components are given
+ * in a single string, separated by a given delimiter character. If the
+ * given string is empty, it is assumed to refer to the root (empty)
+ * category, and nothing is added to the path (rather than adding a single
+ * empty component).
+ *
+ * Note that when a String object is passed to this method, a reference to
+ * it is not saved (rather, its content is copied), which will lead to that
+ * String object being gc'ed. To reduce the number of garbage objects, you
+ * can pass a mutable CharBuffer instead of an immutable String to this
+ * method.
+ */
+ public void add(CharSequence pathString, char delimiter) {
+ int len = pathString.length();
+ if (len == 0) {
+ return; // assume root category meant, so add nothing.
+ }
+ short pos = (ncomponents == 0) ? 0 : ends[ncomponents - 1];
+ for (int i = 0; i < len; i++) {
+ char c = pathString.charAt(i);
+ if (c == delimiter) {
+ if (ncomponents >= ends.length) {
+ short[] newends = new short[(ends.length + 1) * 2];
+ System.arraycopy(ends, 0, newends, 0, ends.length);
+ ends = newends;
+ }
+ ends[ncomponents++] = pos;
+ } else {
+ if (pos >= chars.length) {
+ char[] newchars = new char[(chars.length + 1) * 2];
+ System.arraycopy(chars, 0, newchars, 0, chars.length);
+ chars = newchars;
+ }
+ chars[pos++] = c;
+ }
+ }
+
+ // Don't forget to count the last component!
+ if (ncomponents >= ends.length) {
+ short[] newends = new short[(ends.length + 1) * 2];
+ System.arraycopy(ends, 0, newends, 0, ends.length);
+ ends = newends;
+ }
+ ends[ncomponents++] = pos;
+ }
+
+ /**
+ * Construct a new CategoryPath object, copying an existing path given as an
+ * array of strings.
+ *
+ * The new object occupies exactly the space it needs, without any spare
+ * capacity. This is the expected behavior in the typical use case, where
+ * this constructor is used to create a temporary object which is never
+ * reused.
+ */
+ public CategoryPath(CharSequence... components) {
+ this.ncomponents = (short) components.length;
+ this.ends = new short[ncomponents];
+ if (ncomponents > 0) {
+ this.ends[0] = (short) components[0].length();
+ for (int i = 1; i < ncomponents; i++) {
+ this.ends[i] = (short) (this.ends[i - 1] + components[i]
+ .length());
+ }
+ this.chars = new char[this.ends[ncomponents - 1]];
+ CharSequence cs = components[0];
+ if (cs instanceof String) {
+ ((String) cs).getChars(0, cs.length(), this.chars, 0);
+ } else {
+ for (int j = 0, k = cs.length(); j < k; j++) {
+ this.chars[j] = cs.charAt(j);
+ }
+ }
+ for (int i = 1; i < ncomponents; i++) {
+ cs = components[i];
+ int offset = this.ends[i - 1];
+ if (cs instanceof String) {
+ ((String) cs).getChars(0, cs.length(), this.chars, offset);
+ } else {
+ for (int j = 0, k = cs.length(); j < k; j++) {
+ this.chars[j + offset] = cs.charAt(j);
+ }
+ }
+ }
+ } else {
+ this.chars = new char[0];
+ }
+ }
+
+ /**
+ * Construct a new CategoryPath object, copying the path given in an
+ * existing CategoryPath object.
+ *
+ * This copy-constructor is handy when you need to save a reference to a
+ * CategoryPath (e.g., when it serves as a key to a hash-table), but cannot
+ * save a reference to the original object because its contents can be
+ * changed later by the user. Copying the contents into a new object is a
+ * solution.
+ *
+ * This constructor does not copy the capacity (spare buffer size)
+ * of the existing CategoryPath. Rather, the new object occupies exactly the
+ * space it needs, without any spare. This is the expected behavior in the
+ * typical use case outlined in the previous paragraph.
+ */
+ public CategoryPath(CategoryPath existing) {
+ ncomponents = existing.ncomponents;
+ if (ncomponents == 0) {
+ chars = new char[0];
+ ends = new short[0];
+ return;
+ }
+
+ chars = new char[existing.ends[ncomponents - 1]];
+ System.arraycopy(existing.chars, 0, chars, 0, chars.length);
+ ends = new short[ncomponents];
+ System.arraycopy(existing.ends, 0, ends, 0, ends.length);
+ }
+
+ /**
+ * Construct a new CategoryPath object, copying a prefix with the given
+ * number of components of the path given in an existing CategoryPath
+ * object.
+ *
+ * If the given length is negative or bigger than the given path's actual
+ * length, the full path is taken.
+ *
+ * This constructor is often convenient for creating a temporary object with
+ * a path's prefix, but this practice is wasteful, and therefore
+ * inadvisable. Rather, the application should be written in a way that
+ * allows considering only a prefix of a given path, without needing to make
+ * a copy of that path.
+ */
+ public CategoryPath(CategoryPath existing, int prefixLen) {
+ if (prefixLen < 0 || prefixLen > existing.ncomponents) {
+ ncomponents = existing.ncomponents;
+ } else {
+ ncomponents = (short) prefixLen;
+ }
+ if (ncomponents == 0) {
+ chars = new char[0];
+ ends = new short[0];
+ return;
+ }
+
+ chars = new char[existing.ends[ncomponents - 1]];
+ System.arraycopy(existing.chars, 0, chars, 0, chars.length);
+ ends = new short[ncomponents];
+ System.arraycopy(existing.ends, 0, ends, 0, ends.length);
+ }
+
+ @Override
+ public Object clone() {
+ return new CategoryPath(this);
+ }
+
+ /**
+ * Compare the given CategoryPath to another one. For two category paths to
+ * be considered equal, only the path they contain needs to be identical The
+ * unused capacity of the objects is not considered in the comparison.
+ */
+ @Override
+ public boolean equals(Object obj) {
+ if (obj instanceof CategoryPath) {
+ CategoryPath other = (CategoryPath) obj;
+ if (other.ncomponents != this.ncomponents) {
+ return false;
+ }
+ // Unfortunately, Arrays.equal() can only compare entire arrays,
+ // and in our case we potentially have unused parts of the arrays
+ // that must not be compared... I wish that some future version
+ // of Java has a offset and length parameter to Arrays.equal
+ // (sort of like System.arraycopy()).
+ if (ncomponents == 0) {
+ return true; // nothing to compare...
+ }
+ for (int i = 0; i < ncomponents; i++) {
+ if (this.ends[i] != other.ends[i]) {
+ return false;
+ }
+ }
+ int len = ends[ncomponents - 1];
+ for (int i = 0; i < len; i++) {
+ if (this.chars[i] != other.chars[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Test whether this object is a descendant of another CategoryPath. This is
+ * true if the other CategoryPath is the prefix of this.
+ */
+ public boolean isDescendantOf(CategoryPath other) {
+ if (this.ncomponents < other.ncomponents) {
+ return false;
+ }
+ int j = 0;
+ for (int i = 0; i < other.ncomponents; i++) {
+ if (ends[i] != other.ends[i]) {
+ return false;
+ }
+ for (; j < ends[i]; j++) {
+ if (this.chars[j] != other.chars[j]) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Calculate a hashCode for this path, used when a CategoryPath serves as a
+ * hash-table key. If two objects are equal(), their hashCodes need to be
+ * equal, so like in equal(), hashCode does not consider unused portions of
+ * the internal buffers in its calculation.
+ *
+ * The hash function used is modeled after Java's String.hashCode() - a
+ * simple multiplicative hash function with the multiplier 31. The same hash
+ * function also appeared in Kernighan & Ritchie's second edition of
+ * "The C Programming Language" (1988).
+ */
+ @Override
+ public int hashCode() {
+ if (ncomponents == 0) {
+ return 0;
+ }
+ int hash = ncomponents;
+ // Unfortunately, Arrays.hashCode() can only calculate a hash code
+ // for an entire arrays, and in our case we potentially have unused
+ // parts of the arrays that must be ignored, so must use our own loop
+ // over the characters. I wish that some future version of Java will
+ // add offset and length parameters to Arrays.hashCode (sort of like
+ // System.arraycopy()'s parameters).
+ for (int i = 0; i < ncomponents; i++) {
+ hash = hash * 31 + ends[i];
+ }
+ int len = ends[ncomponents - 1];
+ for (int i = 0; i < len; i++) {
+ hash = hash * 31 + chars[i];
+ }
+ return hash;
+ }
+
+ /**
+ * Like {@link #hashCode()}, but find the hash function of a prefix with the
+ * given number of components, rather than of the entire path.
+ */
+ public int hashCode(int prefixLen) {
+ if (prefixLen < 0 || prefixLen > ncomponents) {
+ prefixLen = ncomponents;
+ }
+ if (prefixLen == 0) {
+ return 0;
+ }
+ int hash = prefixLen;
+ for (int i = 0; i < prefixLen; i++) {
+ hash = hash * 31 + ends[i];
+ }
+ int len = ends[prefixLen - 1];
+ for (int i = 0; i < len; i++) {
+ hash = hash * 31 + chars[i];
+ }
+ return hash;
+ }
+
+ /**
+ * Calculate a 64-bit hash function for this path. Unlike
+ * {@link #hashCode()}, this method is not part of the Java standard, and is
+ * only used if explicitly called by the user.
+ *
+ * If two objects are equal(), their hash codes need to be equal, so like in
+ * {@link #equals(Object)}, longHashCode does not consider unused portions
+ * of the internal buffers in its calculation.
+ *
+ * The hash function used is a simple multiplicative hash function, with the
+ * multiplier 65599. While Java's standard multiplier 31 (used in
+ * {@link #hashCode()}) gives a good distribution for ASCII strings, it
+ * turns out that for foreign-language strings (with 16-bit characters) it
+ * gives too many collisions, and a bigger multiplier produces fewer
+ * collisions in this case.
+ */
+ public long longHashCode() {
+ if (ncomponents == 0) {
+ return 0;
+ }
+ long hash = ncomponents;
+ for (int i = 0; i < ncomponents; i++) {
+ hash = hash * 65599 + ends[i];
+ }
+ int len = ends[ncomponents - 1];
+ for (int i = 0; i < len; i++) {
+ hash = hash * 65599 + chars[i];
+ }
+ return hash;
+ }
+
+ /**
+ * Like {@link #longHashCode()}, but find the hash function of a prefix with
+ * the given number of components, rather than of the entire path.
+ */
+ public long longHashCode(int prefixLen) {
+ if (prefixLen < 0 || prefixLen > ncomponents) {
+ prefixLen = ncomponents;
+ }
+ if (prefixLen == 0) {
+ return 0;
+ }
+ long hash = prefixLen;
+ for (int i = 0; i < prefixLen; i++) {
+ hash = hash * 65599 + ends[i];
+ }
+ int len = ends[prefixLen - 1];
+ for (int i = 0; i < len; i++) {
+ hash = hash * 65599 + chars[i];
+ }
+ return hash;
+ }
+
+ /**
+ * Write out a serialized (as a character sequence) representation of the
+ * path to a given Appendable (e.g., a StringBuilder, CharBuffer, Writer, or
+ * something similar.
+ *
+ * This method may throw a IOException if the given Appendable threw this
+ * exception while appending.
+ */
+ public void serializeAppendTo(Appendable out) throws IOException {
+ // Note that we use the fact that ncomponents and ends[] are shorts,
+ // so we can write them as chars:
+ out.append((char) ncomponents);
+ if (ncomponents == 0) {
+ return;
+ }
+ for (int i = 0; i < ncomponents; i++) {
+ out.append((char) ends[i]);
+ }
+ int usedchars = ends[ncomponents - 1];
+ for (int i = 0; i < usedchars; i++) {
+ out.append(chars[i]);
+ }
+ }
+
+ /**
+ * Just like {@link #serializeAppendTo(Appendable)}, but writes only a
+ * prefix of the CategoryPath.
+ */
+ public void serializeAppendTo(int prefixLen, Appendable out)
+ throws IOException {
+ if (prefixLen < 0 || prefixLen > ncomponents) {
+ prefixLen = ncomponents;
+ }
+ // Note that we use the fact that ncomponents and ends[] are shorts,
+ // so we can write them as chars:
+ out.append((char) prefixLen);
+ if (prefixLen == 0) {
+ return;
+ }
+ for (int i = 0; i < prefixLen; i++) {
+ out.append((char) ends[i]);
+ }
+ int usedchars = ends[prefixLen - 1];
+ for (int i = 0; i < usedchars; i++) {
+ out.append(chars[i]);
+ }
+ }
+
+ /**
+ * Set a CategoryPath from a character-sequence representation written by
+ * {@link #serializeAppendTo(Appendable)}.
+ *
+ * Reading starts at the given offset into the given character sequence, and
+ * the offset right after the end of this path is returned.
+ */
+ public int setFromSerialized(CharSequence buffer, int offset) {
+ ncomponents = (short) buffer.charAt(offset++);
+ if (ncomponents == 0) {
+ return offset;
+ }
+
+ if (ncomponents >= ends.length) {
+ ends = new short[Math.max(ends.length * 2, ncomponents)];
+ }
+ for (int i = 0; i < ncomponents; i++) {
+ ends[i] = (short) buffer.charAt(offset++);
+ }
+
+ int usedchars = ends[ncomponents - 1];
+ if (usedchars > chars.length) {
+ chars = new char[Math.max(chars.length * 2, usedchars)];
+ }
+ for (int i = 0; i < usedchars; i++) {
+ chars[i] = buffer.charAt(offset++);
+ }
+
+ return offset;
+ }
+
+ /**
+ * Check whether the current path is identical to the one serialized (with
+ * {@link #serializeAppendTo(Appendable)}) in the given buffer, at the given
+ * offset.
+ */
+ public boolean equalsToSerialized(CharSequence buffer, int offset) {
+ int n = (short) buffer.charAt(offset++);
+ if (ncomponents != n) {
+ return false;
+ }
+ if (ncomponents == 0) {
+ return true;
+ }
+ for (int i = 0; i < ncomponents; i++) {
+ if (ends[i] != (short) buffer.charAt(offset++)) {
+ return false;
+ }
+ }
+ int usedchars = ends[ncomponents - 1];
+ for (int i = 0; i < usedchars; i++) {
+ if (chars[i] != buffer.charAt(offset++)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Just like {@link #equalsToSerialized(CharSequence, int)}, but compare to
+ * a prefix of the CategoryPath, instead of the whole CategoryPath.
+ */
+ public boolean equalsToSerialized(int prefixLen, CharSequence buffer,
+ int offset) {
+ if (prefixLen < 0 || prefixLen > ncomponents) {
+ prefixLen = ncomponents;
+ }
+ int n = (short) buffer.charAt(offset++);
+ if (prefixLen != n) {
+ return false;
+ }
+ if (prefixLen == 0) {
+ return true;
+ }
+ for (int i = 0; i < prefixLen; i++) {
+ if (ends[i] != (short) buffer.charAt(offset++)) {
+ return false;
+ }
+ }
+ int usedchars = ends[prefixLen - 1];
+ for (int i = 0; i < usedchars; i++) {
+ if (chars[i] != buffer.charAt(offset++)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * This method calculates a hash function of a path that has been written to
+ * (using {@link #serializeAppendTo(Appendable)}) a character buffer. It is
+ * guaranteed that the value returned is identical to that which
+ * {@link #hashCode()} would have produced for the original object before it
+ * was serialized.
+ */
+ public static int hashCodeOfSerialized(CharSequence buffer, int offset) {
+ // Note: the algorithm here must be identical to that of hashCode(),
+ // in order that they produce identical results!
+ int ncomponents = (short) buffer.charAt(offset++);
+ if (ncomponents == 0) {
+ return 0;
+ }
+ int hash = ncomponents;
+ for (int i = 0; i < ncomponents; i++) {
+ hash = hash * 31 + buffer.charAt(offset++);
+ }
+ int len = buffer.charAt(offset - 1);
+ for (int i = 0; i < len; i++) {
+ hash = hash * 31 + buffer.charAt(offset++);
+ }
+ return hash;
+ }
+
+ /**
+ * Serializes the content of this CategoryPath to a byte stream, using UTF-8
+ * encoding to convert characters to bytes, and treating the ends as 16-bit
+ * characters.
+ *
+ * @param osw
+ * The output byte stream.
+ * @throws IOException
+ * If there are encoding errors.
+ */
+ // TODO (Facet): consolidate all de/serialize method names to
+ // serialize() and unserialize()
+ public void serializeToStreamWriter(OutputStreamWriter osw)
+ throws IOException {
+ osw.write(this.ncomponents);
+ if (this.ncomponents <= 0) {
+ return;
+ }
+ for (int j = 0; j < this.ncomponents; j++) {
+ osw.write(this.ends[j]);
+ }
+ osw.write(this.chars, 0, this.ends[this.ncomponents - 1]);
+ }
+
+ /**
+ * Serializes the content of this CategoryPath to a byte stream, using UTF-8
+ * encoding to convert characters to bytes, and treating the ends as 16-bit
+ * characters.
+ *
+ * @param isr
+ * The input stream.
+ * @throws IOException
+ * If there are encoding errors.
+ */
+ public void deserializeFromStreamReader(InputStreamReader isr)
+ throws IOException {
+ this.ncomponents = (short) isr.read();
+ if (this.ncomponents <= 0) {
+ return;
+ }
+ if (this.ends == null || this.ends.length < this.ncomponents) {
+ this.ends = new short[this.ncomponents];
+ }
+ for (int j = 0; j < this.ncomponents; j++) {
+ this.ends[j] = (short) isr.read();
+ }
+ if (this.chars == null
+ || this.ends[this.ncomponents - 1] > chars.length) {
+ this.chars = new char[this.ends[this.ncomponents - 1]];
+ }
+ isr.read(this.chars, 0, this.ends[this.ncomponents - 1]);
+ }
+
+ private void writeObject(java.io.ObjectOutputStream out)
+ throws IOException {
+ OutputStreamWriter osw = new OutputStreamWriter(out, "UTF-8");
+ this.serializeToStreamWriter(osw);
+ osw.flush();
+ }
+
+ private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
+ InputStreamReader isr = new InputStreamReader(in, "UTF-8");
+ this.deserializeFromStreamReader(isr);
+ }
+
+ /**
+ * Compares this CategoryPath with the other CategoryPath for lexicographic
+ * order.
+ * Returns a negative integer, zero, or a positive integer as this
+ * CategoryPath lexicographically precedes, equals to, or lexicographically follows
+ * the other CategoryPath.
+ */
+ public int compareTo(CategoryPath other) {
+ int minlength = (this.length() < other.length()) ? this.length() : other.length();
+ int ch = 0;
+ for (int co = 0 ; co < minlength; co++) {
+ if (this.ends[co] <= other.ends[co]) {
+ for ( ; ch < this.ends[co] ; ch++) {
+ if (this.chars[ch] != other.chars[ch]) {
+ return this.chars[ch] - other.chars[ch];
+ }
+ }
+ if (this.ends[co] < other.ends[co]) {
+ return -1;
+ }
+ } else /* this.ends[co] > other.ends[co] */ {
+ for ( ; ch < other.ends[co] ; ch++) {
+ if (this.chars[ch] != other.chars[ch]) {
+ return this.chars[ch] - other.chars[ch];
+ }
+ }
+ return +1;
+ }
+ }
+ // one is a prefix of the other
+ return this.length() - other.length();
+ }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java
new file mode 100644
index 00000000000..2d9649f399f
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java
@@ -0,0 +1,274 @@
+package org.apache.lucene.facet.taxonomy;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Map;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * TaxonomyReader is the read-only interface with which the faceted-search
+ * library uses the taxonomy during search time.
+ *
+ * A TaxonomyReader holds a list of categories. Each category has a serial
+ * number which we call an "ordinal", and a hierarchical "path" name:
+ *
+ * An implementation must allow multiple readers to be active concurrently
+ * with a single writer. Readers follow so-called "point in time" semantics,
+ * i.e., a TaxonomyReader object will only see taxonomy entries which were
+ * available at the time it was created. What the writer writes is only
+ * available to (new) readers after the writer's commit() is called.
+ *
+ * In faceted search, two separate indices are used: the main Lucene index,
+ * and the taxonomy. Because the main index refers to the categories listed
+ * in the taxonomy, it is important to open the taxonomy *after* opening the
+ * main index, and it is also necessary to reopen() the taxonomy after
+ * reopen()ing the main index.
+ *
+ * This order is important, otherwise it would be possible for the main index
+ * to refer to a category which is not yet visible in the old snapshot of
+ * the taxonomy. Note that it is indeed fine for the the taxonomy to be opened
+ * after the main index - even a long time after. The reason is that once
+ * a category is added to the taxonomy, it can never be changed or deleted,
+ * so there is no danger that a "too new" taxonomy not being consistent with
+ * an older index.
+ *
+ * @lucene.experimental
+ */
+public interface TaxonomyReader extends Closeable {
+
+ /**
+ * The root category (the category with the empty path) always has the
+ * ordinal 0, to which we give a name ROOT_ORDINAL.
+ * getOrdinal() of an empty path will always return ROOT_ORDINAL, and
+ * getCategory(ROOT_ORDINAL) will return the empty path.
+ */
+ public final static int ROOT_ORDINAL = 0;
+
+ /**
+ * Ordinals are always non-negative, so a negative ordinal can be used to
+ * signify an error. Methods here return INVALID_ORDINAL (-1) in this case.
+ */
+ public final static int INVALID_ORDINAL = -1;
+
+ /**
+ * getOrdinal() returns the ordinal of the category given as a path.
+ * The ordinal is the category's serial number, an integer which starts
+ * with 0 and grows as more categories are added (note that once a category
+ * is added, it can never be deleted).
+ *
+ * If the given category wasn't found in the taxonomy, INVALID_ORDINAL is
+ * returned.
+ */
+ public int getOrdinal(CategoryPath categoryPath) throws IOException;
+
+ /**
+ * getPath() returns the path name of the category with the given
+ * ordinal. The path is returned as a new CategoryPath object - to
+ * reuse an existing object, use {@link #getPath(int, CategoryPath)}.
+ *
+ * A null is returned if a category with the given ordinal does not exist.
+ */
+ public CategoryPath getPath(int ordinal) throws IOException;
+
+ /**
+ * getPath() returns the path name of the category with the given
+ * ordinal. The path is written to the given CategoryPath object (which
+ * is cleared first).
+ *
+ * If a category with the given ordinal does not exist, the given
+ * CategoryPath object is not modified, and the method returns
+ *
+ * If there were no changes since this instance was opened or last refreshed,
+ * then this call does nothing. Note, however, that this is still a relatively
+ * slow method (as it needs to verify whether there have been any changes on
+ * disk to the taxonomy), so it should not be called too often needlessly. In
+ * faceted search, the taxonomy reader's refresh() should be called only after
+ * a reopen() of the main index.
+ *
+ * It should be noted that refresh() is similar in purpose to
+ * IndexReader.reopen(), but the two methods behave differently. refresh()
+ * refreshes the existing TaxonomyReader object, rather than opening a new one
+ * in addition to the old one as reopen() does. The reason is that in a
+ * taxonomy, one can only add new categories and cannot modify or delete
+ * existing categories; Therefore, there is no reason to keep an old snapshot
+ * of the taxonomy open - refreshing the taxonomy to the newest data and using
+ * this new snapshots in all threads (whether new or old) is fine. This saves
+ * us needing to keep multiple copies of the taxonomy open in memory.
+ */
+ public void refresh() throws IOException;
+
+ /**
+ * getParent() returns the ordinal of the parent category of the category
+ * with the given ordinal.
+ *
+ * When a category is specified as a path name, finding the path of its
+ * parent is as trivial as dropping the last component of the path.
+ * getParent() is functionally equivalent to calling getPath() on the
+ * given ordinal, dropping the last component of the path, and then calling
+ * getOrdinal() to get an ordinal back. However, implementations are
+ * expected to provide a much more efficient implementation:
+ *
+ * getParent() should be a very quick method, as it is used during the
+ * facet aggregation process in faceted search. Implementations will most
+ * likely want to serve replies to this method from a pre-filled cache.
+ *
+ * If the given ordinal is the ROOT_ORDINAL, an INVALID_ORDINAL is returned.
+ * If the given ordinal is a top-level category, the ROOT_ORDINAL is returned.
+ * If an invalid ordinal is given (negative or beyond the last available
+ * ordinal), an ArrayIndexOutOfBoundsException is thrown. However, it is
+ * expected that getParent will only be called for ordinals which are
+ * already known to be in the taxonomy.
+ */
+ public int getParent(int ordinal) throws IOException;
+
+ /**
+ * getParentArray() returns an int array of size getSize() listing the
+ * ordinal of the parent category of each category in the taxonomy.
+ *
+ * The caller can hold on to the array it got indefinitely - it is
+ * guaranteed that no-one else will modify it. The other side of the
+ * same coin is that the caller must treat the array it got as read-only
+ * and not modify it, because other callers might have gotten the
+ * same array too (and getParent() calls might be answered from the
+ * same array).
+ *
+ * If you use getParentArray() instead of getParent(), remember that
+ * the array you got is (naturally) not modified after a refresh(),
+ * so you should always call getParentArray() again after a refresh().
+ *
+ * This method's function is similar to allocating an array of size
+ * getSize() and filling it with getParent() calls, but implementations
+ * are encouraged to implement it much more efficiently, with O(1)
+ * complexity. This can be done, for example, by the implementation
+ * already keeping the parents in an array, and just returning this
+ * array (without any allocation or copying) when requested.
+ */
+ public int[] getParentArray() throws IOException;
+
+ /**
+ * Equivalent representations of the taxonomy's parent info,
+ * used internally for efficient computation of facet results:
+ * "youngest child" and "oldest sibling"
+ */
+ public static interface ChildrenArrays {
+ /**
+ * getYoungestChildArray() returns an int array of size getSize()
+ * listing the ordinal of the youngest (highest numbered) child
+ * category of each category in the taxonomy. The value for a leaf
+ * category (a category without children) is
+ *
+ * The caller can hold on to the object it got indefinitely - it is
+ * guaranteed that no-one else will modify it. The other side of the
+ * same coin is that the caller must treat the object which it got (and
+ * the arrays it contains) as read-only and not modify it, because
+ * other callers might have gotten the same object too.
+ *
+ * Implementations should have O(getSize()) time for the first call or
+ * after a refresh(), but O(1) time for further calls. In neither case
+ * there should be a need to read new data from disk. These guarantees
+ * are most likely achieved by calculating this object (based on the
+ * getParentArray()) when first needed, and later (if the taxonomy was not
+ * refreshed) returning the same object (without any allocation or copying)
+ * when requested.
+ *
+ * The reason we have one method returning one object, rather than two
+ * methods returning two arrays, is to avoid race conditions in a multi-
+ * threaded application: We want to avoid the possibility of returning one
+ * new array and one old array, as those could not be used together.
+ */
+ public ChildrenArrays getChildrenArrays();
+
+ /**
+ * Retrieve user committed data.
+ * @see TaxonomyWriter#commit(Map)
+ */
+ public Map
+ * Because categories are numbered consecutively starting with 0, it
+ * means the taxonomy contains ordinals 0 through getSize()-1.
+ *
+ * Note that the number returned by getSize() is often slightly higher
+ * than the number of categories inserted into the taxonomy; This is
+ * because when a category is added to the taxonomy, its ancestors
+ * are also added automatically (including the root, which always get
+ * ordinal 0).
+ */
+ public int getSize();
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyWriter.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyWriter.java
new file mode 100644
index 00000000000..6cc9ca862dd
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyWriter.java
@@ -0,0 +1,134 @@
+package org.apache.lucene.facet.taxonomy;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexWriter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * TaxonomyWriter is the interface which the faceted-search library uses
+ * to dynamically build the taxonomy at indexing time.
+ *
+ * Notes about concurrent access to the taxonomy:
+ *
+ * An implementation must allow multiple readers and a single writer to be
+ * active concurrently. Readers follow so-called "point in time" semantics,
+ * i.e., a reader object will only see taxonomy entries which were available
+ * at the time it was created. What the writer writes is only available to
+ * (new) readers after the writer's commit() is called.
+ *
+ * Faceted search keeps two indices - namely Lucene's main index, and this
+ * taxonomy index. When one or more readers are active concurrently with the
+ * writer, care must be taken to avoid an inconsistency between the state of
+ * these two indices: When writing to the indices, the taxonomy must always
+ * be committed to disk *before* the main index, because the main index
+ * refers to categories listed in the taxonomy.
+ * Such control can best be achieved by turning off the main index's
+ * "autocommit" feature, and explicitly calling commit() for both indices
+ * (first for the taxonomy, then for the main index).
+ * In old versions of Lucene (2.2 or earlier), when autocommit could not be
+ * turned off, a more complicated solution needs to be used. E.g., use
+ * some sort of (possibly inter-process) locking to ensure that a reader
+ * is being opened only right after both indices have been flushed (and
+ * before anything else is written to them).
+ *
+ * @lucene.experimental
+ */
+public interface TaxonomyWriter extends Closeable {
+
+ /**
+ * addCategory() adds a category with a given path name to the taxonomy,
+ * and returns its ordinal. If the category was already present in
+ * the taxonomy, its existing ordinal is returned.
+ *
+ * Before adding a category, addCategory() makes sure that all its
+ * ancestor categories exist in the taxonomy as well. As result, the
+ * ordinal of a category is guaranteed to be smaller then the ordinal of
+ * any of its descendants.
+ */
+ public int addCategory(CategoryPath categoryPath) throws IOException;
+
+ /**
+ * Calling commit() ensures that all the categories written so far are
+ * visible to a reader that is opened (or reopened) after that call.
+ * When the index is closed(), commit() is also implicitly done.
+ */
+ public void commit() throws IOException;
+
+ /**
+ * Like commit(), but also store properties with the index. These properties
+ * are retrievable by {@link TaxonomyReader#getCommitUserData}.
+ * See {@link IndexWriter#commit(Map)}.
+ */
+ public void commit(Map
+ * When a category is specified as a path name, finding the path of its
+ * parent is as trivial as dropping the last component of the path.
+ * getParent() is functionally equivalent to calling getPath() on the
+ * given ordinal, dropping the last component of the path, and then calling
+ * getOrdinal() to get an ordinal back.
+ *
+ * If the given ordinal is the ROOT_ORDINAL, an INVALID_ORDINAL is returned.
+ * If the given ordinal is a top-level category, the ROOT_ORDINAL is returned.
+ * If an invalid ordinal is given (negative or beyond the last available
+ * ordinal), an ArrayIndexOutOfBoundsException is thrown. However, it is
+ * expected that getParent will only be called for ordinals which are
+ * already known to be in the taxonomy.
+ *
+ * TODO (Facet): instead of a getParent(ordinal) method, consider having a
+ * getCategory(categorypath, prefixlen) which is similar to addCategory
+ * except it doesn't add new categories; This method can be used to get
+ * the ordinals of all prefixes of the given category, and it can use
+ * exactly the same code and cache used by addCategory() so it means less code.
+ */
+ public int getParent(int ordinal) throws IOException;
+
+ /**
+ * getSize() returns the number of categories in the taxonomy.
+ *
+ * Because categories are numbered consecutively starting with 0, it
+ * means the taxonomy contains ordinals 0 through getSize()-1.
+ *
+ * Note that the number returned by getSize() is often slightly higher
+ * than the number of categories inserted into the taxonomy; This is
+ * because when a category is added to the taxonomy, its ancestors
+ * are also added automatically (including the root, which always get
+ * ordinal 0).
+ */
+ public int getSize();
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/Consts.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/Consts.java
new file mode 100644
index 00000000000..9c53c48f6c7
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/Consts.java
@@ -0,0 +1,58 @@
+package org.apache.lucene.facet.taxonomy.lucene;
+
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.FieldSelectorResult;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @lucene.experimental
+ */
+abstract class Consts {
+
+ static final String FULL = "$full_path$";
+ static final String FIELD_PAYLOADS = "$payloads$";
+ static final String PAYLOAD_PARENT = "p";
+ static final char[] PAYLOAD_PARENT_CHARS = PAYLOAD_PARENT.toCharArray();
+
+ /**
+ * The following is a "field selector", an object which tells Lucene to
+ * extract only a single field rather than a whole document.
+ */
+ public static final FieldSelector fullPathSelector = new FieldSelector() {
+ public FieldSelectorResult accept(String fieldName) {
+ if (fieldName.equals(FULL)) {
+ return FieldSelectorResult.LOAD_AND_BREAK;
+ }
+ return FieldSelectorResult.NO_LOAD;
+ }
+ };
+
+ /**
+ * Delimiter used for creating the full path of a category from the list of
+ * its labels from root. It is forbidden for labels to contain this
+ * character.
+ *
+ * Originally, we used \uFFFE, officially a "unicode noncharacter" (invalid
+ * unicode character) for this purpose. Recently, we switched to the
+ * "private-use" character \uF749.
+ */
+ //static final char DEFAULT_DELIMITER = '\uFFFE';
+ static final char DEFAULT_DELIMITER = '\uF749';
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/LuceneTaxonomyReader.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/LuceneTaxonomyReader.java
new file mode 100644
index 00000000000..9bcae6e6a17
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/LuceneTaxonomyReader.java
@@ -0,0 +1,569 @@
+package org.apache.lucene.facet.taxonomy.lucene;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.collections.LRUHashMap;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * LuceneTaxonomyReader is a {@link TaxonomyReader} which retrieves stored
+ * taxonomy information from a separate Lucene index. By using a Lucene index,
+ * rather than some specialized file format, we get for "free" its correctness
+ * (especially regarding concurrency), and the ability to save it on any
+ * implementation of Directory (and not just the file system).
+ *
+ * Reading from the on-disk index on every method call is too slow, so this
+ * implementation employs caching: Some methods cache recent requests and
+ * their results, while other methods prefetch all the data into memory
+ * and then provide answers directly from in-memory tables. See the
+ * documentation of individual methods for comments on their performance.
+ *
+ * @lucene.experimental
+ */
+public class LuceneTaxonomyReader implements TaxonomyReader {
+
+ private static final Logger logger = Logger.getLogger(LuceneTaxonomyReader.class.getName());
+
+ private IndexReader indexReader;
+
+ // The following lock is used to allow multiple threads to read from the
+ // index concurrently, while having them block during the very short
+ // critical moment of refresh() (see comments below). Note, however, that
+ // we only read from the index when we don't have the entry in our cache,
+ // and the caches are locked separately.
+ private ReadWriteLock indexReaderLock = new ReentrantReadWriteLock();
+
+ // The following are the limited-size LRU caches used to cache the latest
+ // results from getOrdinal() and getCategoryCache().
+ // Because LRUHashMap is not thread-safe, we need to synchronize on this
+ // object when using it. Unfortunately, this is not optimal under heavy
+ // contention because it means that while one thread is using the cache
+ // (reading or modifying) others are blocked from using it - or even
+ // starting to do benign things like calculating the hash function. A more
+ // efficient approach would be to use a non-locking (as much as possible)
+ // concurrent solution, along the lines of java.util.concurrent.ConcurrentHashMap
+ // but with LRU semantics.
+ // However, even in the current sub-optimal implementation we do not make
+ // the mistake of locking out readers while waiting for disk in a cache
+ // miss - below, we do not hold cache lock while reading missing data from
+ // disk.
+ private final LRUHashMap
+ * Currently, if the given size is smaller than the current size of
+ * a cache, it will not shrink, and rather we be limited to its current
+ * size.
+ * @param size the new maximum cache size, in number of entries.
+ */
+ public void setCacheSize(int size) {
+ synchronized(getCategoryCache) {
+ getCategoryCache.setMaxSize(size);
+ }
+ synchronized(getOrdinalCache) {
+ getOrdinalCache.setMaxSize(size);
+ }
+ }
+
+ /**
+ * setDelimiter changes the character that the taxonomy uses in its
+ * internal storage as a delimiter between category components. Do not
+ * use this method unless you really know what you are doing.
+ *
+ * If you do use this method, make sure you call it before any other
+ * methods that actually queries the taxonomy. Moreover, make sure you
+ * always pass the same delimiter for all LuceneTaxonomyWriter and
+ * LuceneTaxonomyReader objects you create.
+ */
+ public void setDelimiter(char delimiter) {
+ this.delimiter = delimiter;
+ }
+
+ public int getOrdinal(CategoryPath categoryPath) throws IOException {
+ if (categoryPath.length()==0) {
+ return ROOT_ORDINAL;
+ }
+ String path = categoryPath.toString(delimiter);
+
+ // First try to find the answer in the LRU cache:
+ synchronized(getOrdinalCache) {
+ Integer res = getOrdinalCache.get(path);
+ if (res!=null) {
+ return res.intValue();
+ }
+ }
+
+ // If we're still here, we have a cache miss. We need to fetch the
+ // value from disk, and then also put it in the cache:
+ int ret = TaxonomyReader.INVALID_ORDINAL;
+ try {
+ indexReaderLock.readLock().lock();
+ // TODO (Facet): avoid Multi*?
+ Bits deletedDocs = MultiFields.getDeletedDocs(indexReader);
+ DocsEnum docs = MultiFields.getTermDocsEnum(indexReader, deletedDocs, Consts.FULL, new BytesRef(path));
+ if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+ ret = docs.docID();
+ }
+ } finally {
+ indexReaderLock.readLock().unlock();
+ }
+
+ // Put the new value in the cache. Note that it is possible that while
+ // we were doing the above fetching (without the cache locked), some
+ // other thread already added the same category to the cache. We do
+ // not care about this possibilty, as LRUCache replaces previous values
+ // of the same keys (it doesn't store duplicates).
+ synchronized(getOrdinalCache) {
+ // GB: new Integer(int); creates a new object each and every time.
+ // Integer.valueOf(int) might not (See JavaDoc).
+ getOrdinalCache.put(path, Integer.valueOf(ret));
+ }
+
+ return ret;
+ }
+
+ public CategoryPath getPath(int ordinal) throws CorruptIndexException, IOException {
+ // TODO (Facet): Currently, the LRU cache we use (getCategoryCache) holds
+ // strings with delimiters, not CategoryPath objects, so even if
+ // we have a cache hit, we need to process the string and build a new
+ // CategoryPath object every time. What is preventing us from putting
+ // the actual CategoryPath object in the cache is the fact that these
+ // objects are mutable. So we should create an immutable (read-only)
+ // interface that CategoryPath implements, and this method should
+ // return this interface, not the writable CategoryPath.
+ String label = getLabel(ordinal);
+ if (label==null) {
+ return null;
+ }
+ return new CategoryPath(label, delimiter);
+ }
+
+ public boolean getPath(int ordinal, CategoryPath result) throws CorruptIndexException, IOException {
+ String label = getLabel(ordinal);
+ if (label==null) {
+ return false;
+ }
+ result.clear();
+ result.add(label, delimiter);
+ return true;
+ }
+
+ private String getLabel(int catID) throws CorruptIndexException, IOException {
+ // First try to find the answer in the LRU cache. It is very
+ // unfortunate that we need to allocate an Integer object here -
+ // it would have been better if we used a hash table specifically
+ // designed for int keys...
+ // GB: new Integer(int); creates a new object each and every time.
+ // Integer.valueOf(int) might not (See JavaDoc).
+ Integer catIDInteger = Integer.valueOf(catID);
+
+ synchronized(getCategoryCache) {
+ String res = getCategoryCache.get(catIDInteger);
+ if (res!=null) {
+ return res;
+ }
+ }
+
+ // If we're still here, we have a cache miss. We need to fetch the
+ // value from disk, and then also put it in the cache:
+ String ret;
+ try {
+ indexReaderLock.readLock().lock();
+ // The taxonomy API dictates that if we get an invalid category
+ // ID, we should return null, If we don't check this here, we
+ // can some sort of an exception from the document() call below.
+ // NOTE: Currently, we *do not* cache this return value; There
+ // isn't much point to do so, because checking the validity of
+ // the docid doesn't require disk access - just comparing with
+ // the number indexReader.maxDoc().
+ if (catID<0 || catID>=indexReader.maxDoc()) {
+ return null;
+ }
+ ret = indexReader.document(catID, Consts.fullPathSelector)
+ .get(Consts.FULL);
+ } finally {
+ indexReaderLock.readLock().unlock();
+ }
+ // Put the new value in the cache. Note that it is possible that while
+ // we were doing the above fetching (without the cache locked), some
+ // other thread already added the same category to the cache. We do
+ // not care about this possibility, as LRUCache replaces previous
+ // values of the same keys (it doesn't store duplicates).
+ synchronized (getCategoryCache) {
+ getCategoryCache.put(catIDInteger, ret);
+ }
+
+ return ret;
+ }
+
+ public int getParent(int ordinal) {
+ // Note how we don't need to hold the read lock to do the following,
+ // because the array reference is volatile, ensuring the correct
+ // visibility and ordering: if we get the new reference, the new
+ // data is also visible to this thread.
+ return getParentArray()[ordinal];
+ }
+
+ /**
+ * getParentArray() returns an int array of size getSize() listing the
+ * ordinal of the parent category of each category in the taxonomy.
+ *
+ * The caller can hold on to the array it got indefinitely - it is
+ * guaranteed that no-one else will modify it. The other side of the
+ * same coin is that the caller must treat the array it got as read-only
+ * and not modify it, because other callers might have gotten the
+ * same array too, and getParent() calls are also answered from the
+ * same array.
+ *
+ * The getParentArray() call is extremely efficient, merely returning
+ * a reference to an array that already exists. For a caller that plans
+ * to call getParent() for many categories, using getParentArray() and
+ * the array it returns is a somewhat faster approach because it avoids
+ * the overhead of method calls and volatile dereferencing.
+ *
+ * If you use getParentArray() instead of getParent(), remember that
+ * the array you got is (naturally) not modified after a refresh(),
+ * so you should always call getParentArray() again after a refresh().
+ */
+
+ public int[] getParentArray() {
+ // Note how we don't need to hold the read lock to do the following,
+ // because the array reference is volatile, ensuring the correct
+ // visibility and ordering: if we get the new reference, the new
+ // data is also visible to this thread.
+ return parentArray.getArray();
+ }
+
+ // Note that refresh() is synchronized (it is the only synchronized
+ // method in this class) to ensure that it never gets called concurrently
+ // with itself.
+ public synchronized void refresh() throws IOException {
+ /*
+ * Since refresh() can be a lengthy operation, it is very important that we
+ * avoid locking out all readers for its duration. This is why we don't hold
+ * the indexReaderLock write lock for the entire duration of this method. In
+ * fact, it is enough to hold it only during a single assignment! Other
+ * comments in this method will explain this.
+ */
+
+ // note that the lengthy operation indexReader.reopen() does not
+ // modify the reader, so we can do it without holding a lock. We can
+ // safely read indexReader without holding the write lock, because
+ // no other thread can be writing at this time (this method is the
+ // only possible writer, and it is "synchronized" to avoid this case).
+ IndexReader r2 = indexReader.reopen();
+ if (indexReader != r2) {
+ IndexReader oldreader = indexReader;
+ // we can close the old searcher, but need to synchronize this
+ // so that we don't close it in the middle that another routine
+ // is reading from it.
+ indexReaderLock.writeLock().lock();
+ indexReader = r2;
+ indexReaderLock.writeLock().unlock();
+ // We can close the old reader, but need to be certain that we
+ // don't close it while another method is reading from it.
+ // Luckily, we can be certain of that even without putting the
+ // oldreader.close() in the locked section. The reason is that
+ // after lock() succeeded above, we know that all existing readers
+ // had finished (this is what a read-write lock ensures). New
+ // readers, starting after the unlock() we just did, already got
+ // the new indexReader we set above. So nobody can be possibly
+ // using the old indexReader, and we can close it:
+ oldreader.close();
+
+ // We prefetch some of the arrays to make requests much faster.
+ // Let's refresh these prefetched arrays; This refresh is much
+ // is made more efficient by assuming that it is enough to read
+ // the values for new categories (old categories could not have been
+ // changed or deleted)
+ // Note that this this done without the write lock being held,
+ // which means that it is possible that during a refresh(), a
+ // reader will have some methods (like getOrdinal and getCategory)
+ // return fresh information, while getParent()
+ // (only to be prefetched now) still return older information.
+ // We consider this to be acceptable. The important thing,
+ // however, is that refreshPrefetchArrays() itself writes to
+ // the arrays in a correct manner (see discussion there)
+ parentArray.refresh(indexReader);
+
+ // Remove any INVALID_ORDINAL values from the ordinal cache,
+ // because it is possible those are now answered by the new data!
+ Iterator
+ * In addition to the permanently-stored Lucene index, efficiency dictates that
+ * we also keep an in-memory cache of recently seen or all
+ * categories, so that we do not need to go back to disk for every category
+ * addition to see which ordinal this category already has, if any. A
+ * {@link TaxonomyWriterCache} object determines the specific caching algorithm
+ * used.
+ *
+ * This class offers some hooks for extending classes to control the
+ * {@link IndexWriter} instance that is used. See {@link #openLuceneIndex} and
+ * {@link #closeLuceneIndex()} .
+ *
+ * @lucene.experimental
+ */
+public class LuceneTaxonomyWriter implements TaxonomyWriter {
+
+ protected IndexWriter indexWriter;
+ private int nextID;
+ private char delimiter = Consts.DEFAULT_DELIMITER;
+ private SinglePositionTokenStream parentStream = new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
+ private Field parentStreamField;
+ private Field fullPathField;
+
+ private TaxonomyWriterCache cache;
+ /**
+ * We call the cache "complete" if we know that every category in our
+ * taxonomy is in the cache. When the cache is not complete, and
+ * we can't find a category in the cache, we still need to look for it
+ * in the on-disk index; Therefore when the cache is not complete, we
+ * need to open a "reader" to the taxonomy index.
+ * The cache becomes incomplete if it was never filled with the existing
+ * categories, or if a put() to the cache ever returned true (meaning
+ * that some of the cached data was cleared).
+ */
+ private boolean cacheIsComplete;
+ private IndexReader reader;
+ private int cacheMisses;
+
+ /**
+ * setDelimiter changes the character that the taxonomy uses in its internal
+ * storage as a delimiter between category components. Do not use this
+ * method unless you really know what you are doing. It has nothing to do
+ * with whatever character the application may be using to represent
+ * categories for its own use.
+ *
+ * If you do use this method, make sure you call it before any other methods
+ * that actually queries the taxonomy. Moreover, make sure you always pass
+ * the same delimiter for all LuceneTaxonomyWriter and LuceneTaxonomyReader
+ * objects you create for the same directory.
+ */
+ public void setDelimiter(char delimiter) {
+ this.delimiter = delimiter;
+ }
+
+ /**
+ * Forcibly unlocks the taxonomy in the named directory.
+ *
+ * Caution: this should only be used by failure recovery code, when it is
+ * known that no other process nor thread is in fact currently accessing
+ * this taxonomy.
+ *
+ * This method is unnecessary if your {@link Directory} uses a
+ * {@link NativeFSLockFactory} instead of the default
+ * {@link SimpleFSLockFactory}. When the "native" lock is used, a lock
+ * does not stay behind forever when the process using it dies.
+ */
+ public static void unlock(Directory directory) throws IOException {
+ IndexWriter.unlock(directory);
+ }
+
+ /**
+ * Construct a Taxonomy writer.
+ *
+ * @param directory
+ * The {@link Directory} in which to store the taxonomy. Note that
+ * the taxonomy is written directly to that directory (not to a
+ * subdirectory of it).
+ * @param openMode
+ * Specifies how to open a taxonomy for writing:
+ * The current default is {@link Cl2oTaxonomyWriterCache} constructed
+ * with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is
+ * cached in memory while building it.
+ */
+ public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
+ return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
+ }
+
+ // convenience constructors:
+
+ public LuceneTaxonomyWriter(Directory d)
+ throws CorruptIndexException, LockObtainFailedException,
+ IOException {
+ this(d, OpenMode.CREATE_OR_APPEND);
+ }
+
+ /**
+ * Frees used resources as well as closes the underlying {@link IndexWriter},
+ * which commits whatever changes made to it to the underlying
+ * {@link Directory}.
+ */
+ public synchronized void close() throws CorruptIndexException, IOException {
+ closeLuceneIndex();
+ closeResources();
+ }
+
+ /**
+ * Returns the number of memory bytes used by the cache.
+ * @return Number of cache bytes in memory, for CL2O only; zero otherwise.
+ */
+ public int getCacheMemoryUsage() {
+ if (this.cache == null || !(this.cache instanceof Cl2oTaxonomyWriterCache)) {
+ return 0;
+ }
+ return ((Cl2oTaxonomyWriterCache)this.cache).getMemoryUsage();
+ }
+
+ /**
+ * A hook for extending classes to close additional resources that were used.
+ * The default implementation closes the {@link IndexReader} as well as the
+ * {@link TaxonomyWriterCache} instances that were used.
+ * Actually, we might also need to add some of the category's ancestors
+ * before we can add the category itself (while keeping the invariant that a
+ * parent is always added to the taxonomy before its child). We do this by
+ * recursion.
+ */
+ private int internalAddCategory(CategoryPath categoryPath, int length)
+ throws CorruptIndexException, IOException {
+
+ // Find our parent's ordinal (recursively adding the parent category
+ // to the taxonomy if it's not already there). Then add the parent
+ // ordinal as payloads (rather than a stored field; payloads can be
+ // more efficiently read into memory in bulk by LuceneTaxonomyReader)
+ int parent;
+ if (length > 1) {
+ parent = findCategory(categoryPath, length - 1);
+ if (parent < 0) {
+ parent = internalAddCategory(categoryPath, length - 1);
+ }
+ } else if (length == 1) {
+ parent = TaxonomyReader.ROOT_ORDINAL;
+ } else {
+ parent = TaxonomyReader.INVALID_ORDINAL;
+ }
+ int id = addCategoryDocument(categoryPath, length, parent);
+
+ return id;
+ }
+
+ // Note that the methods calling addCategoryDocument() are synchornized,
+ // so this method is effectively synchronized as well, but we'll add
+ // synchronized to be on the safe side, and we can reuse class-local objects
+ // instead of allocating them every time
+ protected synchronized int addCategoryDocument(CategoryPath categoryPath,
+ int length, int parent)
+ throws CorruptIndexException, IOException {
+ // Before Lucene 2.9, position increments >=0 were supported, so we
+ // added 1 to parent to allow the parent -1 (the parent of the root).
+ // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
+ // no longer enough, since 0 is not encoded consistently either (see
+ // comment in SinglePositionTokenStream). But because we must be
+ // backward-compatible with existing indexes, we can't just fix what
+ // we write here (e.g., to write parent+2), and need to do a workaround
+ // in the reader (which knows that anyway only category 0 has a parent
+ // -1).
+ parentStream.set(parent+1);
+ Document d = new Document();
+ d.add(parentStreamField);
+
+ fullPathField.setValue(categoryPath.toString(delimiter, length));
+ d.add(fullPathField);
+
+ // Note that we do no pass an Analyzer here because the fields that are
+ // added to the Document are untokenized or contains their own TokenStream.
+ // Therefore the IndexWriter's Analyzer has no effect.
+ indexWriter.addDocument(d);
+ int id = nextID++;
+
+ addToCache(categoryPath, length, id);
+
+ // also add to the parent array
+ getParentArray().add(id, parent);
+
+ return id;
+ }
+
+ private static class SinglePositionTokenStream extends TokenStream {
+ private CharTermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private boolean returned;
+ public SinglePositionTokenStream(String word) {
+ termAtt = addAttribute(CharTermAttribute.class);
+ posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ termAtt.setEmpty().append(word);
+ returned = true;
+ }
+ /**
+ * Set the value we want to keep, as the position increment.
+ * Note that when TermPositions.nextPosition() is later used to
+ * retrieve this value, val-1 will be returned, not val.
+ *
+ * IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0,
+ * the retrieved position would be -1). But starting with Lucene 2.9,
+ * this unfortunately changed, and only val>0 are safe. val=0 can
+ * still be used, but don't count on the value you retrieve later
+ * (it could be 0 or -1, depending on circumstances or versions).
+ * This change is described in Lucene's JIRA: LUCENE-1542.
+ */
+ public void set(int val) {
+ posIncrAtt.setPositionIncrement(val);
+ returned = false;
+ }
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (returned) {
+ return false;
+ }
+ returned = true;
+ return true;
+ }
+ }
+
+ private void addToCache(CategoryPath categoryPath, int id)
+ throws CorruptIndexException, IOException {
+ if (cache.put(categoryPath, id)) {
+ // If cache.put() returned true, it means the cache was limited in
+ // size, became full, so parts of it had to be cleared.
+ // Unfortunately we don't know which part was cleared - it is
+ // possible that a relatively-new category that hasn't yet been
+ // committed to disk (and therefore isn't yet visible in our
+ // "reader") was deleted from the cache, and therefore we must
+ // now refresh the reader.
+ // Because this is a slow operation, cache implementations are
+ // expected not to delete entries one-by-one but rather in bulk
+ // (LruTaxonomyWriterCache removes the 2/3rd oldest entries).
+ refreshReader();
+ cacheIsComplete = false;
+ }
+ }
+
+ private void addToCache(CategoryPath categoryPath, int prefixLen, int id)
+ throws CorruptIndexException, IOException {
+ if (cache.put(categoryPath, prefixLen, id)) {
+ refreshReader();
+ cacheIsComplete = false;
+ }
+ }
+
+ private synchronized void refreshReader() throws IOException {
+ if (reader != null) {
+ IndexReader r2 = reader.reopen();
+ if (reader != r2) {
+ reader.close();
+ reader = r2;
+ }
+ }
+ }
+
+ /**
+ * Calling commit() ensures that all the categories written so far are
+ * visible to a reader that is opened (or reopened) after that call.
+ * When the index is closed(), commit() is also implicitly done.
+ * See {@link TaxonomyWriter#commit()}
+ */
+ public synchronized void commit() throws CorruptIndexException, IOException {
+ indexWriter.commit();
+ refreshReader();
+ }
+
+ /**
+ * Like commit(), but also store properties with the index. These properties
+ * are retrievable by {@link LuceneTaxonomyReader#getCommitUserData}.
+ * See {@link TaxonomyWriter#commit(Map)}.
+ */
+ public synchronized void commit(Map
+ * Because categories are numbered consecutively starting with 0, it means
+ * the taxonomy contains ordinals 0 through getSize()-1.
+ *
+ * Note that the number returned by getSize() is often slightly higher than
+ * the number of categories inserted into the taxonomy; This is because when
+ * a category is added to the taxonomy, its ancestors are also added
+ * automatically (including the root, which always get ordinal 0).
+ */
+ synchronized public int getSize() {
+ return indexWriter.maxDoc();
+ }
+
+ private boolean alreadyCalledFillCache = false;
+
+ /**
+ * Set the number of cache misses before an attempt is made to read the
+ * entire taxonomy into the in-memory cache.
+ *
+ * LuceneTaxonomyWriter holds an in-memory cache of recently seen
+ * categories to speed up operation. On each cache-miss, the on-disk index
+ * needs to be consulted. When an existing taxonomy is opened, a lot of
+ * slow disk reads like that are needed until the cache is filled, so it
+ * is more efficient to read the entire taxonomy into memory at once.
+ * We do this complete read after a certain number (defined by this method)
+ * of cache misses.
+ *
+ * If the number is set to
+ * Note that if the memory cache of choice is limited in size, and cannot
+ * hold the entire content of the on-disk taxonomy, then it is never
+ * read in its entirety into the cache, regardless of the setting of this
+ * method.
+ */
+ public void setCacheMissesUntilFill(int i) {
+ cacheMissesUntilFill = i;
+ }
+ private int cacheMissesUntilFill = 11;
+
+ private boolean perhapsFillCache() throws IOException {
+ // Note: we assume that we're only called when cacheIsComplete==false.
+ // TODO (Facet): parametrize this criterion:
+ if (cacheMisses < cacheMissesUntilFill) {
+ return false;
+ }
+ // If the cache was already filled (or we decided not to fill it because
+ // there was no room), there is no sense in trying it again.
+ if (alreadyCalledFillCache) {
+ return false;
+ }
+ alreadyCalledFillCache = true;
+ // TODO (Facet): we should probably completely clear the cache before starting
+ // to read it?
+ if (reader == null) {
+ reader = openReader();
+ }
+
+ if (!cache.hasRoom(reader.numDocs())) {
+ return false;
+ }
+
+ CategoryPath cp = new CategoryPath();
+ Terms terms = MultiFields.getTerms(reader, Consts.FULL);
+ // The check is done here to avoid checking it on every iteration of the
+ // below loop. A null term wlil be returned if there are no terms in the
+ // lexicon, or after the Consts.FULL term. However while the loop is
+ // executed we're safe, because we only iterate as long as there are next()
+ // terms.
+ if (terms != null) {
+ TermsEnum termsEnum = terms.iterator();
+ Bits deletedDocs = MultiFields.getDeletedDocs(reader);
+ DocsEnum docsEnum = null;
+ while (termsEnum.next() != null) {
+ BytesRef t = termsEnum.term();
+ // Since we guarantee uniqueness of categories, each term has exactly
+ // one document. Also, since we do not allow removing categories (and
+ // hence documents), there are no deletions in the index. Therefore, it
+ // is sufficient to call next(), and then doc(), exactly once with no
+ // 'validation' checks.
+ docsEnum = termsEnum.docs(deletedDocs, docsEnum);
+ docsEnum.nextDoc();
+ cp.clear();
+ // TODO (Facet): avoid String creation/use bytes?
+ cp.add(t.utf8ToString(), delimiter);
+ cache.put(cp, docsEnum.docID());
+ }
+ }
+
+ cacheIsComplete = true;
+ // No sense to keep the reader open - we will not need to read from it
+ // if everything is in the cache.
+ reader.close();
+ reader = null;
+ return true;
+ }
+
+ // TODO (Facet): synchronization of some sort?
+ private ParentArray parentArray;
+ private ParentArray getParentArray() throws IOException {
+ if (parentArray==null) {
+ if (reader == null) {
+ reader = openReader();
+ }
+ parentArray = new ParentArray();
+ parentArray.refresh(reader);
+ }
+ return parentArray;
+ }
+ public int getParent(int ordinal) throws IOException {
+ // Note: the following if() just enforces that a user can never ask
+ // for the parent of a nonexistant category - even if the parent array
+ // was allocated bigger than it really needs to be.
+ if (ordinal >= getSize()) {
+ throw new ArrayIndexOutOfBoundsException();
+ }
+ return getParentArray().getArray()[ordinal];
+ }
+
+ /**
+ * Take all the categories of one or more given taxonomies, and add them to
+ * the main taxonomy (this), if they are not already there.
+ *
+ * Additionally, fill a mapping for each of the added taxonomies,
+ * mapping its ordinals to the ordinals in the enlarged main taxonomy.
+ * These mapping are saved into an array of OrdinalMap objects given by the
+ * user, one for each of the given taxonomies (not including "this", the main
+ * taxonomy). Often the first of these will be a MemoryOrdinalMap and the
+ * others will be a DiskOrdinalMap - see discussion in {OrdinalMap}.
+ *
+ * Note that the taxonomies to be added are given as Directory objects,
+ * not opened TaxonomyReader/TaxonomyWriter objects, so if any of them are
+ * currently managed by an open TaxonomyWriter, make sure to commit() (or
+ * close()) it first. The main taxonomy (this) is an open TaxonomyWriter,
+ * and does not need to be commit()ed before this call.
+ */
+ public void addTaxonomies(Directory[] taxonomies, OrdinalMap[] ordinalMaps) throws IOException {
+ // To prevent us stepping on the rest of this class's decisions on when
+ // to open a reader, and when not, we'll be opening a new reader instead
+ // of using the existing "reader" object:
+ IndexReader mainreader = openReader();
+ // TODO (Facet): can this then go segment-by-segment and avoid MultiDocsEnum etc?
+ Terms terms = MultiFields.getTerms(mainreader, Consts.FULL);
+ assert terms != null; // TODO (Facet): explicit check / throw exception?
+ TermsEnum mainte = terms.iterator();
+ DocsEnum mainde = null;
+
+ IndexReader[] otherreaders = new IndexReader[taxonomies.length];
+ TermsEnum[] othertes = new TermsEnum[taxonomies.length];
+ DocsEnum[] otherdocsEnum = new DocsEnum[taxonomies.length]; // just for reuse
+ for (int i=0; i
+ * There exist two implementations of OrdinalMap: MemoryOrdinalMap and
+ * DiskOrdinalMap. As their names suggest, the former keeps the map in
+ * memory and the latter in a temporary disk file. Because these maps will
+ * later be needed one by one (to remap the counting lists), not all at the
+ * same time, it is recommended to put the first taxonomy's map in memory,
+ * and all the rest on disk (later to be automatically read into memory one
+ * by one, when needed).
+ */
+ public static interface OrdinalMap {
+ /**
+ * Set the size of the map. This MUST be called before addMapping().
+ * It is assumed (but not verified) that addMapping() will then be
+ * called exactly 'size' times, with different origOrdinals between 0
+ * and size-1.
+ */
+ public void setSize(int size) throws IOException;
+ public void addMapping(int origOrdinal, int newOrdinal) throws IOException;
+ /**
+ * Call addDone() to say that all addMapping() have been done.
+ * In some implementations this might free some resources.
+ */
+ public void addDone() throws IOException;
+ /**
+ * Return the map from the taxonomy's original (consecutive) ordinals
+ * to the new taxonomy's ordinals. If the map has to be read from disk
+ * and ordered appropriately, it is done when getMap() is called.
+ * getMap() should only be called once, and only when the map is actually
+ * needed. Calling it will also free all resources that the map might
+ * be holding (such as temporary disk space), other than the returned int[].
+ */
+ public int[] getMap() throws IOException;
+ }
+
+ /**
+ * {@link OrdinalMap} maintained in memory
+ */
+ public static final class MemoryOrdinalMap implements OrdinalMap {
+ int[] map;
+ public void setSize(int taxonomySize) {
+ map = new int[taxonomySize];
+ }
+ public void addMapping(int origOrdinal, int newOrdinal) {
+ map[origOrdinal] = newOrdinal;
+ }
+ public void addDone() { /* nothing to do */ }
+ public int[] getMap() {
+ return map;
+ }
+ }
+
+ /**
+ * {@link OrdinalMap} maintained on file system
+ */
+ public static final class DiskOrdinalMap implements OrdinalMap {
+ File tmpfile;
+ DataOutputStream out;
+
+ public DiskOrdinalMap(File tmpfile) throws FileNotFoundException {
+ this.tmpfile = tmpfile;
+ out = new DataOutputStream(new BufferedOutputStream(
+ new FileOutputStream(tmpfile)));
+ }
+
+ public void addMapping(int origOrdinal, int newOrdinal) throws IOException {
+ out.writeInt(origOrdinal);
+ out.writeInt(newOrdinal);
+ }
+
+ public void setSize(int taxonomySize) throws IOException {
+ out.writeInt(taxonomySize);
+ }
+
+ public void addDone() throws IOException {
+ if (out!=null) {
+ out.close();
+ out = null;
+ }
+ }
+
+ int[] map = null;
+
+ public int[] getMap() throws IOException {
+ if (map!=null) {
+ return map;
+ }
+ addDone(); // in case this wasn't previously called
+ DataInputStream in = new DataInputStream(new BufferedInputStream(
+ new FileInputStream(tmpfile)));
+ map = new int[in.readInt()];
+ // NOTE: The current code assumes here that the map is complete,
+ // i.e., every ordinal gets one and exactly one value. Otherwise,
+ // we may run into an EOF here, or vice versa, not read everything.
+ for (int i=0; i
+
+ * It basically has put() methods for adding a mapping, and get() for looking
+ * a mapping up the cache. The cache does not guarantee to hold
+ * everything that has been put into it, and might in fact selectively
+ * delete some of the mappings (e.g., the ones least recently used).
+ * This means that if get() returns a negative response, it does not
+ * necessarily mean that the category doesn't exist - just that it is not
+ * in the cache. The caller can only infer that the category doesn't exist
+ * if it knows the cache to be complete (because all the categories were
+ * loaded into the cache, and since then no put() returned true).
+ * However,
+ * if it does so, it should clear out large parts of the cache at once, because
+ * the user will typically need to work hard to recover from every cache
+ * cleanup (see {@link #put(CategoryPath, int)}'s return value).
+ *
+ * @lucene.experimental
+ */
+public interface TaxonomyWriterCache {
+
+ /**
+ * Let go of whatever resources the cache is holding. After a close(),
+ * this object can no longer be used.
+ */
+ public void close();
+
+ /**
+ * Lookup a category in the cache, returning its ordinal, or a negative
+ * number if the category is not in the cache.
+ *
+ * It is up to the caller to remember what a negative response means:
+ * If the caller knows the cache is complete (it was initially
+ * fed with all the categories, and since then put() never returned true)
+ * it means the category does not exist. Otherwise, the category might
+ * still exist, but just be missing from the cache.
+ */
+ public int get(CategoryPath categoryPath);
+
+ /**
+ * Like {@link #get(CategoryPath)}, but for a given prefix of the
+ * category path.
+ *
+ * If the given length is negative or bigger than the path's actual
+ * length, the full path is taken.
+ */
+ public int get(CategoryPath categoryPath, int length);
+
+ /**
+ * Add a category to the cache, with the given ordinal as the value.
+ *
+ * If the implementation keeps only a partial cache (e.g., an LRU cache)
+ * and finds that its cache is full, it should clear up part of the cache
+ * and return
+ * The reason why the caller needs to know if part of the cache was
+ * cleared is that in that case it will have to commit its on-disk index
+ * (so that all the latest category additions can be searched on disk, if
+ * we can't rely on the cache to contain them).
+ *
+ * Ordinals should be non-negative. Currently there is no defined way to
+ * specify that a cache should remember a category does NOT exist.
+ * It doesn't really matter, because normally the next thing we do after
+ * finding that a category does not exist is to add it.
+ */
+ public boolean put(CategoryPath categoryPath, int ordinal);
+
+ /**
+ * Like {@link #put(CategoryPath, int)}, but for a given prefix of the
+ * category path.
+ *
+ * If the given length is negative or bigger than the path's actual
+ * length, the full path is taken.
+ */
+ public boolean put(CategoryPath categoryPath, int prefixLen, int ordinal);
+
+ /**
+ * Sometimes the cache is either unlimited in size, or limited by a very
+ * big size, and in that case when we add a lot of categories it might
+ * make sense to pre-load the cache with all the existing categories.
+ * However, this pre-load does not make sense when the allowed cache
+ * size is small. The hasRoom() method allows to differentiate between
+ * these cases.
+ *
+ * After hasRoom(n) returned
+ * Since the HashArrays don't handle collisions, a {@link CollisionMap} is used
+ * to store the colliding labels.
+ *
+ * This data structure grows by adding a new HashArray whenever the number of
+ * collisions in the {@link CollisionMap} exceeds {@code loadFactor} *
+ * {@link #getMaxOrdinal()}. Growing also includes reinserting all colliding
+ * labels into the HashArrays to possibly reduce the number of collisions.
+ *
+ * For setting the {@code loadFactor} see
+ * {@link #CompactLabelToOrdinal(int, float, int)}.
+ *
+ *
+ * This data structure has a much lower memory footprint (~30%) compared to a
+ * Java HashMapSimple faceted indexing and search sample
+
+ A simple faceted example, showing how to:
+
+
+
+ For more complex examples see the other sample code packages.
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/DocumentBuilder.java b/modules/facet/src/java/org/apache/lucene/DocumentBuilder.java
new file mode 100644
index 00000000000..26cee4b2f2e
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/DocumentBuilder.java
@@ -0,0 +1,77 @@
+package org.apache.lucene;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * An interface which standardizes the process of building an indexable
+ * {@link Document}.
+ *
+ * To allow reuse of the DocumentBuilder object, implementations are also
+ * encouraged to have a setter method, which remembers its parameters just like
+ * the constructor. This setter method cannot be described in this interface,
+ * because it will take different parameters in each implementation.
+ *
+ * builder1.build(builder2.build(builder3.build(new Document())));
+ *
+ *
+ * @lucene.experimental
+ */
+public interface DocumentBuilder {
+
+ /** An exception thrown from {@link DocumentBuilder}'s build(). */
+ public static class DocumentBuilderException extends Exception {
+
+ public DocumentBuilderException() {
+ super();
+ }
+
+ public DocumentBuilderException(String message) {
+ super(message);
+ }
+
+ public DocumentBuilderException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public DocumentBuilderException(Throwable cause) {
+ super(cause);
+ }
+
+ }
+
+ /**
+ * Adds to the given document whatever {@link Field}s the implementation needs
+ * to add. Return the docunment instance to allow for chaining calls.
+ */
+ public Document build(Document doc) throws DocumentBuilderException;
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/FacetException.java b/modules/facet/src/java/org/apache/lucene/facet/FacetException.java
new file mode 100644
index 00000000000..a03797ca091
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/FacetException.java
@@ -0,0 +1,46 @@
+package org.apache.lucene.facet;
+
+import java.io.IOException;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A parent class for exceptions thrown by the Facets code.
+ *
+ * @lucene.experimental
+ */
+public class FacetException extends IOException {
+
+ public FacetException() {
+ super();
+ }
+
+ public FacetException(String message) {
+ super(message);
+ }
+
+ public FacetException(String message, Throwable cause) {
+ super(message);
+ initCause(cause);
+ }
+
+ public FacetException(Throwable cause) {
+ initCause(cause);
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/CategoryEnhancement.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/CategoryEnhancement.java
new file mode 100644
index 00000000000..fced0cf7393
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/CategoryEnhancement.java
@@ -0,0 +1,127 @@
+package org.apache.lucene.facet.enhancements;
+
+import org.apache.lucene.analysis.TokenStream;
+
+import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams;
+import org.apache.lucene.facet.index.attributes.CategoryAttribute;
+import org.apache.lucene.facet.index.attributes.CategoryProperty;
+import org.apache.lucene.facet.index.streaming.CategoryListTokenizer;
+import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
+import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This interface allows easy addition of enhanced category features. Usually, a
+ * {@link CategoryEnhancement} will correspond to a {@link CategoryProperty}.
+ *
+ *
+ *
+ * @lucene.experimental
+ */
+public interface CategoryEnhancement {
+
+ /**
+ * Get the bytes to be added to the category token payload for this
+ * enhancement.
+ *
+ * This method is a cheaper replacement for a call to
+ * instanceof
. It has two HashSets - one for classes which are
+ * an extension to AP and one for the classes which are not. Whenever a
+ * property class is introduced:
+ *
+ *
+ *
+ * NOTE: 'instanceof' is only called once per a Class (not instance) of a
+ * property. And as there are few properties (currently 4 concrete
+ * implementations) the two sets would be rather small
+ */
+ public static boolean isAssociationProperty(Class extends CategoryProperty> clazz) {
+ if (ASSOCIATION_PROPERTY_CLASSES.contains(clazz)) {
+ return true;
+ }
+
+ if (NON_ASSOCIATION_PROPERTY_CLASSES.contains(clazz)) {
+ return false;
+ }
+
+ if (AssociationProperty.class.isAssignableFrom(clazz)) {
+ ASSOCIATION_PROPERTY_CLASSES.add(clazz);
+ return true;
+ }
+
+ NON_ASSOCIATION_PROPERTY_CLASSES.add(clazz);
+ return false;
+ }
+
+ public boolean generatesCategoryList() {
+ return true;
+ }
+
+ public String getCategoryListTermText() {
+ return CATEGORY_LIST_TERM_TEXT;
+ }
+
+ public CategoryListTokenizer getCategoryListTokenizer(
+ TokenStream tokenizer, EnhancementsIndexingParams indexingParams,
+ TaxonomyWriter taxonomyWriter) {
+ return new AssociationListTokenizer(tokenizer, indexingParams, this);
+ }
+
+ public byte[] getCategoryTokenBytes(CategoryAttribute categoryAttribute) {
+
+ AssociationProperty property = getAssociationProperty(categoryAttribute);
+
+ if (property == null) {
+ return null;
+ }
+
+ int association = property.getAssociation();
+ int bytesNeeded = Vint8.bytesNeeded(association);
+ byte[] buffer = new byte[bytesNeeded];
+ Vint8.encode(association, buffer, 0);
+ return buffer;
+ }
+
+ public static AssociationProperty getAssociationProperty(
+ CategoryAttribute categoryAttribute) {
+ AssociationProperty property = null;
+ Set
+ * The return is either an int value casted as long if the ordinal has an
+ * associated value. Otherwise the returned value would be
+ * {@link #NO_ASSOCIATION} which is 'pure long' value (e.g not in the int
+ * range of values)
+ *
+ * @param ordinal
+ * for which the association value is requested
+ * @return the associated int value (encapsulated in a long) if the ordinal
+ * had an associated value, or {@link #NO_ASSOCIATION} otherwise
+ */
+ public long getAssociation(int ordinal) {
+ if (ordinalToAssociationMap.containsKey(ordinal)) {
+ return ordinalToAssociationMap.get(ordinal);
+ }
+
+ return NO_ASSOCIATION;
+ }
+
+ /**
+ * Get an iterator over the ordinals which has an association for the
+ * document set by {@link #setNextDoc(int)}.
+ */
+ public IntIterator getAssociatedOrdinals() {
+ return ordinalToAssociationMap.keyIterator();
+ }
+
+ /**
+ * Skips to the given docId, getting the values in pairs of (ordinal, value)
+ * and populating the map
+ *
+ * @param docId
+ * document id owning the associations
+ * @return true if associations were fetched successfully, false otherwise
+ * @throws IOException
+ * on error
+ */
+ private boolean fetchAssociations(int docId) throws IOException {
+ // No associations at all? don't bother trying to seek the docID in the
+ // posting
+ if (!hasAssociations) {
+ return false;
+ }
+
+ // No associations for this document? well, nothing to decode than,
+ // return false
+ if (!associationPayloadIter.skipTo(docId)) {
+ return false;
+ }
+
+ // loop over all the values decoded from the payload in pairs.
+ for (;;) {
+ // Get the ordinal
+ long ordinal = associationPayloadIter.nextCategory();
+
+ // if no ordinal - it's the end of data, break the loop
+ if (ordinal > Integer.MAX_VALUE) {
+ break;
+ }
+
+ // get the associated value
+ long association = associationPayloadIter.nextCategory();
+ // If we're at this step - it means we have an ordinal, do we have
+ // an association for it?
+ if (association > Integer.MAX_VALUE) {
+ // No association!!! A Broken Pair!! PANIC!
+ throw new IOException(
+ "ERROR! Associations should come in pairs of (ordinal, value), yet this payload has an odd number of values! (docId="
+ + docId + ")");
+ }
+ // Populate the map with the given ordinal and association pair
+ ordinalToAssociationMap.put((int) ordinal, (int) association);
+ }
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime
+ * result
+ + ((associationPayloadIter == null) ? 0
+ : associationPayloadIter.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+
+ if (obj == null) {
+ return false;
+ }
+
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+
+ AssociationsPayloadIterator other = (AssociationsPayloadIterator) obj;
+ if (associationPayloadIter == null) {
+ if (other.associationPayloadIter != null) {
+ return false;
+ }
+ } else if (!associationPayloadIter.equals(other.associationPayloadIter)) {
+ return false;
+ }
+ return true;
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/package.html b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/package.html
new file mode 100644
index 00000000000..7c19bb32452
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/package.html
@@ -0,0 +1,13 @@
+
+Association category enhancements
+
+A {@link org.apache.lucene.facet.enhancements.CategoryEnhancement CategoryEnhancement}
+for adding associations data to the index (categories with
+{@link org.apache.lucene.facet.enhancements.association.AssociationProperty AssociationProperty}'s).
+
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/package.html b/modules/facet/src/java/org/apache/lucene/facet/enhancements/package.html
new file mode 100644
index 00000000000..f8515111dd2
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/package.html
@@ -0,0 +1,32 @@
+
+
+Enhanced category features
+
+Mechanisms for addition of enhanced category features.
+
+
+
+
+
diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/DefaultEnhancementsIndexingParams.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/DefaultEnhancementsIndexingParams.java
new file mode 100644
index 00000000000..57d4580055c
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/DefaultEnhancementsIndexingParams.java
@@ -0,0 +1,98 @@
+package org.apache.lucene.facet.enhancements.params;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.facet.enhancements.CategoryEnhancement;
+import org.apache.lucene.facet.index.attributes.CategoryProperty;
+import org.apache.lucene.facet.index.params.CategoryListParams;
+import org.apache.lucene.facet.index.params.PerDimensionIndexingParams;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Default implementation of {@link EnhancementsIndexingParams}
+ *
+ * @lucene.experimental
+ */
+public class DefaultEnhancementsIndexingParams extends
+ PerDimensionIndexingParams implements EnhancementsIndexingParams {
+
+ private ListEnhanced category features
+
+{@link org.apache.lucene.facet.index.params.FacetIndexingParams FacetIndexingParams}
+used by
+{@link org.apache.lucene.facet.enhancements.EnhancementsDocumentBuilder EnhancementsDocumentBuilder}
+for adding
+{@link org.apache.lucene.facet.enhancements.CategoryEnhancement CategoryEnhancement}'s
+to the indexing parameters, and accessing them during indexing and search.
+
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/CategoryContainer.java b/modules/facet/src/java/org/apache/lucene/facet/index/CategoryContainer.java
new file mode 100644
index 00000000000..16336e14eac
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/CategoryContainer.java
@@ -0,0 +1,282 @@
+package org.apache.lucene.facet.index;
+
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.util.Attribute;
+
+import org.apache.lucene.facet.FacetException;
+import org.apache.lucene.facet.index.attributes.CategoryAttribute;
+import org.apache.lucene.facet.index.attributes.CategoryAttributeImpl;
+import org.apache.lucene.facet.index.attributes.CategoryProperty;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A container to add categories which are to be introduced to
+ * {@link CategoryDocumentBuilder#setCategories(Iterable)}. Categories can be
+ * added with Properties.
+ *
+ * @lucene.experimental
+ */
+public class CategoryContainer implements Iterable
+ * NOTE: {@link CategoryProperty}s are {@link Serializable}, but do not
+ * assume that Lucene's {@link Attribute}s are as well
+ * @throws IOException
+ */
+ protected void serializeCategoryAttribute(ObjectOutputStream out,
+ CategoryAttribute ca) throws IOException {
+ out.writeObject(ca.getCategoryPath());
+ Set
+ * Construction could be done with either a given {@link FacetIndexingParams} or
+ * the default implementation {@link DefaultFacetIndexingParams}.
+ * A CategoryDocumentBuilder can be reused by repeatedly setting the categories
+ * and building the document. Categories are provided either as
+ * {@link CategoryAttribute} elements through {@link #setCategories(Iterable)},
+ * or as {@link CategoryPath} elements through
+ * {@link #setCategoryPaths(Iterable)}.
+ *
+ * See:
+ * {@link #CategoryDocumentBuilder(TaxonomyWriter, FacetIndexingParams)}
+ *
+ * @param taxonomyWriter
+ * to which new categories will be added, as well as translating
+ * known categories to ordinals
+ * @throws IOException
+ *
+ */
+ public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter)
+ throws IOException {
+ this(taxonomyWriter, new DefaultFacetIndexingParams());
+ }
+
+ /**
+ * Creating a facets document builder with a given facet indexing parameters
+ * object.
+ *
+ * @param taxonomyWriter
+ * to which new categories will be added, as well as translating
+ * known categories to ordinals
+ * @param params
+ * holds all parameters the indexing process should use such as
+ * category-list parameters
+ * @throws IOException
+ */
+ public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter,
+ FacetIndexingParams params) throws IOException {
+ this.taxonomyWriter = taxonomyWriter;
+ this.indexingParams = params;
+ this.categoriesMap = new HashMap
+ * // merge the old taxonomy with the new one.
+ * OrdinalMap map = LuceneTaxonomyWriter.addTaxonomies();
+ * int[] ordmap = map.getMap();
+ *
+ * // re-map the ordinals on the old directory.
+ * Directory oldDir;
+ * FacetsPayloadProcessorProvider fppp = new FacetsPayloadProcessorProvider(
+ * oldDir, ordmap);
+ * IndexWriterConfig conf = new IndexWriterConfig(VER, ANALYZER);
+ * conf.setMergePolicy(new ForceOptimizeMergePolicy());
+ * IndexWriter writer = new IndexWriter(oldDir, conf);
+ * writer.setPayloadProcessorProvider(fppp);
+ * writer.optimize();
+ * writer.close();
+ *
+ * // merge that directory with the new index.
+ * IndexWriter newWriter; // opened on the 'new' Directory
+ * newWriter.addIndexes(oldDir);
+ * newWriter.commit();
+ *
+ *
+ * For re-mapping the ordinals during index merge, do the following:
+ *
+ *
+ * // merge the old taxonomy with the new one.
+ * OrdinalMap map = LuceneTaxonomyWriter.addTaxonomies();
+ * int[] ordmap = map.getMap();
+ *
+ * // Add the index and re-map ordinals on the go
+ * IndexReader r = IndexReader.open(oldDir);
+ * IndexWriterConfig conf = new IndexWriterConfig(VER, ANALYZER);
+ * IndexWriter writer = new IndexWriter(newDir, conf);
+ * writer.setPayloadProcessorProvider(fppp);
+ * writer.addIndexes(r);
+ * writer.commit();
+ *
+ *
+ * Adding multiple properties of the same class is forbidden.
+ *
+ * @param property
+ * The property to add.
+ * @throws UnsupportedOperationException
+ * When attempting to add a property of a class that was added
+ * before and merge is prohibited.
+ */
+ public void addProperty(CategoryProperty property)
+ throws UnsupportedOperationException;
+
+ /**
+ * Get a property of a certain property class.
+ *
+ * @param propertyClass
+ * The required property class.
+ * @return The property of the given class, or null if no such property
+ * exists.
+ */
+ public CategoryProperty getProperty(
+ Class extends CategoryProperty> propertyClass);
+
+ /**
+ * Get a property of one of given property classes.
+ *
+ * @param propertyClasses
+ * The property classes.
+ * @return A property matching one of the given classes, or null if no such
+ * property exists.
+ */
+ public CategoryProperty getProperty(
+ CollectionCategory attributes and their properties for indexing
+
+Attributes for a {@link org.apache.lucene.facet.taxonomy.CategoryPath category},
+possibly containing
+{@link org.apache.lucene.facet.index.attributes.CategoryProperty category property}'s.
+
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultOrdinalPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultOrdinalPolicy.java
new file mode 100644
index 00000000000..95de238c841
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultOrdinalPolicy.java
@@ -0,0 +1,43 @@
+package org.apache.lucene.facet.index.categorypolicy;
+
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This class filters our the ROOT category ID. For more information see
+ * {@link OrdinalPolicy}.
+ *
+ * @lucene.experimental
+ */
+public class DefaultOrdinalPolicy implements OrdinalPolicy {
+
+ /**
+ * Filters out (returns false) ordinals equal or less than
+ * {@link TaxonomyReader#ROOT_ORDINAL}. true otherwise.
+ */
+ public boolean shouldAdd(int ordinal) {
+ return ordinal > TaxonomyReader.ROOT_ORDINAL;
+ }
+
+ /**
+ * Implemented as NO-OP as the default is not taxonomy dependent
+ */
+ public void init(TaxonomyWriter taxonomyWriter) { }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultPathPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultPathPolicy.java
new file mode 100644
index 00000000000..2fb172dad57
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultPathPolicy.java
@@ -0,0 +1,38 @@
+package org.apache.lucene.facet.index.categorypolicy;
+
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This class filters our the ROOT category path. For more information see
+ * {@link PathPolicy}.
+ *
+ * @lucene.experimental
+ */
+public class DefaultPathPolicy implements PathPolicy {
+
+ /**
+ * Filters out (returns false) CategoryPaths equal or less than
+ * {@link TaxonomyReader#ROOT_ORDINAL}. true otherwise.
+ */
+ public boolean shouldAdd(CategoryPath categoryPath) {
+ return categoryPath.length() > 0;
+ }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelOrdinalPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelOrdinalPolicy.java
new file mode 100644
index 00000000000..ee4c6fb89f5
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelOrdinalPolicy.java
@@ -0,0 +1,71 @@
+package org.apache.lucene.facet.index.categorypolicy;
+
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Filter out any "top level" category ordinals.
{@link #shouldAdd(int)}.
+ *
+ * @lucene.experimental
+ */
+public class NonTopLevelOrdinalPolicy implements OrdinalPolicy {
+
+ /**
+ * The taxonomyWriter with which the given ordinals' parent is determined.
+ */
+ private TaxonomyWriter taxonomyWriter;
+
+ /**
+ * Constructs a new non-top-level-ordinal-filter. With a given
+ * taxonomyWriter.
+ *
+ */
+ public NonTopLevelOrdinalPolicy() {
+ this.taxonomyWriter = null;
+ }
+
+ /**
+ * @param taxonomyWriter
+ * A relevant taxonomyWriter object, with which ordinals sent to
+ * {@link #shouldAdd(int)} are examined.
+ */
+ public void init(TaxonomyWriter taxonomyWriter) {
+ this.taxonomyWriter = taxonomyWriter;
+ }
+
+ /**
+ * Filters out ordinal which are ROOT or who's parent is ROOT. In order to
+ * determine if a parent is root, there's a need for
+ * {@link TaxonomyWriter#getParent(int)}.
+ */
+ public boolean shouldAdd(int ordinal) {
+ if (ordinal > TaxonomyReader.ROOT_ORDINAL) {
+ try {
+ if (this.taxonomyWriter.getParent(ordinal) > TaxonomyReader.ROOT_ORDINAL) {
+ return true;
+ }
+ } catch (Exception e) {
+ return false;
+ }
+ }
+ return false;
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelPathPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelPathPolicy.java
new file mode 100644
index 00000000000..768c0b20cb5
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelPathPolicy.java
@@ -0,0 +1,43 @@
+package org.apache.lucene.facet.index.categorypolicy;
+
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This class filters our the ROOT category, and it's direct descendants. For
+ * more information see {@link PathPolicy}.
+ *
+ * @lucene.experimental
+ */
+public class NonTopLevelPathPolicy implements PathPolicy {
+
+ /**
+ * The shortest path length delivered is two components (root + one child).
+ */
+ public final int DEFAULT_MINIMAL_SUBPATH_LENGTH = 2;
+
+ /**
+ * Filters out (returns false) CategoryPaths equal or less than
+ * {@link TaxonomyReader#ROOT_ORDINAL}. true otherwise.
+ */
+ public boolean shouldAdd(CategoryPath categoryPath) {
+ return categoryPath.length() >= DEFAULT_MINIMAL_SUBPATH_LENGTH;
+ }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/OrdinalPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/OrdinalPolicy.java
new file mode 100644
index 00000000000..b300a28cfb2
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/OrdinalPolicy.java
@@ -0,0 +1,56 @@
+package org.apache.lucene.facet.index.categorypolicy;
+
+import java.io.Serializable;
+
+import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
+import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Filtering category ordinals in {@link CategoryParentsStream}, where a given
+ * category ordinal is added to the stream, and than its parents are being added
+ * one after the other using {@link TaxonomyWriter#getParent(int)}.
+ * That loop should have a stop point - the default approach (excluding the
+ * ROOT) is implemented in {@link DefaultOrdinalPolicy}.
+ *
+ * @lucene.experimental
+ */
+public interface OrdinalPolicy extends Serializable {
+
+ /**
+ * Check whether a given category ordinal should be added to the stream.
+ *
+ * @param ordinal
+ * A given category ordinal which is to be tested for stream
+ * addition.
+ * @return true
if the category should be added.
+ * false
otherwise.
+ */
+ public abstract boolean shouldAdd(int ordinal);
+
+ /**
+ * Initialize the policy with a TaxonomyWriter. This method can be
+ * implemented as noop if the ordinal policy is not taxonomy dependent
+ *
+ * @param taxonomyWriter
+ * A relevant taxonomyWriter object, with which ordinals sent to
+ * {@link #shouldAdd(int)} are examined.
+ */
+ public abstract void init(TaxonomyWriter taxonomyWriter);
+}
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/PathPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/PathPolicy.java
new file mode 100644
index 00000000000..9f49f502b08
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/PathPolicy.java
@@ -0,0 +1,47 @@
+package org.apache.lucene.facet.index.categorypolicy;
+
+import java.io.Serializable;
+
+import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Filtering category paths in {@link CategoryParentsStream}, where a given
+ * category is added to the stream, and than all its parents are being
+ * added one after the other by successively removing the last component.
+ * That loop should have a stop point - the default approach (excluding the
+ * ROOT) is implemented in {@link DefaultOrdinalPolicy}.
+ *
+ * @lucene.experimental
+ */
+public interface PathPolicy extends Serializable {
+
+ /**
+ * Check whether a given category path should be added to the stream.
+ *
+ * @param categoryPath
+ * A given category path which is to be tested for stream
+ * addition.
+ * @return true
if the category path should be added.
+ * false
otherwise.
+ */
+ public abstract boolean shouldAdd(CategoryPath categoryPath);
+
+}
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/package.html b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/package.html
new file mode 100644
index 00000000000..b95117ef93a
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/package.html
@@ -0,0 +1,21 @@
+
+
+Policies for indexing categories
+
+There are two kinds of policies:
+
+
+
+Policies are "consulted" with during indexing, for deciding whether a category should
+be added to the index or not. The two kinds of policies can be used for different purposes.
+For example, path policies dictates which categories can participate in a drill-down operation,
+while ordinal policies affect which can be accumulated (e.g. counted).
+
+
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/package.html b/modules/facet/src/java/org/apache/lucene/facet/index/package.html
new file mode 100644
index 00000000000..18c67078124
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/package.html
@@ -0,0 +1,15 @@
+
+
+Indexing of document categories
+
+Attachment of
+{@link org.apache.lucene.facet.taxonomy.CategoryPath CategoryPath}'s
+or {@link org.apache.lucene.facet.index.attributes.CategoryAttribute CategoryAttribute}'s
+to a given document using a
+{@link org.apache.lucene.facet.taxonomy.TaxonomyWriter Taxonomy}.
+
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java b/modules/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java
new file mode 100644
index 00000000000..4ec1a142f4d
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java
@@ -0,0 +1,149 @@
+package org.apache.lucene.facet.index.params;
+
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+
+import org.apache.lucene.facet.search.CategoryListIterator;
+import org.apache.lucene.facet.search.PayloadIntDecodingIterator;
+import org.apache.lucene.facet.search.TotalFacetCounts;
+import org.apache.lucene.facet.util.PartitionsUtils;
+import org.apache.lucene.util.encoding.DGapIntEncoder;
+import org.apache.lucene.util.encoding.IntDecoder;
+import org.apache.lucene.util.encoding.IntEncoder;
+import org.apache.lucene.util.encoding.SortingIntEncoder;
+import org.apache.lucene.util.encoding.UniqueValuesIntEncoder;
+import org.apache.lucene.util.encoding.VInt8IntEncoder;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Contains parameters for a category list *
+ *
+ * @lucene.experimental
+ */
+public class CategoryListParams implements Serializable {
+
+ /** The default term used to store the facets information. */
+ public static final Term DEFAULT_TERM = new Term("$facets", "$fulltree$");
+
+ private final Term term;
+
+ private final int hashCode;
+
+ /**
+ * Constructs a default category list parameters object, using
+ * {@link #DEFAULT_TERM}.
+ */
+ public CategoryListParams() {
+ this(DEFAULT_TERM);
+ }
+
+ /**
+ * Constructs a category list parameters object, using the given {@link Term}.
+ * @param term who's payload hold the category-list.
+ */
+ public CategoryListParams(Term term) {
+ this.term = term;
+ // Pre-compute the hashCode because these objects are immutable. Saves
+ // some time on the comparisons later.
+ this.hashCode = term.hashCode();
+ }
+
+ /**
+ * A {@link Term} who's payload holds the category-list.
+ */
+ public final Term getTerm() {
+ return term;
+ }
+
+ /**
+ * Allows to override how categories are encoded and decoded. A matching
+ * {@link IntDecoder} is provided by the {@link IntEncoder}.
+ *
+ *
+ * In any event when changing this value make sure you know what you are
+ * doing, and test the results - e.g. counts, if the application is about
+ * counting facets.
+ */
+ public IntEncoder createEncoder() {
+ return new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())));
+ }
+
+ /**
+ * Equality is defined by the 'term' that defines this category list.
+ * Sub-classes should override this method if a more complex calculation
+ * is needed to ensure equality.
+ */
+ @Override
+ public boolean equals(Object o) {
+ if (o == this) {
+ return true;
+ }
+ if (!(o instanceof CategoryListParams)) {
+ return false;
+ }
+ CategoryListParams other = (CategoryListParams) o;
+ if (this.hashCode != other.hashCode) {
+ return false;
+ }
+ // The above hashcodes might equal each other in the case of a collision,
+ // so at this point only directly term equality testing will settle
+ // the equality test.
+ return this.term.equals(other.term);
+ }
+
+ /**
+ * Hashcode is similar to {@link #equals(Object)}, in that it uses
+ * the term that defines this category list to derive the hashcode.
+ * Subclasses need to ensure that equality/hashcode is correctly defined,
+ * or there could be side-effects in the {@link TotalFacetCounts} caching
+ * mechanism (as the filename for a Total Facet Counts array cache
+ * is dependent on the hashCode, so it should consistently return the same
+ * hash for identity).
+ */
+ @Override
+ public int hashCode() {
+ return this.hashCode;
+ }
+
+ /**
+ * Create the category list iterator for the specified partition.
+ */
+ public CategoryListIterator createCategoryListIterator(IndexReader reader,
+ int partition) throws IOException {
+ String categoryListTermStr = PartitionsUtils.partitionName(this, partition);
+ Term payloadTerm = new Term(term.field(), categoryListTermStr);
+ return new PayloadIntDecodingIterator(reader, payloadTerm,
+ createEncoder().createMatchingDecoder());
+ }
+
+}
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/params/DefaultFacetIndexingParams.java b/modules/facet/src/java/org/apache/lucene/facet/index/params/DefaultFacetIndexingParams.java
new file mode 100644
index 00000000000..557b9180e6b
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/params/DefaultFacetIndexingParams.java
@@ -0,0 +1,196 @@
+package org.apache.lucene.facet.index.params;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.facet.index.categorypolicy.DefaultOrdinalPolicy;
+import org.apache.lucene.facet.index.categorypolicy.DefaultPathPolicy;
+import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
+import org.apache.lucene.facet.index.categorypolicy.PathPolicy;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Default implementation for {@link FacetIndexingParams}.
+ * buffer
is large enough.
+ * @see CategoryPath#charsNeededForFullPath()
+ */
+ public int drillDownTermText(CategoryPath path, char[] buffer);
+
+ /**
+ * Get the partition size.
+ * Same value should be used during the life time of an index.
+ * At search time this value is compared with actual taxonomy size and their minimum is used.
+ */
+ public int getPartitionSize();
+
+ /**
+ * Get the policy for indexing category paths,
+ * used for deciding how "high" to climb in taxonomy
+ * from a category when ingesting its category paths.
+ */
+ public PathPolicy getPathPolicy();
+
+ /**
+ * Get the policy for indexing category ordinals,
+ * used for deciding how "high" to climb in taxonomy
+ * from a category when ingesting its ordinals
+ */
+ public OrdinalPolicy getOrdinalPolicy();
+
+ /**
+ * Get the delimiter character used internally for drill-down terms
+ */
+ public char getFacetDelimChar();
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/params/FacetParamsMissingPropertyException.java b/modules/facet/src/java/org/apache/lucene/facet/index/params/FacetParamsMissingPropertyException.java
new file mode 100644
index 00000000000..adb8181e509
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/params/FacetParamsMissingPropertyException.java
@@ -0,0 +1,32 @@
+package org.apache.lucene.facet.index.params;
+
+import org.apache.lucene.facet.FacetException;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Thrown when the facets params are missing a property. *
+ *
+ * @lucene.experimental
+ */
+public class FacetParamsMissingPropertyException extends FacetException {
+
+ public FacetParamsMissingPropertyException(String key) {
+ super("Property with key \"" + key + "\" not found");
+ }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java b/modules/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java
new file mode 100644
index 00000000000..1df4c0e7540
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java
@@ -0,0 +1,105 @@
+package org.apache.lucene.facet.index.params;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A FacetIndexingParams that utilizes different category lists, defined by the
+ * dimension specified CategoryPaths (see
+ * {@link PerDimensionIndexingParams#addCategoryListParams(CategoryPath, CategoryListParams)}
+ * Indexing-time specifications for handling facets
+
+Parameters on how facets are to be written to the index,
+such as which fields and terms are used to refer to the facets posting list.
+
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryAttributesStream.java b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryAttributesStream.java
new file mode 100644
index 00000000000..a869219c378
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryAttributesStream.java
@@ -0,0 +1,81 @@
+package org.apache.lucene.facet.index.streaming;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.TokenStream;
+
+import org.apache.lucene.facet.index.attributes.CategoryAttribute;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * An attribute stream built from an {@link Iterable} of
+ * {@link CategoryAttribute}. This stream should then be passed through several
+ * filters (see {@link CategoryParentsStream}, {@link CategoryListTokenizer} and
+ * {@link CategoryTokenizer}) until a token stream is produced that can be
+ * indexed by Lucene.
+ *
+ * By default, category properties are removed when creating parents of a
+ * certain category. However, it is possible to retain certain property types
+ * using {@link #addRetainableProperty(Class)}.
+ *
+ * @lucene.experimental
+ */
+public class CategoryParentsStream extends TokenFilter {
+
+ /**
+ * A {@link TaxonomyWriter} for adding categories and retrieving their
+ * ordinals.
+ */
+ protected TaxonomyWriter taxonomyWriter;
+
+ /** An attribute containing all data related to the category */
+ protected CategoryAttribute categoryAttribute;
+
+ /** A category property containing the category ordinal */
+ protected OrdinalProperty ordinalProperty;
+
+ /**
+ * A set of property classes that are to be retained when creating a parent
+ * token.
+ */
+ private Set
+ * It is possible to retain properties of certain types the parent tokens,
+ * using {@link #addRetainableProperty(Class)}.
+ */
+ protected void clearCategoryProperties() {
+ if (this.retainableProperties == null
+ || this.retainableProperties.isEmpty()) {
+ this.categoryAttribute.clearProperties();
+ } else {
+ ListExpert: attributes streaming definition for indexing facets
+
+Steaming of facets attributes is a low level indexing interface with Lucene indexing.
+There are two types of category related streams:
+
+
+
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/package.html b/modules/facet/src/java/org/apache/lucene/facet/package.html
new file mode 100644
index 00000000000..494f27fe13e
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/package.html
@@ -0,0 +1,8 @@
+
+
+
+ *
+ *
+ * @lucene.experimental
+ */
+public interface CategoryListIterator {
+
+ /**
+ * Initializes the iterator. This method must be called before any calls to
+ * {@link #skipTo(int)}, and its return value indicates whether there are
+ * any relevant documents for this iterator. If it returns false, any call
+ * to {@link #skipTo(int)} will return false as well.hashCode()
and equals()
must be provided.
+ *
+ * NOTE: calling this method twice may result in skipping over
+ * documents for some implementations. Also, calling it again after all
+ * documents were consumed may yield unexpected behavior.
+ */
+ public boolean init() throws IOException;
+
+ /**
+ * Skips forward to document docId. Returns true iff this document exists
+ * and has any categories. This method must be called before calling
+ * {@link #nextCategory()} for a particular document.
+ * NOTE: Users should call this method with increasing docIds, and
+ * implementations can assume that this is the case.
+ */
+ public boolean skipTo(int docId) throws IOException;
+
+ /**
+ * Returns the next category for the current document that is set through
+ * {@link #skipTo(int)}, or a number higher than {@link Integer#MAX_VALUE}.
+ * No assumptions can be made on the order of the categories.
+ */
+ public long nextCategory() throws IOException;
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/DrillDown.java b/modules/facet/src/java/org/apache/lucene/facet/search/DrillDown.java
new file mode 100644
index 00000000000..823a7292feb
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/DrillDown.java
@@ -0,0 +1,110 @@
+package org.apache.lucene.facet.search;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.BooleanClause.Occur;
+
+import org.apache.lucene.facet.index.params.CategoryListParams;
+import org.apache.lucene.facet.index.params.FacetIndexingParams;
+import org.apache.lucene.facet.search.params.FacetSearchParams;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Creation of drill down term or query.
+ *
+ * @lucene.experimental
+ */
+public final class DrillDown {
+
+ /**
+ * @see #term(FacetIndexingParams, CategoryPath)
+ */
+ public static final Term term(FacetSearchParams sParams, CategoryPath path) {
+ return term(sParams.getFacetIndexingParams(), path);
+ }
+
+ /**
+ * Return a term for drilling down into a category.
+ */
+ public static final Term term(FacetIndexingParams iParams, CategoryPath path) {
+ CategoryListParams clp = iParams.getCategoryListParams(path);
+ char[] buffer = new char[path.charsNeededForFullPath()];
+ iParams.drillDownTermText(path, buffer);
+ return new Term(clp.getTerm().field(), String.valueOf(buffer));
+ }
+
+ /**
+ * Return a query for drilling down into all given categories (AND).
+ * @see #term(FacetSearchParams, CategoryPath)
+ * @see #query(FacetSearchParams, Query, CategoryPath...)
+ */
+ public static final Query query(FacetIndexingParams iParams, CategoryPath... paths) {
+ if (paths==null || paths.length==0) {
+ throw new IllegalArgumentException("Empty category path not allowed for drill down query!");
+ }
+ if (paths.length==1) {
+ return new TermQuery(term(iParams, paths[0]));
+ }
+ BooleanQuery res = new BooleanQuery();
+ for (CategoryPath cp : paths) {
+ res.add(new TermQuery(term(iParams, cp)), Occur.MUST);
+ }
+ return res;
+ }
+
+ /**
+ * Return a query for drilling down into all given categories (AND).
+ * @see #term(FacetSearchParams, CategoryPath)
+ * @see #query(FacetSearchParams, Query, CategoryPath...)
+ */
+ public static final Query query(FacetSearchParams sParams, CategoryPath... paths) {
+ return query(sParams.getFacetIndexingParams(), paths);
+ }
+
+ /**
+ * Turn a base query into a drilling-down query for all given category paths (AND).
+ * @see #query(FacetIndexingParams, CategoryPath...)
+ */
+ public static final Query query(FacetIndexingParams iParams, Query baseQuery, CategoryPath... paths) {
+ BooleanQuery res = new BooleanQuery();
+ res.add(baseQuery, Occur.MUST);
+ res.add(query(iParams, paths), Occur.MUST);
+ return res;
+ }
+
+ /**
+ * Turn a base query into a drilling-down query for all given category paths (AND).
+ * @see #query(FacetSearchParams, CategoryPath...)
+ */
+ public static final Query query(FacetSearchParams sParams, Query baseQuery, CategoryPath... paths) {
+ return query(sParams.getFacetIndexingParams(), baseQuery, paths);
+ }
+
+ /**
+ * Turn a base query into a drilling-down query using the default {@link FacetSearchParams}
+ * @see #query(FacetSearchParams, Query, CategoryPath...)
+ */
+ public static final Query query(Query baseQuery, CategoryPath... paths) {
+ return query(new FacetSearchParams(), baseQuery, paths);
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/FacetArrays.java b/modules/facet/src/java/org/apache/lucene/facet/search/FacetArrays.java
new file mode 100644
index 00000000000..cf954ade935
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/FacetArrays.java
@@ -0,0 +1,91 @@
+package org.apache.lucene.facet.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Provider of arrays used for facet operations such as counting.
+ *
+ * @lucene.experimental
+ */
+public class FacetArrays {
+
+ private int[] intArray;
+ private float[] floatArray;
+ private IntArrayAllocator intArrayAllocator;
+ private FloatArrayAllocator floatArrayAllocator;
+ private int arraysLength;
+
+ /**
+ * Create a FacetArrays with certain array allocators.
+ * @param intArrayAllocator allocator for int arrays.
+ * @param floatArrayAllocator allocator for float arrays.
+ */
+ public FacetArrays(IntArrayAllocator intArrayAllocator,
+ FloatArrayAllocator floatArrayAllocator) {
+ this.intArrayAllocator = intArrayAllocator;
+ this.floatArrayAllocator = floatArrayAllocator;
+ }
+
+ /**
+ * Notify allocators that they can free arrays allocated
+ * on behalf of this FacetArrays object.
+ */
+ public void free() {
+ if (intArrayAllocator!=null) {
+ intArrayAllocator.free(intArray);
+ // Should give up handle to the array now
+ // that it is freed.
+ intArray = null;
+ }
+ if (floatArrayAllocator!=null) {
+ floatArrayAllocator.free(floatArray);
+ // Should give up handle to the array now
+ // that it is freed.
+ floatArray = null;
+ }
+ arraysLength = 0;
+ }
+
+ /**
+ * Obtain an int array, e.g. for facet counting.
+ */
+ public int[] getIntArray() {
+ if (intArray == null) {
+ intArray = intArrayAllocator.allocate();
+ arraysLength = intArray.length;
+ }
+ return intArray;
+ }
+
+ /** Obtain a float array, e.g. for evaluating facet association values. */
+ public float[] getFloatArray() {
+ if (floatArray == null) {
+ floatArray = floatArrayAllocator.allocate();
+ arraysLength = floatArray.length;
+ }
+ return floatArray;
+ }
+
+ /**
+ * Return the arrays length
+ */
+ public int getArraysLength() {
+ return arraysLength;
+ }
+
+}
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/FacetResultsHandler.java b/modules/facet/src/java/org/apache/lucene/facet/search/FacetResultsHandler.java
new file mode 100644
index 00000000000..10ea4847009
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/FacetResultsHandler.java
@@ -0,0 +1,161 @@
+package org.apache.lucene.facet.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.facet.search.params.FacetRequest;
+import org.apache.lucene.facet.search.results.FacetResult;
+import org.apache.lucene.facet.search.results.FacetResultNode;
+import org.apache.lucene.facet.search.results.IntermediateFacetResult;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Handler for facet results.
+ * facetRequest
, requests an
+ * illegal FacetResult, like, e.g., a root node category path that
+ * does not exist in constructor parameter taxonomyReader
+ * .
+ * @throws IOException
+ * on error
+ */
+ public abstract IntermediateFacetResult fetchPartitionResult(FacetArrays arrays, int offset) throws IOException;
+
+ /**
+ * Merge results of several facet partitions. Logic of the merge is undefined
+ * and open for interpretations. For example, a merge implementation could
+ * keep top K results. Passed {@link IntermediateFacetResult} must be ones
+ * that were created by this handler otherwise a {@link ClassCastException} is
+ * thrown. In addition, all passed {@link IntermediateFacetResult} must have
+ * the same {@link FacetRequest} otherwise an {@link IllegalArgumentException}
+ * is thrown.
+ *
+ * @param tmpResults one or more temporary results created by this
+ * handler.
+ * @return temporary facet result that represents to union, as specified by
+ * this handler, of the input temporary facet results.
+ * @throws IOException on error.
+ * @throws ClassCastException if the temporary result passed was not created
+ * by this handler
+ * @throws IllegalArgumentException if passed facetResults
do not
+ * have the same {@link FacetRequest}
+ * @see IntermediateFacetResult#getFacetRequest()
+ */
+ public abstract IntermediateFacetResult mergeResults(IntermediateFacetResult... tmpResults)
+ throws IOException, ClassCastException, IllegalArgumentException;
+
+ /**
+ * Create a facet result from the temporary result.
+ * @param tmpResult temporary result to be rendered as a {@link FacetResult}
+ * @throws IOException on error.
+ */
+ public abstract FacetResult renderFacetResult(IntermediateFacetResult tmpResult) throws IOException ;
+
+ /**
+ * Perform any rearrangement as required on a facet result that has changed after
+ * it was rendered.
+ * size
,
+ * keeping around a pool of up to maxArrays
old arrays.
+ *
+ * NOTE: Once {@link #pop()} is called no other {@link #add(Object)} or
+ * {@link #insertWithOverflow(Object)} should be called.
+ */
+ public T pop();
+
+ /** Get (But not remove) the top of the Heap */
+ public T top();
+
+ /**
+ * Insert a new value, returning the overflowen object
+ * NOTE: This method should not be called after invoking {@link #pop()}
+ */
+ public T insertWithOverflow(T value);
+
+ /**
+ * Add a new value to the heap, return the new top().
+ * Some implementations may choose to not implement this functionality.
+ * In such a case null
should be returned.
+ * NOTE: This method should not be called after invoking {@link #pop()}
+ */
+ public T add(T frn);
+
+ /** Clear the heap */
+ public void clear();
+
+ /** Return the amount of objects currently in the heap */
+ public int size();
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/IntArrayAllocator.java b/modules/facet/src/java/org/apache/lucene/facet/search/IntArrayAllocator.java
new file mode 100644
index 00000000000..6b03d1cfd02
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/IntArrayAllocator.java
@@ -0,0 +1,68 @@
+package org.apache.lucene.facet.search;
+
+import java.util.Arrays;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * An IntArrayAllocator is an object which manages counter array objects
+ * of a certain length. These counter arrays are needed temporarily during
+ * faceted search (see {@link FacetsAccumulator} and can be reused across searches
+ * instead of being allocated afresh on every search.
+ * length
,
+ * keeping around a pool of up to maxArrays
old arrays.
+ *
+ * IndexReader reader = [open your reader];
+ * Term t = new Term("field", "where-payload-exists");
+ * CategoryListIterator cli = new PayloadIntDecodingIterator(reader, t);
+ * if (!cli.init()) {
+ * // it means there are no payloads / documents associated with that term.
+ * // Usually a sanity check. However, init() must be called.
+ * }
+ * DocIdSetIterator disi = [you usually iterate on something else, such as a Scorer];
+ * int doc;
+ * while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+ * cli.setdoc(doc);
+ * long category;
+ * while ((category = cli.nextCategory()) < Integer.MAX_VALUE) {
+ * }
+ * }
+ *
+ *
+ * @lucene.experimental
+ */
+public class PayloadIntDecodingIterator implements CategoryListIterator {
+
+ private final UnsafeByteArrayInputStream ubais;
+ private final IntDecoder decoder;
+
+ private final IndexReader indexReader;
+ private final Term term;
+ private final PayloadIterator pi;
+ private final int hashCode;
+
+ public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder)
+ throws IOException {
+ this(indexReader, term, decoder, new byte[1024]);
+ }
+
+ public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder,
+ byte[] buffer) throws IOException {
+ pi = new PayloadIterator(indexReader, term, buffer);
+ ubais = new UnsafeByteArrayInputStream();
+ this.decoder = decoder;
+ hashCode = indexReader.hashCode() ^ term.hashCode();
+ this.term = term;
+ this.indexReader = indexReader;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof PayloadIntDecodingIterator)) {
+ return false;
+ }
+ PayloadIntDecodingIterator that = (PayloadIntDecodingIterator) other;
+ if (hashCode != that.hashCode) {
+ return false;
+ }
+
+ // Hash codes are the same, check equals() to avoid cases of hash-collisions.
+ return indexReader.equals(that.indexReader) && term.equals(that.term);
+ }
+
+ @Override
+ public int hashCode() {
+ return hashCode;
+ }
+
+ public boolean init() throws IOException {
+ return pi.init();
+ }
+
+ public long nextCategory() throws IOException {
+ return decoder.decode();
+ }
+
+ public boolean skipTo(int docId) throws IOException {
+ if (!pi.setdoc(docId)) {
+ return false;
+ }
+
+ // Initializing the decoding mechanism with the new payload data
+ ubais.reInit(pi.getBuffer(), 0, pi.getPayloadLength());
+ decoder.reInit(ubais);
+ return true;
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java b/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java
new file mode 100644
index 00000000000..c9aaf3c332d
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java
@@ -0,0 +1,138 @@
+package org.apache.lucene.facet.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A utility class for iterating through a posting list of a given term and
+ * retrieving the payload of the first occurrence in every document. Comes with
+ * its own working space (buffer).
+ *
+ * @lucene.experimental
+ */
+public class PayloadIterator {
+
+ protected byte[] buffer;
+ protected int payloadLength;
+
+ DocsAndPositionsEnum tp;
+
+ private boolean hasMore;
+
+ public PayloadIterator(IndexReader indexReader, Term term)
+ throws IOException {
+ this(indexReader, term, new byte[1024]);
+ }
+
+ public PayloadIterator(IndexReader indexReader, Term term, byte[] buffer)
+ throws IOException {
+ this.buffer = buffer;
+ // TODO (Facet): avoid Multi*?
+ Bits deletedDocs = MultiFields.getDeletedDocs(indexReader);
+ this.tp = MultiFields.getTermPositionsEnum(indexReader, deletedDocs, term.field(), term.bytes());
+ }
+
+ /**
+ * (re)initialize the iterator. Should be done before the first call to
+ * {@link #setdoc(int)}. Returns false if there is no category list found
+ * (no setdoc() will never return true).
+ */
+ public boolean init() throws IOException {
+ hasMore = tp != null && tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
+ return hasMore;
+ }
+
+ /**
+ * Skip forward to document docId. Return true if this document exists and
+ * has any payload.
+ * maxObjects
old objects.
+ * offset
+ the length of the count arrays
+ * within arrays
(exclusive)
+ * @throws IOException in case
+ * {@link TaxonomyReader#getOrdinal(org.apache.lucene.facet.taxonomy.CategoryPath)}
+ * does.
+ * @see FacetResultsHandler#fetchPartitionResult(FacetArrays, int)
+ */
+ @Override
+ public IntermediateFacetResult fetchPartitionResult(FacetArrays arrays, int offset) throws IOException {
+
+ // get the root of the result tree to be returned, and the depth of that result tree
+ // (depth means number of node levels excluding the root).
+ int rootNode = this.taxonomyReader.getOrdinal(this.facetRequest.getCategoryPath());
+ if (rootNode == TaxonomyReader.INVALID_ORDINAL) {
+ return null;
+ }
+
+ int K = Math.min(facetRequest.getNumResults(),taxonomyReader.getSize()); // number of best results in each node
+
+ // this will grow into the returned IntermediateFacetResult
+ IntToObjectMapordinal
, whose depth is currentDepth
,
+ * and all its descendants down to maxDepth
(including),
+ * descendants whose value in the count arrays, arrays
, is != 0.
+ * The count arrays only includes the current partition, from offset
, to (exclusive)
+ * endOffset
.
+ * It is assumed that ordinal
< endOffset
,
+ * otherwise, not ordinal
, and none of its descendants, reside in
+ * the current partition. ordinal
< offset
is allowed,
+ * as ordinal's descendants might be >= offeset
.
+ *
+ * @param ordinal a facet ordinal.
+ * @param youngestChild mapping a given ordinal to its youngest child in the taxonomy (of largest ordinal number),
+ * or to -1 if has no children.
+ * @param olderSibling mapping a given ordinal to its older sibling, or to -1
+ * @param arrays values for the ordinals in the given partition
+ * @param offset the first (smallest) ordinal in the given partition
+ * @param partitionSize number of ordinals in the given partition
+ * @param endOffset one larger than the largest ordinal that belong to this partition
+ * @param currentDepth the depth or ordinal in the TaxonomyTree (relative to rootnode of the facetRequest)
+ * @param maxDepth maximal depth of descendants to be considered here (measured relative to rootnode of the
+ * facetRequest).
+ *
+ * @return the number of nodes, from ordinal down its descendants, of depth <= maxDepth,
+ * which reside in the current partition, and whose value != 0
+ */
+ private int countOnly(int ordinal, int[] youngestChild, int[] olderSibling,
+ FacetArrays arrays, int partitionSize, int offset,
+ int endOffset, int currentDepth, int maxDepth) {
+ int ret = 0;
+ if (offset <= ordinal) {
+ // ordinal belongs to the current partition
+ if (0 != facetRequest.getValueOf(arrays, ordinal % partitionSize)) {
+ ret++;
+ }
+ }
+ // now consider children of ordinal, if not too deep
+ if (currentDepth >= maxDepth) {
+ return ret;
+ }
+
+ int yc = youngestChild[ordinal];
+ while (yc >= endOffset) {
+ yc = olderSibling[yc];
+ }
+ while (yc > TaxonomyReader.INVALID_ORDINAL) { // assuming this is -1, smaller than any legal ordinal
+ ret += countOnly (yc, youngestChild, olderSibling, arrays,
+ partitionSize, offset, endOffset, currentDepth+1, maxDepth);
+ yc = olderSibling[yc];
+ }
+ return ret;
+ }
+
+ /**
+ * Merge several partitions' {@link IntermediateFacetResult}-s into one of the
+ * same format
+ *
+ * @see FacetResultsHandler#mergeResults(IntermediateFacetResult...)
+ */
+ @Override
+ public IntermediateFacetResult mergeResults(IntermediateFacetResult... tmpResults)
+ throws ClassCastException, IllegalArgumentException {
+
+ if (tmpResults.length == 0) {
+ return null;
+ }
+
+ int i=0;
+ // skip over null tmpResults
+ for (; (i < tmpResults.length)&&(tmpResults[i] == null); i++) {}
+ if (i == tmpResults.length) {
+ // all inputs are null
+ return null;
+ }
+
+ // i points to the first non-null input
+ int K = this.facetRequest.getNumResults(); // number of best result in each node
+ IntermediateFacetResultWithHash tmpToReturn = (IntermediateFacetResultWithHash)tmpResults[i++];
+
+ // now loop over the rest of tmpResults and merge each into tmpToReturn
+ for ( ; i < tmpResults.length; i++) {
+ IntermediateFacetResultWithHash tfr = (IntermediateFacetResultWithHash)tmpResults[i];
+ tmpToReturn.totalNumOfFacetsConsidered += tfr.totalNumOfFacetsConsidered;
+ if (tfr.isRootNodeIncluded) {
+ tmpToReturn.isRootNodeIncluded = true;
+ tmpToReturn.rootNodeValue = tfr.rootNodeValue;
+ }
+ // now merge the HashMap of tfr into this of tmpToReturn
+ IntToObjectMap
+ *
+ * The total facet counts are maintained as an array of arrays of integers,
+ * where a separate array is kept for each partition.
+ *
+ * @lucene.experimental
+ */
+public class TotalFacetCounts {
+
+ /** total facet counts per partition: totalCounts[partition][ordinal%partitionLength] */
+ private int[][] totalCounts = null;
+
+ private final TaxonomyReader taxonomy;
+ private final FacetIndexingParams facetIndexingParams;
+
+ private final static AtomicInteger atomicGen4Test = new AtomicInteger(1);
+ /** Creation type for test purposes */
+ enum CreationType { Computed, Loaded } // for testing
+ final int gen4test;
+ final CreationType createType4test;
+
+ /**
+ * Construct by key - from index Directory or by recomputing.
+ * @param key the key mapping of this total facet counts (index, taxonomy, category lists...)
+ */
+ private TotalFacetCounts (TaxonomyReader taxonomy, FacetIndexingParams facetIndexingParams,
+ int[][] counts, CreationType createType4Test) throws IOException, LockObtainFailedException {
+ this.taxonomy = taxonomy;
+ this.facetIndexingParams = facetIndexingParams;
+ this.totalCounts = counts;
+ this.createType4test = createType4Test;
+ this.gen4test = atomicGen4Test.incrementAndGet();
+ }
+
+ /**
+ * Fill a partition's array with the TotalCountsArray values.
+ * @param partitionArray array to fill
+ * @param partition number of required partition
+ */
+ public void fillTotalCountsForPartition(int[] partitionArray, int partition) {
+ int partitionSize = partitionArray.length;
+ int[] countArray = totalCounts[partition];
+ if (countArray == null) {
+ countArray = new int[partitionSize];
+ totalCounts[partition] = countArray;
+ }
+ int length = Math.min(partitionSize, countArray.length);
+ System.arraycopy(countArray, 0, partitionArray, 0, length);
+ }
+
+ /**
+ * Return the total count of an input category
+ * @param ordinal ordinal of category whose total count is required
+ */
+ public int getTotalCount(int ordinal) {
+ int partition = PartitionsUtils.partitionNumber(facetIndexingParams,ordinal);
+ int offset = ordinal % PartitionsUtils.partitionSize(facetIndexingParams, taxonomy);
+ return totalCounts[partition][offset];
+ }
+
+ static TotalFacetCounts loadFromFile(File inputFile, TaxonomyReader taxonomy,
+ FacetIndexingParams facetIndexingParams) throws IOException {
+ DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(inputFile)));
+ try {
+ int[][] counts = new int[dis.readInt()][];
+ for (int i=0; i
+ *
+ * @see #markRecentlyUsed(TFCKey)
+ */
+ private ConcurrentHashMap
+ *
+ */
+ private void markRecentlyUsed(TFCKey key) {
+ lruKeys.remove(key);
+ lruKeys.add(key);
+ }
+
+ private synchronized void trimCache() {
+ // loop until cache is of desired size.
+ while (cache.size()>maxCacheSize ) {
+ TFCKey key = lruKeys.poll();
+ if (key==null) { //defensive
+ // it is defensive since lruKeys presumably covers the cache keys
+ key = cache.keys().nextElement();
+ }
+ // remove this element. Note that an attempt to remove with the same key again is a no-op,
+ // which gracefully handles the possible race in markRecentlyUsed().
+ cache.remove(key);
+ }
+ }
+
+ /**
+ * compute TFC and cache it, after verifying it was not just added - for this
+ * matter this method is synchronized, which is not too bad, because there is
+ * lots of work done in the computations.
+ */
+ private synchronized TotalFacetCounts computeAndCache(TFCKey key, CategoryListCache clCache) throws IOException {
+ TotalFacetCounts tfc = cache.get(key);
+ if (tfc == null) {
+ tfc = TotalFacetCounts.compute(key.indexReader, key.taxonomy, key.facetIndexingParams, clCache);
+ lruKeys.add(key);
+ cache.put(key,tfc);
+ trimCache();
+ }
+ return tfc;
+ }
+
+ /**
+ * Load {@link TotalFacetCounts} matching input parameters from the provided outputFile
+ * and add them into the cache for the provided indexReader, taxonomy, and facetIndexingParams.
+ * If a {@link TotalFacetCounts} for these parameters already exists in the cache, it will be
+ * replaced by the loaded one.
+ * @param inputFile file from which to read the data
+ * @param indexReader the documents index
+ * @param taxonomy the taxonomy index
+ * @param facetIndexingParams the facet indexing parameters
+ * @throws IOException on error
+ * @see #store(File, IndexReader, TaxonomyReader, FacetIndexingParams, CategoryListCache)
+ */
+ public synchronized void load(File inputFile, IndexReader indexReader, TaxonomyReader taxonomy,
+ FacetIndexingParams facetIndexingParams) throws IOException {
+ if (!inputFile.isFile() || !inputFile.exists() || !inputFile.canRead()) {
+ throw new IllegalArgumentException("Exepecting an existing readable file: "+inputFile);
+ }
+ TFCKey key = new TFCKey(indexReader, taxonomy, facetIndexingParams);
+ TotalFacetCounts tfc = TotalFacetCounts.loadFromFile(inputFile, taxonomy, facetIndexingParams);
+ cache.put(key,tfc);
+ trimCache();
+ markRecentlyUsed(key);
+ }
+
+ /**
+ * Store the {@link TotalFacetCounts} matching input parameters into the provided outputFile,
+ * making them available for a later call to {@link #load(File, IndexReader, TaxonomyReader, FacetIndexingParams)}.
+ * If these {@link TotalFacetCounts} are available in the cache, they are used. But if they are
+ * not in the cache, this call will first compute them (which will also add them to the cache).
+ * @param outputFile file to store in.
+ * @param indexReader the documents index
+ * @param taxonomy the taxonomy index
+ * @param facetIndexingParams the facet indexing parameters
+ * @param clCache category list cache for faster computation, can be null
+ * @throws IOException on error
+ * @see #load(File, IndexReader, TaxonomyReader, FacetIndexingParams)
+ * @see #getTotalCounts(IndexReader, TaxonomyReader, FacetIndexingParams, CategoryListCache)
+ */
+ public void store(File outputFile, IndexReader indexReader, TaxonomyReader taxonomy,
+ FacetIndexingParams facetIndexingParams, CategoryListCache clCache) throws IOException {
+ File parentFile = outputFile.getParentFile();
+ if (
+ ( outputFile.exists() && (!outputFile.isFile() || !outputFile.canWrite())) ||
+ (!outputFile.exists() && (!parentFile.isDirectory() || !parentFile.canWrite()))
+ ) {
+ throw new IllegalArgumentException("Exepecting a writable file: "+outputFile);
+ }
+ TotalFacetCounts tfc = getTotalCounts(indexReader, taxonomy, facetIndexingParams, clCache);
+ TotalFacetCounts.storeToFile(outputFile, tfc);
+ }
+
+ private static class TFCKey {
+ final IndexReader indexReader;
+ final TaxonomyReader taxonomy;
+ private final Iterable
+ *
+ * Aggregating Facets during Faceted Search
+
+ A facets aggregator is the parallel of Lucene's Collector.
+ While Collector collected matching documents,
+ an aggregator aggregates facets of a matching document.
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListCache.java b/modules/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListCache.java
new file mode 100644
index 00000000000..2acc218307f
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListCache.java
@@ -0,0 +1,61 @@
+package org.apache.lucene.facet.search.cache;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.facet.index.params.CategoryListParams;
+import org.apache.lucene.facet.index.params.FacetIndexingParams;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Cache for {@link CategoryListData}, per {@link CategoryListParams}.
+ *
+ * @lucene.experimental
+ */
+public class CategoryListCache {
+
+ private HashMapFaceted Search API
+
+ API for faceted search has several interfaces - simple, top level ones, adequate for most users,
+ and advanced, more complicated ones, for the more advanced users.
+
+
+
+
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java
new file mode 100644
index 00000000000..099ed021562
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java
@@ -0,0 +1,75 @@
+package org.apache.lucene.facet.search.params;
+
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.facet.search.FacetArrays;
+import org.apache.lucene.facet.search.aggregator.Aggregator;
+import org.apache.lucene.facet.search.aggregator.ComplementCountingAggregator;
+import org.apache.lucene.facet.search.aggregator.CountingAggregator;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Facet request for counting facets.
+ *
+ * @lucene.experimental
+ */
+public class CountFacetRequest extends FacetRequest {
+
+ /**
+ * Create a count facet request for a given node in the taxonomy.
+ *
+ * @param path category path of the category of interest.
+ * @param num number of child categories for which count info is requeted.
+ * reqiested. Default implementation will find top categories, -
+ * this behavior can be overridden by overriding
+ * {@link #createFacetResultsHandler(TaxonomyReader)}.
+ */
+ public CountFacetRequest(CategoryPath path, int num) {
+ super(path, num);
+ }
+
+ @Override
+ public Aggregator createAggregator(boolean useComplements,
+ FacetArrays arrays, IndexReader reader,
+ TaxonomyReader taxonomy) {
+ // we rely on that, if needed, result is cleared by arrays!
+ int[] a = arrays.getIntArray();
+ if (useComplements) {
+ return new ComplementCountingAggregator(a);
+ }
+ return new CountingAggregator(a);
+ }
+
+ @Override
+ public double getValueOf(FacetArrays arrays, int ordinal) {
+ return arrays.getIntArray()[ordinal];
+ }
+
+ @Override
+ public boolean supportsComplements() {
+ return true;
+ }
+
+ @Override
+ public boolean requireDocumentScore() {
+ return false;
+ }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java
new file mode 100644
index 00000000000..7366c5c0a95
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java
@@ -0,0 +1,377 @@
+package org.apache.lucene.facet.search.params;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.facet.index.params.CategoryListParams;
+import org.apache.lucene.facet.search.CategoryListIterator;
+import org.apache.lucene.facet.search.FacetArrays;
+import org.apache.lucene.facet.search.FacetResultsHandler;
+import org.apache.lucene.facet.search.TopKFacetResultsHandler;
+import org.apache.lucene.facet.search.TopKInEachNodeHandler;
+import org.apache.lucene.facet.search.aggregator.Aggregator;
+import org.apache.lucene.facet.search.cache.CategoryListData;
+import org.apache.lucene.facet.search.cache.CategoryListCache;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Request to accumulate facet information for a specified facet and possibly
+ * also some of its descendants, upto a specified depth.
+ *
+
+
+
+
+
+ numResults
+ */
+ private final int hashCode;
+
+ private ResultMode resultMode = DEFAULT_RESULT_MODE;
+
+ /**
+ * Initialize the request with a given path, and a requested number of facets
+ * results. By default, all returned results would be labeled - to alter this
+ * default see {@link #setNumLabel(int)}.
+ * numResults
is given as
+ * Integer.MAX_VALUE
than all the facet results would be
+ * returned, without any limit.
+ *
+ * TODO (Facet): add AUTO_EXPAND option
+ */
+ public final int getDepth() {
+ return depth;
+ }
+
+ /**
+ * If getNumLabel()arrays
. E.g., for ordinal number n, with
+ * partition, of size partitionSize, now covering n,
+ * getValueOf
would be invoked with idx
+ * being n % partitionSize.
+ */
+ public abstract double getValueOf(FacetArrays arrays, int idx);
+
+ /**
+ * Indicates whether this facet request is eligible for applying the complements optimization.
+ */
+ public boolean supportsComplements() {
+ return false; // by default: no
+ }
+
+ /** Indicates whether the results of this request depends on each result document's score */
+ public abstract boolean requireDocumentScore();
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetSearchParams.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetSearchParams.java
new file mode 100644
index 00000000000..99a6bd4dc28
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetSearchParams.java
@@ -0,0 +1,130 @@
+package org.apache.lucene.facet.search.params;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams;
+import org.apache.lucene.facet.index.params.FacetIndexingParams;
+import org.apache.lucene.facet.search.cache.CategoryListCache;
+import org.apache.lucene.facet.search.results.FacetResult;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Faceted search parameters indicate for which facets should info be gathered.
+ *
+ * NOTE: The order of addition implies the order of the {@link FacetResult}s
+ * @param facetRequest facet request to be added.
+ */
+ public void addFacetRequest(FacetRequest facetRequest) {
+ if (facetRequest == null) {
+ throw new IllegalArgumentException("Provided facetRequest must not be null");
+ }
+ facetRequests.add(facetRequest);
+ }
+
+ @Override
+ public String toString() {
+ final char TAB = '\t';
+ final char NEWLINE = '\n';
+
+ StringBuilder sb = new StringBuilder("IndexingParams: ");
+ sb.append(NEWLINE).append(TAB).append(getFacetIndexingParams());
+
+ sb.append(NEWLINE).append("FacetRequests:");
+ for (FacetRequest facetRequest : getFacetRequests()) {
+ sb.append(NEWLINE).append(TAB).append(facetRequest);
+ }
+
+ return sb.toString();
+ }
+
+ /**
+ * @return the cldCache in effect
+ */
+ public CategoryListCache getClCache() {
+ return clCache;
+ }
+
+ /**
+ * Set Cached Category Lists data to be used in Faceted search.
+ * @param clCache the cldCache to set
+ */
+ public void setClCache(CategoryListCache clCache) {
+ this.clCache = clCache;
+ }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/ScoreFacetRequest.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/ScoreFacetRequest.java
new file mode 100644
index 00000000000..dcb723a1a4a
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/ScoreFacetRequest.java
@@ -0,0 +1,63 @@
+package org.apache.lucene.facet.search.params;
+
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.facet.search.FacetArrays;
+import org.apache.lucene.facet.search.aggregator.Aggregator;
+import org.apache.lucene.facet.search.aggregator.ScoringAggregator;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Facet request for weighting facets according to document scores.
+ *
+ * @lucene.experimental
+ */
+public class ScoreFacetRequest extends FacetRequest {
+
+ /** Create a score facet request for a given node in the taxonomy. */
+ public ScoreFacetRequest(CategoryPath path, int num) {
+ super(path, num);
+ }
+
+ @Override
+ public Aggregator createAggregator(boolean useComplements,
+ FacetArrays arrays, IndexReader reader,
+ TaxonomyReader taxonomy) {
+ assert !useComplements : "complements are not supported by this FacetRequest";
+ return new ScoringAggregator(arrays.getFloatArray());
+ }
+
+ @Override
+ public double getValueOf(FacetArrays arrays, int ordinal) {
+ return arrays.getFloatArray()[ordinal];
+ }
+
+ @Override
+ public boolean supportsComplements() {
+ return false;
+ }
+
+ @Override
+ public boolean requireDocumentScore() {
+ return true;
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationFloatSumFacetRequest.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationFloatSumFacetRequest.java
new file mode 100644
index 00000000000..ce7a1c93bab
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationFloatSumFacetRequest.java
@@ -0,0 +1,70 @@
+package org.apache.lucene.facet.search.params.association;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.facet.search.FacetArrays;
+import org.apache.lucene.facet.search.aggregator.Aggregator;
+import org.apache.lucene.facet.search.aggregator.association.AssociationFloatSumAggregator;
+import org.apache.lucene.facet.search.params.FacetRequest;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Facet request for weighting facets according to their float association by
+ * summing the association values.
+ *
+ * @lucene.experimental
+ */
+public class AssociationFloatSumFacetRequest extends FacetRequest {
+
+ /**
+ * Create a float association facet request for a given node in the
+ * taxonomy.
+ */
+ public AssociationFloatSumFacetRequest(CategoryPath path, int num) {
+ super(path, num);
+ }
+
+ @Override
+ public Aggregator createAggregator(boolean useComplements,
+ FacetArrays arrays, IndexReader reader,
+ TaxonomyReader taxonomy) throws IOException {
+ assert !useComplements : "complements are not supported by this FacetRequest";
+ return new AssociationFloatSumAggregator(reader, arrays.getFloatArray());
+ }
+
+ @Override
+ public double getValueOf(FacetArrays arrays, int ordinal) {
+ return arrays.getFloatArray()[ordinal];
+ }
+
+ @Override
+ public boolean supportsComplements() {
+ return false;
+ }
+
+ @Override
+ public boolean requireDocumentScore() {
+ return false;
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationIntSumFacetRequest.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationIntSumFacetRequest.java
new file mode 100644
index 00000000000..32ee7881e3d
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationIntSumFacetRequest.java
@@ -0,0 +1,70 @@
+package org.apache.lucene.facet.search.params.association;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.facet.search.FacetArrays;
+import org.apache.lucene.facet.search.aggregator.Aggregator;
+import org.apache.lucene.facet.search.aggregator.association.AssociationIntSumAggregator;
+import org.apache.lucene.facet.search.params.FacetRequest;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Facet request for weighting facets according to their integer association by
+ * summing the association values.
+ *
+ * @lucene.experimental
+ */
+public class AssociationIntSumFacetRequest extends FacetRequest {
+
+ /**
+ * Create an integer association facet request for a given node in the
+ * taxonomy.
+ */
+ public AssociationIntSumFacetRequest(CategoryPath path, int num) {
+ super(path, num);
+ }
+
+ @Override
+ public Aggregator createAggregator(boolean useComplements,
+ FacetArrays arrays, IndexReader reader,
+ TaxonomyReader taxonomy) throws IOException {
+ assert !useComplements : "complements are not supported by this FacetRequest";
+ return new AssociationIntSumAggregator(reader, arrays.getIntArray());
+ }
+
+ @Override
+ public double getValueOf(FacetArrays arrays, int ordinal) {
+ return arrays.getIntArray()[ordinal];
+ }
+
+ @Override
+ public boolean supportsComplements() {
+ return false;
+ }
+
+ @Override
+ public boolean requireDocumentScore() {
+ return false;
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/package.html b/modules/facet/src/java/org/apache/lucene/facet/search/params/package.html
new file mode 100644
index 00000000000..7957d9b9ecf
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/package.html
@@ -0,0 +1,8 @@
+
+
+ Parameters for Faceted Search
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResult.java b/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResult.java
new file mode 100644
index 00000000000..af0d32cd2a8
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResult.java
@@ -0,0 +1,103 @@
+package org.apache.lucene.facet.search.results;
+
+import org.apache.lucene.facet.search.params.FacetRequest;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Result of faceted search.
+ *
+ * @lucene.experimental
+ */
+public class FacetResult {
+
+ private final FacetRequest facetRequest;
+ private final FacetResultNode rootNode;
+ private final int numValidDescendants;
+
+ public FacetResult(FacetRequest facetRequest, FacetResultNode rootNode, int numValidDescendants) {
+ this.facetRequest = facetRequest;
+ this.rootNode = rootNode;
+ this.numValidDescendants = numValidDescendants;
+ }
+
+ /**
+ * Facet result node matching the root of the {@link #getFacetRequest() facet request}.
+ * @see #getFacetRequest()
+ * @see FacetRequest#getCategoryPath()
+ */
+ public final FacetResultNode getFacetResultNode() {
+ return this.rootNode;
+ }
+
+ /**
+ * Number of descendants of {@link #getFacetResultNode() root facet result node},
+ * up till the requested depth, which are valid by the
+ * {@link FacetRequest#createFacetResultsHandler(org.apache.lucene.facet.taxonomy.TaxonomyReader)
+ * results handler in effect}. Typically -- have value != 0.
+ * This number does not include the root node.
+ * @see #getFacetRequest()
+ * @see FacetRequest#getDepth()
+ */
+ public final int getNumValidDescendants() {
+ return this.numValidDescendants;
+ }
+
+ /**
+ * Request for which this result was obtained.
+ */
+ public final FacetRequest getFacetRequest() {
+ return this.facetRequest;
+ }
+
+ /**
+ * String representation of this facet result.
+ * Use with caution: might return a very long string.
+ * @param prefix prefix for each result line
+ * @see #toString()
+ */
+ public String toString(String prefix) {
+ StringBuilder sb = new StringBuilder();
+ String nl = "";
+
+ // request
+ if (this.facetRequest != null) {
+ sb.append(nl).append(prefix).append("Request: ").append(
+ this.facetRequest.toString());
+ nl = "\n";
+ }
+
+ // total facets
+ sb.append(nl).append(prefix).append("Num valid Descendants (up to specified depth): ").append(
+ this.numValidDescendants);
+ nl = "\n";
+
+ // result node
+ if (this.rootNode != null) {
+ sb.append(nl).append(this.rootNode.toString(prefix + "\t"));
+ }
+
+ return sb.toString();
+ }
+
+ @Override
+ public String toString() {
+ return toString("");
+ }
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResultNode.java b/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResultNode.java
new file mode 100644
index 00000000000..eff9f2b4a45
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResultNode.java
@@ -0,0 +1,110 @@
+package org.apache.lucene.facet.search.results;
+
+import java.io.IOException;
+
+import org.apache.lucene.facet.search.FacetResultsHandler;
+import org.apache.lucene.facet.search.params.FacetRequest;
+import org.apache.lucene.facet.search.sampling.SampleFixer;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Result of faceted search for a certain taxonomy node.
+ *
+ * @lucene.experimental
+ */
+public interface FacetResultNode {
+
+ /**
+ * String representation of this facet result node.
+ * Use with caution: might return a very long string.
+ * @param prefix prefix for each result line
+ */
+ public String toString(String prefix);
+
+ /**
+ * Ordinal of the category of this result.
+ */
+ public int getOrdinal();
+
+ /**
+ * Category path of the category of this result, or null if not computed,
+ * because the application did not request to compute it.
+ * To force computing the label in case not yet computed use
+ * {@link #getLabel(TaxonomyReader)}.
+ * @see FacetRequest#getNumLabel()
+ * @see #getLabel(TaxonomyReader)
+ */
+ public CategoryPath getLabel();
+
+ /**
+ * Category path of the category of this result.
+ * If not already computed, will be computed now.
+ * Results of Faceted Search
+
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SampleFixer.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SampleFixer.java
new file mode 100644
index 00000000000..dc6a3a2eff1
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SampleFixer.java
@@ -0,0 +1,44 @@
+package org.apache.lucene.facet.search.sampling;
+
+import java.io.IOException;
+
+import org.apache.lucene.facet.search.ScoredDocIDs;
+import org.apache.lucene.facet.search.results.FacetResult;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Fixer of sample facet accumulation results
+ *
+ * @lucene.experimental
+ */
+public interface SampleFixer {
+
+ /**
+ * Alter the input result, fixing it to account for the sampling. This
+ * implementation can compute accurate or estimated counts for the sampled facets.
+ * For example, a faster correction could just multiply by a compensating factor.
+ *
+ * @param origDocIds
+ * full set of matching documents.
+ * @param fres
+ * sample result to be fixed.
+ * @throws IOException
+ */
+ public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) throws IOException;
+}
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java
new file mode 100644
index 00000000000..debebeafd58
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java
@@ -0,0 +1,238 @@
+package org.apache.lucene.facet.search.sampling;
+
+import java.io.IOException;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.facet.search.FacetArrays;
+import org.apache.lucene.facet.search.ScoredDocIDs;
+import org.apache.lucene.facet.search.aggregator.Aggregator;
+import org.apache.lucene.facet.search.params.FacetRequest;
+import org.apache.lucene.facet.search.params.FacetSearchParams;
+import org.apache.lucene.facet.search.results.FacetResult;
+import org.apache.lucene.facet.search.results.FacetResultNode;
+import org.apache.lucene.facet.search.results.MutableFacetResultNode;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.facet.util.RandomSample;
+import org.apache.lucene.facet.util.ScoredDocIdsUtils;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Sampling definition for facets accumulation
+ * TakmiSampleFixer
which is adequate only for
+ * counting. For any other accumulator, provide a different fixer.
+ */
+ public SampleFixer getSampleFixer(
+ IndexReader indexReader, TaxonomyReader taxonomyReader,
+ FacetSearchParams searchParams) {
+ return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
+ }
+
+ /**
+ * Result of sample computation
+ */
+ public final static class SampleResult {
+ public final ScoredDocIDs docids;
+ public final double actualSampleRatio;
+ protected SampleResult(ScoredDocIDs docids, double actualSampleRatio) {
+ this.docids = docids;
+ this.actualSampleRatio = actualSampleRatio;
+ }
+ }
+
+ /**
+ * Return the sampling params in effect
+ */
+ public final SamplingParams getSamplingParams() {
+ return samplingParams;
+ }
+
+ /**
+ * Trim the input facet result.
+ * Note: It is only valid to call this method with result obtained for a
+ * facet request created through {@link #overSampledSearchParams(FacetSearchParams)}.
+ *
+ * @throws IllegalArgumentException
+ * if called with results not obtained for requests created
+ * through {@link #overSampledSearchParams(FacetSearchParams)}
+ */
+ public FacetResult trimResult(FacetResult facetResult) throws IllegalArgumentException {
+ double overSampleFactor = getSamplingParams().getOversampleFactor();
+ if (overSampleFactor <= 1) { // no factoring done?
+ return facetResult;
+ }
+
+ OverSampledFacetRequest sampledFreq = null;
+
+ try {
+ sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest();
+ } catch (ClassCastException e) {
+ throw new IllegalArgumentException(
+ "It is only valid to call this method with result obtained for a" +
+ "facet request created through sampler.overSamlpingSearchParams()",
+ e);
+ }
+
+ FacetRequest origFrq = sampledFreq.orig;
+
+ MutableFacetResultNode trimmedRootNode = MutableFacetResultNode.toImpl(facetResult.getFacetResultNode());
+ trimmedRootNode.trimSubResults(origFrq.getNumResults());
+
+ return new FacetResult(origFrq, trimmedRootNode, facetResult.getNumValidDescendants());
+ }
+
+ /**
+ * Over-sampled search params, wrapping each request with an over-sampled one.
+ */
+ public FacetSearchParams overSampledSearchParams(FacetSearchParams original) {
+ FacetSearchParams res = original;
+ // So now we can sample -> altering the searchParams to accommodate for the statistical error for the sampling
+ double overSampleFactor = getSamplingParams().getOversampleFactor();
+ if (overSampleFactor > 1) { // any factoring to do?
+ res = new FacetSearchParams(original.getFacetIndexingParams());
+ for (FacetRequest frq: original.getFacetRequests()) {
+ int overSampledNumResults = (int) Math.ceil(frq.getNumResults() * overSampleFactor);
+ res.addFacetRequest(new OverSampledFacetRequest(frq, overSampledNumResults));
+ }
+ }
+ return res;
+ }
+
+ /**
+ * Wrapping a facet request for over sampling.
+ * Implementation detail: even if the original request is a count request, no
+ * statistics will be computed for it as the wrapping is not a count request.
+ * This is ok, as the sampling accumulator is later computing the statistics
+ * over the original requests.
+ */
+ private static class OverSampledFacetRequest extends FacetRequest {
+ final FacetRequest orig;
+ public OverSampledFacetRequest(FacetRequest orig, int num) {
+ super(orig.getCategoryPath(), num);
+ this.orig = orig;
+ }
+
+ @Override
+ public Aggregator createAggregator(boolean useComplements,
+ FacetArrays arrays, IndexReader indexReader,
+ TaxonomyReader taxonomy) throws IOException {
+ return orig.createAggregator(useComplements, arrays, indexReader,
+ taxonomy);
+ }
+
+ @Override
+ public double getValueOf(FacetArrays arrays, int idx) {
+ return orig.getValueOf(arrays, idx);
+ }
+
+ @Override
+ public boolean requireDocumentScore() {
+ return orig.requireDocumentScore();
+ }
+ }
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SamplingAccumulator.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SamplingAccumulator.java
new file mode 100644
index 00000000000..fa48c684479
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SamplingAccumulator.java
@@ -0,0 +1,143 @@
+package org.apache.lucene.facet.search.sampling;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.facet.search.FacetResultsHandler;
+import org.apache.lucene.facet.search.FacetsAccumulator;
+import org.apache.lucene.facet.search.FloatArrayAllocator;
+import org.apache.lucene.facet.search.IntArrayAllocator;
+import org.apache.lucene.facet.search.SamplingWrapper;
+import org.apache.lucene.facet.search.ScoredDocIDs;
+import org.apache.lucene.facet.search.StandardFacetsAccumulator;
+import org.apache.lucene.facet.search.params.FacetSearchParams;
+import org.apache.lucene.facet.search.results.FacetResult;
+import org.apache.lucene.facet.search.results.FacetResultNode;
+import org.apache.lucene.facet.search.sampling.Sampler.SampleResult;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Facets accumulation with sampling.
+ *
+ *
+ *
+ *
+ *
+ * @return true if valid, false otherwise
+ */
+ public boolean validate() {
+ return
+ samplingThreshold >= maxSampleSize &&
+ maxSampleSize >= minSampleSize &&
+ sampleRatio > 0 &&
+ sampleRatio < 1;
+ }
+
+ /**
+ * Return the oversampleFactor. When sampling, we would collect that much more
+ * results, so that later, when selecting top out of these, chances are higher
+ * to get actual best results. Note that having this value larger than 1 only
+ * makes sense when using a SampleFixer which finds accurate results, such as
+ * minSampleSize <= maxSampleSize <= samplingThreshold
0 < samplingRatio <= 1
TakmiSampleFixer
. When this value is smaller than 1, it is
+ * ignored and no oversampling takes place.
+ */
+ public final double getOversampleFactor() {
+ return oversampleFactor;
+ }
+
+ /**
+ * @param oversampleFactor the oversampleFactor to set
+ * @see #getOversampleFactor()
+ */
+ public void setOversampleFactor(double oversampleFactor) {
+ this.oversampleFactor = oversampleFactor;
+ }
+
+}
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java
new file mode 100644
index 00000000000..300721a870d
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java
@@ -0,0 +1,180 @@
+package org.apache.lucene.facet.search.sampling;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.Bits;
+
+import org.apache.lucene.facet.search.DrillDown;
+import org.apache.lucene.facet.search.ScoredDocIDs;
+import org.apache.lucene.facet.search.ScoredDocIDsIterator;
+import org.apache.lucene.facet.search.params.FacetSearchParams;
+import org.apache.lucene.facet.search.results.FacetResult;
+import org.apache.lucene.facet.search.results.FacetResultNode;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Fix sampling results by counting the intersection between two lists: a
+ * TermDocs (list of documents in a certain category) and a DocIdSetIterator
+ * (list of documents matching the query).
+ *
+ *
+ * @lucene.experimental
+ */
+// TODO (Facet): implement also an estimated fixing by ratio (taking into
+// account "translation" of counts!)
+class TakmiSampleFixer implements SampleFixer {
+
+ private TaxonomyReader taxonomyReader;
+ private IndexReader indexReader;
+ private FacetSearchParams searchParams;
+
+ public TakmiSampleFixer(IndexReader indexReader,
+ TaxonomyReader taxonomyReader, FacetSearchParams searchParams) {
+ this.indexReader = indexReader;
+ this.taxonomyReader = taxonomyReader;
+ this.searchParams = searchParams;
+ }
+
+ public void fixResult(ScoredDocIDs origDocIds, FacetResult fres)
+ throws IOException {
+ FacetResultNode topRes = fres.getFacetResultNode();
+ fixResultNode(topRes, origDocIds);
+ }
+
+ /**
+ * Fix result node count, and, recursively, fix all its children
+ *
+ * @param facetResNode
+ * result node to be fixed
+ * @param docIds
+ * docids in effect
+ * @throws IOException
+ */
+ private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds)
+ throws IOException {
+ recount(facetResNode, docIds);
+ for (FacetResultNode frn : facetResNode.getSubResults()) {
+ fixResultNode(frn, docIds);
+ }
+ }
+
+ /**
+ * Internal utility: recount for a facet result node
+ *
+ * @param fresNode
+ * result node to be recounted
+ * @param docIds
+ * full set of matching documents.
+ * @throws IOException
+ */
+ private void recount(FacetResultNode fresNode, ScoredDocIDs docIds)
+ throws IOException {
+ // TODO (Facet): change from void to return the new, smaller docSet, and use
+ // that for the children, as this will make their intersection ops faster.
+ // can do this only when the new set is "sufficiently" smaller.
+
+ /* We need the category's path name in order to do its recounting.
+ * If it is missing, because the option to label only part of the
+ * facet results was exercise, we need to calculate them anyway, so
+ * in essence sampling with recounting spends some extra cycles for
+ * labeling results for which labels are not required. */
+ CategoryPath catPath = fresNode.getLabel(taxonomyReader); // force labeling
+
+ Term drillDownTerm = DrillDown.term(searchParams, catPath);
+ // TODO (Facet): avoid Multi*?
+ Bits deletedDocs = MultiFields.getDeletedDocs(indexReader);
+ int updatedCount = countIntersection(MultiFields.getTermDocsEnum(indexReader, deletedDocs, drillDownTerm.field(), drillDownTerm.bytes()),
+ docIds.iterator());
+
+ fresNode.setValue(updatedCount);
+ }
+
+ /**
+ * Count the size of the intersection between two lists: a TermDocs (list of
+ * documents in a certain category) and a DocIdSetIterator (list of documents
+ * matching a query).
+ */
+ private static int countIntersection(DocsEnum p1, ScoredDocIDsIterator p2)
+ throws IOException {
+ // The documentation of of both TermDocs and DocIdSetIterator claim
+ // that we must do next() before doc(). So we do, and if one of the
+ // lists is empty, obviously return 0;
+ if (p1 == null || p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
+ return 0;
+ }
+ if (!p2.next()) {
+ return 0;
+ }
+
+ int d1 = p1.docID();
+ int d2 = p2.getDocID();
+
+ int count = 0;
+ for (;;) {
+ if (d1 == d2) {
+ ++count;
+ if (p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
+ break; // end of list 1, nothing more in intersection
+ }
+ d1 = p1.docID();
+ if (!advance(p2, d1)) {
+ break; // end of list 2, nothing more in intersection
+ }
+ d2 = p2.getDocID();
+ } else if (d1 < d2) {
+ if (p1.advance(d2) == DocIdSetIterator.NO_MORE_DOCS) {
+ break; // end of list 1, nothing more in intersection
+ }
+ d1 = p1.docID();
+ } else /* d1>d2 */ {
+ if (!advance(p2, d1)) {
+ break; // end of list 2, nothing more in intersection
+ }
+ d2 = p2.getDocID();
+ }
+ }
+ return count;
+ }
+
+ /**
+ * utility: advance the iterator until finding (or exceeding) specific
+ * document
+ *
+ * @param iterator
+ * iterator being advanced
+ * @param targetDoc
+ * target of advancing
+ * @return false if iterator exhausted, true otherwise.
+ */
+ private static boolean advance(ScoredDocIDsIterator iterator, int targetDoc) {
+ while (iterator.next()) {
+ if (iterator.getDocID() >= targetDoc) {
+ return true; // target reached
+ }
+ }
+ return false; // exhausted
+ }
+}
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/package.html b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/package.html
new file mode 100644
index 00000000000..9ea9f97e1b4
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/package.html
@@ -0,0 +1,8 @@
+
+
+ Sampling for facets accumulation
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java
new file mode 100644
index 00000000000..389cd1f6e30
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java
@@ -0,0 +1,1053 @@
+package org.apache.lucene.facet.taxonomy;
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Serializable;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A CategoryPath holds a sequence of string components, specifying the
+ * hierarchical name of a category.
+ * start
specifies the first component in the subpath, and
+ * end
is one past the last component. If start
is
+ * negative, 0 is assumed, and if end
is negative or past the
+ * end of the path, the path is taken until the end. Otherwise, if
+ * end<=start
, nothing is appended. Nothing is appended also in
+ * the case that the path is empty.
+ */
+ public void appendTo(Appendable out, char delimiter, int start, int end)
+ throws IOException {
+ if (start < 0) {
+ start = 0;
+ }
+ if (end < 0 || end > ncomponents) {
+ end = ncomponents;
+ }
+ if (end <= start) {
+ return; // just append nothing...
+ }
+ for (int i = (start == 0 ? 0 : ends[start - 1]); i < ends[start]; i++) {
+ out.append(chars[i]);
+ }
+ for (int j = start + 1; j < end; j++) {
+ out.append(delimiter);
+ for (int i = ends[j - 1]; i < ends[j]; i++) {
+ out.append(chars[i]);
+ }
+ }
+ }
+
+ /**
+ * Build a string representation of the path, with its components separated
+ * by the given delimiter character. The resulting string is returned as a
+ * new String object. To avoid this temporary object creation, consider
+ * using {@link #appendTo(Appendable, char)} instead.
+ * start
specifies the first component in the subpath, and
+ * end
is one past the last component. If start
is
+ * negative, 0 is assumed, and if end
is negative or past the
+ * end of the path, the path is taken until the end. Otherwise, if
+ * end<=start
, an empty string is returned. An emptry string is
+ * returned also in the case that the path is empty.
+ */
+ public String toString(char delimiter, int start, int end) {
+ if (start < 0) {
+ start = 0;
+ }
+ if (end < 0 || end > ncomponents) {
+ end = ncomponents;
+ }
+ if (end <= start) {
+ return "";
+ }
+ int startchar = (start == 0) ? 0 : ends[start - 1];
+ StringBuilder sb = new StringBuilder(ends[end - 1] - startchar
+ + (end - start) - 1);
+ try {
+ this.appendTo(sb, delimiter, start, end);
+ } catch (IOException e) {
+ // can't happen, because sb.append() never actually throws an
+ // exception
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Return the i'th component of the path, in a new String object. If there
+ * is no i'th component, a null is returned.
+ */
+ public String getComponent(int i) {
+ if (i < 0 || i >= ncomponents) {
+ return null;
+ }
+ if (i == 0) {
+ return new String(chars, 0, ends[0]);
+ }
+ return new String(chars, ends[i - 1], ends[i] - ends[i - 1]);
+ }
+
+ /**
+ * Return the last component of the path, in a new String object. If the
+ * path is empty, a null is returned.
+ */
+ public String lastComponent() {
+ if (ncomponents == 0) {
+ return null;
+ }
+ if (ncomponents == 1) {
+ return new String(chars, 0, ends[0]);
+ }
+ return new String(chars, ends[ncomponents - 2], ends[ncomponents - 1]
+ - ends[ncomponents - 2]);
+ }
+
+ /**
+ * Copies the specified number of components from this category path to the
+ * specified character array, with the components separated by a given
+ * delimiter character. The array must be large enough to hold the
+ * components and separators - the amount of needed space can be calculated
+ * with {@link #charsNeededForFullPath()}.
+ *
+ *
+ * Notes about concurrent access to the taxonomy:
+ * false
. Otherwise, the method returns true
.
+ */
+ public boolean getPath(int ordinal, CategoryPath result) throws IOException;
+
+ /**
+ * refresh() re-reads the taxonomy information if there were any changes to
+ * the taxonomy since this instance was opened or last refreshed. Calling
+ * refresh() is more efficient than close()ing the old instance and opening a
+ * new one.
+ * INVALID_ORDINAL
.
+ */
+ public int[] getYoungestChildArray();
+ /**
+ * getOlderSiblingArray() returns an int array of size getSize()
+ * listing for each category the ordinal of its immediate older
+ * sibling (the sibling in the taxonomy tree with the highest ordinal
+ * below that of the given ordinal). The value for a category with no
+ * older sibling is INVALID_ORDINAL
.
+ */
+ public int[] getOlderSiblingArray();
+ }
+
+ /**
+ * getChildrenArrays() returns a {@link ChildrenArrays} object which can
+ * be used together to efficiently enumerate the children of any category.
+ * APPEND
+ * means open an existing index for append (failing if the index does
+ * not yet exist). CREATE
means create a new index (first
+ * deleting the old one if it already existed).
+ * APPEND_OR_CREATE
appends to an existing index if there
+ * is one, otherwise it creates a new index.
+ * @param cache
+ * A {@link TaxonomyWriterCache} implementation which determines
+ * the in-memory caching policy. See for example
+ * {@link LruTaxonomyWriterCache} and {@link Cl2oTaxonomyWriterCache}.
+ * If null or missing, {@link #defaultTaxonomyWriterCache()} is used.
+ * @throws CorruptIndexException
+ * if the taxonomy is corrupted.
+ * @throws LockObtainFailedException
+ * if the taxonomy is locked by another writer. If it is known
+ * that no other concurrent writer is active, the lock might
+ * have been left around by an old dead process, and should be
+ * removed using {@link #unlock(Directory)}.
+ * @throws IOException
+ * if another error occurred.
+ */
+ public LuceneTaxonomyWriter(Directory directory, OpenMode openMode,
+ TaxonomyWriterCache cache)
+ throws CorruptIndexException, LockObtainFailedException,
+ IOException {
+
+ openLuceneIndex(directory, openMode);
+ reader = null;
+
+ parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream);
+ parentStreamField.setOmitNorms(true);
+ fullPathField = new Field(Consts.FULL, "", Store.YES, Index.NOT_ANALYZED_NO_NORMS);
+ fullPathField.setOmitTermFreqAndPositions(true);
+
+ this.nextID = indexWriter.maxDoc();
+
+ if (cache==null) {
+ cache = defaultTaxonomyWriterCache();
+ }
+ this.cache = cache;
+
+ if (nextID == 0) {
+ cacheIsComplete = true;
+ // Make sure that the taxonomy always contain the root category
+ // with category id 0.
+ addCategory(new CategoryPath());
+ refreshReader();
+ } else {
+ // There are some categories on the disk, which we have not yet
+ // read into the cache, and therefore the cache is incomplete.
+ // We chose not to read all the categories into the cache now,
+ // to avoid terrible performance when a taxonomy index is opened
+ // to add just a single category. We will do it later, after we
+ // notice a few cache misses.
+ cacheIsComplete = false;
+ }
+ cacheMisses = 0;
+ }
+
+ /**
+ * A hook for extensions of this class to provide their own
+ * {@link IndexWriter} implementation or instance. Extending classes can
+ * instantiate and configure the {@link IndexWriter} as they see fit,
+ * including setting a {@link org.apache.lucene.index.MergeScheduler}, or
+ * {@link org.apache.lucene.index.IndexDeletionPolicy}, different RAM size
+ * etc.
+ * NOTE: the instance this method returns will be closed upon calling
+ * to {@link #close()}. If you wish to do something different, you should
+ * override {@link #closeLuceneIndex()}.
+ *
+ * @param directory the {@link Directory} on top of wich an
+ * {@link IndexWriter} should be opened.
+ * @param openMode see {@link OpenMode}
+ */
+ protected void openLuceneIndex (Directory directory, OpenMode openMode)
+ throws CorruptIndexException, LockObtainFailedException, IOException {
+ // Make sure we use a MergePolicy which merges segments in-order and thus
+ // keeps the doc IDs ordered as well (this is crucial for the taxonomy
+ // index).
+ IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_30,
+ new KeywordAnalyzer()).setOpenMode(openMode).setMergePolicy(
+ new LogByteSizeMergePolicy());
+ indexWriter = new IndexWriter(directory, config);
+ }
+
+ // Currently overridden by a unit test that verifies that every index we open
+ // is close()ed.
+ /**
+ * Open an {@link IndexReader} from the {@link #indexWriter} member, by
+ * calling {@link IndexWriter#getReader()}. Extending classes can override
+ * this method to return their own {@link IndexReader}.
+ */
+ protected IndexReader openReader() throws IOException {
+ return IndexReader.open(indexWriter, true);
+ }
+
+ /**
+ * Creates a new instance with a default cached as defined by
+ * {@link #defaultTaxonomyWriterCache()}.
+ */
+ public LuceneTaxonomyWriter(Directory directory, OpenMode openMode)
+ throws CorruptIndexException, LockObtainFailedException, IOException {
+ this(directory, openMode, defaultTaxonomyWriterCache());
+ }
+
+ /**
+ * Defines the default {@link TaxonomyWriterCache} to use in constructors
+ * which do not specify one.
+ *
+ * NOTE: if you override this method, you should include a
+ * super.closeResources()
call in your implementation.
+ */
+ protected synchronized void closeResources() throws IOException {
+ if (reader != null) {
+ reader.close();
+ reader = null;
+ }
+ if (cache != null) {
+ cache.close();
+ cache = null;
+ }
+ }
+
+ /**
+ * A hook for extending classes to control closing the {@link IndexWriter}
+ * returned by {@link #openLuceneIndex}.
+ */
+ protected void closeLuceneIndex() throws CorruptIndexException, IOException {
+ if (indexWriter != null) {
+ indexWriter.close();
+ indexWriter = null;
+ }
+ }
+
+ /**
+ * Look up the given category in the cache and/or the on-disk storage,
+ * returning the category's ordinal, or a negative number in case the
+ * category does not yet exist in the taxonomy.
+ */
+ protected int findCategory(CategoryPath categoryPath) throws IOException {
+ // If we can find the category in our cache, we can return the
+ // response directly from it:
+ int res = cache.get(categoryPath);
+ if (res >= 0) {
+ return res;
+ }
+ // If we know that the cache is complete, i.e., contains every category
+ // which exists, we can return -1 immediately. However, if the cache is
+ // not complete, we need to check the disk.
+ if (cacheIsComplete) {
+ return -1;
+ }
+ cacheMisses++;
+ // After a few cache misses, it makes sense to read all the categories
+ // from disk and into the cache. The reason not to do this on the first
+ // cache miss (or even when opening the writer) is that it will
+ // significantly slow down the case when a taxonomy is opened just to
+ // add one category. The idea only spending a long time on reading
+ // after enough time was spent on cache misses is known as a "online
+ // algorithm".
+ if (perhapsFillCache()) {
+ return cache.get(categoryPath);
+ }
+
+ // We need to get an answer from the on-disk index. If a reader
+ // is not yet open, do it now:
+ if (reader == null) {
+ reader = openReader();
+ }
+
+ // TODO (Facet): avoid Multi*?
+ Bits deletedDocs = MultiFields.getDeletedDocs(reader);
+ DocsEnum docs = MultiFields.getTermDocsEnum(reader, deletedDocs, Consts.FULL,
+ new BytesRef(categoryPath.toString(delimiter)));
+ if (docs == null || docs.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
+ return -1; // category does not exist in taxonomy
+ }
+ // Note: we do NOT add to the cache the fact that the category
+ // does not exist. The reason is that our only use for this
+ // method is just before we actually add this category. If
+ // in the future this usage changes, we should consider caching
+ // the fact that the category is not in the taxonomy.
+ addToCache(categoryPath, docs.docID());
+ return docs.docID();
+ }
+
+ /**
+ * Look up the given prefix of the given category in the cache and/or the
+ * on-disk storage, returning that prefix's ordinal, or a negative number in
+ * case the category does not yet exist in the taxonomy.
+ */
+ private int findCategory(CategoryPath categoryPath, int prefixLen)
+ throws IOException {
+ int res = cache.get(categoryPath, prefixLen);
+ if (res >= 0) {
+ return res;
+ }
+ if (cacheIsComplete) {
+ return -1;
+ }
+ cacheMisses++;
+ if (perhapsFillCache()) {
+ return cache.get(categoryPath, prefixLen);
+ }
+ if (reader == null) {
+ reader = openReader();
+ }
+ Bits deletedDocs = MultiFields.getDeletedDocs(reader);
+ DocsEnum docs = MultiFields.getTermDocsEnum(reader, deletedDocs, Consts.FULL,
+ new BytesRef(categoryPath.toString(delimiter, prefixLen)));
+ if (docs == null || docs.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
+ return -1; // category does not exist in taxonomy
+ }
+ addToCache(categoryPath, prefixLen, docs.docID());
+ return docs.docID();
+ }
+
+ // TODO (Facet): addCategory() is synchronized. This means that if indexing is
+ // multi-threaded, a new category that needs to be written to disk (and
+ // potentially even trigger a lengthy merge) locks out other addCategory()
+ // calls - even those which could immediately return a cached value.
+ // We definitely need to fix this situation!
+ public synchronized int addCategory(CategoryPath categoryPath)
+ throws IOException {
+ // If the category is already in the cache and/or the taxonomy, we
+ // should return its existing ordinal:
+ int res = findCategory(categoryPath);
+ if (res < 0) {
+ // This is a new category, and we need to insert it into the index
+ // (and the cache). Actually, we might also need to add some of
+ // the category's ancestors before we can add the category itself
+ // (while keeping the invariant that a parent is always added to
+ // the taxonomy before its child). internalAddCategory() does all
+ // this recursively:
+ res = internalAddCategory(categoryPath, categoryPath.length());
+ }
+ return res;
+
+ }
+
+ /**
+ * Add a new category into the index (and the cache), and return its new
+ * ordinal.
+ * 0
, the entire taxonomy is read
+ * into the cache on first use, without fetching individual categories
+ * first.
+ * Taxonomy implemented using a Lucene-Index
+
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/package.html b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/package.html
new file mode 100644
index 00000000000..ab92496bd8c
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/package.html
@@ -0,0 +1,32 @@
+
+
+Taxonomy of Categories
+
+ Facets are defined using a hierarchy of categories, known as a
+ Taxonomy
+
+
+ For example, in a book store application, a Taxonomy could have the
+ following hierarchy:
+
+
+
+
+
+
+
+ The Taxonomy translates category-paths into category-ordinal and vice versa.
+
+
\ No newline at end of file
diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/TaxonomyWriterCache.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/TaxonomyWriterCache.java
new file mode 100644
index 00000000000..8d80ce1f91b
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/TaxonomyWriterCache.java
@@ -0,0 +1,115 @@
+package org.apache.lucene.facet.taxonomy.writercache;
+
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * TaxonomyWriterCache is a relatively simple interface for a cache of
+ * category->ordinal mappings, used in TaxonomyWriter implementations
+ * (such as {@link LuceneTaxonomyWriter}).
+ *
+
+ true
. Otherwise, it should return
+ * false
.
+ * true
, the following n put()
+ * should return false (meaning that the cache was not cleared).
+ */
+ public boolean hasRoom(int numberOfEntries);
+
+}
diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CharBlockArray.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CharBlockArray.java
new file mode 100644
index 00000000000..13e0112c3de
--- /dev/null
+++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CharBlockArray.java
@@ -0,0 +1,195 @@
+package org.apache.lucene.facet.taxonomy.writercache.cl2o;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.OutputStream;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Similar to {@link StringBuilder}, but with a more efficient growing strategy.
+ * This class uses char array blocks to grow.
+ *
+ * @lucene.experimental
+ */
+class CharBlockArray implements Appendable, Serializable, CharSequence {
+
+ private static final long serialVersionUID = 1L;
+
+ private final static int DefaultBlockSize = 32 * 1024; // 32 KB default size
+
+ final static class Block implements Serializable, Cloneable {
+ private static final long serialVersionUID = 1L;
+
+ char[] chars;
+ int length;
+
+ Block(int size) {
+ this.chars = new char[size];
+ this.length = 0;
+ }
+ }
+
+ List