diff --git a/src/main/java/org/elasticsearch/index/codec/CodecModule.java b/src/main/java/org/elasticsearch/index/codec/CodecModule.java index 9c06153498a..83db5846e1e 100644 --- a/src/main/java/org/elasticsearch/index/codec/CodecModule.java +++ b/src/main/java/org/elasticsearch/index/codec/CodecModule.java @@ -34,6 +34,38 @@ import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvid import java.util.Map; /** + * The {@link CodecModule} creates and loads the {@link CodecService} and + * {@link PostingsFormatService} allowing low level data-structure + * specialization on a Lucene Segment basis. + *

+ * The codec module is the authoritative source for build-in and custom + * {@link PostingsFormatProvider}. During module bootstrap it processes the + * index settings underneath the + * {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} and + * instantiates the corresponding {@link PostingsFormatProvider} instances. To + * configure a custom provider implementations the class should reside in the + * org.elasticsearch.index.codec.postingsformat package and the + * classname should be suffixed with PostingsFormatProvider.
+ * For example to expose the Elastic-Fantastic format provider one need to + * provide the following configuration settings and classes: + *

    + *
  1. create a {@link PostingsFormatProvider} subclass in the package + * org.elasticsearch.index.codec.postingsformat
  2. + * + *
  3. name the subclass ElasticFantatsticPostingsFormatProvider
  4. + * + *
  5. configure the custom format in you index settings under + * index.codec.postings_format.elastic_fantatic.type : "ElasticFantatic" + *
  6. + * + *
  7. provide any postings format settings for this custom format under the + * same key ie. + * index.codec.postings_format.elastic_fantatic.performance : "crazy_fast" + *
  8. + *
+ * + * @see CodecService + * */ public class CodecModule extends AbstractModule { @@ -55,7 +87,7 @@ public class CodecModule extends AbstractModule { Map> postingFormatProviders = Maps.newHashMap(customProviders); - Map postingsFormatsSettings = indexSettings.getGroups("index.codec.postings_format"); + Map postingsFormatsSettings = indexSettings.getGroups(PostingsFormatProvider.POSTINGS_FORMAT_SETTINGS_PREFIX); for (Map.Entry entry : postingsFormatsSettings.entrySet()) { String name = entry.getKey(); Settings settings = entry.getValue(); diff --git a/src/main/java/org/elasticsearch/index/codec/CodecService.java b/src/main/java/org/elasticsearch/index/codec/CodecService.java index b08b97d94e2..ce0c0250be2 100644 --- a/src/main/java/org/elasticsearch/index/codec/CodecService.java +++ b/src/main/java/org/elasticsearch/index/codec/CodecService.java @@ -33,6 +33,12 @@ import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.settings.IndexSettings; /** + * Since Lucene 4.0 low level index segments are read and written through a + * codec layer that allows to use use-case specific file formats & + * data-structures per field. ElasticSearch exposes the full + * {@link Codec} capabilities through this {@link CodecService}. + * + * @see PostingsFormatService */ public class CodecService extends AbstractIndexComponent { diff --git a/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java b/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java index db4b3a2c9c2..0e8db9989a7 100644 --- a/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java +++ b/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java @@ -25,7 +25,12 @@ import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider; import org.elasticsearch.index.mapper.MapperService; /** - * This one is the "default" codec we use. + * {@link PerFieldMappingPostingFormatCodec This postings format} is the default + * {@link PostingsFormat} for Elasticsearch. It utilizes the + * {@link MapperService} to lookup a {@link PostingsFormat} per field. This + * allows users to change the low level postings format for individual fields + * per index in real time via the mapping API. If no specific postings format is + * configured for a specific field the default postings format is used. */ // LUCENE UPGRADE: make sure to move to a new codec depending on the lucene version public class PerFieldMappingPostingFormatCodec extends Lucene40Codec { diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/AbstractPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/AbstractPostingsFormatProvider.java index 79255114f15..7dd2a12dd79 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/AbstractPostingsFormatProvider.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/AbstractPostingsFormatProvider.java @@ -19,7 +19,10 @@ package org.elasticsearch.index.codec.postingsformat; +import org.apache.lucene.codecs.PostingsFormat; + /** + * Simple abstract {@link PostingsFormat} requiring a name for the provider; */ public abstract class AbstractPostingsFormatProvider implements PostingsFormatProvider { diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java index 5ec926a4c6d..151f7288289 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java @@ -28,6 +28,18 @@ import org.elasticsearch.common.settings.Settings; /** * The default postingsformat, maps to {@link Lucene40PostingsFormat}. + *
    + *
  • min_block_size: the minimum block size the default Lucene term + * dictionary uses to encode on-disk blocks.
  • + * + *
  • max_block_size: the maximum block size the default Lucene term + * dictionary uses to encode on-disk blocks.
  • + * + *
  • freq_cut_off: the document frequency cut off where pulsing + * in-lines posting lists into the term dictionary. Terms with a document + * frequency less or equal to the cutoff will be in-lined. The default is + * 1
  • + *
*/ // LUCENE UPGRADE: Upgrade Lucene40PostingsFormat to next version public class DefaultPostingsFormatProvider extends AbstractPostingsFormatProvider { diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/DirectPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/DirectPostingsFormatProvider.java index 5be8445c158..3e5fc22738e 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/DirectPostingsFormatProvider.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/DirectPostingsFormatProvider.java @@ -26,6 +26,22 @@ import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; /** + * A {@link PostingsFormatProvider} for {@link DirectPostingsFormat}. This + * postings format uses an on-disk storage for its terms and posting lists and + * streams its data during segment merges but loads its entire postings, terms + * and positions into memory for faster search performance. This format has a + * significant memory footprint and should be used with care. This postings + * format offers the following parameters: + *
    + *
  • min_skip_count: the minimum number terms with a shared prefix to + * allow a skip pointer to be written. the default is 8
  • + * + *
  • low_freq_cutoff: terms with a lower document frequency use a + * single array object representation for postings and positions.
  • + *
+ * + * @see DirectPostingsFormat + * */ public class DirectPostingsFormatProvider extends AbstractPostingsFormatProvider { diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/MemoryPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/MemoryPostingsFormatProvider.java index ffd3aea3054..cde577b625f 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/MemoryPostingsFormatProvider.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/MemoryPostingsFormatProvider.java @@ -27,6 +27,16 @@ import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; /** + * A {@link PostingsFormatProvider} for Lucenes {@link MemoryPostingsFormat}. + * This postings format offers the following parameters: + *
    + *
  • pack_fst: true iff the in memory structure should + * be packed once its build. Packed will reduce the size for the data-structure + * in memory but requires more memory during building. Default is false
  • + * + *
  • acceptable_overhead_ratio: the compression overhead used to + * compress internal structures. See {@link PackedInts} for details. Default is {@value PackedInts#DEFAULT}
  • + *
*/ public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider { @@ -39,6 +49,7 @@ public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider super(name); this.packFst = postingsFormatSettings.getAsBoolean("pack_fst", false); this.acceptableOverheadRatio = postingsFormatSettings.getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT); + // TODO this should really be an ENUM? this.postingsFormat = new MemoryPostingsFormat(packFst, acceptableOverheadRatio); } diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java index fa980314826..9efac108026 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java @@ -30,6 +30,34 @@ import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat; import org.elasticsearch.common.collect.MapBuilder; /** + * This class represents the set of Elasticsearch "build-in" + * {@link PostingsFormatProvider.Factory postings format factories} + *
    + *
  • direct: a postings format that uses disk-based storage but loads + * its terms and postings directly into memory. Note this postings format is + * very memory intensive and has certain limitation that don't allow segments to + * grow beyond 2.1GB see {@link DirectPostingsFormat} for details.
  • + * + *
  • memory: a postings format that stores its entire terms, postings, + * positions and payloads in a finite state transducer. This format should only + * be used for primary keys or with fields where each term is contained in a + * very low number of documents.
  • + * + *
  • pulsing: a postings format in-lines the posting lists for very low + * frequent terms in the term dictionary. This is useful to improve lookup + * performance for low-frequent terms.
  • + * + *
  • bloom_default: a postings format that uses a bloom filter to + * improve term lookup performance. This is useful for primarily keys or fields + * that are used as a delete key
  • + * + *
  • bloom_pulsing: a postings format that combines the advantages of + * bloom and pulsing to further improve lookup performance
  • + * + *
  • default: the default Elasticsearch postings format offering best + * general purpose performance. This format is used if no postings format is + * specified in the field mapping.
  • + *
*/ public class PostingFormats { diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java index 0f2314f8959..d46e8976685 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java @@ -23,22 +23,64 @@ import org.apache.lucene.codecs.PostingsFormat; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.codec.CodecModule; import org.elasticsearch.index.settings.IndexSettings; import java.util.Map; /** + * A {@link PostingsFormatProvider} acts as a named container for specific + * {@link PostingsFormat} implementations. Custom {@link PostingsFormat} + * implementations can be exposed via + * {@link CodecModule#addPostingFormat(String, Class)} + *

+ * Each {@link PostingsFormatProvider} must provide a unique name for its + * postings format in order to map the postings format to a specific field via + * the mapping API. The name provided via {@link #name()} is used to lookup the + * postings format in {@link PostingsFormatService#get(String)} and should be + * identical to the values used in the field mappings. + *

+ *

+ * {@link PostingsFormatProvider} instances are initialized with a + * {@link Settings} subset below the + * {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix and + * will only see the sub-tree below their mapping name. For instance a postings + * format ElasticFantastic will see settings below + * index.codec.postings_format.elastic_fantastic given that the + * postings format is exposed via + * index.codec.postings_format.elastic_fantastic.type : "ElasticFantastic". + *

+ * + * @see CodecModule */ public interface PostingsFormatProvider { - + public static final String POSTINGS_FORMAT_SETTINGS_PREFIX = "index.codec.postings_format"; + + /** + * A helper class to lookup {@link PostingsFormatProvider providers} by their unique {@link PostingsFormatProvider#name() name} + */ public static class Helper { + /** + * Looks up and creates {@link PostingsFormatProvider} for the given name. + *

+ * The settings for the created {@link PostingsFormatProvider} is taken from the given index settings. + * All settings with the {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix + * and the formats name as the key are passed to the factory. + *

+ * + * @param indexSettings the index settings to configure the postings format + * @param name the name of the postings format to lookup + * @param postingFormatFactories the factory mapping to lookup the {@link Factory} to create the {@link PostingsFormatProvider} + * @return a fully configured {@link PostingsFormatProvider} for the given name. + * @throws ElasticSearchIllegalArgumentException if the no {@link PostingsFormatProvider} for the given name parameter could be found. + */ public static PostingsFormatProvider lookup(@IndexSettings Settings indexSettings, String name, Map postingFormatFactories) throws ElasticSearchIllegalArgumentException { Factory factory = postingFormatFactories.get(name); if (factory == null) { throw new ElasticSearchIllegalArgumentException("failed to find postings_format [" + name + "]"); } - Settings settings = indexSettings.getGroups("index.codec.postings_format").get(name); + Settings settings = indexSettings.getGroups(POSTINGS_FORMAT_SETTINGS_PREFIX).get(name); if (settings == null) { settings = ImmutableSettings.Builder.EMPTY_SETTINGS; } @@ -46,10 +88,22 @@ public interface PostingsFormatProvider { } } + /** + * Returns this providers {@link PostingsFormat} instance. + */ PostingsFormat get(); + /** + * Returns the name of this providers {@link PostingsFormat} + */ String name(); + /** + * A simple factory used to create {@link PostingsFormatProvider} used by + * delegating providers like {@link BloomFilterPostingsFormatProvider} or + * {@link PulsingPostingsFormatProvider}. Those providers wrap other + * postings formats to enrich their capabilities. + */ public interface Factory { PostingsFormatProvider create(String name, Settings settings); } diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatService.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatService.java index 34d1b9ef807..f4d470998eb 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatService.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatService.java @@ -27,11 +27,18 @@ import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.AbstractIndexComponent; import org.elasticsearch.index.Index; +import org.elasticsearch.index.codec.CodecService; import org.elasticsearch.index.settings.IndexSettings; import java.util.Map; /** + * The {@link PostingsFormatService} provides access to + * all configured {@link PostingsFormatProvider} instances by + * {@link PostingsFormatProvider#name() name}. + * + * @see CodecService + * */ public class PostingsFormatService extends AbstractIndexComponent { @@ -51,7 +58,7 @@ public class PostingsFormatService extends AbstractIndexComponent { MapBuilder providers = MapBuilder.newMapBuilder(); - Map postingsFormatSettings = indexSettings.getGroups("index.codec.postings_format"); + Map postingsFormatSettings = indexSettings.getGroups(PostingsFormatProvider.POSTINGS_FORMAT_SETTINGS_PREFIX); for (Map.Entry entry : postingFormatFactories.entrySet()) { String name = entry.getKey(); PostingsFormatProvider.Factory factory = entry.getValue(); diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java index bb212703ef3..f3856ae906a 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java @@ -27,6 +27,22 @@ import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; /** + * A {@link PostingsFormatProvider} for Lucenes {@link Pulsing40PostingsFormat}. + * The pulsing implementation in-lines the posting lists for very low frequent + * terms in the term dictionary. This is useful to improve lookup performance + * for low-frequent terms. This postings format offers the following parameters: + *
    + *
  • min_block_size: the minimum block size the default Lucene term + * dictionary uses to encode on-disk blocks.
  • + * + *
  • max_block_size: the maximum block size the default Lucene term + * dictionary uses to encode on-disk blocks.
  • + * + *
  • freq_cut_off: the document frequency cut off where pulsing + * in-lines posting lists into the term dictionary. Terms with a document + * frequency less or equal to the cutoff will be in-lined. The default is + * 1
  • + *
*/ // LUCENE UPGRADE: Upgrade Pulsing40PostingsFormat to next version public class PulsingPostingsFormatProvider extends AbstractPostingsFormatProvider {