diff --git a/src/main/java/org/elasticsearch/index/codec/CodecModule.java b/src/main/java/org/elasticsearch/index/codec/CodecModule.java
index 9c06153498a..83db5846e1e 100644
--- a/src/main/java/org/elasticsearch/index/codec/CodecModule.java
+++ b/src/main/java/org/elasticsearch/index/codec/CodecModule.java
@@ -34,6 +34,38 @@ import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvid
import java.util.Map;
/**
+ * The {@link CodecModule} creates and loads the {@link CodecService} and
+ * {@link PostingsFormatService} allowing low level data-structure
+ * specialization on a Lucene Segment basis.
+ *
+ * The codec module is the authoritative source for build-in and custom
+ * {@link PostingsFormatProvider}. During module bootstrap it processes the
+ * index settings underneath the
+ * {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} and
+ * instantiates the corresponding {@link PostingsFormatProvider} instances. To
+ * configure a custom provider implementations the class should reside in the
+ * org.elasticsearch.index.codec.postingsformat package and the
+ * classname should be suffixed with PostingsFormatProvider.
+ * For example to expose the Elastic-Fantastic format provider one need to
+ * provide the following configuration settings and classes:
+ *
+ * - create a {@link PostingsFormatProvider} subclass in the package
+ * org.elasticsearch.index.codec.postingsformat
+ *
+ * - name the subclass ElasticFantatsticPostingsFormatProvider
+ *
+ * - configure the custom format in you index settings under
+ * index.codec.postings_format.elastic_fantatic.type : "ElasticFantatic"
+ *
+ *
+ * - provide any postings format settings for this custom format under the
+ * same key ie.
+ * index.codec.postings_format.elastic_fantatic.performance : "crazy_fast"
+ *
+ *
+ *
+ * @see CodecService
+ *
*/
public class CodecModule extends AbstractModule {
@@ -55,7 +87,7 @@ public class CodecModule extends AbstractModule {
Map> postingFormatProviders = Maps.newHashMap(customProviders);
- Map postingsFormatsSettings = indexSettings.getGroups("index.codec.postings_format");
+ Map postingsFormatsSettings = indexSettings.getGroups(PostingsFormatProvider.POSTINGS_FORMAT_SETTINGS_PREFIX);
for (Map.Entry entry : postingsFormatsSettings.entrySet()) {
String name = entry.getKey();
Settings settings = entry.getValue();
diff --git a/src/main/java/org/elasticsearch/index/codec/CodecService.java b/src/main/java/org/elasticsearch/index/codec/CodecService.java
index b08b97d94e2..ce0c0250be2 100644
--- a/src/main/java/org/elasticsearch/index/codec/CodecService.java
+++ b/src/main/java/org/elasticsearch/index/codec/CodecService.java
@@ -33,6 +33,12 @@ import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.settings.IndexSettings;
/**
+ * Since Lucene 4.0 low level index segments are read and written through a
+ * codec layer that allows to use use-case specific file formats &
+ * data-structures per field. ElasticSearch exposes the full
+ * {@link Codec} capabilities through this {@link CodecService}.
+ *
+ * @see PostingsFormatService
*/
public class CodecService extends AbstractIndexComponent {
diff --git a/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java b/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java
index db4b3a2c9c2..0e8db9989a7 100644
--- a/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java
+++ b/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java
@@ -25,7 +25,12 @@ import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
import org.elasticsearch.index.mapper.MapperService;
/**
- * This one is the "default" codec we use.
+ * {@link PerFieldMappingPostingFormatCodec This postings format} is the default
+ * {@link PostingsFormat} for Elasticsearch. It utilizes the
+ * {@link MapperService} to lookup a {@link PostingsFormat} per field. This
+ * allows users to change the low level postings format for individual fields
+ * per index in real time via the mapping API. If no specific postings format is
+ * configured for a specific field the default postings format is used.
*/
// LUCENE UPGRADE: make sure to move to a new codec depending on the lucene version
public class PerFieldMappingPostingFormatCodec extends Lucene40Codec {
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/AbstractPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/AbstractPostingsFormatProvider.java
index 79255114f15..7dd2a12dd79 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/AbstractPostingsFormatProvider.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/AbstractPostingsFormatProvider.java
@@ -19,7 +19,10 @@
package org.elasticsearch.index.codec.postingsformat;
+import org.apache.lucene.codecs.PostingsFormat;
+
/**
+ * Simple abstract {@link PostingsFormat} requiring a name for the provider;
*/
public abstract class AbstractPostingsFormatProvider implements PostingsFormatProvider {
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java
index 5ec926a4c6d..151f7288289 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java
@@ -28,6 +28,18 @@ import org.elasticsearch.common.settings.Settings;
/**
* The default postingsformat, maps to {@link Lucene40PostingsFormat}.
+ *
+ * - min_block_size: the minimum block size the default Lucene term
+ * dictionary uses to encode on-disk blocks.
+ *
+ * - max_block_size: the maximum block size the default Lucene term
+ * dictionary uses to encode on-disk blocks.
+ *
+ * - freq_cut_off: the document frequency cut off where pulsing
+ * in-lines posting lists into the term dictionary. Terms with a document
+ * frequency less or equal to the cutoff will be in-lined. The default is
+ * 1
+ *
*/
// LUCENE UPGRADE: Upgrade Lucene40PostingsFormat to next version
public class DefaultPostingsFormatProvider extends AbstractPostingsFormatProvider {
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/DirectPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/DirectPostingsFormatProvider.java
index 5be8445c158..3e5fc22738e 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/DirectPostingsFormatProvider.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/DirectPostingsFormatProvider.java
@@ -26,6 +26,22 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
/**
+ * A {@link PostingsFormatProvider} for {@link DirectPostingsFormat}. This
+ * postings format uses an on-disk storage for its terms and posting lists and
+ * streams its data during segment merges but loads its entire postings, terms
+ * and positions into memory for faster search performance. This format has a
+ * significant memory footprint and should be used with care. This postings
+ * format offers the following parameters:
+ *
+ * - min_skip_count: the minimum number terms with a shared prefix to
+ * allow a skip pointer to be written. the default is 8
+ *
+ * - low_freq_cutoff: terms with a lower document frequency use a
+ * single array object representation for postings and positions.
+ *
+ *
+ * @see DirectPostingsFormat
+ *
*/
public class DirectPostingsFormatProvider extends AbstractPostingsFormatProvider {
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/MemoryPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/MemoryPostingsFormatProvider.java
index ffd3aea3054..cde577b625f 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/MemoryPostingsFormatProvider.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/MemoryPostingsFormatProvider.java
@@ -27,6 +27,16 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
/**
+ * A {@link PostingsFormatProvider} for Lucenes {@link MemoryPostingsFormat}.
+ * This postings format offers the following parameters:
+ *
+ * - pack_fst:
true
iff the in memory structure should
+ * be packed once its build. Packed will reduce the size for the data-structure
+ * in memory but requires more memory during building. Default is false
+ *
+ * - acceptable_overhead_ratio: the compression overhead used to
+ * compress internal structures. See {@link PackedInts} for details. Default is {@value PackedInts#DEFAULT}
+ *
*/
public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider {
@@ -39,6 +49,7 @@ public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider
super(name);
this.packFst = postingsFormatSettings.getAsBoolean("pack_fst", false);
this.acceptableOverheadRatio = postingsFormatSettings.getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT);
+ // TODO this should really be an ENUM?
this.postingsFormat = new MemoryPostingsFormat(packFst, acceptableOverheadRatio);
}
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java
index fa980314826..9efac108026 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java
@@ -30,6 +30,34 @@ import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
import org.elasticsearch.common.collect.MapBuilder;
/**
+ * This class represents the set of Elasticsearch "build-in"
+ * {@link PostingsFormatProvider.Factory postings format factories}
+ *
+ * - direct: a postings format that uses disk-based storage but loads
+ * its terms and postings directly into memory. Note this postings format is
+ * very memory intensive and has certain limitation that don't allow segments to
+ * grow beyond 2.1GB see {@link DirectPostingsFormat} for details.
+ *
+ * - memory: a postings format that stores its entire terms, postings,
+ * positions and payloads in a finite state transducer. This format should only
+ * be used for primary keys or with fields where each term is contained in a
+ * very low number of documents.
+ *
+ * - pulsing: a postings format in-lines the posting lists for very low
+ * frequent terms in the term dictionary. This is useful to improve lookup
+ * performance for low-frequent terms.
+ *
+ * - bloom_default: a postings format that uses a bloom filter to
+ * improve term lookup performance. This is useful for primarily keys or fields
+ * that are used as a delete key
+ *
+ * - bloom_pulsing: a postings format that combines the advantages of
+ * bloom and pulsing to further improve lookup performance
+ *
+ * - default: the default Elasticsearch postings format offering best
+ * general purpose performance. This format is used if no postings format is
+ * specified in the field mapping.
+ *
*/
public class PostingFormats {
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java
index 0f2314f8959..d46e8976685 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java
@@ -23,22 +23,64 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.codec.CodecModule;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.Map;
/**
+ * A {@link PostingsFormatProvider} acts as a named container for specific
+ * {@link PostingsFormat} implementations. Custom {@link PostingsFormat}
+ * implementations can be exposed via
+ * {@link CodecModule#addPostingFormat(String, Class)}
+ *
+ * Each {@link PostingsFormatProvider} must provide a unique name for its
+ * postings format in order to map the postings format to a specific field via
+ * the mapping API. The name provided via {@link #name()} is used to lookup the
+ * postings format in {@link PostingsFormatService#get(String)} and should be
+ * identical to the values used in the field mappings.
+ *
+ *
+ * {@link PostingsFormatProvider} instances are initialized with a
+ * {@link Settings} subset below the
+ * {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix and
+ * will only see the sub-tree below their mapping name. For instance a postings
+ * format ElasticFantastic will see settings below
+ * index.codec.postings_format.elastic_fantastic given that the
+ * postings format is exposed via
+ * index.codec.postings_format.elastic_fantastic.type : "ElasticFantastic".
+ *
+ *
+ * @see CodecModule
*/
public interface PostingsFormatProvider {
-
+ public static final String POSTINGS_FORMAT_SETTINGS_PREFIX = "index.codec.postings_format";
+
+ /**
+ * A helper class to lookup {@link PostingsFormatProvider providers} by their unique {@link PostingsFormatProvider#name() name}
+ */
public static class Helper {
+ /**
+ * Looks up and creates {@link PostingsFormatProvider} for the given name.
+ *
+ * The settings for the created {@link PostingsFormatProvider} is taken from the given index settings.
+ * All settings with the {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix
+ * and the formats name as the key are passed to the factory.
+ *
+ *
+ * @param indexSettings the index settings to configure the postings format
+ * @param name the name of the postings format to lookup
+ * @param postingFormatFactories the factory mapping to lookup the {@link Factory} to create the {@link PostingsFormatProvider}
+ * @return a fully configured {@link PostingsFormatProvider} for the given name.
+ * @throws ElasticSearchIllegalArgumentException if the no {@link PostingsFormatProvider} for the given name parameter could be found.
+ */
public static PostingsFormatProvider lookup(@IndexSettings Settings indexSettings, String name, Map postingFormatFactories) throws ElasticSearchIllegalArgumentException {
Factory factory = postingFormatFactories.get(name);
if (factory == null) {
throw new ElasticSearchIllegalArgumentException("failed to find postings_format [" + name + "]");
}
- Settings settings = indexSettings.getGroups("index.codec.postings_format").get(name);
+ Settings settings = indexSettings.getGroups(POSTINGS_FORMAT_SETTINGS_PREFIX).get(name);
if (settings == null) {
settings = ImmutableSettings.Builder.EMPTY_SETTINGS;
}
@@ -46,10 +88,22 @@ public interface PostingsFormatProvider {
}
}
+ /**
+ * Returns this providers {@link PostingsFormat} instance.
+ */
PostingsFormat get();
+ /**
+ * Returns the name of this providers {@link PostingsFormat}
+ */
String name();
+ /**
+ * A simple factory used to create {@link PostingsFormatProvider} used by
+ * delegating providers like {@link BloomFilterPostingsFormatProvider} or
+ * {@link PulsingPostingsFormatProvider}. Those providers wrap other
+ * postings formats to enrich their capabilities.
+ */
public interface Factory {
PostingsFormatProvider create(String name, Settings settings);
}
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatService.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatService.java
index 34d1b9ef807..f4d470998eb 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatService.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatService.java
@@ -27,11 +27,18 @@ import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.AbstractIndexComponent;
import org.elasticsearch.index.Index;
+import org.elasticsearch.index.codec.CodecService;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.Map;
/**
+ * The {@link PostingsFormatService} provides access to
+ * all configured {@link PostingsFormatProvider} instances by
+ * {@link PostingsFormatProvider#name() name}.
+ *
+ * @see CodecService
+ *
*/
public class PostingsFormatService extends AbstractIndexComponent {
@@ -51,7 +58,7 @@ public class PostingsFormatService extends AbstractIndexComponent {
MapBuilder providers = MapBuilder.newMapBuilder();
- Map postingsFormatSettings = indexSettings.getGroups("index.codec.postings_format");
+ Map postingsFormatSettings = indexSettings.getGroups(PostingsFormatProvider.POSTINGS_FORMAT_SETTINGS_PREFIX);
for (Map.Entry entry : postingFormatFactories.entrySet()) {
String name = entry.getKey();
PostingsFormatProvider.Factory factory = entry.getValue();
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java
index bb212703ef3..f3856ae906a 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java
@@ -27,6 +27,22 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
/**
+ * A {@link PostingsFormatProvider} for Lucenes {@link Pulsing40PostingsFormat}.
+ * The pulsing implementation in-lines the posting lists for very low frequent
+ * terms in the term dictionary. This is useful to improve lookup performance
+ * for low-frequent terms. This postings format offers the following parameters:
+ *
+ * - min_block_size: the minimum block size the default Lucene term
+ * dictionary uses to encode on-disk blocks.
+ *
+ * - max_block_size: the maximum block size the default Lucene term
+ * dictionary uses to encode on-disk blocks.
+ *
+ * - freq_cut_off: the document frequency cut off where pulsing
+ * in-lines posting lists into the term dictionary. Terms with a document
+ * frequency less or equal to the cutoff will be in-lined. The default is
+ * 1
+ *
*/
// LUCENE UPGRADE: Upgrade Pulsing40PostingsFormat to next version
public class PulsingPostingsFormatProvider extends AbstractPostingsFormatProvider {