Add JavaDocs for Codecs, PostingsFormat and related services/modules
This commit is contained in:
parent
c09ee82ef5
commit
840eaf983d
|
@ -34,6 +34,38 @@ import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvid
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The {@link CodecModule} creates and loads the {@link CodecService} and
|
||||
* {@link PostingsFormatService} allowing low level data-structure
|
||||
* specialization on a Lucene Segment basis.
|
||||
* <p>
|
||||
* The codec module is the authoritative source for build-in and custom
|
||||
* {@link PostingsFormatProvider}. During module bootstrap it processes the
|
||||
* index settings underneath the
|
||||
* {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} and
|
||||
* instantiates the corresponding {@link PostingsFormatProvider} instances. To
|
||||
* configure a custom provider implementations the class should reside in the
|
||||
* <tt>org.elasticsearch.index.codec.postingsformat</tt> package and the
|
||||
* classname should be suffixed with <tt>PostingsFormatProvider</tt>. <br>
|
||||
* For example to expose the Elastic-Fantastic format provider one need to
|
||||
* provide the following configuration settings and classes:
|
||||
* <ol>
|
||||
* <li>create a {@link PostingsFormatProvider} subclass in the package
|
||||
* <tt>org.elasticsearch.index.codec.postingsformat</tt></li>
|
||||
*
|
||||
* <li>name the subclass <tt>ElasticFantatsticPostingsFormatProvider</tt></li>
|
||||
*
|
||||
* <li>configure the custom format in you index settings under
|
||||
* <tt>index.codec.postings_format.elastic_fantatic.type : "ElasticFantatic"</tt>
|
||||
* </li>
|
||||
*
|
||||
* <li>provide any postings format settings for this custom format under the
|
||||
* same key ie.
|
||||
* <tt>index.codec.postings_format.elastic_fantatic.performance : "crazy_fast"</tt>
|
||||
* </li>
|
||||
* </ol>
|
||||
*
|
||||
* @see CodecService
|
||||
*
|
||||
*/
|
||||
public class CodecModule extends AbstractModule {
|
||||
|
||||
|
@ -55,7 +87,7 @@ public class CodecModule extends AbstractModule {
|
|||
|
||||
Map<String, Class<? extends PostingsFormatProvider>> postingFormatProviders = Maps.newHashMap(customProviders);
|
||||
|
||||
Map<String, Settings> postingsFormatsSettings = indexSettings.getGroups("index.codec.postings_format");
|
||||
Map<String, Settings> postingsFormatsSettings = indexSettings.getGroups(PostingsFormatProvider.POSTINGS_FORMAT_SETTINGS_PREFIX);
|
||||
for (Map.Entry<String, Settings> entry : postingsFormatsSettings.entrySet()) {
|
||||
String name = entry.getKey();
|
||||
Settings settings = entry.getValue();
|
||||
|
|
|
@ -33,6 +33,12 @@ import org.elasticsearch.index.mapper.MapperService;
|
|||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
/**
|
||||
* Since Lucene 4.0 low level index segments are read and written through a
|
||||
* codec layer that allows to use use-case specific file formats &
|
||||
* data-structures per field. ElasticSearch exposes the full
|
||||
* {@link Codec} capabilities through this {@link CodecService}.
|
||||
*
|
||||
* @see PostingsFormatService
|
||||
*/
|
||||
public class CodecService extends AbstractIndexComponent {
|
||||
|
||||
|
|
|
@ -25,7 +25,12 @@ import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
|
|||
import org.elasticsearch.index.mapper.MapperService;
|
||||
|
||||
/**
|
||||
* This one is the "default" codec we use.
|
||||
* {@link PerFieldMappingPostingFormatCodec This postings format} is the default
|
||||
* {@link PostingsFormat} for Elasticsearch. It utilizes the
|
||||
* {@link MapperService} to lookup a {@link PostingsFormat} per field. This
|
||||
* allows users to change the low level postings format for individual fields
|
||||
* per index in real time via the mapping API. If no specific postings format is
|
||||
* configured for a specific field the default postings format is used.
|
||||
*/
|
||||
// LUCENE UPGRADE: make sure to move to a new codec depending on the lucene version
|
||||
public class PerFieldMappingPostingFormatCodec extends Lucene40Codec {
|
||||
|
|
|
@ -19,7 +19,10 @@
|
|||
|
||||
package org.elasticsearch.index.codec.postingsformat;
|
||||
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
|
||||
/**
|
||||
* Simple abstract {@link PostingsFormat} requiring a name for the provider;
|
||||
*/
|
||||
public abstract class AbstractPostingsFormatProvider implements PostingsFormatProvider {
|
||||
|
||||
|
|
|
@ -28,6 +28,18 @@ import org.elasticsearch.common.settings.Settings;
|
|||
|
||||
/**
|
||||
* The default postingsformat, maps to {@link Lucene40PostingsFormat}.
|
||||
* <ul>
|
||||
* <li><tt>min_block_size</tt>: the minimum block size the default Lucene term
|
||||
* dictionary uses to encode on-disk blocks.</li>
|
||||
*
|
||||
* <li><tt>max_block_size</tt>: the maximum block size the default Lucene term
|
||||
* dictionary uses to encode on-disk blocks.</li>
|
||||
*
|
||||
* <li><tt>freq_cut_off</tt>: the document frequency cut off where pulsing
|
||||
* in-lines posting lists into the term dictionary. Terms with a document
|
||||
* frequency less or equal to the cutoff will be in-lined. The default is
|
||||
* <tt>1</tt></li>
|
||||
* </ul>
|
||||
*/
|
||||
// LUCENE UPGRADE: Upgrade Lucene40PostingsFormat to next version
|
||||
public class DefaultPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
||||
|
|
|
@ -26,6 +26,22 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
|
||||
/**
|
||||
* A {@link PostingsFormatProvider} for {@link DirectPostingsFormat}. This
|
||||
* postings format uses an on-disk storage for its terms and posting lists and
|
||||
* streams its data during segment merges but loads its entire postings, terms
|
||||
* and positions into memory for faster search performance. This format has a
|
||||
* significant memory footprint and should be used with care. <b> This postings
|
||||
* format offers the following parameters:
|
||||
* <ul>
|
||||
* <li><tt>min_skip_count</tt>: the minimum number terms with a shared prefix to
|
||||
* allow a skip pointer to be written. the default is <tt>8</tt></li>
|
||||
*
|
||||
* <li><tt>low_freq_cutoff</tt>: terms with a lower document frequency use a
|
||||
* single array object representation for postings and positions.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @see DirectPostingsFormat
|
||||
*
|
||||
*/
|
||||
public class DirectPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
||||
|
||||
|
|
|
@ -27,6 +27,16 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
|
||||
/**
|
||||
* A {@link PostingsFormatProvider} for Lucenes {@link MemoryPostingsFormat}.
|
||||
* This postings format offers the following parameters:
|
||||
* <ul>
|
||||
* <li><tt>pack_fst</tt>: <code>true</code> iff the in memory structure should
|
||||
* be packed once its build. Packed will reduce the size for the data-structure
|
||||
* in memory but requires more memory during building. Default is <code>false</code></li>
|
||||
*
|
||||
* <li><tt>acceptable_overhead_ratio</tt>: the compression overhead used to
|
||||
* compress internal structures. See {@link PackedInts} for details. Default is {@value PackedInts#DEFAULT}</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
||||
|
||||
|
@ -39,6 +49,7 @@ public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider
|
|||
super(name);
|
||||
this.packFst = postingsFormatSettings.getAsBoolean("pack_fst", false);
|
||||
this.acceptableOverheadRatio = postingsFormatSettings.getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT);
|
||||
// TODO this should really be an ENUM?
|
||||
this.postingsFormat = new MemoryPostingsFormat(packFst, acceptableOverheadRatio);
|
||||
}
|
||||
|
||||
|
|
|
@ -30,6 +30,34 @@ import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
|
|||
import org.elasticsearch.common.collect.MapBuilder;
|
||||
|
||||
/**
|
||||
* This class represents the set of Elasticsearch "build-in"
|
||||
* {@link PostingsFormatProvider.Factory postings format factories}
|
||||
* <ul>
|
||||
* <li><b>direct</b>: a postings format that uses disk-based storage but loads
|
||||
* its terms and postings directly into memory. Note this postings format is
|
||||
* very memory intensive and has certain limitation that don't allow segments to
|
||||
* grow beyond 2.1GB see {@link DirectPostingsFormat} for details.</li>
|
||||
*
|
||||
* <li><b>memory</b>: a postings format that stores its entire terms, postings,
|
||||
* positions and payloads in a finite state transducer. This format should only
|
||||
* be used for primary keys or with fields where each term is contained in a
|
||||
* very low number of documents.</li>
|
||||
*
|
||||
* <li><b>pulsing</b>: a postings format in-lines the posting lists for very low
|
||||
* frequent terms in the term dictionary. This is useful to improve lookup
|
||||
* performance for low-frequent terms.</li>
|
||||
*
|
||||
* <li><b>bloom_default</b>: a postings format that uses a bloom filter to
|
||||
* improve term lookup performance. This is useful for primarily keys or fields
|
||||
* that are used as a delete key</li>
|
||||
*
|
||||
* <li><b>bloom_pulsing</b>: a postings format that combines the advantages of
|
||||
* <b>bloom</b> and <b>pulsing</b> to further improve lookup performance</li>
|
||||
*
|
||||
* <li><b>default</b>: the default Elasticsearch postings format offering best
|
||||
* general purpose performance. This format is used if no postings format is
|
||||
* specified in the field mapping.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class PostingFormats {
|
||||
|
||||
|
|
|
@ -23,22 +23,64 @@ import org.apache.lucene.codecs.PostingsFormat;
|
|||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.codec.CodecModule;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A {@link PostingsFormatProvider} acts as a named container for specific
|
||||
* {@link PostingsFormat} implementations. Custom {@link PostingsFormat}
|
||||
* implementations can be exposed via
|
||||
* {@link CodecModule#addPostingFormat(String, Class)}
|
||||
* <p>
|
||||
* Each {@link PostingsFormatProvider} must provide a unique name for its
|
||||
* postings format in order to map the postings format to a specific field via
|
||||
* the mapping API. The name provided via {@link #name()} is used to lookup the
|
||||
* postings format in {@link PostingsFormatService#get(String)} and should be
|
||||
* identical to the values used in the field mappings.
|
||||
* </p>
|
||||
* <p>
|
||||
* {@link PostingsFormatProvider} instances are initialized with a
|
||||
* {@link Settings} subset below the
|
||||
* {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix and
|
||||
* will only see the sub-tree below their mapping name. For instance a postings
|
||||
* format <tt>ElasticFantastic</tt> will see settings below
|
||||
* <tt>index.codec.postings_format.elastic_fantastic</tt> given that the
|
||||
* postings format is exposed via
|
||||
* <tt>index.codec.postings_format.elastic_fantastic.type : "ElasticFantastic"</tt>.
|
||||
* </p>
|
||||
*
|
||||
* @see CodecModule
|
||||
*/
|
||||
public interface PostingsFormatProvider {
|
||||
|
||||
public static final String POSTINGS_FORMAT_SETTINGS_PREFIX = "index.codec.postings_format";
|
||||
|
||||
/**
|
||||
* A helper class to lookup {@link PostingsFormatProvider providers} by their unique {@link PostingsFormatProvider#name() name}
|
||||
*/
|
||||
public static class Helper {
|
||||
|
||||
/**
|
||||
* Looks up and creates {@link PostingsFormatProvider} for the given name.
|
||||
* <p>
|
||||
* The settings for the created {@link PostingsFormatProvider} is taken from the given index settings.
|
||||
* All settings with the {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix
|
||||
* and the formats name as the key are passed to the factory.
|
||||
* </p>
|
||||
*
|
||||
* @param indexSettings the index settings to configure the postings format
|
||||
* @param name the name of the postings format to lookup
|
||||
* @param postingFormatFactories the factory mapping to lookup the {@link Factory} to create the {@link PostingsFormatProvider}
|
||||
* @return a fully configured {@link PostingsFormatProvider} for the given name.
|
||||
* @throws ElasticSearchIllegalArgumentException if the no {@link PostingsFormatProvider} for the given name parameter could be found.
|
||||
*/
|
||||
public static PostingsFormatProvider lookup(@IndexSettings Settings indexSettings, String name, Map<String, Factory> postingFormatFactories) throws ElasticSearchIllegalArgumentException {
|
||||
Factory factory = postingFormatFactories.get(name);
|
||||
if (factory == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("failed to find postings_format [" + name + "]");
|
||||
}
|
||||
Settings settings = indexSettings.getGroups("index.codec.postings_format").get(name);
|
||||
Settings settings = indexSettings.getGroups(POSTINGS_FORMAT_SETTINGS_PREFIX).get(name);
|
||||
if (settings == null) {
|
||||
settings = ImmutableSettings.Builder.EMPTY_SETTINGS;
|
||||
}
|
||||
|
@ -46,10 +88,22 @@ public interface PostingsFormatProvider {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this providers {@link PostingsFormat} instance.
|
||||
*/
|
||||
PostingsFormat get();
|
||||
|
||||
/**
|
||||
* Returns the name of this providers {@link PostingsFormat}
|
||||
*/
|
||||
String name();
|
||||
|
||||
/**
|
||||
* A simple factory used to create {@link PostingsFormatProvider} used by
|
||||
* delegating providers like {@link BloomFilterPostingsFormatProvider} or
|
||||
* {@link PulsingPostingsFormatProvider}. Those providers wrap other
|
||||
* postings formats to enrich their capabilities.
|
||||
*/
|
||||
public interface Factory {
|
||||
PostingsFormatProvider create(String name, Settings settings);
|
||||
}
|
||||
|
|
|
@ -27,11 +27,18 @@ import org.elasticsearch.common.settings.ImmutableSettings;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.AbstractIndexComponent;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.codec.CodecService;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The {@link PostingsFormatService} provides access to
|
||||
* all configured {@link PostingsFormatProvider} instances by
|
||||
* {@link PostingsFormatProvider#name() name}.
|
||||
*
|
||||
* @see CodecService
|
||||
*
|
||||
*/
|
||||
public class PostingsFormatService extends AbstractIndexComponent {
|
||||
|
||||
|
@ -51,7 +58,7 @@ public class PostingsFormatService extends AbstractIndexComponent {
|
|||
|
||||
MapBuilder<String, PostingsFormatProvider> providers = MapBuilder.newMapBuilder();
|
||||
|
||||
Map<String, Settings> postingsFormatSettings = indexSettings.getGroups("index.codec.postings_format");
|
||||
Map<String, Settings> postingsFormatSettings = indexSettings.getGroups(PostingsFormatProvider.POSTINGS_FORMAT_SETTINGS_PREFIX);
|
||||
for (Map.Entry<String, PostingsFormatProvider.Factory> entry : postingFormatFactories.entrySet()) {
|
||||
String name = entry.getKey();
|
||||
PostingsFormatProvider.Factory factory = entry.getValue();
|
||||
|
|
|
@ -27,6 +27,22 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
|
||||
/**
|
||||
* A {@link PostingsFormatProvider} for Lucenes {@link Pulsing40PostingsFormat}.
|
||||
* The pulsing implementation in-lines the posting lists for very low frequent
|
||||
* terms in the term dictionary. This is useful to improve lookup performance
|
||||
* for low-frequent terms. This postings format offers the following parameters:
|
||||
* <ul>
|
||||
* <li><tt>min_block_size</tt>: the minimum block size the default Lucene term
|
||||
* dictionary uses to encode on-disk blocks.</li>
|
||||
*
|
||||
* <li><tt>max_block_size</tt>: the maximum block size the default Lucene term
|
||||
* dictionary uses to encode on-disk blocks.</li>
|
||||
*
|
||||
* <li><tt>freq_cut_off</tt>: the document frequency cut off where pulsing
|
||||
* in-lines posting lists into the term dictionary. Terms with a document
|
||||
* frequency less or equal to the cutoff will be in-lined. The default is
|
||||
* <tt>1</tt></li>
|
||||
* </ul>
|
||||
*/
|
||||
// LUCENE UPGRADE: Upgrade Pulsing40PostingsFormat to next version
|
||||
public class PulsingPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
||||
|
|
Loading…
Reference in New Issue