Add JavaDocs for Codecs, PostingsFormat and related services/modules

This commit is contained in:
Simon Willnauer 2012-11-16 17:18:58 +01:00
parent c09ee82ef5
commit 840eaf983d
11 changed files with 195 additions and 5 deletions

View File

@ -34,6 +34,38 @@ import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvid
import java.util.Map;
/**
* The {@link CodecModule} creates and loads the {@link CodecService} and
* {@link PostingsFormatService} allowing low level data-structure
* specialization on a Lucene Segment basis.
* <p>
* The codec module is the authoritative source for build-in and custom
* {@link PostingsFormatProvider}. During module bootstrap it processes the
* index settings underneath the
* {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} and
* instantiates the corresponding {@link PostingsFormatProvider} instances. To
* configure a custom provider implementations the class should reside in the
* <tt>org.elasticsearch.index.codec.postingsformat</tt> package and the
* classname should be suffixed with <tt>PostingsFormatProvider</tt>. <br>
* For example to expose the Elastic-Fantastic format provider one need to
* provide the following configuration settings and classes:
* <ol>
* <li>create a {@link PostingsFormatProvider} subclass in the package
* <tt>org.elasticsearch.index.codec.postingsformat</tt></li>
*
* <li>name the subclass <tt>ElasticFantatsticPostingsFormatProvider</tt></li>
*
* <li>configure the custom format in you index settings under
* <tt>index.codec.postings_format.elastic_fantatic.type : "ElasticFantatic"</tt>
* </li>
*
* <li>provide any postings format settings for this custom format under the
* same key ie.
* <tt>index.codec.postings_format.elastic_fantatic.performance : "crazy_fast"</tt>
* </li>
* </ol>
*
* @see CodecService
*
*/
public class CodecModule extends AbstractModule {
@ -55,7 +87,7 @@ public class CodecModule extends AbstractModule {
Map<String, Class<? extends PostingsFormatProvider>> postingFormatProviders = Maps.newHashMap(customProviders);
Map<String, Settings> postingsFormatsSettings = indexSettings.getGroups("index.codec.postings_format");
Map<String, Settings> postingsFormatsSettings = indexSettings.getGroups(PostingsFormatProvider.POSTINGS_FORMAT_SETTINGS_PREFIX);
for (Map.Entry<String, Settings> entry : postingsFormatsSettings.entrySet()) {
String name = entry.getKey();
Settings settings = entry.getValue();

View File

@ -33,6 +33,12 @@ import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.settings.IndexSettings;
/**
* Since Lucene 4.0 low level index segments are read and written through a
* codec layer that allows to use use-case specific file formats &
* data-structures per field. ElasticSearch exposes the full
* {@link Codec} capabilities through this {@link CodecService}.
*
* @see PostingsFormatService
*/
public class CodecService extends AbstractIndexComponent {

View File

@ -25,7 +25,12 @@ import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
import org.elasticsearch.index.mapper.MapperService;
/**
* This one is the "default" codec we use.
* {@link PerFieldMappingPostingFormatCodec This postings format} is the default
* {@link PostingsFormat} for Elasticsearch. It utilizes the
* {@link MapperService} to lookup a {@link PostingsFormat} per field. This
* allows users to change the low level postings format for individual fields
* per index in real time via the mapping API. If no specific postings format is
* configured for a specific field the default postings format is used.
*/
// LUCENE UPGRADE: make sure to move to a new codec depending on the lucene version
public class PerFieldMappingPostingFormatCodec extends Lucene40Codec {

View File

@ -19,7 +19,10 @@
package org.elasticsearch.index.codec.postingsformat;
import org.apache.lucene.codecs.PostingsFormat;
/**
* Simple abstract {@link PostingsFormat} requiring a name for the provider;
*/
public abstract class AbstractPostingsFormatProvider implements PostingsFormatProvider {

View File

@ -28,6 +28,18 @@ import org.elasticsearch.common.settings.Settings;
/**
* The default postingsformat, maps to {@link Lucene40PostingsFormat}.
* <ul>
* <li><tt>min_block_size</tt>: the minimum block size the default Lucene term
* dictionary uses to encode on-disk blocks.</li>
*
* <li><tt>max_block_size</tt>: the maximum block size the default Lucene term
* dictionary uses to encode on-disk blocks.</li>
*
* <li><tt>freq_cut_off</tt>: the document frequency cut off where pulsing
* in-lines posting lists into the term dictionary. Terms with a document
* frequency less or equal to the cutoff will be in-lined. The default is
* <tt>1</tt></li>
* </ul>
*/
// LUCENE UPGRADE: Upgrade Lucene40PostingsFormat to next version
public class DefaultPostingsFormatProvider extends AbstractPostingsFormatProvider {

View File

@ -26,6 +26,22 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
/**
* A {@link PostingsFormatProvider} for {@link DirectPostingsFormat}. This
* postings format uses an on-disk storage for its terms and posting lists and
* streams its data during segment merges but loads its entire postings, terms
* and positions into memory for faster search performance. This format has a
* significant memory footprint and should be used with care. <b> This postings
* format offers the following parameters:
* <ul>
* <li><tt>min_skip_count</tt>: the minimum number terms with a shared prefix to
* allow a skip pointer to be written. the default is <tt>8</tt></li>
*
* <li><tt>low_freq_cutoff</tt>: terms with a lower document frequency use a
* single array object representation for postings and positions.</li>
* </ul>
*
* @see DirectPostingsFormat
*
*/
public class DirectPostingsFormatProvider extends AbstractPostingsFormatProvider {

View File

@ -27,6 +27,16 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
/**
* A {@link PostingsFormatProvider} for Lucenes {@link MemoryPostingsFormat}.
* This postings format offers the following parameters:
* <ul>
* <li><tt>pack_fst</tt>: <code>true</code> iff the in memory structure should
* be packed once its build. Packed will reduce the size for the data-structure
* in memory but requires more memory during building. Default is <code>false</code></li>
*
* <li><tt>acceptable_overhead_ratio</tt>: the compression overhead used to
* compress internal structures. See {@link PackedInts} for details. Default is {@value PackedInts#DEFAULT}</li>
* </ul>
*/
public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider {
@ -39,6 +49,7 @@ public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider
super(name);
this.packFst = postingsFormatSettings.getAsBoolean("pack_fst", false);
this.acceptableOverheadRatio = postingsFormatSettings.getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT);
// TODO this should really be an ENUM?
this.postingsFormat = new MemoryPostingsFormat(packFst, acceptableOverheadRatio);
}

View File

@ -30,6 +30,34 @@ import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
import org.elasticsearch.common.collect.MapBuilder;
/**
* This class represents the set of Elasticsearch "build-in"
* {@link PostingsFormatProvider.Factory postings format factories}
* <ul>
* <li><b>direct</b>: a postings format that uses disk-based storage but loads
* its terms and postings directly into memory. Note this postings format is
* very memory intensive and has certain limitation that don't allow segments to
* grow beyond 2.1GB see {@link DirectPostingsFormat} for details.</li>
*
* <li><b>memory</b>: a postings format that stores its entire terms, postings,
* positions and payloads in a finite state transducer. This format should only
* be used for primary keys or with fields where each term is contained in a
* very low number of documents.</li>
*
* <li><b>pulsing</b>: a postings format in-lines the posting lists for very low
* frequent terms in the term dictionary. This is useful to improve lookup
* performance for low-frequent terms.</li>
*
* <li><b>bloom_default</b>: a postings format that uses a bloom filter to
* improve term lookup performance. This is useful for primarily keys or fields
* that are used as a delete key</li>
*
* <li><b>bloom_pulsing</b>: a postings format that combines the advantages of
* <b>bloom</b> and <b>pulsing</b> to further improve lookup performance</li>
*
* <li><b>default</b>: the default Elasticsearch postings format offering best
* general purpose performance. This format is used if no postings format is
* specified in the field mapping.</li>
* </ul>
*/
public class PostingFormats {

View File

@ -23,22 +23,64 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.codec.CodecModule;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.Map;
/**
* A {@link PostingsFormatProvider} acts as a named container for specific
* {@link PostingsFormat} implementations. Custom {@link PostingsFormat}
* implementations can be exposed via
* {@link CodecModule#addPostingFormat(String, Class)}
* <p>
* Each {@link PostingsFormatProvider} must provide a unique name for its
* postings format in order to map the postings format to a specific field via
* the mapping API. The name provided via {@link #name()} is used to lookup the
* postings format in {@link PostingsFormatService#get(String)} and should be
* identical to the values used in the field mappings.
* </p>
* <p>
* {@link PostingsFormatProvider} instances are initialized with a
* {@link Settings} subset below the
* {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix and
* will only see the sub-tree below their mapping name. For instance a postings
* format <tt>ElasticFantastic</tt> will see settings below
* <tt>index.codec.postings_format.elastic_fantastic</tt> given that the
* postings format is exposed via
* <tt>index.codec.postings_format.elastic_fantastic.type : "ElasticFantastic"</tt>.
* </p>
*
* @see CodecModule
*/
public interface PostingsFormatProvider {
public static final String POSTINGS_FORMAT_SETTINGS_PREFIX = "index.codec.postings_format";
/**
* A helper class to lookup {@link PostingsFormatProvider providers} by their unique {@link PostingsFormatProvider#name() name}
*/
public static class Helper {
/**
* Looks up and creates {@link PostingsFormatProvider} for the given name.
* <p>
* The settings for the created {@link PostingsFormatProvider} is taken from the given index settings.
* All settings with the {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix
* and the formats name as the key are passed to the factory.
* </p>
*
* @param indexSettings the index settings to configure the postings format
* @param name the name of the postings format to lookup
* @param postingFormatFactories the factory mapping to lookup the {@link Factory} to create the {@link PostingsFormatProvider}
* @return a fully configured {@link PostingsFormatProvider} for the given name.
* @throws ElasticSearchIllegalArgumentException if the no {@link PostingsFormatProvider} for the given name parameter could be found.
*/
public static PostingsFormatProvider lookup(@IndexSettings Settings indexSettings, String name, Map<String, Factory> postingFormatFactories) throws ElasticSearchIllegalArgumentException {
Factory factory = postingFormatFactories.get(name);
if (factory == null) {
throw new ElasticSearchIllegalArgumentException("failed to find postings_format [" + name + "]");
}
Settings settings = indexSettings.getGroups("index.codec.postings_format").get(name);
Settings settings = indexSettings.getGroups(POSTINGS_FORMAT_SETTINGS_PREFIX).get(name);
if (settings == null) {
settings = ImmutableSettings.Builder.EMPTY_SETTINGS;
}
@ -46,10 +88,22 @@ public interface PostingsFormatProvider {
}
}
/**
* Returns this providers {@link PostingsFormat} instance.
*/
PostingsFormat get();
/**
* Returns the name of this providers {@link PostingsFormat}
*/
String name();
/**
* A simple factory used to create {@link PostingsFormatProvider} used by
* delegating providers like {@link BloomFilterPostingsFormatProvider} or
* {@link PulsingPostingsFormatProvider}. Those providers wrap other
* postings formats to enrich their capabilities.
*/
public interface Factory {
PostingsFormatProvider create(String name, Settings settings);
}

View File

@ -27,11 +27,18 @@ import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.AbstractIndexComponent;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.codec.CodecService;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.Map;
/**
* The {@link PostingsFormatService} provides access to
* all configured {@link PostingsFormatProvider} instances by
* {@link PostingsFormatProvider#name() name}.
*
* @see CodecService
*
*/
public class PostingsFormatService extends AbstractIndexComponent {
@ -51,7 +58,7 @@ public class PostingsFormatService extends AbstractIndexComponent {
MapBuilder<String, PostingsFormatProvider> providers = MapBuilder.newMapBuilder();
Map<String, Settings> postingsFormatSettings = indexSettings.getGroups("index.codec.postings_format");
Map<String, Settings> postingsFormatSettings = indexSettings.getGroups(PostingsFormatProvider.POSTINGS_FORMAT_SETTINGS_PREFIX);
for (Map.Entry<String, PostingsFormatProvider.Factory> entry : postingFormatFactories.entrySet()) {
String name = entry.getKey();
PostingsFormatProvider.Factory factory = entry.getValue();

View File

@ -27,6 +27,22 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
/**
* A {@link PostingsFormatProvider} for Lucenes {@link Pulsing40PostingsFormat}.
* The pulsing implementation in-lines the posting lists for very low frequent
* terms in the term dictionary. This is useful to improve lookup performance
* for low-frequent terms. This postings format offers the following parameters:
* <ul>
* <li><tt>min_block_size</tt>: the minimum block size the default Lucene term
* dictionary uses to encode on-disk blocks.</li>
*
* <li><tt>max_block_size</tt>: the maximum block size the default Lucene term
* dictionary uses to encode on-disk blocks.</li>
*
* <li><tt>freq_cut_off</tt>: the document frequency cut off where pulsing
* in-lines posting lists into the term dictionary. Terms with a document
* frequency less or equal to the cutoff will be in-lined. The default is
* <tt>1</tt></li>
* </ul>
*/
// LUCENE UPGRADE: Upgrade Pulsing40PostingsFormat to next version
public class PulsingPostingsFormatProvider extends AbstractPostingsFormatProvider {