Add JavaDocs for Codecs, PostingsFormat and related services/modules
This commit is contained in:
parent
c09ee82ef5
commit
840eaf983d
|
@ -34,6 +34,38 @@ import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvid
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* The {@link CodecModule} creates and loads the {@link CodecService} and
|
||||||
|
* {@link PostingsFormatService} allowing low level data-structure
|
||||||
|
* specialization on a Lucene Segment basis.
|
||||||
|
* <p>
|
||||||
|
* The codec module is the authoritative source for build-in and custom
|
||||||
|
* {@link PostingsFormatProvider}. During module bootstrap it processes the
|
||||||
|
* index settings underneath the
|
||||||
|
* {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} and
|
||||||
|
* instantiates the corresponding {@link PostingsFormatProvider} instances. To
|
||||||
|
* configure a custom provider implementations the class should reside in the
|
||||||
|
* <tt>org.elasticsearch.index.codec.postingsformat</tt> package and the
|
||||||
|
* classname should be suffixed with <tt>PostingsFormatProvider</tt>. <br>
|
||||||
|
* For example to expose the Elastic-Fantastic format provider one need to
|
||||||
|
* provide the following configuration settings and classes:
|
||||||
|
* <ol>
|
||||||
|
* <li>create a {@link PostingsFormatProvider} subclass in the package
|
||||||
|
* <tt>org.elasticsearch.index.codec.postingsformat</tt></li>
|
||||||
|
*
|
||||||
|
* <li>name the subclass <tt>ElasticFantatsticPostingsFormatProvider</tt></li>
|
||||||
|
*
|
||||||
|
* <li>configure the custom format in you index settings under
|
||||||
|
* <tt>index.codec.postings_format.elastic_fantatic.type : "ElasticFantatic"</tt>
|
||||||
|
* </li>
|
||||||
|
*
|
||||||
|
* <li>provide any postings format settings for this custom format under the
|
||||||
|
* same key ie.
|
||||||
|
* <tt>index.codec.postings_format.elastic_fantatic.performance : "crazy_fast"</tt>
|
||||||
|
* </li>
|
||||||
|
* </ol>
|
||||||
|
*
|
||||||
|
* @see CodecService
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public class CodecModule extends AbstractModule {
|
public class CodecModule extends AbstractModule {
|
||||||
|
|
||||||
|
@ -55,7 +87,7 @@ public class CodecModule extends AbstractModule {
|
||||||
|
|
||||||
Map<String, Class<? extends PostingsFormatProvider>> postingFormatProviders = Maps.newHashMap(customProviders);
|
Map<String, Class<? extends PostingsFormatProvider>> postingFormatProviders = Maps.newHashMap(customProviders);
|
||||||
|
|
||||||
Map<String, Settings> postingsFormatsSettings = indexSettings.getGroups("index.codec.postings_format");
|
Map<String, Settings> postingsFormatsSettings = indexSettings.getGroups(PostingsFormatProvider.POSTINGS_FORMAT_SETTINGS_PREFIX);
|
||||||
for (Map.Entry<String, Settings> entry : postingsFormatsSettings.entrySet()) {
|
for (Map.Entry<String, Settings> entry : postingsFormatsSettings.entrySet()) {
|
||||||
String name = entry.getKey();
|
String name = entry.getKey();
|
||||||
Settings settings = entry.getValue();
|
Settings settings = entry.getValue();
|
||||||
|
|
|
@ -33,6 +33,12 @@ import org.elasticsearch.index.mapper.MapperService;
|
||||||
import org.elasticsearch.index.settings.IndexSettings;
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Since Lucene 4.0 low level index segments are read and written through a
|
||||||
|
* codec layer that allows to use use-case specific file formats &
|
||||||
|
* data-structures per field. ElasticSearch exposes the full
|
||||||
|
* {@link Codec} capabilities through this {@link CodecService}.
|
||||||
|
*
|
||||||
|
* @see PostingsFormatService
|
||||||
*/
|
*/
|
||||||
public class CodecService extends AbstractIndexComponent {
|
public class CodecService extends AbstractIndexComponent {
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,12 @@ import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
|
||||||
import org.elasticsearch.index.mapper.MapperService;
|
import org.elasticsearch.index.mapper.MapperService;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This one is the "default" codec we use.
|
* {@link PerFieldMappingPostingFormatCodec This postings format} is the default
|
||||||
|
* {@link PostingsFormat} for Elasticsearch. It utilizes the
|
||||||
|
* {@link MapperService} to lookup a {@link PostingsFormat} per field. This
|
||||||
|
* allows users to change the low level postings format for individual fields
|
||||||
|
* per index in real time via the mapping API. If no specific postings format is
|
||||||
|
* configured for a specific field the default postings format is used.
|
||||||
*/
|
*/
|
||||||
// LUCENE UPGRADE: make sure to move to a new codec depending on the lucene version
|
// LUCENE UPGRADE: make sure to move to a new codec depending on the lucene version
|
||||||
public class PerFieldMappingPostingFormatCodec extends Lucene40Codec {
|
public class PerFieldMappingPostingFormatCodec extends Lucene40Codec {
|
||||||
|
|
|
@ -19,7 +19,10 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.codec.postingsformat;
|
package org.elasticsearch.index.codec.postingsformat;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Simple abstract {@link PostingsFormat} requiring a name for the provider;
|
||||||
*/
|
*/
|
||||||
public abstract class AbstractPostingsFormatProvider implements PostingsFormatProvider {
|
public abstract class AbstractPostingsFormatProvider implements PostingsFormatProvider {
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,18 @@ import org.elasticsearch.common.settings.Settings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The default postingsformat, maps to {@link Lucene40PostingsFormat}.
|
* The default postingsformat, maps to {@link Lucene40PostingsFormat}.
|
||||||
|
* <ul>
|
||||||
|
* <li><tt>min_block_size</tt>: the minimum block size the default Lucene term
|
||||||
|
* dictionary uses to encode on-disk blocks.</li>
|
||||||
|
*
|
||||||
|
* <li><tt>max_block_size</tt>: the maximum block size the default Lucene term
|
||||||
|
* dictionary uses to encode on-disk blocks.</li>
|
||||||
|
*
|
||||||
|
* <li><tt>freq_cut_off</tt>: the document frequency cut off where pulsing
|
||||||
|
* in-lines posting lists into the term dictionary. Terms with a document
|
||||||
|
* frequency less or equal to the cutoff will be in-lined. The default is
|
||||||
|
* <tt>1</tt></li>
|
||||||
|
* </ul>
|
||||||
*/
|
*/
|
||||||
// LUCENE UPGRADE: Upgrade Lucene40PostingsFormat to next version
|
// LUCENE UPGRADE: Upgrade Lucene40PostingsFormat to next version
|
||||||
public class DefaultPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
public class DefaultPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
||||||
|
|
|
@ -26,6 +26,22 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* A {@link PostingsFormatProvider} for {@link DirectPostingsFormat}. This
|
||||||
|
* postings format uses an on-disk storage for its terms and posting lists and
|
||||||
|
* streams its data during segment merges but loads its entire postings, terms
|
||||||
|
* and positions into memory for faster search performance. This format has a
|
||||||
|
* significant memory footprint and should be used with care. <b> This postings
|
||||||
|
* format offers the following parameters:
|
||||||
|
* <ul>
|
||||||
|
* <li><tt>min_skip_count</tt>: the minimum number terms with a shared prefix to
|
||||||
|
* allow a skip pointer to be written. the default is <tt>8</tt></li>
|
||||||
|
*
|
||||||
|
* <li><tt>low_freq_cutoff</tt>: terms with a lower document frequency use a
|
||||||
|
* single array object representation for postings and positions.</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* @see DirectPostingsFormat
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public class DirectPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
public class DirectPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,16 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* A {@link PostingsFormatProvider} for Lucenes {@link MemoryPostingsFormat}.
|
||||||
|
* This postings format offers the following parameters:
|
||||||
|
* <ul>
|
||||||
|
* <li><tt>pack_fst</tt>: <code>true</code> iff the in memory structure should
|
||||||
|
* be packed once its build. Packed will reduce the size for the data-structure
|
||||||
|
* in memory but requires more memory during building. Default is <code>false</code></li>
|
||||||
|
*
|
||||||
|
* <li><tt>acceptable_overhead_ratio</tt>: the compression overhead used to
|
||||||
|
* compress internal structures. See {@link PackedInts} for details. Default is {@value PackedInts#DEFAULT}</li>
|
||||||
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
||||||
|
|
||||||
|
@ -39,6 +49,7 @@ public class MemoryPostingsFormatProvider extends AbstractPostingsFormatProvider
|
||||||
super(name);
|
super(name);
|
||||||
this.packFst = postingsFormatSettings.getAsBoolean("pack_fst", false);
|
this.packFst = postingsFormatSettings.getAsBoolean("pack_fst", false);
|
||||||
this.acceptableOverheadRatio = postingsFormatSettings.getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT);
|
this.acceptableOverheadRatio = postingsFormatSettings.getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT);
|
||||||
|
// TODO this should really be an ENUM?
|
||||||
this.postingsFormat = new MemoryPostingsFormat(packFst, acceptableOverheadRatio);
|
this.postingsFormat = new MemoryPostingsFormat(packFst, acceptableOverheadRatio);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,34 @@ import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
|
||||||
import org.elasticsearch.common.collect.MapBuilder;
|
import org.elasticsearch.common.collect.MapBuilder;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* This class represents the set of Elasticsearch "build-in"
|
||||||
|
* {@link PostingsFormatProvider.Factory postings format factories}
|
||||||
|
* <ul>
|
||||||
|
* <li><b>direct</b>: a postings format that uses disk-based storage but loads
|
||||||
|
* its terms and postings directly into memory. Note this postings format is
|
||||||
|
* very memory intensive and has certain limitation that don't allow segments to
|
||||||
|
* grow beyond 2.1GB see {@link DirectPostingsFormat} for details.</li>
|
||||||
|
*
|
||||||
|
* <li><b>memory</b>: a postings format that stores its entire terms, postings,
|
||||||
|
* positions and payloads in a finite state transducer. This format should only
|
||||||
|
* be used for primary keys or with fields where each term is contained in a
|
||||||
|
* very low number of documents.</li>
|
||||||
|
*
|
||||||
|
* <li><b>pulsing</b>: a postings format in-lines the posting lists for very low
|
||||||
|
* frequent terms in the term dictionary. This is useful to improve lookup
|
||||||
|
* performance for low-frequent terms.</li>
|
||||||
|
*
|
||||||
|
* <li><b>bloom_default</b>: a postings format that uses a bloom filter to
|
||||||
|
* improve term lookup performance. This is useful for primarily keys or fields
|
||||||
|
* that are used as a delete key</li>
|
||||||
|
*
|
||||||
|
* <li><b>bloom_pulsing</b>: a postings format that combines the advantages of
|
||||||
|
* <b>bloom</b> and <b>pulsing</b> to further improve lookup performance</li>
|
||||||
|
*
|
||||||
|
* <li><b>default</b>: the default Elasticsearch postings format offering best
|
||||||
|
* general purpose performance. This format is used if no postings format is
|
||||||
|
* specified in the field mapping.</li>
|
||||||
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public class PostingFormats {
|
public class PostingFormats {
|
||||||
|
|
||||||
|
|
|
@ -23,22 +23,64 @@ import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.codec.CodecModule;
|
||||||
import org.elasticsearch.index.settings.IndexSettings;
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* A {@link PostingsFormatProvider} acts as a named container for specific
|
||||||
|
* {@link PostingsFormat} implementations. Custom {@link PostingsFormat}
|
||||||
|
* implementations can be exposed via
|
||||||
|
* {@link CodecModule#addPostingFormat(String, Class)}
|
||||||
|
* <p>
|
||||||
|
* Each {@link PostingsFormatProvider} must provide a unique name for its
|
||||||
|
* postings format in order to map the postings format to a specific field via
|
||||||
|
* the mapping API. The name provided via {@link #name()} is used to lookup the
|
||||||
|
* postings format in {@link PostingsFormatService#get(String)} and should be
|
||||||
|
* identical to the values used in the field mappings.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* {@link PostingsFormatProvider} instances are initialized with a
|
||||||
|
* {@link Settings} subset below the
|
||||||
|
* {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix and
|
||||||
|
* will only see the sub-tree below their mapping name. For instance a postings
|
||||||
|
* format <tt>ElasticFantastic</tt> will see settings below
|
||||||
|
* <tt>index.codec.postings_format.elastic_fantastic</tt> given that the
|
||||||
|
* postings format is exposed via
|
||||||
|
* <tt>index.codec.postings_format.elastic_fantastic.type : "ElasticFantastic"</tt>.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @see CodecModule
|
||||||
*/
|
*/
|
||||||
public interface PostingsFormatProvider {
|
public interface PostingsFormatProvider {
|
||||||
|
public static final String POSTINGS_FORMAT_SETTINGS_PREFIX = "index.codec.postings_format";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A helper class to lookup {@link PostingsFormatProvider providers} by their unique {@link PostingsFormatProvider#name() name}
|
||||||
|
*/
|
||||||
public static class Helper {
|
public static class Helper {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Looks up and creates {@link PostingsFormatProvider} for the given name.
|
||||||
|
* <p>
|
||||||
|
* The settings for the created {@link PostingsFormatProvider} is taken from the given index settings.
|
||||||
|
* All settings with the {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix
|
||||||
|
* and the formats name as the key are passed to the factory.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param indexSettings the index settings to configure the postings format
|
||||||
|
* @param name the name of the postings format to lookup
|
||||||
|
* @param postingFormatFactories the factory mapping to lookup the {@link Factory} to create the {@link PostingsFormatProvider}
|
||||||
|
* @return a fully configured {@link PostingsFormatProvider} for the given name.
|
||||||
|
* @throws ElasticSearchIllegalArgumentException if the no {@link PostingsFormatProvider} for the given name parameter could be found.
|
||||||
|
*/
|
||||||
public static PostingsFormatProvider lookup(@IndexSettings Settings indexSettings, String name, Map<String, Factory> postingFormatFactories) throws ElasticSearchIllegalArgumentException {
|
public static PostingsFormatProvider lookup(@IndexSettings Settings indexSettings, String name, Map<String, Factory> postingFormatFactories) throws ElasticSearchIllegalArgumentException {
|
||||||
Factory factory = postingFormatFactories.get(name);
|
Factory factory = postingFormatFactories.get(name);
|
||||||
if (factory == null) {
|
if (factory == null) {
|
||||||
throw new ElasticSearchIllegalArgumentException("failed to find postings_format [" + name + "]");
|
throw new ElasticSearchIllegalArgumentException("failed to find postings_format [" + name + "]");
|
||||||
}
|
}
|
||||||
Settings settings = indexSettings.getGroups("index.codec.postings_format").get(name);
|
Settings settings = indexSettings.getGroups(POSTINGS_FORMAT_SETTINGS_PREFIX).get(name);
|
||||||
if (settings == null) {
|
if (settings == null) {
|
||||||
settings = ImmutableSettings.Builder.EMPTY_SETTINGS;
|
settings = ImmutableSettings.Builder.EMPTY_SETTINGS;
|
||||||
}
|
}
|
||||||
|
@ -46,10 +88,22 @@ public interface PostingsFormatProvider {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns this providers {@link PostingsFormat} instance.
|
||||||
|
*/
|
||||||
PostingsFormat get();
|
PostingsFormat get();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the name of this providers {@link PostingsFormat}
|
||||||
|
*/
|
||||||
String name();
|
String name();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A simple factory used to create {@link PostingsFormatProvider} used by
|
||||||
|
* delegating providers like {@link BloomFilterPostingsFormatProvider} or
|
||||||
|
* {@link PulsingPostingsFormatProvider}. Those providers wrap other
|
||||||
|
* postings formats to enrich their capabilities.
|
||||||
|
*/
|
||||||
public interface Factory {
|
public interface Factory {
|
||||||
PostingsFormatProvider create(String name, Settings settings);
|
PostingsFormatProvider create(String name, Settings settings);
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,11 +27,18 @@ import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.index.AbstractIndexComponent;
|
import org.elasticsearch.index.AbstractIndexComponent;
|
||||||
import org.elasticsearch.index.Index;
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.codec.CodecService;
|
||||||
import org.elasticsearch.index.settings.IndexSettings;
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* The {@link PostingsFormatService} provides access to
|
||||||
|
* all configured {@link PostingsFormatProvider} instances by
|
||||||
|
* {@link PostingsFormatProvider#name() name}.
|
||||||
|
*
|
||||||
|
* @see CodecService
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public class PostingsFormatService extends AbstractIndexComponent {
|
public class PostingsFormatService extends AbstractIndexComponent {
|
||||||
|
|
||||||
|
@ -51,7 +58,7 @@ public class PostingsFormatService extends AbstractIndexComponent {
|
||||||
|
|
||||||
MapBuilder<String, PostingsFormatProvider> providers = MapBuilder.newMapBuilder();
|
MapBuilder<String, PostingsFormatProvider> providers = MapBuilder.newMapBuilder();
|
||||||
|
|
||||||
Map<String, Settings> postingsFormatSettings = indexSettings.getGroups("index.codec.postings_format");
|
Map<String, Settings> postingsFormatSettings = indexSettings.getGroups(PostingsFormatProvider.POSTINGS_FORMAT_SETTINGS_PREFIX);
|
||||||
for (Map.Entry<String, PostingsFormatProvider.Factory> entry : postingFormatFactories.entrySet()) {
|
for (Map.Entry<String, PostingsFormatProvider.Factory> entry : postingFormatFactories.entrySet()) {
|
||||||
String name = entry.getKey();
|
String name = entry.getKey();
|
||||||
PostingsFormatProvider.Factory factory = entry.getValue();
|
PostingsFormatProvider.Factory factory = entry.getValue();
|
||||||
|
|
|
@ -27,6 +27,22 @@ import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* A {@link PostingsFormatProvider} for Lucenes {@link Pulsing40PostingsFormat}.
|
||||||
|
* The pulsing implementation in-lines the posting lists for very low frequent
|
||||||
|
* terms in the term dictionary. This is useful to improve lookup performance
|
||||||
|
* for low-frequent terms. This postings format offers the following parameters:
|
||||||
|
* <ul>
|
||||||
|
* <li><tt>min_block_size</tt>: the minimum block size the default Lucene term
|
||||||
|
* dictionary uses to encode on-disk blocks.</li>
|
||||||
|
*
|
||||||
|
* <li><tt>max_block_size</tt>: the maximum block size the default Lucene term
|
||||||
|
* dictionary uses to encode on-disk blocks.</li>
|
||||||
|
*
|
||||||
|
* <li><tt>freq_cut_off</tt>: the document frequency cut off where pulsing
|
||||||
|
* in-lines posting lists into the term dictionary. Terms with a document
|
||||||
|
* frequency less or equal to the cutoff will be in-lined. The default is
|
||||||
|
* <tt>1</tt></li>
|
||||||
|
* </ul>
|
||||||
*/
|
*/
|
||||||
// LUCENE UPGRADE: Upgrade Pulsing40PostingsFormat to next version
|
// LUCENE UPGRADE: Upgrade Pulsing40PostingsFormat to next version
|
||||||
public class PulsingPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
public class PulsingPostingsFormatProvider extends AbstractPostingsFormatProvider {
|
||||||
|
|
Loading…
Reference in New Issue