[core] add best_compression option for Lucene 5.0

Upgrades lucene to latest, and supports the BEST_COMPRESSION parameter
now supported (with backwards compatibility, etc) in Lucene.
This option uses deflate, tuned for highly compressible data.

index.codec::
The default value compresses stored data with LZ4 compression, but
this can be set to best_compression for a higher compression ratio,
at the expense of slower stored fields performance.

IMO its safest to implement as a named codec here, because ES already
has logic to handle this correctly, and because its unrealistic to have
a plethora of options to Lucene's default codec... we are practically
limited in Lucene to what we can support with back compat, so I don't
think we should overengineer this and add additional unnecessary plumbing.

See also:
https://issues.apache.org/jira/browse/LUCENE-5914
https://issues.apache.org/jira/browse/LUCENE-6089
https://issues.apache.org/jira/browse/LUCENE-6090
https://issues.apache.org/jira/browse/LUCENE-6100

Closes #8863
This commit is contained in:
Robert Muir 2014-12-10 22:05:54 -05:00
parent aa644e3ad7
commit a2ffe494ae
8 changed files with 73 additions and 7 deletions

View File

@ -41,6 +41,11 @@ otherwise it is written in non-compound format.
refresh operation will be executed. Defaults to `1s`. Can be set to `-1` refresh operation will be executed. Defaults to `1s`. Can be set to `-1`
in order to disable it. in order to disable it.
`index.codec`::
The `default` value compresses stored data with LZ4 compression, but
this can be set to `best_compression` for a higher compression ratio,
at the expense of slower stored fields performance.
`index.shard.check_on_startup`:: `index.shard.check_on_startup`::
Should shard consistency be checked upon opening. Should shard consistency be checked upon opening.
When `true`, the shard will be checked, preventing it from being open in When `true`, the shard will be checked, preventing it from being open in

View File

@ -32,7 +32,7 @@
<properties> <properties>
<lucene.version>5.0.0</lucene.version> <lucene.version>5.0.0</lucene.version>
<lucene.maven.version>5.0.0-snapshot-1642891</lucene.maven.version> <lucene.maven.version>5.0.0-snapshot-1644303</lucene.maven.version>
<tests.jvms>auto</tests.jvms> <tests.jvms>auto</tests.jvms>
<tests.shuffle>true</tests.shuffle> <tests.shuffle>true</tests.shuffle>
<tests.output>onerror</tests.output> <tests.output>onerror</tests.output>
@ -54,7 +54,7 @@
</repository> </repository>
<repository> <repository>
<id>Lucene snapshots</id> <id>Lucene snapshots</id>
<url>https://download.elasticsearch.org/lucenesnapshots/1642891</url> <url>https://download.elasticsearch.org/lucenesnapshots/1644303</url>
</repository> </repository>
</repositories> </repositories>

View File

@ -20,7 +20,11 @@
package org.elasticsearch.index.codec; package org.elasticsearch.index.codec;
import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableMap;
import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.Inject;
@ -50,6 +54,7 @@ public class CodecService extends AbstractIndexComponent {
private final ImmutableMap<String, Codec> codecs; private final ImmutableMap<String, Codec> codecs;
public final static String DEFAULT_CODEC = "default"; public final static String DEFAULT_CODEC = "default";
public final static String BEST_COMPRESSION_CODEC = "best_compression";
public CodecService(Index index) { public CodecService(Index index) {
this(index, ImmutableSettings.Builder.EMPTY_SETTINGS); this(index, ImmutableSettings.Builder.EMPTY_SETTINGS);
@ -68,9 +73,17 @@ public class CodecService extends AbstractIndexComponent {
this.mapperService = mapperService; this.mapperService = mapperService;
MapBuilder<String, Codec> codecs = MapBuilder.<String, Codec>newMapBuilder(); MapBuilder<String, Codec> codecs = MapBuilder.<String, Codec>newMapBuilder();
if (mapperService == null) { if (mapperService == null) {
codecs.put(DEFAULT_CODEC, Codec.getDefault()); codecs.put(DEFAULT_CODEC, new Lucene50Codec());
codecs.put(BEST_COMPRESSION_CODEC, new Lucene50Codec(Mode.BEST_COMPRESSION));
} else { } else {
codecs.put(DEFAULT_CODEC, new PerFieldMappingPostingFormatCodec(mapperService, codecs.put(DEFAULT_CODEC,
new PerFieldMappingPostingFormatCodec(Mode.BEST_SPEED,
mapperService,
postingsFormatService.get(PostingsFormatService.DEFAULT_FORMAT).get(),
docValuesFormatService.get(DocValuesFormatService.DEFAULT_FORMAT).get(), logger));
codecs.put(BEST_COMPRESSION_CODEC,
new PerFieldMappingPostingFormatCodec(Mode.BEST_COMPRESSION,
mapperService,
postingsFormatService.get(PostingsFormatService.DEFAULT_FORMAT).get(), postingsFormatService.get(PostingsFormatService.DEFAULT_FORMAT).get(),
docValuesFormatService.get(DocValuesFormatService.DEFAULT_FORMAT).get(), logger)); docValuesFormatService.get(DocValuesFormatService.DEFAULT_FORMAT).get(), logger));
} }

View File

@ -23,6 +23,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50Codec; import org.apache.lucene.codecs.lucene50.Lucene50Codec;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatProvider; import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatProvider;
@ -49,7 +50,8 @@ public class PerFieldMappingPostingFormatCodec extends Lucene50Codec {
assert Codec.forName(Lucene.LATEST_CODEC).getClass().isAssignableFrom(PerFieldMappingPostingFormatCodec.class) : "PerFieldMappingPostingFormatCodec must subclass the latest lucene codec: " + Lucene.LATEST_CODEC; assert Codec.forName(Lucene.LATEST_CODEC).getClass().isAssignableFrom(PerFieldMappingPostingFormatCodec.class) : "PerFieldMappingPostingFormatCodec must subclass the latest lucene codec: " + Lucene.LATEST_CODEC;
} }
public PerFieldMappingPostingFormatCodec(MapperService mapperService, PostingsFormat defaultPostingFormat, DocValuesFormat defaultDocValuesFormat, ESLogger logger) { public PerFieldMappingPostingFormatCodec(Lucene50StoredFieldsFormat.Mode compressionMode, MapperService mapperService, PostingsFormat defaultPostingFormat, DocValuesFormat defaultDocValuesFormat, ESLogger logger) {
super(compressionMode);
this.mapperService = mapperService; this.mapperService = mapperService;
this.logger = logger; this.logger = logger;
this.defaultPostingFormat = defaultPostingFormat; this.defaultPostingFormat = defaultPostingFormat;

View File

@ -20,6 +20,7 @@
package org.elasticsearch.index.engine.internal; package org.elasticsearch.index.engine.internal;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import org.apache.lucene.index.*; import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
@ -69,6 +70,7 @@ import org.elasticsearch.threadpool.ThreadPool;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.lang.reflect.Method;
import java.util.*; import java.util.*;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
@ -1054,10 +1056,13 @@ public class InternalEngine implements Engine {
} }
} }
// TODO: can we please remove this method?!
private void waitForMerges(boolean flushAfter) { private void waitForMerges(boolean flushAfter) {
try { try {
currentIndexWriter().waitForMerges(); Method method = IndexWriter.class.getDeclaredMethod("waitForMerges");
} catch (IOException e) { method.setAccessible(true);
method.invoke(currentIndexWriter());
} catch (ReflectiveOperationException e) {
throw new OptimizeFailedEngineException(shardId, e); throw new OptimizeFailedEngineException(shardId, e);
} }
if (flushAfter) { if (flushAfter) {

View File

@ -33,7 +33,15 @@ import org.apache.lucene.codecs.lucene46.Lucene46Codec;
import org.apache.lucene.codecs.lucene49.Lucene49Codec; import org.apache.lucene.codecs.lucene49.Lucene49Codec;
import org.apache.lucene.codecs.lucene50.Lucene50Codec; import org.apache.lucene.codecs.lucene50.Lucene50Codec;
import org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat; import org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.store.Directory;
import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
@ -173,6 +181,34 @@ public class CodecTests extends ElasticsearchSingleNodeLuceneTestCase {
assertThat(documentMapper.rootMapper(VersionFieldMapper.class).docValuesFormatProvider(), instanceOf(PreBuiltDocValuesFormatProvider.class)); assertThat(documentMapper.rootMapper(VersionFieldMapper.class).docValuesFormatProvider(), instanceOf(PreBuiltDocValuesFormatProvider.class));
assertThat(documentMapper.rootMapper(VersionFieldMapper.class).docValuesFormatProvider().get(), instanceOf(Lucene410DocValuesFormat.class)); assertThat(documentMapper.rootMapper(VersionFieldMapper.class).docValuesFormatProvider().get(), instanceOf(Lucene410DocValuesFormat.class));
} }
public void testDefault() throws Exception {
Codec codec = createCodecService().codec("default");
assertCompressionEquals(Mode.BEST_SPEED, codec);
}
public void testBestCompression() throws Exception {
Codec codec = createCodecService().codec("best_compression");
assertCompressionEquals(Mode.BEST_COMPRESSION, codec);
}
// write some docs with it, inspect .si to see this was the used compression
private void assertCompressionEquals(Mode expected, Codec actual) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(null);
iwc.setCodec(actual);
IndexWriter iw = new IndexWriter(dir, iwc);
iw.addDocument(new Document());
iw.commit();
iw.close();
DirectoryReader ir = DirectoryReader.open(dir);
SegmentReader sr = (SegmentReader) ir.leaves().get(0).reader();
String v = sr.getSegmentInfo().info.getAttribute(Lucene50StoredFieldsFormat.MODE_KEY);
assertNotNull(v);
assertEquals(expected, Mode.valueOf(v));
ir.close();
dir.close();
}
private static CodecService createCodecService() { private static CodecService createCodecService() {
return createCodecService(ImmutableSettings.Builder.EMPTY_SETTINGS); return createCodecService(ImmutableSettings.Builder.EMPTY_SETTINGS);

View File

@ -187,6 +187,7 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
IOUtils.close(verifyingOutput, dir); IOUtils.close(verifyingOutput, dir);
} }
// TODO: remove this, its too fragile. just use a static old index instead.
private static final class OldSIMockingCodec extends FilterCodec { private static final class OldSIMockingCodec extends FilterCodec {
protected OldSIMockingCodec() { protected OldSIMockingCodec() {
@ -232,6 +233,7 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
} }
} }
output.writeStringSet(files); output.writeStringSet(files);
output.writeStringStringMap(si.getAttributes());
CodecUtil.writeFooter(output); CodecUtil.writeFooter(output);
success = true; success = true;
} finally { } finally {
@ -245,6 +247,7 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
} }
} }
// IF THIS TEST FAILS ON UPGRADE GO LOOK AT THE OldSIMockingCodec!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@Test @Test
public void testWriteLegacyChecksums() throws IOException { public void testWriteLegacyChecksums() throws IOException {
final ShardId shardId = new ShardId(new Index("index"), 1); final ShardId shardId = new ShardId(new Index("index"), 1);

View File

@ -315,6 +315,8 @@ public abstract class ElasticsearchIntegrationTest extends ElasticsearchTestCase
randomSettingsBuilder.put(SETTING_NUMBER_OF_SHARDS, numberOfShards()) randomSettingsBuilder.put(SETTING_NUMBER_OF_SHARDS, numberOfShards())
.put(SETTING_NUMBER_OF_REPLICAS, numberOfReplicas()); .put(SETTING_NUMBER_OF_REPLICAS, numberOfReplicas());
randomSettingsBuilder.put("index.codec", randomFrom("default", "best_compression"));
XContentBuilder mappings = null; XContentBuilder mappings = null;
if (frequently() && randomDynamicTemplates()) { if (frequently() && randomDynamicTemplates()) {
mappings = XContentFactory.jsonBuilder().startObject().startObject("_default_"); mappings = XContentFactory.jsonBuilder().startObject().startObject("_default_");