[core] add best_compression option for Lucene 5.0

Upgrades lucene to latest, and supports the BEST_COMPRESSION parameter
now supported (with backwards compatibility, etc) in Lucene.
This option uses deflate, tuned for highly compressible data.

index.codec::
The default value compresses stored data with LZ4 compression, but
this can be set to best_compression for a higher compression ratio,
at the expense of slower stored fields performance.

IMO its safest to implement as a named codec here, because ES already
has logic to handle this correctly, and because its unrealistic to have
a plethora of options to Lucene's default codec... we are practically
limited in Lucene to what we can support with back compat, so I don't
think we should overengineer this and add additional unnecessary plumbing.

See also:
https://issues.apache.org/jira/browse/LUCENE-5914
https://issues.apache.org/jira/browse/LUCENE-6089
https://issues.apache.org/jira/browse/LUCENE-6090
https://issues.apache.org/jira/browse/LUCENE-6100

Closes #8863
This commit is contained in:
Robert Muir 2014-12-10 22:05:54 -05:00
parent aa644e3ad7
commit a2ffe494ae
8 changed files with 73 additions and 7 deletions

View File

@ -41,6 +41,11 @@ otherwise it is written in non-compound format.
refresh operation will be executed. Defaults to `1s`. Can be set to `-1`
in order to disable it.
`index.codec`::
The `default` value compresses stored data with LZ4 compression, but
this can be set to `best_compression` for a higher compression ratio,
at the expense of slower stored fields performance.
`index.shard.check_on_startup`::
Should shard consistency be checked upon opening.
When `true`, the shard will be checked, preventing it from being open in

View File

@ -32,7 +32,7 @@
<properties>
<lucene.version>5.0.0</lucene.version>
<lucene.maven.version>5.0.0-snapshot-1642891</lucene.maven.version>
<lucene.maven.version>5.0.0-snapshot-1644303</lucene.maven.version>
<tests.jvms>auto</tests.jvms>
<tests.shuffle>true</tests.shuffle>
<tests.output>onerror</tests.output>
@ -54,7 +54,7 @@
</repository>
<repository>
<id>Lucene snapshots</id>
<url>https://download.elasticsearch.org/lucenesnapshots/1642891</url>
<url>https://download.elasticsearch.org/lucenesnapshots/1644303</url>
</repository>
</repositories>

View File

@ -20,7 +20,11 @@
package org.elasticsearch.index.codec;
import com.google.common.collect.ImmutableMap;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.inject.Inject;
@ -50,6 +54,7 @@ public class CodecService extends AbstractIndexComponent {
private final ImmutableMap<String, Codec> codecs;
public final static String DEFAULT_CODEC = "default";
public final static String BEST_COMPRESSION_CODEC = "best_compression";
public CodecService(Index index) {
this(index, ImmutableSettings.Builder.EMPTY_SETTINGS);
@ -68,9 +73,17 @@ public class CodecService extends AbstractIndexComponent {
this.mapperService = mapperService;
MapBuilder<String, Codec> codecs = MapBuilder.<String, Codec>newMapBuilder();
if (mapperService == null) {
codecs.put(DEFAULT_CODEC, Codec.getDefault());
codecs.put(DEFAULT_CODEC, new Lucene50Codec());
codecs.put(BEST_COMPRESSION_CODEC, new Lucene50Codec(Mode.BEST_COMPRESSION));
} else {
codecs.put(DEFAULT_CODEC, new PerFieldMappingPostingFormatCodec(mapperService,
codecs.put(DEFAULT_CODEC,
new PerFieldMappingPostingFormatCodec(Mode.BEST_SPEED,
mapperService,
postingsFormatService.get(PostingsFormatService.DEFAULT_FORMAT).get(),
docValuesFormatService.get(DocValuesFormatService.DEFAULT_FORMAT).get(), logger));
codecs.put(BEST_COMPRESSION_CODEC,
new PerFieldMappingPostingFormatCodec(Mode.BEST_COMPRESSION,
mapperService,
postingsFormatService.get(PostingsFormatService.DEFAULT_FORMAT).get(),
docValuesFormatService.get(DocValuesFormatService.DEFAULT_FORMAT).get(), logger));
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatProvider;
@ -49,7 +50,8 @@ public class PerFieldMappingPostingFormatCodec extends Lucene50Codec {
assert Codec.forName(Lucene.LATEST_CODEC).getClass().isAssignableFrom(PerFieldMappingPostingFormatCodec.class) : "PerFieldMappingPostingFormatCodec must subclass the latest lucene codec: " + Lucene.LATEST_CODEC;
}
public PerFieldMappingPostingFormatCodec(MapperService mapperService, PostingsFormat defaultPostingFormat, DocValuesFormat defaultDocValuesFormat, ESLogger logger) {
public PerFieldMappingPostingFormatCodec(Lucene50StoredFieldsFormat.Mode compressionMode, MapperService mapperService, PostingsFormat defaultPostingFormat, DocValuesFormat defaultDocValuesFormat, ESLogger logger) {
super(compressionMode);
this.mapperService = mapperService;
this.logger = logger;
this.defaultPostingFormat = defaultPostingFormat;

View File

@ -20,6 +20,7 @@
package org.elasticsearch.index.engine.internal;
import com.google.common.collect.Lists;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.search.*;
@ -69,6 +70,7 @@ import org.elasticsearch.threadpool.ThreadPool;
import java.io.Closeable;
import java.io.IOException;
import java.lang.reflect.Method;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
@ -1054,10 +1056,13 @@ public class InternalEngine implements Engine {
}
}
// TODO: can we please remove this method?!
private void waitForMerges(boolean flushAfter) {
try {
currentIndexWriter().waitForMerges();
} catch (IOException e) {
Method method = IndexWriter.class.getDeclaredMethod("waitForMerges");
method.setAccessible(true);
method.invoke(currentIndexWriter());
} catch (ReflectiveOperationException e) {
throw new OptimizeFailedEngineException(shardId, e);
}
if (flushAfter) {

View File

@ -33,7 +33,15 @@ import org.apache.lucene.codecs.lucene46.Lucene46Codec;
import org.apache.lucene.codecs.lucene49.Lucene49Codec;
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
import org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.store.Directory;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
@ -173,6 +181,34 @@ public class CodecTests extends ElasticsearchSingleNodeLuceneTestCase {
assertThat(documentMapper.rootMapper(VersionFieldMapper.class).docValuesFormatProvider(), instanceOf(PreBuiltDocValuesFormatProvider.class));
assertThat(documentMapper.rootMapper(VersionFieldMapper.class).docValuesFormatProvider().get(), instanceOf(Lucene410DocValuesFormat.class));
}
public void testDefault() throws Exception {
Codec codec = createCodecService().codec("default");
assertCompressionEquals(Mode.BEST_SPEED, codec);
}
public void testBestCompression() throws Exception {
Codec codec = createCodecService().codec("best_compression");
assertCompressionEquals(Mode.BEST_COMPRESSION, codec);
}
// write some docs with it, inspect .si to see this was the used compression
private void assertCompressionEquals(Mode expected, Codec actual) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(null);
iwc.setCodec(actual);
IndexWriter iw = new IndexWriter(dir, iwc);
iw.addDocument(new Document());
iw.commit();
iw.close();
DirectoryReader ir = DirectoryReader.open(dir);
SegmentReader sr = (SegmentReader) ir.leaves().get(0).reader();
String v = sr.getSegmentInfo().info.getAttribute(Lucene50StoredFieldsFormat.MODE_KEY);
assertNotNull(v);
assertEquals(expected, Mode.valueOf(v));
ir.close();
dir.close();
}
private static CodecService createCodecService() {
return createCodecService(ImmutableSettings.Builder.EMPTY_SETTINGS);

View File

@ -187,6 +187,7 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
IOUtils.close(verifyingOutput, dir);
}
// TODO: remove this, its too fragile. just use a static old index instead.
private static final class OldSIMockingCodec extends FilterCodec {
protected OldSIMockingCodec() {
@ -232,6 +233,7 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
}
}
output.writeStringSet(files);
output.writeStringStringMap(si.getAttributes());
CodecUtil.writeFooter(output);
success = true;
} finally {
@ -245,6 +247,7 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
}
}
// IF THIS TEST FAILS ON UPGRADE GO LOOK AT THE OldSIMockingCodec!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@Test
public void testWriteLegacyChecksums() throws IOException {
final ShardId shardId = new ShardId(new Index("index"), 1);

View File

@ -315,6 +315,8 @@ public abstract class ElasticsearchIntegrationTest extends ElasticsearchTestCase
randomSettingsBuilder.put(SETTING_NUMBER_OF_SHARDS, numberOfShards())
.put(SETTING_NUMBER_OF_REPLICAS, numberOfReplicas());
randomSettingsBuilder.put("index.codec", randomFrom("default", "best_compression"));
XContentBuilder mappings = null;
if (frequently() && randomDynamicTemplates()) {
mappings = XContentFactory.jsonBuilder().startObject().startObject("_default_");