Specialise the default codec to reuse Lucene41 files in the common case.

Closes #2799
This commit is contained in:
Simon Willnauer 2013-03-18 22:58:57 +01:00
parent 54e7e309a5
commit 747ce36915
10 changed files with 266 additions and 25 deletions

View File

@ -263,7 +263,7 @@ public abstract class TransportBroadcastOperationAction<Request extends Broadcas
} else {
try {
onOperation(shard, shardOperation(shardRequest));
} catch (Exception e) {
} catch (Throwable e) {
onOperation(shard, shardIt, e);
}
}

View File

@ -21,7 +21,9 @@ package org.elasticsearch.index.codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene42.Lucene42Codec;
import org.elasticsearch.ElasticSearchIllegalStateException;
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
import org.elasticsearch.index.mapper.FieldMappers;
import org.elasticsearch.index.mapper.MapperService;
/**
@ -45,7 +47,11 @@ public class PerFieldMappingPostingFormatCodec extends Lucene42Codec {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
PostingsFormatProvider postingsFormat = mapperService.indexName(field).mapper().postingsFormatProvider();
final FieldMappers indexName = mapperService.indexName(field);
if (indexName == null) {
throw new ElasticSearchIllegalStateException("no index mapper found for field: [" + field + "]");
}
PostingsFormatProvider postingsFormat = indexName.mapper().postingsFormatProvider();
return postingsFormat != null ? postingsFormat.get() : defaultPostingFormat;
}
}

View File

@ -21,6 +21,7 @@ package org.elasticsearch.index.codec.postingsformat;
import org.apache.lucene.codecs.*;
import org.apache.lucene.index.*;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Bits;
@ -82,7 +83,7 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state)
public BloomFilteredFieldsConsumer fieldsConsumer(SegmentWriteState state)
throws IOException {
if (delegatePostingsFormat == null) {
throw new UnsupportedOperationException("Error - " + getClass().getName()
@ -94,14 +95,19 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state)
public BloomFilteredFieldsProducer fieldsProducer(SegmentReadState state)
throws IOException {
return new BloomFilteredFieldsProducer(state);
}
public class BloomFilteredFieldsProducer extends FieldsProducer {
public final class BloomFilteredFieldsProducer extends FieldsProducer {
private FieldsProducer delegateFieldsProducer;
HashMap<String, BloomFilter> bloomsByFieldName = new HashMap<String, BloomFilter>();
// for internal use only
FieldsProducer getDelegate() {
return delegateFieldsProducer;
}
public BloomFilteredFieldsProducer(SegmentReadState state)
throws IOException {
@ -119,15 +125,18 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {
// Load the delegate postings format
PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
.readString());
this.delegateFieldsProducer = delegatePostingsFormat
.fieldsProducer(state);
int numBlooms = bloomIn.readInt();
for (int i = 0; i < numBlooms; i++) {
int fieldNum = bloomIn.readInt();
BloomFilter bloom = BloomFilter.deserialize(bloomIn);
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
bloomsByFieldName.put(fieldInfo.name, bloom);
if (state.context.context != IOContext.Context.MERGE) {
// if we merge we don't need to load the bloom filters
for (int i = 0; i < numBlooms; i++) {
int fieldNum = bloomIn.readInt();
BloomFilter bloom = BloomFilter.deserialize(bloomIn);
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
bloomsByFieldName.put(fieldInfo.name, bloom);
}
}
IOUtils.close(bloomIn);
success = true;
@ -332,7 +341,7 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {
}
class BloomFilteredFieldsConsumer extends FieldsConsumer {
final class BloomFilteredFieldsConsumer extends FieldsConsumer {
private FieldsConsumer delegateFieldsConsumer;
private Map<FieldInfo, BloomFilter> bloomFilters = new HashMap<FieldInfo, BloomFilter>();
private SegmentWriteState state;
@ -345,6 +354,11 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {
// this.delegatePostingsFormat=delegatePostingsFormat;
this.state = state;
}
// for internal use only
FieldsConsumer getDelegate() {
return delegateFieldsConsumer;
}
@Override
public TermsConsumer addField(FieldInfo field) throws IOException {
@ -449,4 +463,8 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {
}
public PostingsFormat getDelegate() {
return this.delegatePostingsFormat;
}
}

View File

@ -0,0 +1,84 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.codec.postingsformat;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat.BloomFilteredFieldsConsumer;
import org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat.BloomFilteredFieldsProducer;
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
/**
* This is the default postings format for ElasticSearch that special cases
* the <tt>_uid</tt> field to use a bloom filter while all other fields
* will use a {@link Lucene41PostingsFormat}. This format will reuse the underlying
* {@link Lucene41PostingsFormat} and it's files also for the <tt>_uid</tt> saving up to
* 5 files per segment in the default case.
*/
public final class ElasticSearch090PostingsFormat extends PostingsFormat {
private final BloomFilterPostingsFormat bloomPostings;
public ElasticSearch090PostingsFormat() {
super("es090");
bloomPostings = new BloomFilterPostingsFormat(new Lucene41PostingsFormat(), BloomFilter.Factory.DEFAULT);
}
public PostingsFormat getDefaultWrapped() {
return bloomPostings.getDelegate();
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
final BloomFilteredFieldsConsumer fieldsConsumer = bloomPostings.fieldsConsumer(state);
return new FieldsConsumer() {
@Override
public void close() throws IOException {
fieldsConsumer.close();
}
@Override
public TermsConsumer addField(FieldInfo field) throws IOException {
if (UidFieldMapper.NAME.equals(field.name)) {
// only go through bloom for the UID field
return fieldsConsumer.addField(field);
}
return fieldsConsumer.getDelegate().addField(field);
}
};
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
// we can just return the delegate here since we didn't record bloom filters for
// the other fields.
return bloomPostings.fieldsProducer(state);
}
}

View File

@ -23,10 +23,6 @@ import com.google.common.collect.ImmutableCollection;
import com.google.common.collect.ImmutableMap;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
import org.apache.lucene.codecs.memory.DirectPostingsFormat;
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat;
import org.elasticsearch.common.collect.MapBuilder;
/**
@ -69,14 +65,15 @@ public class PostingFormats {
for (String luceneName : PostingsFormat.availablePostingsFormats()) {
buildInPostingFormatsX.put(luceneName, new PreBuiltPostingsFormatProvider.Factory(PostingsFormat.forName(luceneName)));
}
buildInPostingFormatsX.put("direct", new PreBuiltPostingsFormatProvider.Factory("direct", new DirectPostingsFormat()));
buildInPostingFormatsX.put("memory", new PreBuiltPostingsFormatProvider.Factory("memory", new MemoryPostingsFormat()));
final ElasticSearch090PostingsFormat defaultFormat = new ElasticSearch090PostingsFormat();
buildInPostingFormatsX.put("direct", new PreBuiltPostingsFormatProvider.Factory("direct", PostingsFormat.forName("Direct")));
buildInPostingFormatsX.put("memory", new PreBuiltPostingsFormatProvider.Factory("memory", PostingsFormat.forName("Memory")));
// LUCENE UPGRADE: Need to change this to the relevant ones on a lucene upgrade
buildInPostingFormatsX.put("pulsing", new PreBuiltPostingsFormatProvider.Factory("pulsing", new Pulsing41PostingsFormat()));
buildInPostingFormatsX.put("default", new PreBuiltPostingsFormatProvider.Factory("default", new Lucene41PostingsFormat()));
buildInPostingFormatsX.put("pulsing", new PreBuiltPostingsFormatProvider.Factory("pulsing", PostingsFormat.forName("Pulsing41")));
buildInPostingFormatsX.put("default", new PreBuiltPostingsFormatProvider.Factory("default", defaultFormat));
buildInPostingFormatsX.put("bloom_pulsing", new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", wrapInBloom(new Pulsing41PostingsFormat())));
buildInPostingFormatsX.put("bloom_default", new PreBuiltPostingsFormatProvider.Factory("bloom_default", wrapInBloom(new Lucene41PostingsFormat())));
buildInPostingFormatsX.put("bloom_pulsing", new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", wrapInBloom(PostingsFormat.forName("Pulsing41"))));
buildInPostingFormatsX.put("bloom_default", new PreBuiltPostingsFormatProvider.Factory("bloom_default", wrapInBloom(PostingsFormat.forName("Lucene41"))));
builtInPostingFormats = buildInPostingFormatsX.immutableMap();
}

View File

@ -60,6 +60,9 @@ public class PreBuiltPostingsFormatProvider implements PostingsFormatProvider {
}
public PreBuiltPostingsFormatProvider(String name, PostingsFormat postingsFormat) {
if (postingsFormat == null) {
throw new IllegalArgumentException("PostingsFormat must not be null");
}
this.name = name;
this.postingsFormat = postingsFormat;
}

View File

@ -134,7 +134,7 @@ public class UidFieldMapper extends AbstractFieldMapper<Uid> implements Internal
@Override
protected String defaultPostingFormat() {
return "bloom_default";
return "default";
}
@Override

View File

@ -1 +1,2 @@
org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat
org.elasticsearch.index.codec.postingsformat.ElasticSearch090PostingsFormat

View File

@ -69,8 +69,10 @@ public class CodecTests {
public void testResolveDefaultPostingFormats() throws Exception {
PostingsFormatService postingsFormatService = createCodecService().postingsFormatService();
assertThat(postingsFormatService.get("default"), instanceOf(PreBuiltPostingsFormatProvider.class));
assertThat(postingsFormatService.get("default").get(), instanceOf(ElasticSearch090PostingsFormat.class));
// Should fail when upgrading Lucene with codec changes
assertThat(postingsFormatService.get("default").get(), instanceOf(((PerFieldPostingsFormat) Codec.getDefault().postingsFormat()).getPostingsFormatForField(null).getClass()));
assertThat(((ElasticSearch090PostingsFormat)postingsFormatService.get("default").get()).getDefaultWrapped(), instanceOf(((PerFieldPostingsFormat) Codec.getDefault().postingsFormat()).getPostingsFormatForField(null).getClass()));
assertThat(postingsFormatService.get("Lucene41"), instanceOf(PreBuiltPostingsFormatProvider.class));
// Should fail when upgrading Lucene with codec changes
assertThat(postingsFormatService.get("Lucene41").get(), instanceOf(((PerFieldPostingsFormat) Codec.getDefault().postingsFormat()).getPostingsFormatForField(null).getClass()));
@ -126,7 +128,7 @@ public class CodecTests {
CodecService codecService = createCodecService(indexSettings);
DocumentMapper documentMapper = codecService.mapperService().documentMapperParser().parse(mapping);
assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider(), instanceOf(PreBuiltPostingsFormatProvider.class));
assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider().get(), instanceOf(Lucene41PostingsFormat.class));
assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider().get(), instanceOf(ElasticSearch090PostingsFormat.class));
assertThat(documentMapper.mappers().name("field2").mapper().postingsFormatProvider(), instanceOf(DefaultPostingsFormatProvider.class));
DefaultPostingsFormatProvider provider = (DefaultPostingsFormatProvider) documentMapper.mappers().name("field2").mapper().postingsFormatProvider();

View File

@ -0,0 +1,130 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.unit.index.codec.postingformat;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.not;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene42.Lucene42Codec;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat;
import org.elasticsearch.index.codec.postingsformat.ElasticSearch090PostingsFormat;
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
import org.testng.annotations.Test;
/**
* Simple smoke test for {@link ElasticSearch090PostingsFormat}
*/
public class DefaultPostingsFormatTests {
private final class TestCodec extends Lucene42Codec {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return new ElasticSearch090PostingsFormat();
}
}
@Test
public void testUseDefault() throws IOException {
Codec codec = new TestCodec();
Directory d = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION, new WhitespaceAnalyzer(Lucene.VERSION));
config.setCodec(codec);
IndexWriter writer = new IndexWriter(d, config);
writer.addDocument(Arrays.asList(new TextField("foo", "bar", Store.YES), new TextField(UidFieldMapper.NAME, "1234", Store.YES)));
writer.commit();
DirectoryReader reader = DirectoryReader.open(writer, false);
List<AtomicReaderContext> leaves = reader.leaves();
assertThat(leaves.size(), equalTo(1));
AtomicReader ar = leaves.get(0).reader();
Terms terms = ar.terms("foo");
Terms uidTerms = ar.terms(UidFieldMapper.NAME);
assertThat(terms.size(), equalTo(1l));
assertThat(terms, not(instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class)));
assertThat(uidTerms, instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class));
reader.close();
writer.close();
d.close();
}
@Test
public void testNoUIDField() throws IOException {
Codec codec = new TestCodec();
Directory d = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION, new WhitespaceAnalyzer(Lucene.VERSION));
config.setCodec(codec);
IndexWriter writer = new IndexWriter(d, config);
for (int i = 0; i < 100; i++) {
writer.addDocument(Arrays.asList(new TextField("foo", "foo bar foo bar", Store.YES), new TextField("some_other_field", "1234", Store.YES)));
}
writer.forceMerge(1);
writer.commit();
DirectoryReader reader = DirectoryReader.open(writer, false);
List<AtomicReaderContext> leaves = reader.leaves();
assertThat(leaves.size(), equalTo(1));
AtomicReader ar = leaves.get(0).reader();
Terms terms = ar.terms("foo");
Terms some_other_field = ar.terms("some_other_field");
assertThat(terms.size(), equalTo(2l));
assertThat(terms, not(instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class)));
assertThat(some_other_field, not(instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class)));
TermsEnum iterator = terms.iterator(null);
Set<String> expected = new HashSet<String>();
expected.add("foo");
expected.add("bar");
while(iterator.next() != null) {
expected.remove(iterator.term().utf8ToString());
}
assertThat(expected.size(), equalTo(0));
reader.close();
writer.close();
d.close();
}
}