Specialise the default codec to reuse Lucene41 files in the common case.

Closes #2799
2013-03-18 22:58:57 +01:00 · 2013-03-18 22:58:57 +01:00 · 747ce36915
parent 54e7e309a5
commit 747ce36915
10 changed files with 266 additions and 25 deletions
--- a/src/main/java/org/elasticsearch/action/support/broadcast/TransportBroadcastOperationAction.java
+++ b/src/main/java/org/elasticsearch/action/support/broadcast/TransportBroadcastOperationAction.java
@ -263,7 +263,7 @@ public abstract class TransportBroadcastOperationAction<Request extends Broadcas
                    } else {
                        try {
                            onOperation(shard, shardOperation(shardRequest));
-                        } catch (Exception e) {
+                        } catch (Throwable e) {
                            onOperation(shard, shardIt, e);
                        }
                    }
--- a/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java
+++ b/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java
@ -21,7 +21,9 @@ package org.elasticsearch.index.codec;

 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.lucene42.Lucene42Codec;
+import org.elasticsearch.ElasticSearchIllegalStateException;
 import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
+import org.elasticsearch.index.mapper.FieldMappers;
 import org.elasticsearch.index.mapper.MapperService;

 /**
@ -45,7 +47,11 @@ public class PerFieldMappingPostingFormatCodec extends Lucene42Codec {

    @Override
    public PostingsFormat getPostingsFormatForField(String field) {
-        PostingsFormatProvider postingsFormat = mapperService.indexName(field).mapper().postingsFormatProvider();
+        final FieldMappers indexName = mapperService.indexName(field);
+        if (indexName == null) {
+            throw new ElasticSearchIllegalStateException("no index mapper found for field: [" + field + "]");
+        }
+        PostingsFormatProvider postingsFormat = indexName.mapper().postingsFormatProvider();
        return postingsFormat != null ? postingsFormat.get() : defaultPostingFormat;
    }
 }
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormat.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormat.java
@ -21,6 +21,7 @@ package org.elasticsearch.index.codec.postingsformat;

 import org.apache.lucene.codecs.*;
 import org.apache.lucene.index.*;
+import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.Bits;
@ -82,7 +83,7 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {
    }

    @Override
-    public FieldsConsumer fieldsConsumer(SegmentWriteState state)
+    public BloomFilteredFieldsConsumer fieldsConsumer(SegmentWriteState state)
            throws IOException {
        if (delegatePostingsFormat == null) {
            throw new UnsupportedOperationException("Error - " + getClass().getName()
@ -94,14 +95,19 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {
    }

    @Override
-    public FieldsProducer fieldsProducer(SegmentReadState state)
+    public BloomFilteredFieldsProducer fieldsProducer(SegmentReadState state)
            throws IOException {
        return new BloomFilteredFieldsProducer(state);
    }

-    public class BloomFilteredFieldsProducer extends FieldsProducer {
+    public final class BloomFilteredFieldsProducer extends FieldsProducer {
        private FieldsProducer delegateFieldsProducer;
        HashMap<String, BloomFilter> bloomsByFieldName = new HashMap<String, BloomFilter>();
+        
+        // for internal use only
+        FieldsProducer getDelegate() {
+            return delegateFieldsProducer;
+        }

        public BloomFilteredFieldsProducer(SegmentReadState state)
                throws IOException {
@ -119,15 +125,18 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {
                // Load the delegate postings format
                PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
                        .readString());
-
+                
                this.delegateFieldsProducer = delegatePostingsFormat
                        .fieldsProducer(state);
                int numBlooms = bloomIn.readInt();
-                for (int i = 0; i < numBlooms; i++) {
-                    int fieldNum = bloomIn.readInt();
-                    BloomFilter bloom = BloomFilter.deserialize(bloomIn);
-                    FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
-                    bloomsByFieldName.put(fieldInfo.name, bloom);
+                if (state.context.context != IOContext.Context.MERGE) {
+                    // if we merge we don't need to load the bloom filters
+                    for (int i = 0; i < numBlooms; i++) {
+                        int fieldNum = bloomIn.readInt();
+                        BloomFilter bloom = BloomFilter.deserialize(bloomIn);
+                        FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
+                        bloomsByFieldName.put(fieldInfo.name, bloom);
+                    }
                }
                IOUtils.close(bloomIn);
                success = true;
@ -332,7 +341,7 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {

    }

-    class BloomFilteredFieldsConsumer extends FieldsConsumer {
+    final class BloomFilteredFieldsConsumer extends FieldsConsumer {
        private FieldsConsumer delegateFieldsConsumer;
        private Map<FieldInfo, BloomFilter> bloomFilters = new HashMap<FieldInfo, BloomFilter>();
        private SegmentWriteState state;
@ -345,6 +354,11 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {
            // this.delegatePostingsFormat=delegatePostingsFormat;
            this.state = state;
        }
+        
+        // for internal use only
+        FieldsConsumer getDelegate() {
+            return delegateFieldsConsumer;
+        }

        @Override
        public TermsConsumer addField(FieldInfo field) throws IOException {
@ -449,4 +463,8 @@ public final class BloomFilterPostingsFormat extends PostingsFormat {

    }

+    public PostingsFormat getDelegate() {
+        return this.delegatePostingsFormat;
+    }
+
 }
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/ElasticSearch090PostingsFormat.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/ElasticSearch090PostingsFormat.java
@ -0,0 +1,84 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.codec.postingsformat;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.TermsConsumer;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat.BloomFilteredFieldsConsumer;
+import org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat.BloomFilteredFieldsProducer;
+import org.elasticsearch.index.mapper.internal.UidFieldMapper;
+
+/**
+ * This is the default postings format for ElasticSearch that special cases
+ * the <tt>_uid</tt> field to use a bloom filter while all other fields
+ * will use a {@link Lucene41PostingsFormat}. This format will reuse the underlying
+ * {@link Lucene41PostingsFormat} and it's files also for the <tt>_uid</tt> saving up to 
+ * 5 files per segment in the default case.
+ */
+public final class ElasticSearch090PostingsFormat extends PostingsFormat {
+    private final BloomFilterPostingsFormat bloomPostings;
+    
+    public ElasticSearch090PostingsFormat() {
+        super("es090");
+        bloomPostings = new BloomFilterPostingsFormat(new Lucene41PostingsFormat(), BloomFilter.Factory.DEFAULT);
+    }
+    
+    public PostingsFormat getDefaultWrapped() {
+        return bloomPostings.getDelegate();
+    }
+
+    @Override
+    public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+        final BloomFilteredFieldsConsumer fieldsConsumer = bloomPostings.fieldsConsumer(state);
+        return new FieldsConsumer() {
+            
+            @Override
+            public void close() throws IOException {
+                fieldsConsumer.close();
+            }
+            
+            @Override
+            public TermsConsumer addField(FieldInfo field) throws IOException {
+                if (UidFieldMapper.NAME.equals(field.name)) {
+                    // only go through bloom for the UID field
+                    return fieldsConsumer.addField(field);
+                }
+                return fieldsConsumer.getDelegate().addField(field);
+            }
+        };
+    }
+
+    @Override
+    public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+        // we can just return the delegate here since we didn't record bloom filters for 
+        // the other fields. 
+        return bloomPostings.fieldsProducer(state);
+    }
+
+}
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java
@ -23,10 +23,6 @@ import com.google.common.collect.ImmutableCollection;
 import com.google.common.collect.ImmutableMap;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
-import org.apache.lucene.codecs.memory.DirectPostingsFormat;
-import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
-import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat;
 import org.elasticsearch.common.collect.MapBuilder;

 /**
@ -69,14 +65,15 @@ public class PostingFormats {
        for (String luceneName : PostingsFormat.availablePostingsFormats()) {
            buildInPostingFormatsX.put(luceneName, new PreBuiltPostingsFormatProvider.Factory(PostingsFormat.forName(luceneName)));
        }
-        buildInPostingFormatsX.put("direct", new PreBuiltPostingsFormatProvider.Factory("direct", new DirectPostingsFormat()));
-        buildInPostingFormatsX.put("memory", new PreBuiltPostingsFormatProvider.Factory("memory", new MemoryPostingsFormat()));
+        final ElasticSearch090PostingsFormat defaultFormat = new ElasticSearch090PostingsFormat();
+        buildInPostingFormatsX.put("direct", new PreBuiltPostingsFormatProvider.Factory("direct", PostingsFormat.forName("Direct")));
+        buildInPostingFormatsX.put("memory", new PreBuiltPostingsFormatProvider.Factory("memory", PostingsFormat.forName("Memory")));
        // LUCENE UPGRADE: Need to change this to the relevant ones on a lucene upgrade
-        buildInPostingFormatsX.put("pulsing", new PreBuiltPostingsFormatProvider.Factory("pulsing", new Pulsing41PostingsFormat()));
-        buildInPostingFormatsX.put("default", new PreBuiltPostingsFormatProvider.Factory("default", new Lucene41PostingsFormat()));
+        buildInPostingFormatsX.put("pulsing", new PreBuiltPostingsFormatProvider.Factory("pulsing", PostingsFormat.forName("Pulsing41")));
+        buildInPostingFormatsX.put("default", new PreBuiltPostingsFormatProvider.Factory("default", defaultFormat));

-        buildInPostingFormatsX.put("bloom_pulsing", new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", wrapInBloom(new Pulsing41PostingsFormat())));
-        buildInPostingFormatsX.put("bloom_default", new PreBuiltPostingsFormatProvider.Factory("bloom_default", wrapInBloom(new Lucene41PostingsFormat())));
+        buildInPostingFormatsX.put("bloom_pulsing", new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", wrapInBloom(PostingsFormat.forName("Pulsing41"))));
+        buildInPostingFormatsX.put("bloom_default", new PreBuiltPostingsFormatProvider.Factory("bloom_default", wrapInBloom(PostingsFormat.forName("Lucene41"))));

        builtInPostingFormats = buildInPostingFormatsX.immutableMap();
    }
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PreBuiltPostingsFormatProvider.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PreBuiltPostingsFormatProvider.java
@ -60,6 +60,9 @@ public class PreBuiltPostingsFormatProvider implements PostingsFormatProvider {
    }

    public PreBuiltPostingsFormatProvider(String name, PostingsFormat postingsFormat) {
+        if (postingsFormat == null) {
+            throw new IllegalArgumentException("PostingsFormat must not be null");
+        }
        this.name = name;
        this.postingsFormat = postingsFormat;
    }
--- a/src/main/java/org/elasticsearch/index/mapper/internal/UidFieldMapper.java
+++ b/src/main/java/org/elasticsearch/index/mapper/internal/UidFieldMapper.java
@ -134,7 +134,7 @@ public class UidFieldMapper extends AbstractFieldMapper<Uid> implements Internal

    @Override
    protected String defaultPostingFormat() {
-        return "bloom_default";
+        return "default";
    }

    @Override
--- a/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
+++ b/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
@ -1 +1,2 @@
 org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat
+org.elasticsearch.index.codec.postingsformat.ElasticSearch090PostingsFormat
--- a/src/test/java/org/elasticsearch/test/unit/index/codec/CodecTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/codec/CodecTests.java
@ -69,8 +69,10 @@ public class CodecTests {
    public void testResolveDefaultPostingFormats() throws Exception {
        PostingsFormatService postingsFormatService = createCodecService().postingsFormatService();
        assertThat(postingsFormatService.get("default"), instanceOf(PreBuiltPostingsFormatProvider.class));
+        assertThat(postingsFormatService.get("default").get(), instanceOf(ElasticSearch090PostingsFormat.class));
+
        // Should fail when upgrading Lucene with codec changes
-        assertThat(postingsFormatService.get("default").get(), instanceOf(((PerFieldPostingsFormat) Codec.getDefault().postingsFormat()).getPostingsFormatForField(null).getClass()));
+        assertThat(((ElasticSearch090PostingsFormat)postingsFormatService.get("default").get()).getDefaultWrapped(), instanceOf(((PerFieldPostingsFormat) Codec.getDefault().postingsFormat()).getPostingsFormatForField(null).getClass()));
        assertThat(postingsFormatService.get("Lucene41"), instanceOf(PreBuiltPostingsFormatProvider.class));
        // Should fail when upgrading Lucene with codec changes
        assertThat(postingsFormatService.get("Lucene41").get(), instanceOf(((PerFieldPostingsFormat) Codec.getDefault().postingsFormat()).getPostingsFormatForField(null).getClass()));
@ -126,7 +128,7 @@ public class CodecTests {
        CodecService codecService = createCodecService(indexSettings);
        DocumentMapper documentMapper = codecService.mapperService().documentMapperParser().parse(mapping);
        assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider(), instanceOf(PreBuiltPostingsFormatProvider.class));
-        assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider().get(), instanceOf(Lucene41PostingsFormat.class));
+        assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider().get(), instanceOf(ElasticSearch090PostingsFormat.class));

        assertThat(documentMapper.mappers().name("field2").mapper().postingsFormatProvider(), instanceOf(DefaultPostingsFormatProvider.class));
        DefaultPostingsFormatProvider provider = (DefaultPostingsFormatProvider) documentMapper.mappers().name("field2").mapper().postingsFormatProvider();
--- a/src/test/java/org/elasticsearch/test/unit/index/codec/postingformat/DefaultPostingsFormatTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/codec/postingformat/DefaultPostingsFormatTests.java
@ -0,0 +1,130 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.test.unit.index.codec.postingformat;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.not;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.lucene42.Lucene42Codec;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat;
+import org.elasticsearch.index.codec.postingsformat.ElasticSearch090PostingsFormat;
+import org.elasticsearch.index.mapper.internal.UidFieldMapper;
+import org.testng.annotations.Test;
+
+/**
+ * Simple smoke test for {@link ElasticSearch090PostingsFormat}
+ */
+public class DefaultPostingsFormatTests {
+
+    private final class TestCodec extends Lucene42Codec {
+
+        @Override
+        public PostingsFormat getPostingsFormatForField(String field) {
+            return new ElasticSearch090PostingsFormat();
+        }
+    }
+
+    @Test
+    public void testUseDefault() throws IOException {
+       
+        Codec codec = new TestCodec();
+        Directory d = new RAMDirectory();
+        IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION, new WhitespaceAnalyzer(Lucene.VERSION));
+        config.setCodec(codec);
+        IndexWriter writer = new IndexWriter(d, config);
+        writer.addDocument(Arrays.asList(new TextField("foo", "bar", Store.YES), new TextField(UidFieldMapper.NAME, "1234", Store.YES)));
+        writer.commit();
+        DirectoryReader reader = DirectoryReader.open(writer, false);
+        List<AtomicReaderContext> leaves = reader.leaves();
+        assertThat(leaves.size(), equalTo(1));
+        AtomicReader ar = leaves.get(0).reader();
+        Terms terms = ar.terms("foo");
+        Terms uidTerms = ar.terms(UidFieldMapper.NAME);
+
+        assertThat(terms.size(), equalTo(1l));
+        assertThat(terms, not(instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class)));
+        assertThat(uidTerms, instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class));
+
+        reader.close();
+        writer.close();
+        d.close();
+    }
+    
+    @Test
+    public void testNoUIDField() throws IOException {
+       
+        Codec codec = new TestCodec();
+        Directory d = new RAMDirectory();
+        IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION, new WhitespaceAnalyzer(Lucene.VERSION));
+        config.setCodec(codec);
+        IndexWriter writer = new IndexWriter(d, config);
+        for (int i = 0; i < 100; i++) {
+            writer.addDocument(Arrays.asList(new TextField("foo", "foo bar foo bar", Store.YES), new TextField("some_other_field", "1234", Store.YES)));
+        }
+        writer.forceMerge(1);
+        writer.commit();
+        
+        DirectoryReader reader = DirectoryReader.open(writer, false);
+        List<AtomicReaderContext> leaves = reader.leaves();
+        assertThat(leaves.size(), equalTo(1));
+        AtomicReader ar = leaves.get(0).reader();
+        Terms terms = ar.terms("foo");
+        Terms some_other_field = ar.terms("some_other_field");
+
+        assertThat(terms.size(), equalTo(2l));
+        assertThat(terms, not(instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class)));
+        assertThat(some_other_field, not(instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class)));
+        TermsEnum iterator = terms.iterator(null);
+        Set<String> expected = new HashSet<String>();
+        expected.add("foo");
+        expected.add("bar");
+        while(iterator.next() != null) {
+            expected.remove(iterator.term().utf8ToString());
+        }
+        assertThat(expected.size(), equalTo(0));
+        reader.close();
+        writer.close();
+        d.close();
+    }
+
+}